diff --git a/projects/hip-tests/catch/perftests/compute/hipPerfDotProduct.cc b/projects/hip-tests/catch/perftests/compute/hipPerfDotProduct.cc index aad97f0630..eddcf4bb2a 100644 --- a/projects/hip-tests/catch/perftests/compute/hipPerfDotProduct.cc +++ b/projects/hip-tests/catch/perftests/compute/hipPerfDotProduct.cc @@ -18,10 +18,10 @@ */ /** -* @addtogroup hipPerfDotProduct hipPerfDotProduct -* @{ -* @ingroup perfComputeTest -*/ + * @addtogroup hipPerfDotProduct hipPerfDotProduct + * @{ + * @ingroup perfComputeTest + */ #include #include @@ -31,11 +31,9 @@ using namespace std; template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_not_equal(int n, - const double* __restrict__ x, - const double* __restrict__ y, - double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ + void vectors_not_equal(int n, const double* __restrict__ x, const double* __restrict__ y, + double* __restrict__ workspace) { int gid = blockIdx.x * blockDim.x + threadIdx.x; double sum = 0.0; @@ -93,9 +91,8 @@ __global__ void vectors_not_equal(int n, } template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_equal(int n, const double* __restrict__ x, - double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ + void vectors_equal(int n, const double* __restrict__ x, double* __restrict__ workspace) { int gid = blockIdx.x * blockDim.x + threadIdx.x; double sum = 0.0; @@ -129,7 +126,7 @@ __global__ void vectors_equal(int n, const double* __restrict__ x, __syncthreads(); if (threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; + sdata[threadIdx.x] += sdata[threadIdx.x + 8]; } __syncthreads(); @@ -149,12 +146,11 @@ __global__ void vectors_equal(int n, const double* __restrict__ x, if (threadIdx.x == 0) { workspace[blockIdx.x] = sdata[0]; - } + } } template -__launch_bounds__(BLOCKSIZE) -__global__ void dot_reduction(double* __restrict__ workspace) { +__launch_bounds__(BLOCKSIZE) __global__ void dot_reduction(double* __restrict__ workspace) { __shared__ double sdata[BLOCKSIZE]; sdata[threadIdx.x] = workspace[threadIdx.x]; @@ -187,7 +183,8 @@ __global__ void dot_reduction(double* __restrict__ workspace) { if (threadIdx.x < 4) { sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } __syncthreads(); + } + __syncthreads(); if (threadIdx.x < 2) { sdata[threadIdx.x] += sdata[threadIdx.x + 2]; @@ -203,8 +200,7 @@ __global__ void dot_reduction(double* __restrict__ workspace) { } } -void computeDotProduct(int n, const double* x, const double* y, double& result, - double* workspace) { +void computeDotProduct(int n, const double* x, const double* y, double& result, double* workspace) { dim3 blocks(DOT_DIM); dim3 threadsPerBlock(DOT_DIM); @@ -225,16 +221,16 @@ void computeDotProduct(int n, const double* x, const double* y, double& result, } /** -* Test Description -* ------------------------ -* - Verify the device kernel results comparing it with the host results. -* Test source -* ------------------------ -* - perftests/compute/hipPerfDotProduct.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the device kernel results comparing it with the host results. + * Test source + * ------------------------ + * - perftests/compute/hipPerfDotProduct.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDotProduct") { int nGpu = 0; @@ -252,120 +248,120 @@ TEST_CASE("Perf_hipPerfDotProduct") { for (unsigned int testCase = 0; testCase < 3; testCase++) { vector vectorSize = {200, 300, 50}; switch (testCase) { - case 0: - nx = vectorSize[0]; - ny = vectorSize[0]; - nz = vectorSize[0]; - break; + case 0: + nx = vectorSize[0]; + ny = vectorSize[0]; + nz = vectorSize[0]; + break; - case 1: - nx = vectorSize[1]; - ny = vectorSize[1]; - nz = vectorSize[1]; - break; + case 1: + nx = vectorSize[1]; + ny = vectorSize[1]; + nz = vectorSize[1]; + break; - case 2: - nx = vectorSize[0]; - ny = vectorSize[1]; - nz = vectorSize[2]; - break; + case 2: + nx = vectorSize[0]; + ny = vectorSize[1]; + nz = vectorSize[2]; + break; - default: - break; - } + default: + break; + } - int trials = 200; - int size = nx * ny * nz; + int trials = 200; + int size = nx * ny * nz; - vector hx(size); - vector hy(size); - double hresult_xy = 0.0; - double hresult_xx = 0.0; + vector hx(size); + vector hy(size); + double hresult_xy = 0.0; + double hresult_xx = 0.0; - srand(time(NULL)); + srand(time(NULL)); - for (int i = 0; i < size; ++i) { - hx[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; - hy[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; + for (int i = 0; i < size; ++i) { + hx[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; + hy[i] = 2.0 * static_cast(rand()) / static_cast(RAND_MAX) - 1.0; - hresult_xy += hx[i] * hy[i]; - hresult_xx += hx[i] * hx[i]; - } + hresult_xy += hx[i] * hy[i]; + hresult_xx += hx[i] * hx[i]; + } - double* dx; - double* dy; - double* workspace; - double dresult; + double* dx; + double* dy; + double* workspace; + double dresult; - HIP_CHECK(hipMalloc(reinterpret_cast(&dx), sizeof(double) * size)); - HIP_CHECK(hipMalloc(reinterpret_cast(&dy), sizeof(double) * size)); - HIP_CHECK(hipMalloc(reinterpret_cast(&workspace), sizeof(double) * DOT_DIM)); + HIP_CHECK(hipMalloc(reinterpret_cast(&dx), sizeof(double) * size)); + HIP_CHECK(hipMalloc(reinterpret_cast(&dy), sizeof(double) * size)); + HIP_CHECK(hipMalloc(reinterpret_cast(&workspace), sizeof(double) * DOT_DIM)); - HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - // Warm up - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - - // Timed run for - HIP_CHECK(hipDeviceSynchronize()); - auto all_start = std::chrono::steady_clock::now(); - - for (int i = 0; i < trials; ++i) { + // Warm up + computeDotProduct(size, dx, dy, dresult, workspace); + computeDotProduct(size, dx, dy, dresult, workspace); computeDotProduct(size, dx, dy, dresult, workspace); - } - float time = 0; - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); + // Timed run for + HIP_CHECK(hipDeviceSynchronize()); + auto all_start = std::chrono::steady_clock::now(); - time /= trials; + for (int i = 0; i < trials; ++i) { + computeDotProduct(size, dx, dy, dresult, workspace); + } - double bw = sizeof(double) * size * 2.0 / 1e9; - double gf = 2.0 * size / 1e9; + float time = 0; + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + time = all_kernel_time.count(); - cout << "\nVector Size: " << size << "\n[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; + time /= trials; - // Verify the device kernel results comparing it with the host results - REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8)); + double bw = sizeof(double) * size * 2.0 / 1e9; + double gf = 2.0 * size / 1e9; - // Warm up - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); + CONSOLE_PRINT("\nVector Size: %d\n[ddot] %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", size, + time, bw / (time / 1e3), gf / (time / 1e3)); - // Timed run for - HIP_CHECK(hipDeviceSynchronize()); - all_start = std::chrono::steady_clock::now(); + // Verify the device kernel results comparing it with the host results + REQUIRE(std::abs(dresult - hresult_xy) < std::max(dresult * 1e-10, 1e-8)); - for (int i = 0; i < trials; ++i) { + // Warm up + computeDotProduct(size, dx, dx, dresult, workspace); + computeDotProduct(size, dx, dx, dresult, workspace); computeDotProduct(size, dx, dx, dresult, workspace); - } - all_end = std::chrono::steady_clock::now(); - all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); + // Timed run for + HIP_CHECK(hipDeviceSynchronize()); + all_start = std::chrono::steady_clock::now(); - time /= trials; - bw = sizeof(double) * size / 1e9; + for (int i = 0; i < trials; ++i) { + computeDotProduct(size, dx, dx, dresult, workspace); + } - cout << "[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; + all_end = std::chrono::steady_clock::now(); + all_kernel_time = all_end - all_start; + time = all_kernel_time.count(); - // Verify the device kernel results comparing it with the host results - REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8)); + time /= trials; + bw = sizeof(double) * size / 1e9; - HIP_CHECK(hipFree(dx)); - HIP_CHECK(hipFree(dy)); - HIP_CHECK(hipFree(workspace)); + CONSOLE_PRINT("[ddot] %.6f msec ; %.6f GByte/s ; %.6f GFlop/s", time, bw / (time / 1e3), + gf / (time / 1e3)); + + // Verify the device kernel results comparing it with the host results + REQUIRE(abs(dresult - hresult_xx) < max(dresult * 1e-10, 1e-8)); + + HIP_CHECK(hipFree(dx)); + HIP_CHECK(hipFree(dy)); + HIP_CHECK(hipFree(workspace)); } } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/projects/hip-tests/catch/perftests/compute/hipPerfMandelbrot.cc b/projects/hip-tests/catch/perftests/compute/hipPerfMandelbrot.cc index a500b7df3e..ef007e06ce 100644 --- a/projects/hip-tests/catch/perftests/compute/hipPerfMandelbrot.cc +++ b/projects/hip-tests/catch/perftests/compute/hipPerfMandelbrot.cc @@ -18,10 +18,10 @@ */ /** -* @addtogroup hipPerfMandelbrot hipPerfMandelbrot -* @{ -* @ingroup perfComputeTest -*/ + * @addtogroup hipPerfMandelbrot hipPerfMandelbrot + * @{ + * @ingroup perfComputeTest + */ #include #include @@ -45,36 +45,35 @@ coordRec coords[] = { static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); template -__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, - T xStep, T yStep, uint maxIter) { +__global__ void float_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*i); - float y0 = static_cast(yPos + yStep*j); + float x0 = static_cast(xPos + xStep * i); + float y0 = static_cast(yPos + yStep * j); float x = x0; float y = y0; uint iter = 0; float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; } template -__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - +__global__ void float_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*static_cast(i)); - float y0 = static_cast(yPos + yStep*static_cast(j)); + float x0 = static_cast(xPos + xStep * static_cast(i)); + float y0 = static_cast(yPos + yStep * static_cast(j)); float x = x0; float y = y0; @@ -84,72 +83,71 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, float tmp; int stay; int ccount = 0; - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; float savx = x; float savy = y; #ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) { + for (iter = 0; (iter < maxIter); iter += 16) { #else - for (iter = 0; stay && (iter < maxIter); iter+=16) { + for (iter = 0; stay && (iter < maxIter); iter += 16) { #endif x = savx; y = savy; // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; savx = (stay ? x : savx); savy = (stay ? y : savy); - ccount += stay*16; + ccount += stay * 16; #ifdef FAST - if (!stay) - break; + if (!stay) break; #endif } // Handle remainder @@ -158,10 +156,10 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, do { x = savx; y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); tmp = x; - x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); ccount += stay; iter--; savx = (stay ? x : savx); @@ -172,36 +170,36 @@ __global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, } template -__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { +__global__ void double_mad_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - double x0 = static_cast(xPos + xStep*i); - double y0 = static_cast(yPos + yStep*j); + double x0 = static_cast(xPos + xStep * i); + double y0 = static_cast(yPos + yStep * j); double x = x0; double y = y0; uint iter = 0; double tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; - x = fma(-y, y,fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; }; template -__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { +__global__ void double_mandel_unroll_kernel(uint* out, uint width, T xPos, T yPos, T xStep, T yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - double x0 = static_cast(xPos + xStep*static_cast(i)); - double y0 = static_cast(yPos + yStep*static_cast(j)); + double x0 = static_cast(xPos + xStep * static_cast(i)); + double y0 = static_cast(yPos + yStep * static_cast(j)); double x = x0; double y = y0; @@ -211,13 +209,13 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, double tmp; int stay; int ccount = 0; - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; double savx = x; double savy = y; #ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) + for (iter = 0; (iter < maxIter); iter += 16) #else - for (iter = 0; stay && (iter < maxIter); iter+=16) + for (iter = 0; stay && (iter < maxIter); iter += 16) #endif { x = savx; @@ -225,141 +223,131 @@ __global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); // Two iterations - tmp = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*x, y, y0); - x = fma(-y, y, fma(tmp, tmp, x0)); - y = fma(2.0f*tmp, y, y0); + tmp = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * x, y, y0); + x = fma(-y, y, fma(tmp, tmp, x0)); + y = fma(2.0f * tmp, y, y0); - stay = (x*x+y*y) <= 4.0; + stay = (x * x + y * y) <= 4.0; savx = (stay ? x : savx); savy = (stay ? y : savy); - ccount += stay*16; + ccount += stay * 16; #ifdef FAST - if (!stay) - break; + if (!stay) break; #endif - } + } // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x, x, x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } - while (stay && iter); - } - out[tid] = (uint)ccount; + if (!stay) { + iter = 16; + do { + x = savx; + y = savy; + stay = ((x * x + y * y) <= 4.0) && (ccount < maxIter); + tmp = x; + x = fma(-y, y, fma(x, x, x0)); + y = fma(2.0f * tmp, y, y0); + ccount += stay; + iter--; + savx = (stay ? x : savx); + savy = (stay ? y : savy); + } while (stay && iter); + } + out[tid] = (uint)ccount; }; // Expected results for each kernel run at each coord unsigned long long expectedIters[] = { - 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, - 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, - 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, - 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; + 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, 120254651ull, + 203277748ull, 2147483648ull, 120254651ull, 203315114ull, 2147483648ull, 120042599ull, + 203315114ull, 2147483648ull, 120042599ull, 203280620ull, 2147483648ull, 120485704ull, + 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, 120485704ull, + 203315114ull, 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; class hipPerfMandelBrot { public: hipPerfMandelBrot(); ~hipPerfMandelBrot(); - void setNumKernels(unsigned int num) { - numKernels = num; - } + void setNumKernels(unsigned int num) { numKernels = num; } - unsigned int getNumKernels() { - return numKernels; - } + unsigned int getNumKernels() { return numKernels; } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } + void setNumStreams(unsigned int num) { numStreams = num; } + unsigned int getNumStreams() { return numStreams; } void open(int deviceID); bool run(unsigned int testCase); void printResults(void); // array of funtion pointers - typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + typedef void (hipPerfMandelBrot::*funPtr)(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); // Wrappers - void float_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); + void float_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); + void float_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + void double_mad(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); - void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); + void double_mandel_unroll(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter, hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt); hipStream_t streams[2]; private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numKernels; unsigned int numStreams; @@ -387,9 +375,9 @@ void hipPerfMandelBrot::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; + + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); numCUs = props.multiProcessorCount; } @@ -397,52 +385,52 @@ void hipPerfMandelBrot::open(int deviceId) { void hipPerfMandelBrot::printResults() { int numStreams = getNumStreams(); - std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " - << numStreams << " streams (s)" << std::endl; + CONSOLE_PRINT("Measured perf for kernels in GFLOPS on %d streams (s)", numStreams); - std::map>:: iterator itr; + std::map>::iterator itr; for (itr = results.begin(); itr != results.end(); itr++) { - std::cout << "\n" << std::setw(20) << itr->first << " "; - for (auto i : results[itr->first]) { - std::cout << std::setw(10) << i << " "; - } - } + CONSOLE_PRINT("\n%s ", itr->first.c_str()); + for (auto i : results[itr->first]) { + CONSOLE_PRINT("%10f ", i); + } + } results.clear(); - std::cout << std::endl; + CONSOLE_PRINT("\n"); } // Wrappers for the kernel launches -void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::float_mad(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, - maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::float_mandel_unroll(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, int threads_per_block, + int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::double_mad(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter, hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } -void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { +void hipPerfMandelBrot::double_mandel_unroll(uint* out, uint width, float xPos, float yPos, + float xStep, float yStep, uint maxIter, + hipStream_t* streams, int blocks, + int threads_per_block, int kernelCnt) { int streamCnt = getNumStreams(); hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); + streams[kernelCnt % streamCnt], out, width, xPos, yPos, xStep, yStep, maxIter); } bool hipPerfMandelBrot::run(unsigned int testCase) { @@ -450,18 +438,18 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { coordIdx = testCase % numCoords; funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, - &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; + &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; // Maximum iteration count maxIter = 32768; - uint ** hPtr = new uint *[numKernels]; - uint ** dPtr = new uint *[numKernels]; + uint** hPtr = new uint*[numKernels]; + uint** dPtr = new uint*[numKernels]; // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. width_ = 256; - bufSize = width_ * width_ * sizeof(uint); + bufSize = width_ * width_ * sizeof(uint); // Create streams for concurrency for (uint i = 0; i < numStreams; i++) { @@ -470,15 +458,15 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { // Allocate memory on the host and device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) } // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); + int threads = (bufSize / sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); // Copy memory asynchronously and concurrently from host to device for (uint i = 0; i < numKernels; i++) { @@ -489,90 +477,88 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { HIP_CHECK(hipStreamSynchronize(0)); int kernelIdx; - if(testCase == 0 || testCase == 5 || testCase == 10) { + if (testCase == 0 || testCase == 5 || testCase == 10) { kernelIdx = 0; - } else if(testCase == 1 || testCase == 6 || testCase == 11) { + } else if (testCase == 1 || testCase == 6 || testCase == 11) { kernelIdx = 1; - } else if(testCase == 2 || testCase == 7 || testCase == 12) { + } else if (testCase == 2 || testCase == 7 || testCase == 12) { kernelIdx = 2; - } else if(testCase == 3 || testCase == 8 || testCase == 13){ + } else if (testCase == 3 || testCase == 8 || testCase == 13) { kernelIdx = 3; } double totalTime = 0.0; for (unsigned int k = 0; k < numLoops; k++) { - if ((testCase == 0 || testCase == 1 || testCase == 2 || - testCase == 5 || testCase == 6 || testCase == 7 || - testCase == 10 || testCase == 11 || testCase == 12)) { - float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); - float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); - float xPos = static_cast(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = static_cast(coords[coordIdx].y + 0.5 * coords[coordIdx].width); + if ((testCase == 0 || testCase == 1 || testCase == 2 || testCase == 5 || testCase == 6 || + testCase == 7 || testCase == 10 || testCase == 11 || testCase == 12)) { + float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); + float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); + float xPos = static_cast(coords[coordIdx].x - 0.5 * coords[coordIdx].width); + float yPos = static_cast(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } - // Synchronize all the concurrent streams to have completed execution - HIP_CHECK(hipStreamSynchronize(0)); + // Synchronize all the concurrent streams to have completed execution + HIP_CHECK(hipStreamSynchronize(0)); - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } else { - double xStep = coords[coordIdx].width / static_cast(width_); - double yStep = -coords[coordIdx].width / static_cast(width_); - double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; - double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } else { + double xStep = coords[coordIdx].width / static_cast(width_); + double yStep = -coords[coordIdx].width / static_cast(width_); + double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; + double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - // Synchronize all the concurrent streams to have completed execution - HIP_CHECK(hipStreamSynchronize(0)); + // Time the kernel execution + auto all_start = std::chrono::steady_clock::now(); + for (uint i = 0; i < numKernels; i++) { + (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, + threads_per_block, i); + } + // Synchronize all the concurrent streams to have completed execution + HIP_CHECK(hipStreamSynchronize(0)); - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } + auto all_end = std::chrono::steady_clock::now(); + std::chrono::duration all_kernel_time = all_end - all_start; + totalTime += all_kernel_time.count(); + } } // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); - } - for(uint i = 0; i < numKernels; i++) { - checkData(hPtr[i]); - int j =0; - while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { - j++; + for (uint i = 0; i < numKernels; i++) { + HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); } + for (uint i = 0; i < numKernels; i++) { + checkData(hPtr[i]); + int j = 0; + while ((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { + j++; + } - if(j==30) { - std::cout << "Incorrect iteration count detected. "; - } + if (j == 30) { + CONSOLE_PRINT("Incorrect iteration count detected. "); + } } // Compute GFLOPS. There are 7 FLOPs per iteration - double perf = (static_cast(totalIters*numKernels) * 7 * static_cast(1e-09)) / - (totalTime / (double)numLoops); + double perf = (static_cast(totalIters * numKernels) * 7 * static_cast(1e-09)) / + (totalTime / (double)numLoops); - std::vector kernelName = {"float", "float_unroll", - "double", "double_unroll"}; + std::vector kernelName = {"float", "float_unroll", "double", "double_unroll"}; // Print results except for Warm-up kernel if (testCase != 100) { - results[kernelName[testCase % 4]].push_back(perf); - } + results[kernelName[testCase % 4]].push_back(perf); + } - for(uint i = 0 ; i < numStreams; i++) { + for (uint i = 0; i < numStreams; i++) { HIP_CHECK(hipStreamDestroy(streams[i])); } @@ -581,19 +567,19 @@ bool hipPerfMandelBrot::run(unsigned int testCase) { HIP_CHECK(hipHostFree(hPtr[i])); HIP_CHECK(hipFree(dPtr[i])); } - delete [] hPtr; - delete [] dPtr; + delete[] hPtr; + delete[] dPtr; return true; } -void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; +void hipPerfMandelBrot::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; for (unsigned int i = 0; i < width_ * width_; i++) { - ptr2[i] = value; + ptr2[i] = value; } } -void hipPerfMandelBrot::checkData(uint *ptr) { +void hipPerfMandelBrot::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_ * width_; i++) { totalIters += ptr[i]; @@ -601,30 +587,30 @@ void hipPerfMandelBrot::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the warm-up kernel default stream executes serially. -* - verify by running all kernels - sync. -* - verify by running all kernels - async. -* Test source -* ------------------------ -* - perftests/compute/hipPerfMandelbrot.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the warm-up kernel default stream executes serially. + * - verify by running all kernels - sync. + * - verify by running all kernels - async. + * Test source + * ------------------------ + * - perftests/compute/hipPerfMandelbrot.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfMandelbrot") { hipPerfMandelBrot mandelbrotCompute; int deviceId = 0; mandelbrotCompute.open(deviceId); - #if HT_AMD +#if HT_AMD SECTION("warm-up kernel default stream executes serially") { mandelbrotCompute.setNumStreams(1); mandelbrotCompute.setNumKernels(1); - REQUIRE(true == mandelbrotCompute.run(100/*Random number*/)); + REQUIRE(true == mandelbrotCompute.run(100 /*Random number*/)); } - #endif +#endif SECTION("run all - sync") { int i = 0; do { @@ -632,7 +618,7 @@ TEST_CASE("Perf_hipPerfMandelbrot") { mandelbrotCompute.setNumKernels(1); REQUIRE(true == mandelbrotCompute.run(i)); i++; - }while(i < 12); + } while (i < 12); mandelbrotCompute.printResults(); } @@ -643,12 +629,12 @@ TEST_CASE("Perf_hipPerfMandelbrot") { mandelbrotCompute.setNumKernels(2); REQUIRE(true == mandelbrotCompute.run(i)); i++; - }while(i < 12); + } while (i < 12); mandelbrotCompute.printResults(); } } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/projects/hip-tests/catch/perftests/stream/hipPerfDeviceConcurrency.cc b/projects/hip-tests/catch/perftests/stream/hipPerfDeviceConcurrency.cc index b07c9f49ed..dfe1d83c0a 100644 --- a/projects/hip-tests/catch/perftests/stream/hipPerfDeviceConcurrency.cc +++ b/projects/hip-tests/catch/perftests/stream/hipPerfDeviceConcurrency.cc @@ -18,12 +18,12 @@ */ /** -* @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency -* @{ -* @ingroup perfStreamTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfDeviceConcurrency hipPerfDeviceConcurrency + * @{ + * @ingroup perfStreamTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include @@ -34,28 +34,28 @@ typedef struct { } coordRec; static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black + {0.0, 0.0, 0.00001}, // All black }; static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); -__global__ void mandelbrot(uint *out, uint width, float xPos, - float yPos, float xStep, float yStep, uint maxIter) { +__global__ void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, float yStep, + uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); int i = tid % width; int j = tid / width; - float x0 = static_cast(xPos + xStep*i); - float y0 = static_cast(yPos + yStep*j); + float x0 = static_cast(xPos + xStep * i); + float y0 = static_cast(yPos + yStep * j); float x = x0; float y = y0; uint iter = 0; float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { + for (iter = 0; (x * x + y * y <= 4.0f) && (iter < maxIter); iter++) { tmp = x; x = fma(-y, y, fma(x, x, x0)); - y = fma(2.0f*tmp, y, y0); + y = fma(2.0f * tmp, y, y0); } out[tid] = iter; }; @@ -65,20 +65,16 @@ class hipPerfDeviceConcurrency { hipPerfDeviceConcurrency(); ~hipPerfDeviceConcurrency(); - void setNumGpus(unsigned int num) { - numDevices = num; - } - unsigned int getNumGpus() { - return numDevices; - } + void setNumGpus(unsigned int num) { numDevices = num; } + unsigned int getNumGpus() { return numDevices; } void open(void); void close(void); bool run(unsigned int testCase, int numGpus); private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numDevices; unsigned int width_; @@ -100,17 +96,16 @@ void hipPerfDeviceConcurrency::open(void) { } } -void hipPerfDeviceConcurrency::close() { -} +void hipPerfDeviceConcurrency::close() {} bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { static int deviceId; - uint ** hPtr = new uint*[numGpus]; - uint ** dPtr = new uint*[numGpus]; - hipStream_t * streams = new hipStream_t[numGpus]; - int *numCUs = new int[numGpus]; - unsigned int *maxIter = new unsigned int[numGpus]; - unsigned long long *expectedIters = new unsigned long long[numGpus]; + uint** hPtr = new uint*[numGpus]; + uint** dPtr = new uint*[numGpus]; + hipStream_t* streams = new hipStream_t[numGpus]; + int* numCUs = new int[numGpus]; + unsigned int* maxIter = new unsigned int[numGpus]; + unsigned long long* expectedIters = new unsigned long long[numGpus]; int threads, threads_per_block, blocks; float xStep, yStep, xPos, yPos; @@ -124,25 +119,21 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, i)); if (testCase != 0) { - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount - << " CUs" << " and device ID: " << i << std::endl; + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID, + props.name, props.multiProcessorCount, i); } - numCUs[i] = props.multiProcessorCount; int clkFrequency = 0; - HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, - hipDeviceAttributeClockRate, i)); + HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); if (clkFrequency == 0) { - std::cout << "clkFrequency = 0, set it to 1000000\n"; + CONSOLE_PRINT("clkFrequency = 0, set it to 1000000"); clkFrequency = 1000000; } - clkFrequency =(unsigned int)clkFrequency/1000; + clkFrequency = (unsigned int)clkFrequency / 1000; // Maximum iteration count // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) - * numCUs[i]) / 128); + maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); maxIter[i] = (maxIter[i] + 15) & ~15; // Width is divisible by 4 because the mandelbrot @@ -153,15 +144,14 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { HIP_CHECK(hipStreamCreate(&streams[i])); // Allocate memory on the host and device - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), - bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) // Prepare kernel launch parameters - threads = (bufSize/sizeof(uint)); - threads_per_block = 64; - blocks = (threads/threads_per_block) + (threads % threads_per_block); + threads = (bufSize / sizeof(uint)); + threads_per_block = 64; + blocks = (threads / threads_per_block) + (threads % threads_per_block); coordIdx = testCase % numCoords; xStep = static_cast(coords[coordIdx].width / static_cast(width_)); @@ -180,10 +170,9 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { deviceId = i; } - HIP_CHECK(hipSetDevice(deviceId)); - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, - streams[i], dPtr[i], width_, xPos, yPos, xStep, - yStep, maxIter[i]); + HIP_CHECK(hipSetDevice(deviceId)); + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], dPtr[i], + width_, xPos, yPos, xStep, yStep, maxIter[i]); } for (int i = 0; i < numGpus; i++) { HIP_CHECK(hipStreamSynchronize(0)); @@ -192,8 +181,8 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { auto all_end = std::chrono::steady_clock::now(); std::chrono::duration all_kernel_time = all_end - all_start; - for(int i = 0; i < numGpus; i++) { - if(testCase != 0) { + for (int i = 0; i < numGpus; i++) { + if (testCase != 0) { deviceId = i; } HIP_CHECK(hipSetDevice(deviceId)); @@ -201,11 +190,11 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { // Copy data back from device to the host HIP_CHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); checkData(hPtr[i]); - expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; + expectedIters[i] = width_ * width_ * (unsigned long long)maxIter[i]; if (testCase != 0) { checkData(hPtr[i]); if (totalIters != expectedIters[i]) { - std::cout << "Incorrect iteration count detected" << std::endl; + CONSOLE_PRINT("Incorrect iteration count detected"); } } @@ -216,31 +205,30 @@ bool hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { } if (testCase != 0) { - std::cout << '\n' << "Measured time for kernel computation on " << numGpus - << " device (s): " << all_kernel_time.count() << " (s) " - << '\n' << std::endl; + CONSOLE_PRINT("\nMeasured time for kernel computation on %d device(s): %.6f (s)\n", numGpus, + all_kernel_time.count()); } if (testCase == 0) { deviceId++; } - delete [] hPtr; - delete [] dPtr; - delete [] streams; - delete [] numCUs; - delete [] maxIter; - delete [] expectedIters; + delete[] hPtr; + delete[] dPtr; + delete[] streams; + delete[] numCUs; + delete[] maxIter; + delete[] expectedIters; return true; } -void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_ ; i++) { - ptr2[i] = value; +void hipPerfDeviceConcurrency::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; + for (unsigned int i = 0; i < width_ * width_; i++) { + ptr2[i] = value; } } -void hipPerfDeviceConcurrency::checkData(uint *ptr) { +void hipPerfDeviceConcurrency::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_ * width_; i++) { totalIters += ptr[i]; @@ -248,16 +236,16 @@ void hipPerfDeviceConcurrency::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the different levels of device concurrency. -* Test source -* ------------------------ -* - perftests/stream/hipPerfDeviceConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the different levels of device concurrency. + * Test source + * ------------------------ + * - perftests/stream/hipPerfDeviceConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfDeviceConcurrency") { hipPerfDeviceConcurrency deviceConcurrency; @@ -279,6 +267,6 @@ TEST_CASE("Perf_hipPerfDeviceConcurrency") { } /** -* End doxygen group perfStreamTest. -* @} -*/ + * End doxygen group perfStreamTest. + * @} + */ diff --git a/projects/hip-tests/catch/perftests/stream/hipPerfStreamConcurrency.cc b/projects/hip-tests/catch/perftests/stream/hipPerfStreamConcurrency.cc index ba4a04aa9d..aa069e2fdd 100644 --- a/projects/hip-tests/catch/perftests/stream/hipPerfStreamConcurrency.cc +++ b/projects/hip-tests/catch/perftests/stream/hipPerfStreamConcurrency.cc @@ -18,12 +18,12 @@ */ /** -* @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency -* @{ -* @ingroup perfComputeTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfStreamConcurrency hipPerfStreamConcurrency + * @{ + * @ingroup perfComputeTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include #include @@ -55,23 +55,23 @@ static coordRec coords[] = { static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); -__global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter) { +__global__ static void mandelbrot(uint* out, uint width, float xPos, float yPos, float xStep, + float yStep, uint maxIter) { int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % (width/4); - int j = tid / (width/4); - int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3); + int i = tid % (width / 4); + int j = tid / (width / 4); + int4 veci = make_int4(4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3); int4 vecj = make_int4(j, j, j, j); float4 x0; - x0.x = static_cast(xPos + xStep*veci.x); - x0.y = static_cast(xPos + xStep*veci.y); - x0.z = static_cast(xPos + xStep*veci.z); - x0.w = static_cast(xPos + xStep*veci.w); + x0.x = static_cast(xPos + xStep * veci.x); + x0.y = static_cast(xPos + xStep * veci.y); + x0.z = static_cast(xPos + xStep * veci.z); + x0.w = static_cast(xPos + xStep * veci.w); float4 y0; - y0.x = static_cast(yPos + yStep*vecj.x); - y0.y = static_cast(yPos + yStep*vecj.y); - y0.z = static_cast(yPos + yStep*vecj.z); - y0.w = static_cast(yPos + yStep*vecj.w); + y0.x = static_cast(yPos + yStep * vecj.x); + y0.y = static_cast(yPos + yStep * vecj.y); + y0.z = static_cast(yPos + yStep * vecj.z); + y0.w = static_cast(yPos + yStep * vecj.w); float4 x = x0; float4 y = y0; uint iter = 0; @@ -80,53 +80,52 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, int4 ccount = make_int4(0, 0, 0, 0); float4 savx = x; float4 savy = y; - stay.x = (x.x*x.x+y.x*y.x) <= static_cast(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= static_cast(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= static_cast(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= static_cast(4.0f); - for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); - iter+=16) { + stay.x = (x.x * x.x + y.x * y.x) <= static_cast(4.0f); + stay.y = (x.y * x.y + y.y * y.y) <= static_cast(4.0f); + stay.z = (x.z * x.z + y.z * y.z) <= static_cast(4.0f); + stay.w = (x.w * x.w + y.w * y.w) <= static_cast(4.0f); + for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); iter += 16) { x = savx; y = savy; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; // Two iterations - tmp = x*x + x0 - y*y; + tmp = x * x + x0 - y * y; y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; + x = tmp * tmp + x0 - y * y; y = 2.0f * tmp * y + y0; - stay.x = (x.x*x.x+y.x*y.x) <= static_cast(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= static_cast(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= static_cast(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= static_cast(4.0f); + stay.x = (x.x * x.x + y.x * y.x) <= static_cast(4.0f); + stay.y = (x.y * x.y + y.y * y.y) <= static_cast(4.0f); + stay.z = (x.z * x.z + y.z * y.z) <= static_cast(4.0f); + stay.w = (x.w * x.w + y.w * y.w) <= static_cast(4.0f); savx.x = static_cast(stay.x ? x.x : savx.x); savx.y = static_cast(stay.y ? x.y : savx.y); savx.z = static_cast(stay.z ? x.z : savx.z); @@ -135,10 +134,10 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, savy.y = static_cast(stay.y ? y.y : savy.y); savy.z = static_cast(stay.z ? y.z : savy.z); savy.w = static_cast(stay.w ? y.w : savy.w); - ccount.x -= stay.x*16; - ccount.y -= stay.y*16; - ccount.z -= stay.z*16; - ccount.w -= stay.w*16; + ccount.x -= stay.x * 16; + ccount.y -= stay.y * 16; + ccount.z -= stay.z * 16; + ccount.w -= stay.w * 16; } // Handle remainder if (!(stay.x & stay.y & stay.z & stay.w)) { @@ -146,13 +145,13 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, do { x = savx; y = savy; - stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter); - stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter); - stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter); - stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter); + stay.x = ((x.x * x.x + y.x * y.x) <= 4.0f) && (ccount.x < maxIter); + stay.y = ((x.y * x.y + y.y * y.y) <= 4.0f) && (ccount.y < maxIter); + stay.z = ((x.z * x.z + y.z * y.z) <= 4.0f) && (ccount.z < maxIter); + stay.w = ((x.w * x.w + y.w * y.w) <= 4.0f) && (ccount.w < maxIter); tmp = x; - x = x*x + x0 - y*y; - y = 2.0f*tmp*y + y0; + x = x * x + x0 - y * y; + y = 2.0f * tmp * y + y0; ccount.x += stay.x; ccount.y += stay.y; ccount.z += stay.z; @@ -168,7 +167,7 @@ __global__ static void mandelbrot(uint *out, uint width, float xPos, float yPos, savy.w = (stay.w ? y.w : savy.w); } while ((stay.x | stay.y | stay.z | stay.w) && iter); } - uint4 *vecOut = reinterpret_cast(out); + uint4* vecOut = reinterpret_cast(out); vecOut[tid].x = (uint)(ccount.x); vecOut[tid].y = (uint)(ccount.y); vecOut[tid].z = (uint)(ccount.z); @@ -180,27 +179,19 @@ class hipPerfStreamConcurrency { hipPerfStreamConcurrency(); ~hipPerfStreamConcurrency(); - void setNumKernels(unsigned int num) { - numKernels = num; - } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } + void setNumKernels(unsigned int num) { numKernels = num; } + void setNumStreams(unsigned int num) { numStreams = num; } + unsigned int getNumStreams() { return numStreams; } - unsigned int getNumKernels() { - return numKernels; - } + unsigned int getNumKernels() { return numKernels; } bool open(int deviceID); bool run(unsigned int testCase, unsigned int deviceId); void close(void); private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); + void setData(void* ptr, unsigned int value); + void checkData(uint* ptr); unsigned int numKernels; unsigned int numStreams; @@ -227,38 +218,34 @@ bool hipPerfStreamConcurrency::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount << " CUs" - << " and device id: " << deviceId << std::endl; + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device ID: %d", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); + numCUs = props.multiProcessorCount; return true; } -void hipPerfStreamConcurrency::close() { -} +void hipPerfStreamConcurrency::close() {} -bool hipPerfStreamConcurrency::run(unsigned int testCase, - unsigned int deviceId) { +bool hipPerfStreamConcurrency::run(unsigned int testCase, unsigned int deviceId) { int clkFrequency = 0; unsigned int numStreams = getNumStreams(); unsigned int numKernels = getNumKernels(); - HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, - hipDeviceAttributeClockRate, deviceId)); + HIP_CHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId)); if (clkFrequency == 0) { - std::cout << "clkFrequency = 0, set it to 1000000\n"; + CONSOLE_PRINT("clkFrequency = 0, set it to 1000000\n"); clkFrequency = 1000000; } - clkFrequency =(unsigned int)clkFrequency/1000; + clkFrequency = (unsigned int)clkFrequency / 1000; // Maximum iteration count // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter = (unsigned int)(((8388608 * (static_cast(clkFrequency) / 1000)) - * numCUs) / 128); + maxIter = (unsigned int)(((8388608 * (static_cast(clkFrequency) / 1000)) * numCUs) / 128); maxIter = (maxIter + 15) & ~15; - hipStream_t *streams = new hipStream_t[numStreams]; - uint ** hPtr = new uint*[numKernels]; - uint ** dPtr = new uint*[numKernels]; + hipStream_t* streams = new hipStream_t[numStreams]; + uint** hPtr = new uint*[numKernels]; + uint** dPtr = new uint*[numKernels]; // Width is divisible by 4 because the mandelbrot kernel // processes 4 pixels at once. @@ -271,16 +258,15 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Allocate memory on the host and device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), - bufSize, hipHostMallocDefault)); + HIP_CHECK(hipHostMalloc(reinterpret_cast(&hPtr[i]), bufSize, hipHostMallocDefault)); setData(hPtr[i], 0xdeadbeef); - HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) + HIP_CHECK(hipMalloc(reinterpret_cast(&dPtr[i]), bufSize)) } // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); + int threads = (bufSize / sizeof(uint)); + int threads_per_block = 64; + int blocks = (threads / threads_per_block) + (threads % threads_per_block); coordIdx = testCase % numCoords; float xStep = static_cast(coords[coordIdx].width / static_cast(width_)); float yStep = static_cast(-coords[coordIdx].width / static_cast(width_)); @@ -289,8 +275,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Copy memory asynchronously and concurrently from host to device for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), - hPtr[i], bufSize, streams[i % numStreams])); + HIP_CHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), hPtr[i], bufSize, + streams[i % numStreams])); } // Synchronize to make sure all the copies are completed @@ -305,9 +291,8 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, auto all_start = std::chrono::steady_clock::now(); for (uint i = 0; i < numKernels; i++) { - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), - 0, streams[i%numStreams], dPtr[i], width_, xPos, yPos, xStep, - yStep, maxIter); + hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, + streams[i % numStreams], dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter); } // Synchronize all the concurrent streans to have completed execution @@ -320,17 +305,16 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, // Copy data back from device to the host for (uint i = 0; i < numKernels; i++) { - HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], - reinterpret_cast(dPtr[i]), bufSize, - streams[i % numStreams])); + HIP_CHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast(dPtr[i]), bufSize, + streams[i % numStreams])); } if (testCase != 0) { - std::cout <<"Measured time for " << numKernels <<" kernels (s) on " - << numStreams <<" stream (s): " << all_kernel_time.count() << std::endl; + CONSOLE_PRINT("Measured time for %d kernels (s) on %d stream(s): %e\n", numKernels, numStreams, + all_kernel_time.count()); } - for (uint i = 0 ; i < numStreams; i++) { + for (uint i = 0; i < numStreams; i++) { HIP_CHECK(hipStreamDestroy(streams[i])); } @@ -340,20 +324,20 @@ bool hipPerfStreamConcurrency::run(unsigned int testCase, HIP_CHECK(hipFree(dPtr[i])); } - delete [] streams; - delete [] hPtr; - delete [] dPtr; + delete[] streams; + delete[] hPtr; + delete[] dPtr; return true; } -void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ ; i++) { - ptr2[i] = value; +void hipPerfStreamConcurrency::setData(void* ptr, unsigned int value) { + unsigned int* ptr2 = (unsigned int*)ptr; + for (unsigned int i = 0; i < width_; i++) { + ptr2[i] = value; } } -void hipPerfStreamConcurrency::checkData(uint *ptr) { +void hipPerfStreamConcurrency::checkData(uint* ptr) { totalIters = 0; for (unsigned int i = 0; i < width_; i++) { totalIters += ptr[i]; @@ -361,16 +345,16 @@ void hipPerfStreamConcurrency::checkData(uint *ptr) { } /** -* Test Description -* ------------------------ -* - Verify the different levels of stream concurrency. -* Test source -* ------------------------ -* - perftests/stream/hipPerfStreamConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the different levels of stream concurrency. + * Test source + * ------------------------ + * - perftests/stream/hipPerfStreamConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfStreamConcurrency") { hipPerfStreamConcurrency streamConcurrency; @@ -386,10 +370,10 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") { break; case 1: - // default stream executes serially - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; + // default stream executes serially + streamConcurrency.setNumStreams(1); + streamConcurrency.setNumKernels(1); + break; case 2: // 2-way concurrency @@ -419,6 +403,6 @@ TEST_CASE("Perf_hipPerfStreamConcurrency") { } /** -* End doxygen group perfComputeTest. -* @} -*/ + * End doxygen group perfComputeTest. + * @} + */ diff --git a/projects/hip-tests/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc b/projects/hip-tests/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc index edbe4c0046..9b240ac13f 100644 --- a/projects/hip-tests/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc +++ b/projects/hip-tests/catch/perftests/stream/hipPerfStreamCreateCopyDestroy.cc @@ -18,19 +18,17 @@ */ /** -* @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy -* @{ -* @ingroup perfStreamTest -* `hipError_t hipStreamCreate(hipStream_t* stream)` - -* Create an asynchronous stream. -*/ + * @addtogroup hipPerfStreamCreateCopyDestroy hipPerfStreamCreateCopyDestroy + * @{ + * @ingroup perfStreamTest + * `hipError_t hipStreamCreate(hipStream_t* stream)` - + * Create an asynchronous stream. + */ #include #include #include -using namespace std; - #define BufSize 0x1000 #define Iterations 0x100 #define TotalStreams 4 @@ -39,17 +37,20 @@ using namespace std; class hipPerfStreamCreateCopyDestroy { private: - unsigned int numBuffers_; - unsigned int numStreams_; - const size_t totalStreams_[TotalStreams]; - const size_t totalBuffers_[TotalBufs]; + unsigned int numBuffers_; + unsigned int numStreams_; + const size_t totalStreams_[TotalStreams]; + const size_t totalBuffers_[TotalBufs]; + public: - hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0), - totalStreams_{1, 2, 4, 8}, - totalBuffers_{1, 100, 1000, 5000} {}; - ~hipPerfStreamCreateCopyDestroy() {}; - bool open(int deviceID); - bool run(unsigned int testNumber); + hipPerfStreamCreateCopyDestroy() + : numBuffers_(0), + numStreams_(0), + totalStreams_{1, 2, 4, 8}, + totalBuffers_{1, 100, 1000, 5000} {}; + ~hipPerfStreamCreateCopyDestroy(){}; + bool open(int deviceID); + bool run(unsigned int testNumber); }; bool hipPerfStreamCreateCopyDestroy::open(int deviceId) { @@ -61,20 +62,20 @@ bool hipPerfStreamCreateCopyDestroy::open(int deviceId) { HIP_CHECK(hipSetDevice(deviceId)); hipDeviceProp_t props; HIP_CHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID - << " " << props.name << " with " << props.multiProcessorCount << " CUs" - << " and device id: " << deviceId << std::endl; + + CONSOLE_PRINT("info: running on bus 0x%x %s with %d CUs and device id: %d\n", props.pciBusID, + props.name, props.multiProcessorCount, deviceId); return true; } bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { numStreams_ = totalStreams_[testNumber % TotalStreams]; - size_t iter = Iterations / (numStreams_ * (static_cast(1) - << (testNumber / TotalBufs + 1))); - hipStream_t *streams = new hipStream_t[numStreams_]; + size_t iter = + Iterations / (numStreams_ * (static_cast(1) << (testNumber / TotalBufs + 1))); + hipStream_t* streams = new hipStream_t[numStreams_]; numBuffers_ = totalBuffers_[testNumber / TotalBufs]; - float ** dSrc = new float*[numBuffers_]; + float** dSrc = new float*[numBuffers_]; size_t nBytes = BufSize * sizeof(float); for (size_t b = 0; b < numBuffers_; ++b) { @@ -97,8 +98,7 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { for (size_t s = 0; s < numStreams_; ++s) { for (size_t b = 0; b < numBuffers_; ++b) { - HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, - hipMemcpyHostToDevice, streams[s])); + HIP_CHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s])); } } @@ -112,31 +112,31 @@ bool hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { auto time = static_cast(diff.count() * 1000 / (iter * numStreams_)); - cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and " - << setw(4) << numBuffers_ << " buffers " << " and " << setw(4) - << iter << " iterations " << time << " (ms) " << endl; + CONSOLE_PRINT( + "Create+Copy+Destroy time for %u streams and %u buffers and %zu iterations %.6f (ms)\n", + numStreams_, numBuffers_, iter, time); - delete [] hSrc; + delete[] hSrc; for (size_t b = 0; b < numBuffers_; ++b) { HIP_CHECK(hipFree(dSrc[b])); } - delete [] streams; - delete [] dSrc; + delete[] streams; + delete[] dSrc; return true; } /** -* Test Description -* ------------------------ -* - Verify the Create+Copy+Destroy time for different stream. -* Test source -* ------------------------ -* - perftests/stream/hipPerfDeviceConcurrency.cc -* Test requirements -* ------------------------ -* - HIP_VERSION >= 5.6 -*/ + * Test Description + * ------------------------ + * - Verify the Create+Copy+Destroy time for different stream. + * Test source + * ------------------------ + * - perftests/stream/hipPerfDeviceConcurrency.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") { hipPerfStreamCreateCopyDestroy streamCCD; @@ -149,6 +149,6 @@ TEST_CASE("Perf_hipPerfStreamCreateCopyDestroy") { } /** -* End doxygen group perfStreamTest. -* @} -*/ + * End doxygen group perfStreamTest. + * @} + */ diff --git a/projects/hip-tests/perftests/compute/hipPerfDotProduct.cpp b/projects/hip-tests/perftests/compute/hipPerfDotProduct.cpp deleted file mode 100644 index e30d5ab039..0000000000 --- a/projects/hip-tests/perftests/compute/hipPerfDotProduct.cpp +++ /dev/null @@ -1,382 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include - -#define DOT_DIM 256 - -using namespace std; - -template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_not_equal(int n, - const double* __restrict__ x, - const double* __restrict__ y, - double* __restrict__ workspace) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - - double sum = 0.0; - for(int idx = gid; idx < n; idx += hipGridDim_x * hipBlockDim_x) { - sum = fma(y[idx], x[idx], sum); - } - - __shared__ double sdata[BLOCKSIZE]; - sdata[threadIdx.x] = sum; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64){ - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32){ - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } - __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[blockIdx.x] = sdata[0]; - } - -} - -template -__launch_bounds__(BLOCKSIZE) -__global__ void vectors_equal(int n, const double* __restrict__ x, - double* __restrict__ workspace) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - - double sum = 0.0; - for(int idx = gid; idx < n; idx += hipGridDim_x * blockDim.x) { - sum = fma(x[idx], x[idx], sum); - } - - __shared__ double sdata[BLOCKSIZE]; - sdata[threadIdx.x] = sum; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64) { - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32) { - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } - __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[blockIdx.x] = sdata[0]; - } -} - -template -__launch_bounds__(BLOCKSIZE) -__global__ void dot_reduction(double* __restrict__ workspace) { - - __shared__ double sdata[BLOCKSIZE]; - - sdata[threadIdx.x] = workspace[threadIdx.x]; - - __syncthreads(); - - if(threadIdx.x < 128) { - sdata[threadIdx.x] += sdata[threadIdx.x + 128]; - } - __syncthreads(); - - if(threadIdx.x < 64) { - sdata[threadIdx.x] += sdata[threadIdx.x + 64]; - } - __syncthreads(); - - if(threadIdx.x < 32) { - sdata[threadIdx.x] += sdata[threadIdx.x + 32]; - } - __syncthreads(); - - if(threadIdx.x < 16) { - sdata[threadIdx.x] += sdata[threadIdx.x + 16]; - } - __syncthreads(); - - if(threadIdx.x < 8) { - sdata[threadIdx.x] += sdata[threadIdx.x + 8]; - } - __syncthreads(); - - if(threadIdx.x < 4) { - sdata[threadIdx.x] += sdata[threadIdx.x + 4]; - } __syncthreads(); - - if(threadIdx.x < 2) { - sdata[threadIdx.x] += sdata[threadIdx.x + 2]; - } - __syncthreads(); - - if(threadIdx.x < 1) { - sdata[threadIdx.x] += sdata[threadIdx.x + 1]; - } - - if(threadIdx.x == 0) { - workspace[0] = sdata[0]; - } - -} - -void computeDotProduct(int n, const double* x, const double* y, double& result, - double* workspace) -{ - dim3 blocks(DOT_DIM); - dim3 threadsPerBlock(DOT_DIM); - - if(x != y) { - hipLaunchKernelGGL(vectors_not_equal, blocks, threadsPerBlock, 0, 0, n, x, y, - workspace); - } - else { - hipLaunchKernelGGL(vectors_equal, blocks, threadsPerBlock, 0, 0, n, x, workspace); - } - - // Part 2 of dot product computation - hipLaunchKernelGGL(dot_reduction, dim3(1), threadsPerBlock, 0, 0, workspace); - - // Copy the final dot product result back from the device - HIPCHECK(hipMemcpy(&result, workspace, sizeof(double), hipMemcpyDeviceToHost)); - - return; -} - -int main(int argc, char* argv[]) { - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - - if (nGpu < 1) { - failed("No GPU!"); - } - hipDeviceProp_t props = {0}; - props = {0}; - HIPCHECK(hipSetDevice(p_gpuDevice)); - HIPCHECK(hipGetDeviceProperties(&props, p_gpuDevice)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << p_gpuDevice - << std::endl; - - int nx, ny, nz; - - for (unsigned int testCase = 0; testCase < 3; testCase++) { - - vector vectorSize = {200, 300, 50}; - switch(testCase) { - - case 0: - nx = vectorSize[0]; - ny = vectorSize[0]; - nz = vectorSize[0]; - break; - - case 1: - nx = vectorSize[1]; - ny = vectorSize[1]; - nz = vectorSize[1]; - break; - - case 2: - nx = vectorSize[0]; - ny = vectorSize[1]; - nz = vectorSize[2]; - break; - - default: - break; - - } - - int trials = 200; - - int size = nx * ny * nz; - - vector hx(size); - vector hy(size); - double hresult_xy = 0.0; - double hresult_xx = 0.0; - - srand(time(NULL)); - - for(int i = 0; i < size; ++i) { - hx[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0; - hy[i] = 2.0 * (double)rand() / (double)RAND_MAX - 1.0; - - hresult_xy += hx[i] * hy[i]; - hresult_xx += hx[i] * hx[i]; - } - - double* dx; - double* dy; - double* workspace; - double dresult; - - HIPCHECK(hipMalloc((void**)&dx, sizeof(double) * size)); - HIPCHECK(hipMalloc((void**)&dy, sizeof(double) * size)); - HIPCHECK(hipMalloc((void**)&workspace, sizeof(double) * DOT_DIM)); - - HIPCHECK(hipMemcpy(dx, hx.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - HIPCHECK(hipMemcpy(dy, hy.data(), sizeof(double) * size, hipMemcpyHostToDevice)); - - // Warm up - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - computeDotProduct(size, dx, dy, dresult, workspace); - - // Timed run for - HIPCHECK(hipDeviceSynchronize()); - auto all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < trials; ++i) { - computeDotProduct(size, dx, dy, dresult, workspace); - } - - float time = 0; - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); - - time /= trials; - - double bw = sizeof(double) * size * 2.0 / 1e9; - double gf = 2.0 * size / 1e9; - - cout << "\nVector Size: " << size << "\n[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; - - // Verify the device kernel results comparing it with the host results - if(std::abs(dresult - hresult_xy) > std::max(dresult * 1e-10, 1e-8)) { - cerr << " Device results inconsistent with host results. " - << " Host result: " << hresult_xy - << " Device result: " << dresult; - } - - // Warm up - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - computeDotProduct(size, dx, dx, dresult, workspace); - - // Timed run for - HIPCHECK(hipDeviceSynchronize()); - all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < trials; ++i) { - computeDotProduct(size, dx, dx, dresult, workspace); - } - - all_end = std::chrono::steady_clock::now(); - all_kernel_time = all_end - all_start; - time = all_kernel_time.count(); - - time /= trials; - bw = sizeof(double) * size / 1e9; - - cout << "[ddot] " << time << "msec ;" << bw/ (time / 1e3) << " GByte/s ;" - << gf/(time / 1e3) << " GFlop/s" << endl; - - // Verify the device kernel results comparing it with the host results - if(abs(dresult - hresult_xx) > max(dresult * 1e-10, 1e-8)) { - cerr << " Device results inconsistent with host results" - << " Host result: " << hresult_xy - << " Device result: " << dresult; - } - - HIPCHECK(hipFree(dx)); - HIPCHECK(hipFree(dy)); - HIPCHECK(hipFree(workspace)); - - } - passed(); - return 0; -} diff --git a/projects/hip-tests/perftests/compute/hipPerfMandelbrot.cpp b/projects/hip-tests/perftests/compute/hipPerfMandelbrot.cpp deleted file mode 100644 index 9f9d6b404e..0000000000 --- a/projects/hip-tests/perftests/compute/hipPerfMandelbrot.cpp +++ /dev/null @@ -1,743 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include -#include -#include -#include -#include - -typedef struct { - double x; - double y; - double width; -} coordRec; - -coordRec coords[] = { - {0.0, 0.0, 4.0}, // Whole set - {0.0, 0.0, 0.00001}, // All black - {-0.0180789661868, 0.6424294066162, 0.00003824140}, // Hit detail -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -template -__global__ void float_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -template -__global__ void float_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*(float)i); - float y0 = (float)(yPos + yStep*(float)j); - - float x = x0; - float y = y0; - -#define FAST - uint iter = 0; - float tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - float savx = x; - float savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) { -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) { -#endif - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } while (stay && iter); - } - - - out[tid] = (uint)ccount; - -}; - - -template -__global__ void double_mad_kernel(uint *out, uint width, T xPos, T yPos, T xStep, T yStep, - uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*i); - double y0 = (double)(yPos + yStep*j); - - double x = x0; - double y = y0; - - uint iter = 0; - double tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - out[tid] = iter; -}; - - -template -__global__ void double_mandel_unroll_kernel(uint *out, uint width, T xPos, - T yPos, T xStep, T yStep, uint maxIter) { - -#pragma FP_CONTRACT ON - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - - int i = tid % width; - int j = tid / width; - double x0 = (double)(xPos + xStep*(double)i); - double y0 = (double)(yPos + yStep*(double)j); - - double x = x0; - double y = y0; - -#define FAST - uint iter = 0; - double tmp; - int stay; - int ccount = 0; - stay = (x*x+y*y) <= 4.0; - double savx = x; - double savy = y; -#ifdef FAST - for (iter = 0; (iter < maxIter); iter+=16) -#else - for (iter = 0; stay && (iter < maxIter); iter+=16) -#endif - { - x = savx; - y = savy; - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - // Two iterations - tmp = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*x,y,y0); - x = fma(-y,y, fma(tmp,tmp,x0)); - y = fma(2.0f*tmp,y,y0); - - stay = (x*x+y*y) <= 4.0; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - ccount += stay*16; -#ifdef FAST - if (!stay) - break; -#endif - } - // Handle remainder - if (!stay) { - iter = 16; - do { - x = savx; - y = savy; - stay = ((x*x+y*y) <= 4.0) && (ccount < maxIter); - tmp = x; - x = fma(-y,y, fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - ccount += stay; - iter--; - savx = (stay ? x : savx); - savy = (stay ? y : savy); - } - while (stay && iter); - - } - out[tid] = (uint)ccount; -}; - -static const unsigned int FMA_EXPECTEDVALUES_INDEX = 15; - -// Expected results for each kernel run at each coord -unsigned long long expectedIters[] = { - 203277748ull, 2147483648ull, 120254651ull, 203277748ull, 2147483648ull, - 120254651ull, 203277748ull, 2147483648ull, 120254651ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull, - 203280620ull, 2147483648ull, 120485704ull, 203280620ull, 2147483648ull, - 120485704ull, 203280620ull, 2147483648ull, 120485704ull, 203315114ull, - 2147483648ull, 120042599ull, 203315114ull, 2147483648ull, 120042599ull}; - -class hipPerfMandelBrot { - public: - hipPerfMandelBrot(); - ~hipPerfMandelBrot(); - - void setNumKernels(unsigned int num) { - numKernels = num; - } - - unsigned int getNumKernels() { - return numKernels; - } - - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } - - void open(int deviceID); - void run(unsigned int testCase, unsigned int deviceId); - void printResults(void); - - // array of funtion pointers - typedef void (hipPerfMandelBrot::*funPtr)(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - // Wrappers - void float_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt); - - void double_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - void double_mandel_unroll(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, int blocks, - int threads_per_block, int kernelCnt); - - hipStream_t streams[2]; - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numKernels; - unsigned int numStreams; - - std::map> results; - unsigned int width_; - unsigned int bufSize; - unsigned int maxIter; - unsigned int coordIdx; - volatile unsigned long long totalIters = 0; - int numCUs; - static const unsigned int numLoops = 10; -}; - - -hipPerfMandelBrot::hipPerfMandelBrot() {} - -hipPerfMandelBrot::~hipPerfMandelBrot() {} - -void hipPerfMandelBrot::open(int deviceId) { - - - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId - << std::endl; - - numCUs = props.multiProcessorCount; -} - - -void hipPerfMandelBrot::printResults() { - - int numkernels = getNumKernels(); - int numStreams = getNumStreams(); - - std::cout << "\n" <<"Measured perf for kernels in GFLOPS on " - << numStreams << " streams (s)" << std::endl; - - std::map>:: iterator itr; - for (itr = results.begin(); itr != results.end(); itr++) { - std::cout << "\n" << std::setw(20) << itr->first << " "; - for(auto i : results[itr->first]) { - std::cout << std::setw(10) << i << " "; - } - } - results.clear(); - - std::cout << std::endl; -} - - -// Wrappers for the kernel launches -void hipPerfMandelBrot::float_mad(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter, hipStream_t* streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, - maxIter); - - -} - - -void hipPerfMandelBrot::float_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mad(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(double_mad_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter, hipStream_t * streams, - int blocks, int threads_per_block, int kernelCnt) { - - int streamCnt = getNumStreams(); - hipLaunchKernelGGL(float_mandel_unroll_kernel, dim3(blocks), dim3(threads_per_block), 0, - streams[kernelCnt % streamCnt], out, width_, xPos, yPos, xStep, yStep, maxIter); - -} - - -void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) { - - unsigned int numStreams = getNumStreams(); - coordIdx = testCase % numCoords; - - funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll, - &hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll}; - - // Maximum iteration count - maxIter = 32768; - - uint * hPtr[numKernels]; - uint * dPtr[numKernels]; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - for (uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - - // Allocate memory on the host and device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - } - - - // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory asynchronously and concurrently from host to device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - } - - // Synchronize to make sure all the copies are completed - HIPCHECK(hipStreamSynchronize(0)); - - int kernelIdx; - if(testCase == 0 || testCase == 5 || testCase == 10) { - kernelIdx = 0; - } - - else if(testCase == 1 || testCase == 6 || testCase == 11) { - kernelIdx = 1; - } - else if(testCase == 2 || testCase == 7 || testCase == 12) { - kernelIdx = 2; - } - else if(testCase == 3 || testCase == 8 || testCase == 13){ - kernelIdx = 3; - } - - - double totalTime = 0.0; - - for (unsigned int k = 0; k < numLoops; k++) { - if ((testCase == 0 || testCase == 1 || testCase == 2 || - testCase == 5 || testCase == 6 || testCase == 7 || - testCase == 10 || testCase == 11 || testCase == 12)) { - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - - } - - - else { - double xStep = coords[coordIdx].width / (double)width_; - double yStep = -coords[coordIdx].width / (double)width_; - double xPos = coords[coordIdx].x - 0.5 * coords[coordIdx].width; - double yPos = coords[coordIdx].y + 0.5 * coords[coordIdx].width; - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - (this->*p[kernelIdx])(dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter, streams, blocks, - threads_per_block, i); - } - - - // Synchronize all the concurrent streams to have completed execution - HIPCHECK(hipStreamSynchronize(0)); - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - totalTime += all_kernel_time.count(); - } - - - } - - // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpy(hPtr[i] ,dPtr[i], bufSize, hipMemcpyDeviceToHost)); - } - - - for(uint i = 0; i < numKernels; i++) { - checkData(hPtr[i]); - - int j =0; - while((totalIters != expectedIters[j] && totalIters > expectedIters[j]) && j < 30) { - j++; - } - - if(j==30) { - std::cout << "Incorrect iteration count detected. "; - } - - } - - - // Compute GFLOPS. There are 7 FLOPs per iteration - double perf = ((double)(totalIters*numKernels) * 7 * (double)(1e-09)) / - (totalTime / (double)numLoops); - - - std::vector kernelName = {"float", "float_unroll", - "double", "double_unroll"}; - - // Print results except for Warm-up kernel - if(testCase!=100) { - results[kernelName[testCase % 4]].push_back(perf); - } - - - for(uint i = 0 ; i < numStreams; i++) { - HIPCHECK(hipStreamDestroy(streams[i])); - } - - - // Free host and device memory - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - -} - - -void hipPerfMandelBrot::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_; i++) { - ptr2[i] = value; - } -} - - -void hipPerfMandelBrot::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfMandelBrot mandelbrotCompute; - int deviceId = 0; - - mandelbrotCompute.open(deviceId); - - for (unsigned int testCase = 0; testCase < 3; testCase++) { - - - switch (testCase) { - - - case 0: { - // Warmup-kernel - default stream executes serially - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(100/*Random number*/, deviceId); - break; - } - - - case 1: { - // run all - sync - int i = 0; - do { - mandelbrotCompute.setNumStreams(1); - mandelbrotCompute.setNumKernels(1); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - } - - - case 2: { - // run all - async - int i = 0; - do { - mandelbrotCompute.setNumStreams(2); - mandelbrotCompute.setNumKernels(2); - mandelbrotCompute.run(i, deviceId); - i++; - }while(i < 12); - mandelbrotCompute.printResults(); - - break; - - } - - - default: { - break; - } - - - } - - - - } - - - passed(); -} diff --git a/projects/hip-tests/perftests/stream/hipPerfDeviceConcurrency.cpp b/projects/hip-tests/perftests/stream/hipPerfDeviceConcurrency.cpp deleted file mode 100644 index 664bdb47ed..0000000000 --- a/projects/hip-tests/perftests/stream/hipPerfDeviceConcurrency.cpp +++ /dev/null @@ -1,284 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -typedef struct { - double x; - double y; - double width; -} coordRec; - -static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, float xStep, - float yStep, uint maxIter) { - - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % width; - int j = tid / width; - float x0 = (float)(xPos + xStep*i); - float y0 = (float)(yPos + yStep*j); - - float x = x0; - float y = y0; - - uint iter = 0; - float tmp; - for (iter = 0; (x*x + y*y <= 4.0f) && (iter < maxIter); iter++) { - tmp = x; - x = fma(-y,y,fma(x,x,x0)); - y = fma(2.0f*tmp,y,y0); - } - - out[tid] = iter; -}; - -class hipPerfDeviceConcurrency { - public: - hipPerfDeviceConcurrency(); - ~hipPerfDeviceConcurrency(); - - void setNumGpus(unsigned int num) { - numDevices = num; - } - unsigned int getNumGpus() { - return numDevices; - } - - void open(void); - void close(void); - void run(unsigned int testCase, int numGpus); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numDevices; - unsigned int width_; - unsigned int bufSize; - unsigned int coordIdx; - unsigned long long totalIters = 0; -}; - - -hipPerfDeviceConcurrency::hipPerfDeviceConcurrency() {} - -hipPerfDeviceConcurrency::~hipPerfDeviceConcurrency() {} - -void hipPerfDeviceConcurrency::open(void) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - setNumGpus(nGpu); - if (nGpu < 1) { - failed("No GPU!"); - } -} - - -void hipPerfDeviceConcurrency::close() { -} - -void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) { - - - static int deviceId; - uint * hPtr[numGpus]; - uint * dPtr[numGpus]; - hipStream_t streams[numGpus]; - int numCUs[numGpus]; - unsigned int maxIter[numGpus]; - unsigned long long expectedIters[numGpus]; - - int threads, threads_per_block, blocks; - float xStep, yStep, xPos, yPos; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, i)); - - if (testCase != 0) { - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device ID: " - << i << std::endl; - } - - numCUs[i] = props.multiProcessorCount; - int clkFrequency = 0; - HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, i)); - - clkFrequency =(unsigned int)clkFrequency/1000; - - // Maximum iteration count - // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter[i] = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs[i]) / 128); - maxIter[i] = (maxIter[i] + 15) & ~15; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * width_ * sizeof(uint); - - // Create streams for concurrency - HIPCHECK(hipStreamCreate(&streams[i])); - - // Allocate memory on the host and device - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - - // Prepare kernel launch parameters - threads = (bufSize/sizeof(uint)); - threads_per_block = 64; - blocks = (threads/threads_per_block) + (threads % threads_per_block); - - coordIdx = testCase % numCoords; - xStep = (float)(coords[coordIdx].width / (double)width_); - yStep = (float)(-coords[coordIdx].width / (double)width_); - xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory from host to device - HIPCHECK(hipMemcpy(dPtr[i], hPtr[i], bufSize, hipMemcpyHostToDevice)); - - } - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - - HIPCHECK(hipSetDevice(deviceId)); - - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i], - dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter[i]); - - } - - for(int i = 0; i < numGpus; i++) { - HIPCHECK(hipStreamSynchronize(0)); - } - - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - for(int i = 0; i < numGpus; i++) { - - if(testCase != 0) { - deviceId = i; - } - HIPCHECK(hipSetDevice(deviceId)); - - // Copy data back from device to the host - HIPCHECK(hipMemcpy(hPtr[i], dPtr[i], bufSize, hipMemcpyDeviceToHost)); - - checkData(hPtr[i]); - expectedIters[i] = width_ * width_ * (unsigned long long) maxIter[i]; - - if (testCase != 0) { - checkData(hPtr[i]); - if(totalIters != expectedIters[i]) { - std::cout << "Incorrect iteration count detected" << std::endl; - } - } - - - HIPCHECK(hipStreamDestroy(streams[i])); - - // Free host and device memory - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - if (testCase != 0) { - std::cout << '\n' << "Measured time for kernel computation on " << numGpus << " device (s): " - << all_kernel_time.count() << " (s) " << '\n' << std::endl; - } - - if(testCase == 0) { - deviceId++; - } - - -} - - -void hipPerfDeviceConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ * width_ ; i++) { - ptr2[i] = value; - } -} - - -void hipPerfDeviceConcurrency::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_ * width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfDeviceConcurrency deviceConcurrency; - - deviceConcurrency.open(); - - int nGpu = deviceConcurrency.getNumGpus(); - - // testCase = 0 refers to warmup kernel run - int testCase = 0; - - for (int i = 0; i < nGpu; i++) { - // Warm-up kernel on all devices - deviceConcurrency.run(testCase, 1); - } - - // Time for kernel on 1 device - deviceConcurrency.run(++testCase, 1); - - // Time for kernel on all available devices - deviceConcurrency.run(++testCase, nGpu); - - passed(); -} diff --git a/projects/hip-tests/perftests/stream/hipPerfStreamConcurrency.cpp b/projects/hip-tests/perftests/stream/hipPerfStreamConcurrency.cpp deleted file mode 100644 index 16e29bc06b..0000000000 --- a/projects/hip-tests/perftests/stream/hipPerfStreamConcurrency.cpp +++ /dev/null @@ -1,432 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" -#include - -#ifdef __HIP_PLATFORM_NVIDIA__ -inline __device__ float4 operator*(float s, float4 a) -{ - return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); -} -inline __device__ float4 operator*(float4 a, float4 b) -{ - return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); -} -inline __device__ float4 operator+(float4 a, float4 b) -{ - return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); -} -inline __device__ float4 operator-(float4 a, float4 b) -{ - return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); -} -#endif - -typedef struct { - double x; - double y; - double width; -} coordRec; - -static coordRec coords[] = { - {0.0, 0.0, 0.00001}, // All black -}; - -static unsigned int numCoords = sizeof(coords) / sizeof(coordRec); - -__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos, - float xStep, float yStep, uint maxIter) { - int tid = (blockIdx.x * blockDim.x + threadIdx.x); - int i = tid % (width/4); - int j = tid / (width/4); - int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3); - int4 vecj = make_int4(j, j, j, j); - float4 x0; - x0.x = (float)(xPos + xStep*veci.x); - x0.y = (float)(xPos + xStep*veci.y); - x0.z = (float)(xPos + xStep*veci.z); - x0.w = (float)(xPos + xStep*veci.w); - float4 y0; - y0.x = (float)(yPos + yStep*vecj.x); - y0.y = (float)(yPos + yStep*vecj.y); - y0.z = (float)(yPos + yStep*vecj.z); - y0.w = (float)(yPos + yStep*vecj.w); - float4 x = x0; - float4 y = y0; - uint iter = 0; - float4 tmp; - int4 stay; - int4 ccount = make_int4(0, 0, 0, 0); - float4 savx = x; - float4 savy = y; - stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); - for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter); - iter+=16) { - x = savx; - y = savy; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - // Two iterations - tmp = x*x + x0 - y*y; - y = 2.0f * x * y + y0; - x = tmp*tmp + x0 - y*y; - y = 2.0f * tmp * y + y0; - stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f); - stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f); - stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f); - stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f); - savx.x = (bool)(stay.x ? x.x : savx.x); - savx.y = (bool)(stay.y ? x.y : savx.y); - savx.z = (bool)(stay.z ? x.z : savx.z); - savx.w = (bool)(stay.w ? x.w : savx.w); - savy.x = (bool)(stay.x ? y.x : savy.x); - savy.y = (bool)(stay.y ? y.y : savy.y); - savy.z = (bool)(stay.z ? y.z : savy.z); - savy.w = (bool)(stay.w ? y.w : savy.w); - ccount.x -= stay.x*16; - ccount.y -= stay.y*16; - ccount.z -= stay.z*16; - ccount.w -= stay.w*16; - } - // Handle remainder - if (!(stay.x & stay.y & stay.z & stay.w)) - { - iter = 16; - do - { - x = savx; - y = savy; - stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter); - stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter); - stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter); - stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter); - tmp = x; - x = x*x + x0 - y*y; - y = 2.0f*tmp*y + y0; - ccount.x += stay.x; - ccount.y += stay.y; - ccount.z += stay.z; - ccount.w += stay.w; - iter--; - savx.x = (stay.x ? x.x : savx.x); - savx.y = (stay.y ? x.y : savx.y); - savx.z = (stay.z ? x.z : savx.z); - savx.w = (stay.w ? x.w : savx.w); - savy.x = (stay.x ? y.x : savy.x); - savy.y = (stay.y ? y.y : savy.y); - savy.z = (stay.z ? y.z : savy.z); - savy.w = (stay.w ? y.w : savy.w); - } while ((stay.x | stay.y | stay.z | stay.w) && iter); - } - uint4 *vecOut = (uint4 *)out; - vecOut[tid].x = (uint)(ccount.x); - vecOut[tid].y = (uint)(ccount.y); - vecOut[tid].z = (uint)(ccount.z); - vecOut[tid].w = (uint)(ccount.w); -} - -class hipPerfStreamConcurrency { - public: - hipPerfStreamConcurrency(); - ~hipPerfStreamConcurrency(); - - void setNumKernels(unsigned int num) { - numKernels = num; - } - void setNumStreams(unsigned int num) { - numStreams = num; - } - unsigned int getNumStreams() { - return numStreams; - } - - unsigned int getNumKernels() { - return numKernels; - } - - void open(int deviceID); - void run(unsigned int testCase, unsigned int deviceId); - void close(void); - - private: - void setData(void *ptr, unsigned int value); - void checkData(uint *ptr); - - unsigned int numKernels; - unsigned int numStreams; - - unsigned int width_; - unsigned int bufSize; - unsigned int maxIter; - unsigned int coordIdx; - unsigned long long totalIters; - int numCUs; - -}; - - -hipPerfStreamConcurrency::hipPerfStreamConcurrency() {} - -hipPerfStreamConcurrency::~hipPerfStreamConcurrency() {} - -void hipPerfStreamConcurrency::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl; - - numCUs = props.multiProcessorCount; -} - - -void hipPerfStreamConcurrency::close() { -} - - -void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId) { - - int clkFrequency = 0; - unsigned int numStreams = getNumStreams(); - unsigned int numKernels = getNumKernels(); - - HIPCHECK(hipDeviceGetAttribute(&clkFrequency, hipDeviceAttributeClockRate, deviceId)); - - clkFrequency =(unsigned int)clkFrequency/1000; - - // Maximum iteration count - // maxIter = 8388608 * (engine_clock / 1000).serial execution - maxIter = (unsigned int)(((8388608 * ((float)clkFrequency / 1000)) * numCUs) / 128); - maxIter = (maxIter + 15) & ~15; - - hipStream_t streams[numStreams]; - - uint * hPtr[numKernels]; - uint * dPtr[numKernels]; - - // Width is divisible by 4 because the mandelbrot kernel processes 4 pixels at once. - width_ = 256; - - bufSize = width_ * sizeof(uint); - - // Create streams for concurrency - for (uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamCreate(&streams[i])); - } - - - // Allocate memory on the host and device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostMalloc((void **)&hPtr[i], bufSize, hipHostMallocDefault)); - setData(hPtr[i], 0xdeadbeef); - HIPCHECK(hipMalloc((uint **)&dPtr[i], bufSize)) - } - - - // Prepare kernel launch parameters - int threads = (bufSize/sizeof(uint)); - int threads_per_block = 64; - int blocks = (threads/threads_per_block) + (threads % threads_per_block); - - coordIdx = testCase % numCoords; - float xStep = (float)(coords[coordIdx].width / (double)width_); - float yStep = (float)(-coords[coordIdx].width / (double)width_); - float xPos = (float)(coords[coordIdx].x - 0.5 * coords[coordIdx].width); - float yPos = (float)(coords[coordIdx].y + 0.5 * coords[coordIdx].width); - - // Copy memory asynchronously and concurrently from host to device - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams])); - } - - - // Synchronize to make sure all the copies are completed - for(uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamSynchronize(streams[i])); - } - - // Warm-up kernel with lower iteration - if (testCase == 0) { - maxIter = 256; - } - - // Time the kernel execution - auto all_start = std::chrono::steady_clock::now(); - - for (uint i = 0; i < numKernels; i++) { - hipLaunchKernelGGL(mandelbrot, dim3(blocks), dim3(threads_per_block), 0, streams[i%numStreams], - dPtr[i], width_, xPos, yPos, xStep, yStep, maxIter); - } - - - // Synchronize all the concurrent streans to have completed execution - for(uint i = 0; i < numStreams; i++) { - HIPCHECK(hipStreamSynchronize(streams[i])); - } - - - auto all_end = std::chrono::steady_clock::now(); - std::chrono::duration all_kernel_time = all_end - all_start; - - // Copy data back from device to the host - for(uint i = 0; i < numKernels; i++) { - HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast(dPtr[i]), bufSize, streams[i % numStreams])); - } - - - if (testCase != 0) { - std::cout <<"Measured time for " << numKernels <<" kernels (s) on " << numStreams <<" stream (s): " - << all_kernel_time.count() << std::endl; - } - - - unsigned long long expected = - (unsigned long long)width_ * (unsigned long long)maxIter; - - for(uint i = 0 ; i < numStreams; i++) { - HIPCHECK(hipStreamDestroy(streams[i])); - } - - - // Free host and device memory - for (uint i = 0; i < numKernels; i++) { - HIPCHECK(hipHostFree(hPtr[i])); - HIPCHECK(hipFree(dPtr[i])); - } - - -} - - -void hipPerfStreamConcurrency::setData(void *ptr, unsigned int value) { - unsigned int *ptr2 = (unsigned int *)ptr; - for (unsigned int i = 0; i < width_ ; i++) { - ptr2[i] = value; - } -} - - -void hipPerfStreamConcurrency::checkData(uint *ptr) { - totalIters = 0; - for (unsigned int i = 0; i < width_; i++) { - totalIters += ptr[i]; - } -} - - -int main(int argc, char* argv[]) { - hipPerfStreamConcurrency streamConcurrency; - int deviceId = 0; - - streamConcurrency.open(deviceId); - - for (unsigned int testCase = 0; testCase < 5; testCase++) { - - - switch (testCase) { - - - case 0: - // Warm-up kernel - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; - - case 1: - // default stream executes serially - streamConcurrency.setNumStreams(1); - streamConcurrency.setNumKernels(1); - break; - - case 2: - // 2-way concurrency - streamConcurrency.setNumStreams(2); - streamConcurrency.setNumKernels(2); - break; - - case 3: - // 4-way concurrency - streamConcurrency.setNumStreams(4); - streamConcurrency.setNumKernels(4); - break; - - case 4: - streamConcurrency.setNumStreams(2); - streamConcurrency.setNumKernels(4); - break; - - case 5: - break; - - default: - break; - } - streamConcurrency.run(testCase, deviceId); - - } - - - passed(); -} diff --git a/projects/hip-tests/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp b/projects/hip-tests/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp deleted file mode 100644 index 103f40c7bb..0000000000 --- a/projects/hip-tests/perftests/stream/hipPerfStreamCreateCopyDestroy.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved. - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - */ - -/* HIT_START - * BUILD: %t %s ../../src/test_common.cpp - * TEST: %t - * HIT_END - */ - -#include -#include -#include "test_common.h" - -using namespace std; - -#define BufSize 0x1000 -#define Iterations 0x100 -#define TotalStreams 4 -#define TotalBufs 4 - - -class hipPerfStreamCreateCopyDestroy { - private: - unsigned int numBuffers_; - unsigned int numStreams_; - const size_t totalStreams_[TotalStreams]; - const size_t totalBuffers_[TotalBufs]; - public: - hipPerfStreamCreateCopyDestroy() : numBuffers_(0), numStreams_(0), - totalStreams_{1, 2, 4, 8}, - totalBuffers_{1, 100, 1000, 5000} {}; - ~hipPerfStreamCreateCopyDestroy() {}; - void open(int deviceID); - void run(unsigned int testNumber); -}; - -void hipPerfStreamCreateCopyDestroy::open(int deviceId) { - int nGpu = 0; - HIPCHECK(hipGetDeviceCount(&nGpu)); - if (nGpu < 1) { - failed("No GPU!"); - } - - HIPCHECK(hipSetDevice(deviceId)); - hipDeviceProp_t props = {0}; - HIPCHECK(hipGetDeviceProperties(&props, deviceId)); - std::cout << "info: running on bus " << "0x" << props.pciBusID << " " << props.name - << " with " << props.multiProcessorCount << " CUs" << " and device id: " << deviceId << std::endl; -} - -void hipPerfStreamCreateCopyDestroy::run(unsigned int testNumber) { - numStreams_ = totalStreams_[testNumber % TotalStreams]; - size_t iter = Iterations / (numStreams_ * ((size_t)1 << (testNumber / TotalBufs + 1))); - hipStream_t streams[numStreams_]; - - numBuffers_ = totalBuffers_[testNumber / TotalBufs]; - float* dSrc[numBuffers_]; - size_t nBytes = BufSize * sizeof(float); - - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipMalloc(&dSrc[b], nBytes)); - } - - float* hSrc; - hSrc = new float[nBytes]; - HIPCHECK(hSrc == 0 ? hipErrorOutOfMemory : hipSuccess); - for (size_t i = 0; i < BufSize; i++) { - hSrc[i] = 1.618f + i; - } - - auto start = std::chrono::steady_clock::now(); - - for (size_t i = 0; i < iter; ++i) { - for (size_t s = 0; s < numStreams_; ++s) { - HIPCHECK(hipStreamCreate(&streams[s])); - } - - for (size_t s = 0; s < numStreams_; ++s) { - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipMemcpyWithStream(dSrc[b], hSrc, nBytes, hipMemcpyHostToDevice, streams[s])); - } - } - - for (size_t s = 0; s < numStreams_; ++s) { - HIPCHECK(hipStreamDestroy(streams[s])); - } - } - - auto end = std::chrono::steady_clock::now(); - std::chrono::duration diff = end - start; - - auto time = static_cast(diff.count() * 1000 / (iter * numStreams_)); - - cout << "Create+Copy+Destroy time for " << numStreams_ << " streams and " - << setw(4) << numBuffers_ << " buffers " << " and " << setw(4) - << iter << " iterations " << time << " (ms) " << endl; - - delete [] hSrc; - for (size_t b = 0; b < numBuffers_; ++b) { - HIPCHECK(hipFree(dSrc[b])); - } -} - -int main(int argc, char* argv[]) { - hipPerfStreamCreateCopyDestroy streamCCD; - - int deviceId = 0; - streamCCD.open(deviceId); - - for (auto testCase = 0; testCase < TotalStreams * TotalBufs; testCase++) { - streamCCD.run(testCase); - } - - passed(); -}