SWDEV-299773 - Enable performance tests on NV (#2337)

1. Simply enable test on NV
   Some need minor fix
performance/compute/hipPerfDotProduct.cpp
performance/dispatch/hipPerfDispatchSpeed.cpp
performance/memory/hipPerfBufferCopyRectSpeed.cpp
performance/memory/hipPerfBufferCopySpeed.cpp
performance/memory/hipPerfDevMemReadSpeed.cpp
performance/memory/hipPerfDevMemWriteSpeed.cpp
performance/memory/hipPerfMemcpy.cpp
performance/memory/hipPerfMemset.cpp
performance/memory/hipPerfSharedMemReadSpeed.cpp
performance/stream/hipPerfDeviceConcurrency.cpp
performance/stream/hipPerfStreamCreateCopyDestroy.cpp

2. Enable and fix on NV
performance/compute/hipPerfMandelbrot.cpp
   Root cause: coordIdx is random
   Solution: Initialize coordIdx correctly
performance/memory/hipPerfMemFill.cpp
   Root cause: Hip ext Apis called.
   Solution: Exclude case with Hip ext Apis involved
performance/memory/hipPerfMemMallocCpyFree.cpp
   Root cause: Test allocates device memory more than GPU has.
   Solution: Allocate device memory in terms of GPU capacity.
tests/performance/memory/hipPerfSampleRate.cpp
   Root cause: Cuda has no operators += for float2 and float4.
   Solution: Provide the operators.
performance/stream/hipPerfStreamConcurrency.cpp
   Root cause:float4 format doesn't match cude.
              operators are missing in cuda lib.
   Solution: Use (x, y, z, w) format.
             Add necessary float4 operatoris for cuda.

Change-Id: I5add29ebabcfb21fb3ef89d09004c5d13423a291
This commit is contained in:
TomSang
2021-09-14 04:07:13 -04:00
committed by GitHub
parent 3fd16c0b5b
commit 9035ae3154
16 changed files with 133 additions and 112 deletions
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -489,6 +489,7 @@ void hipPerfMandelBrot::double_mandel_unroll(uint *out, uint width, float xPos,
void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
unsigned int numStreams = getNumStreams();
coordIdx = testCase % numCoords;
funPtr p[] = {&hipPerfMandelBrot::float_mad, &hipPerfMandelBrot::float_mandel_unroll,
&hipPerfMandelBrot::double_mad, &hipPerfMandelBrot::double_mandel_unroll};
@@ -555,9 +556,6 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
double totalTime = 0.0;
for (unsigned int k = 0; k < numLoops; k++) {
coordIdx = testCase % numCoords;
if ((testCase == 0 || testCase == 1 || testCase == 2 ||
testCase == 5 || testCase == 6 || testCase == 7 ||
testCase == 10 || testCase == 11 || testCase == 12)) {
@@ -653,7 +651,7 @@ void hipPerfMandelBrot::run(unsigned int testCase,unsigned int deviceId) {
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp ../../src/timer.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
+11 -2
View File
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -136,6 +136,7 @@ class hipPerfMemFill {
}
HIPCHECK(hipSetDevice(deviceId));
memset(&props_, 0, sizeof(props_));
HIPCHECK(hipGetDeviceProperties(&props_, deviceId));
std::cout << "Info: running on device: id: " << deviceId << ", bus: 0x"
<< props_.pciBusID << " " << props_.name << " with "
@@ -397,8 +398,9 @@ class hipPerfMemFill {
return true;
}
/* This fuction should be via device attribute query*/
/* This function should be via device attribute query*/
bool supportDeviceMallocFinegrained() {
#ifdef __HIP_PLATFORM_AMD__
T *A = nullptr;
hipExtMallocWithFlags((void **)&A, sizeof(T), hipDeviceMallocFinegrained);
if (!A) {
@@ -406,6 +408,9 @@ class hipPerfMemFill {
}
HIPCHECK(hipFree(A));
return true;
#else
return false;
#endif
}
unsigned int setNumBlocks(size_t size) {
@@ -419,6 +424,7 @@ class hipPerfMemFill {
#endif
}
#ifdef __HIP_PLATFORM_AMD__
bool testExtDeviceMemoryHostFill(size_t size, unsigned int flags) {
double GBytes = (double) size / (1024.0 * 1024.0 * 1024.0);
@@ -481,6 +487,7 @@ class hipPerfMemFill {
return true;
}
#endif
bool run() {
if (supportLargeBar()) {
@@ -499,11 +506,13 @@ class hipPerfMemFill {
return false;
}
#ifdef __HIP_PLATFORM_AMD__
if (supportDeviceMallocFinegrained()) {
if (!testExtDeviceMemory()) {
return false;
}
}
#endif
return true;
}
@@ -22,7 +22,7 @@ THE SOFTWARE.
#include <time.h>
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -37,10 +37,15 @@ void valSet(int* A, int val, size_t size) {
}
}
void setup(size_t *size, const int num, int **pA) {
void setup(size_t *size, int &num, int **pA, const size_t totalGlobalMem) {
std::cout << "size: ";
for (int i = 0; i < num; i++) {
size[i] = 1 << (i + 6);
if((NUM_ITER + 1) * size[i] > totalGlobalMem) {
num = i;
break;
}
std::cout << size[i] << " ";
}
std::cout << std::endl;
@@ -77,11 +82,16 @@ int main() {
size_t size[NUM_SIZE] = { 0 };
int *Ad[NUM_ITER] = { nullptr };
int *A;
hipDeviceProp_t props;
memset(&props, 0, sizeof(props));
HIPCHECK(hipGetDeviceProperties(&props, 0));
std::cout << "totalGlobalMem: " << props.totalGlobalMem << std::endl;
setup(size, NUM_SIZE, &A);
int num = NUM_SIZE;
setup(size, num, &A, props.totalGlobalMem);
testInit(size[0], A);
for (int i = 0; i < NUM_SIZE; i++) {
for (int i = 0; i < num; i++) {
std::cout << size[i] << std::endl;
start = clock();
for (int j = 0; j < NUM_ITER; j++) {
+1 -1
View File
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
+3 -3
View File
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -140,13 +140,13 @@ void hipPerfMemset::run1D(unsigned int test, T memsetval, enum MemsetType type,
HIPCHECK(hipStreamCreate(&stream));
// Warm-up
HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
auto start = chrono::steady_clock::now();
for (uint i = 0; i < NUM_ITER; i++) {
if (type == hipMemsetTypeDefault && !async) {
HIPCHECK(hipMemset((hipDeviceptr_t)A_d, memsetval, bufSize_));
HIPCHECK(hipMemset((void *)A_d, memsetval, bufSize_));
}
else if (type == hipMemsetTypeDefault && async) {
HIPCHECK(hipMemsetAsync(A_d, memsetval, bufSize_, stream));
+17 -4
View File
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -42,6 +42,18 @@ vector<unsigned int> sizes = {1, 2, 4, 8, 16, 32,
#define NUM_BUFS 6
#define MAX_BUFS (1 << (NUM_BUFS - 1))
#ifdef __HIP_PLATFORM_NVIDIA__
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
a.x += b.x; a.y += b.y;
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
#endif
template <typename T>
__global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int writeIt,
T **inBuffer, int numBufs) {
@@ -49,7 +61,8 @@ __global__ void sampleRate(T * outBuffer, unsigned int inBufSize, unsigned int w
uint gid = (blockIdx.x * blockDim.x + threadIdx.x);
uint inputIdx = gid % inBufSize;
T tmp = (T)0.0f;
T tmp;
memset(&tmp, 0, sizeof(T));
for(int i = 0; i < numBufs; i++) {
tmp += *(*(inBuffer+i)+inputIdx);
}
@@ -264,11 +277,11 @@ void hipPerfSampleRate::run(unsigned int test) {
// Free host and device memory
for (uint i = 0; i < numBufs_; i++) {
HIPCHECK(hipFree(hInPtr[i]));
HIPCHECK(hipHostFree(hInPtr[i]));
HIPCHECK(hipFree(dInPtr[i]));
}
HIPCHECK(hipFree(hOutPtr));
HIPCHECK(hipHostFree(hOutPtr));
HIPCHECK(hipFree(dPtr));
}
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -226,7 +226,7 @@ void hipPerfDeviceConcurrency::run(unsigned int testCase, int numGpus) {
HIPCHECK(hipStreamDestroy(streams[i]));
// Free host and device memory
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/
@@ -28,6 +28,24 @@
#include "test_common.h"
#include <hip/hip_vector_types.h>
#ifdef __HIP_PLATFORM_NVIDIA__
inline __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __device__ float4 operator*(float4 a, float4 b)
{
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
inline __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
#endif
typedef struct {
double x;
@@ -39,157 +57,130 @@ static coordRec coords[] = {
{0.0, 0.0, 0.00001}, // All black
};
static unsigned int numCoords = sizeof(coords) / sizeof(coordRec);
__global__ void mandelbrot(uint *out, uint width, float xPos, float yPos,
float xStep, float yStep, uint maxIter) {
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
int i = tid % (width/4);
int j = tid / (width/4);
int4 veci = make_int4(4*i, 4*i+1, 4*i+2, 4*i+3);
int4 vecj = make_int4(j, j, j, j);
float4 x0;
x0.data[0] = (float)(xPos + xStep*veci.data[0]);
x0.data[1] = (float)(xPos + xStep*veci.data[1]);
x0.data[2] = (float)(xPos + xStep*veci.data[2]);
x0.data[3] = (float)(xPos + xStep*veci.data[3]);
x0.x = (float)(xPos + xStep*veci.x);
x0.y = (float)(xPos + xStep*veci.y);
x0.z = (float)(xPos + xStep*veci.z);
x0.w = (float)(xPos + xStep*veci.w);
float4 y0;
y0.data[0] = (float)(yPos + yStep*vecj.data[0]);
y0.data[1] = (float)(yPos + yStep*vecj.data[1]);
y0.data[2] = (float)(yPos + yStep*vecj.data[2]);
y0.data[3] = (float)(yPos + yStep*vecj.data[3]);
y0.x = (float)(yPos + yStep*vecj.x);
y0.y = (float)(yPos + yStep*vecj.y);
y0.z = (float)(yPos + yStep*vecj.z);
y0.w = (float)(yPos + yStep*vecj.w);
float4 x = x0;
float4 y = y0;
uint iter = 0;
float4 tmp;
int4 stay;
int4 ccount = make_int4(0, 0, 0, 0);
float4 savx = x;
float4 savy = y;
stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
for (iter = 0; (stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && (iter < maxIter);
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
for (iter = 0; (stay.x | stay.y | stay.z | stay.w) && (iter < maxIter);
iter+=16) {
x = savx;
y = savy;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
// Two iterations
tmp = x*x + x0 - y*y;
y = 2.0f * x * y + y0;
x = tmp*tmp + x0 - y*y;
y = 2.0f * tmp * y + y0;
stay.data[0] = (x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= (float)(4.0f);
stay.data[1] = (x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= (float)(4.0f);
stay.data[2] = (x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= (float)(4.0f);
stay.data[3] = (x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= (float)(4.0f);
savx.data[0] = (bool)(stay.data[0] ? x.data[0] : savx.data[0]);
savx.data[1] = (bool)(stay.data[1] ? x.data[1] : savx.data[1]);
savx.data[2] = (bool)(stay.data[2] ? x.data[2] : savx.data[2]);
savx.data[3] = (bool)(stay.data[3] ? x.data[3] : savx.data[3]);
savy.data[0] = (bool)(stay.data[0] ? y.data[0] : savy.data[0]);
savy.data[1] = (bool)(stay.data[1] ? y.data[1] : savy.data[1]);
savy.data[2] = (bool)(stay.data[2] ? y.data[2] : savy.data[2]);
savy.data[3] = (bool)(stay.data[3] ? y.data[3] : savy.data[3]);
ccount.data[0] -= stay.data[0]*16;
ccount.data[1] -= stay.data[1]*16;
ccount.data[2] -= stay.data[2]*16;
ccount.data[3] -= stay.data[3]*16;
}
stay.x = (x.x*x.x+y.x*y.x) <= (float)(4.0f);
stay.y = (x.y*x.y+y.y*y.y) <= (float)(4.0f);
stay.z = (x.z*x.z+y.z*y.z) <= (float)(4.0f);
stay.w = (x.w*x.w+y.w*y.w) <= (float)(4.0f);
savx.x = (bool)(stay.x ? x.x : savx.x);
savx.y = (bool)(stay.y ? x.y : savx.y);
savx.z = (bool)(stay.z ? x.z : savx.z);
savx.w = (bool)(stay.w ? x.w : savx.w);
savy.x = (bool)(stay.x ? y.x : savy.x);
savy.y = (bool)(stay.y ? y.y : savy.y);
savy.z = (bool)(stay.z ? y.z : savy.z);
savy.w = (bool)(stay.w ? y.w : savy.w);
ccount.x -= stay.x*16;
ccount.y -= stay.y*16;
ccount.z -= stay.z*16;
ccount.w -= stay.w*16;
}
// Handle remainder
if (!(stay.data[0] & stay.data[1] & stay.data[2] & stay.data[3]))
if (!(stay.x & stay.y & stay.z & stay.w))
{
iter = 16;
do
{
x = savx;
y = savy;
stay.x = ((x.data[0]*x.data[0]+y.data[0]*y.data[0]) <= 4.0f) && (ccount.data[0] < maxIter);
stay.y = ((x.data[1]*x.data[1]+y.data[1]*y.data[1]) <= 4.0f) && (ccount.data[1] < maxIter);
stay.z = ((x.data[2]*x.data[2]+y.data[2]*y.data[2]) <= 4.0f) && (ccount.data[2] < maxIter);
stay.w = ((x.data[3]*x.data[3]+y.data[3]*y.data[3]) <= 4.0f) && (ccount.data[3] < maxIter);
stay.x = ((x.x*x.x+y.x*y.x) <= 4.0f) && (ccount.x < maxIter);
stay.y = ((x.y*x.y+y.y*y.y) <= 4.0f) && (ccount.y < maxIter);
stay.z = ((x.z*x.z+y.z*y.z) <= 4.0f) && (ccount.z < maxIter);
stay.w = ((x.w*x.w+y.w*y.w) <= 4.0f) && (ccount.w < maxIter);
tmp = x;
x = x*x + x0 - y*y;
y = 2.0f*tmp*y + y0;
ccount.data[0] += stay.data[0];
ccount.data[1] += stay.data[1];
ccount.data[2] += stay.data[2];
ccount.data[3] += stay.data[3];
ccount.x += stay.x;
ccount.y += stay.y;
ccount.z += stay.z;
ccount.w += stay.w;
iter--;
savx.data[0] = (stay.data[0] ? x.data[0] : savx.data[0]);
savx.data[1] = (stay.data[1] ? x.data[1] : savx.data[1]);
savx.data[2] = (stay.data[2] ? x.data[2] : savx.data[2]);
savx.data[3] = (stay.data[3] ? x.data[3] : savx.data[3]);
savy.data[0] = (stay.data[0] ? y.data[0] : savy.data[0]);
savy.data[1] = (stay.data[1] ? y.data[1] : savy.data[1]);
savy.data[2] = (stay.data[2] ? y.data[2] : savy.data[2]);
savy.data[3] = (stay.data[3] ? y.data[3] : savy.data[3]);
} while ((stay.data[0] | stay.data[1] | stay.data[2] | stay.data[3]) && iter);
savx.x = (stay.x ? x.x : savx.x);
savx.y = (stay.y ? x.y : savx.y);
savx.z = (stay.z ? x.z : savx.z);
savx.w = (stay.w ? x.w : savx.w);
savy.x = (stay.x ? y.x : savy.x);
savy.y = (stay.y ? y.y : savy.y);
savy.z = (stay.z ? y.z : savy.z);
savy.w = (stay.w ? y.w : savy.w);
} while ((stay.x | stay.y | stay.z | stay.w) && iter);
}
uint4 *vecOut = (uint4 *)out;
vecOut[tid].data[0] = (uint)(ccount.data[0]);
vecOut[tid].data[1] = (uint)(ccount.data[1]);
vecOut[tid].data[2] = (uint)(ccount.data[2]);
vecOut[tid].data[3] = (uint)(ccount.data[3]);
vecOut[tid].x = (uint)(ccount.x);
vecOut[tid].y = (uint)(ccount.y);
vecOut[tid].z = (uint)(ccount.z);
vecOut[tid].w = (uint)(ccount.w);
}
class hipPerfStreamConcurrency {
public:
hipPerfStreamConcurrency();
@@ -307,7 +298,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
// Copy memory asynchronously and concurrently from host to device
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpyHtoDAsync(dPtr[i], hPtr[i], bufSize, streams[i % numStreams]));
HIPCHECK(hipMemcpyHtoDAsync(reinterpret_cast<hipDeviceptr_t>(dPtr[i]), hPtr[i], bufSize, streams[i % numStreams]));
}
@@ -341,7 +332,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
// Copy data back from device to the host
for(uint i = 0; i < numKernels; i++) {
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i] ,dPtr[i], bufSize, streams[i % numStreams]));
HIPCHECK(hipMemcpyDtoHAsync(hPtr[i], reinterpret_cast<hipDeviceptr_t>(dPtr[i]), bufSize, streams[i % numStreams]));
}
@@ -361,7 +352,7 @@ void hipPerfStreamConcurrency::run(unsigned int testCase,unsigned int deviceId)
// Free host and device memory
for (uint i = 0; i < numKernels; i++) {
HIPCHECK(hipFree(hPtr[i]));
HIPCHECK(hipHostFree(hPtr[i]));
HIPCHECK(hipFree(dPtr[i]));
}
@@ -18,7 +18,7 @@
*/
/* HIT_START
* BUILD: %t %s ../../src/test_common.cpp EXCLUDE_HIP_PLATFORM nvidia
* BUILD: %t %s ../../src/test_common.cpp
* TEST: %t
* HIT_END
*/