Make hipMultiThreadStreams1 test a little harsher.

Fail faster if synchronization rules are violated. Run vectorAddRevers to read last elements of array first - if the vector add kernel starts before preceding copy finishes we will read stale data and flag the error. Increase default array sizes, so synchronization errors more easily exposed. [ROCm/hip commit: 2e1fec47ab]
2017-05-16 18:56:40 -05:00
@@ -29,6 +29,8 @@ THE SOFTWARE.
 #include "hip/hip_runtime.h"
 #include "test_common.h"

+int p_iters=10;
+
 void printSep()
 {
    printf ("======================================================================================\n");
@@ -43,7 +45,7 @@ template<
 	class P=HipTest::Unpinned, 
 	class C=HipTest::Memcpy
 >
-void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
+void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
 {
 	using HipTest::MemTraits;

@@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
    T *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned);
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1000.0f;
+		B_h[i] = 2000.0f;
+		C_h[i] = -1;
+	}
+
+
+	MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
+	MemTraits<C>::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream);
+    HIPCHECK (hipDeviceSynchronize());
+
+	for (size_t i=0; i<numElements; i++) {
+		A_h[i] = 1.0f;
+		B_h[i] = 2.0f;
+		C_h[i] = -1;
+	}
+


 	for (int i=0; i<iters; i++) {
@@ -66,7 +86,11 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 		MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
 		MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);

-		hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		//HIPCHECK(hipStreamSynchronize(stream));
+
+		// This is the null stream?
+		//hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
+		hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);

 		MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);

@@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
 	}

    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned);
+	std::cout <<"  pid" << pid << " success\n";
    HIPCHECK (hipDeviceSynchronize());

-	std::cout <<"  pid" << pid << " success\n";
 }

 template<typename T, class C>
@@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s
 	printf ("%s\n", __func__);
 	std::cout << testName << std::endl;

+	size_t numElements = N;
+
 	// Test 2 threads operating on same stream:
-    std::thread t1 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream0);
+    std::thread t1 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream0);
    if (serialize) {
        t1.join();
    }
-    std::thread t2 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 100/*iters*/, stream1);
+    std::thread t2 (simpleVectorAdd<T, HipTest::Pinned, C>, numElements, p_iters/*iters*/, stream1);
    if (serialize) {
        t2.join();
    }
@@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s

 int main(int argc, char *argv[])
 {
+	N = 8000000;
    HipTest::parseStandardArguments(argc, argv, true);

    printf ("info: set device to %d\n", p_gpuDevice);
@@ -121,8 +148,8 @@ int main(int argc, char *argv[])
        hipStream_t stream;
        HIPCHECK (hipStreamCreate(&stream));

-        simpleVectorCopy<float, HipTest::Pinned, HipTest::MemcpyAsync>	(2000000/*mb*/, 10/*iters*/, stream);
-        simpleVectorCopy<float, HipTest::Pinned, HipTest::Memcpy>		(2000000/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::MemcpyAsync>	(N/*mb*/, 10/*iters*/, stream);
+        simpleVectorAdd<float, HipTest::Pinned, HipTest::Memcpy>		(N/*mb*/, 10/*iters*/, stream);

        HIPCHECK(hipStreamDestroy(stream));
    }
@@ -139,8 +166,8 @@ int main(int argc, char *argv[])
    }

    if (p_tests & 0x4) {
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
-		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
+		//test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
 		test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with one stream",  stream0, stream0, false);
 	}

@@ -146,6 +146,23 @@ vectorADD(hipLaunchParm lp,
 }


+template <typename T>
+__global__ void
+vectorADDReverse(hipLaunchParm lp,
+            const T *A_d,
+            const T *B_d,
+            T *C_d,
+            size_t NELEM)
+{
+    size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
+    size_t stride = hipBlockDim_x * hipGridDim_x ;
+
+    for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) {
+        C_d[i] = A_d[i] + B_d[i];
+	}
+}
+
+
 template <typename T>
 __global__ void
 addCount( const T *A_d,
@@ -343,7 +360,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d,
 // Assumes C_h contains vector add of A_h + B_h
 // Calls the test "failed" macro if a mismatch is detected.
 template <typename T>
-void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true)
+size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true)
 {
    size_t  mismatchCount = 0;
    size_t  firstMismatch = 0;
@@ -364,15 +381,19 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true
        }
    }

-    if (expectMatch) {
-        if (mismatchCount) {
-            failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+	if (reportMismatch) {
+        if (expectMatch) {
+            if (mismatchCount) {
+                failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch);
+            }
+        } else {
+            if (mismatchCount == 0) {
+                failed("expected mismatches but did not detect any!");
+            }
        }
-    } else {
-        if (mismatchCount == 0) {
-            failed("expected mismatches but did not detect any!");
-        }
-    }
+	}
+
+	return mismatchCount;

 }