diff --git a/hipamd/tests/src/CMakeLists.txt b/hipamd/tests/src/CMakeLists.txt index c1582ecf88..cc6af0b5d2 100644 --- a/hipamd/tests/src/CMakeLists.txt +++ b/hipamd/tests/src/CMakeLists.txt @@ -122,6 +122,7 @@ make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisio make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) +make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -137,6 +138,7 @@ make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) make_test(hipPointerAttrib " " ) +make_test(hipMultiThreadStreams " " ) make_test(hipMemcpy " " ) make_test(hipMemcpyAsync " " ) diff --git a/hipamd/tests/src/hipMemcpy.cpp b/hipamd/tests/src/hipMemcpy.cpp index f9bde2df9f..1d4efcbc3f 100644 --- a/hipamd/tests/src/hipMemcpy.cpp +++ b/hipamd/tests/src/hipMemcpy.cpp @@ -63,11 +63,6 @@ void simpleTest1() } -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif //--- diff --git a/hipamd/tests/src/hipMultiThreadStreams.cpp b/hipamd/tests/src/hipMultiThreadStreams.cpp index f9bde2df9f..a3dd94e077 100644 --- a/hipamd/tests/src/hipMultiThreadStreams.cpp +++ b/hipamd/tests/src/hipMultiThreadStreams.cpp @@ -23,6 +23,7 @@ THE SOFTWARE. #include "test_common.h" + void printSep() { printf ("======================================================================================\n"); @@ -31,189 +32,63 @@ void printSep() //--- // Test simple H2D copies and back. // Designed to stress a small number of simple smoke tests -void simpleTest1() + +template< + typename T=float, + class P=HipTest::Unpinned, + class C=HipTest::Memcpy +> +void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) { - printf ("test: %s\n", __func__); - size_t Nbytes = N*sizeof(int); - printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); + using HipTest::MemTraits; - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); - - printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); - - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); - - HIPCHECK (hipDeviceSynchronize()); - - HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); - HIPCHECK (hipDeviceReset()); - - printf (" %s success\n", __func__); -} - - -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif - - -//--- -// Test many different kinds of memory copies. -// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. -// -// IN: numElements controls the number of elements used for allocations. -// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. -// IN: useHostToHost : If true, add an extra host-to-host copy. -// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. -// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. -// -template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) -{ - size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", - __func__, - TYPENAME(T), - sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + std::thread::id pid = std::this_thread::get_id(); + printf ("test: %s <%s> %s %s\n", __func__, TYPENAME(T), P::str(), C::str()); + size_t Nbytes = numElements*sizeof(T); + printf ("numElements=%zu Nbytes=%6.2fMB\n", numElements, Nbytes/1024.0/1024.0); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (int i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } + HIPCHECK (hipDeviceSynchronize()); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + } - if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + HIPCHECK (hipDeviceSynchronize()); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } - - HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); - - printf (" %s success\n", __func__); + std::cout <<" pid" << pid << " success\n"; } - -//--- -//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault -template -void memcpytest2_loop(size_t numElements) +template +void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t stream1, bool serialize) { - printSep(); + printSep(); + printf ("%s\n", __func__); + std::cout << testName << std::endl; - for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { - for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO - for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { - for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - } - } - } - } -} - - -//--- -//Try many different sizes to memory copy. -template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) -{ - printSep(); - printf ("test: %s<%s>\n", __func__, TYPENAME(T)); - - int deviceId; - HIPCHECK(hipGetDevice(&deviceId)); - - size_t free, total; - HIPCHECK(hipMemGetInfo(&free, &total)); - - if (maxElem == 0) { - maxElem = free/sizeof(T)/5; - } - - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); - - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host - } -} - - -//--- -//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: -template -void multiThread_1(bool serialize, bool usePinnedHost) -{ - printSep(); - printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + // Test 2 threads operating on same stream: + std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 1000, stream0); if (serialize) { t1.join(); } - - - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 1000, stream1); if (serialize) { t2.join(); } @@ -222,8 +97,9 @@ void multiThread_1(bool serialize, bool usePinnedHost) t1.join(); t2.join(); } -} + HIPCHECK(hipDeviceSynchronize()); +}; int main(int argc, char *argv[]) @@ -236,36 +112,30 @@ int main(int argc, char *argv[]) if (p_tests & 0x1) { HIPCHECK ( hipDeviceReset() ); - simpleTest1(); + + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + + //HIPCHECK(hipStreamDestroy(stream)); } + if (p_tests & 0x2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - } + hipStream_t stream0, stream1; + HIPCHECK (hipStreamCreate(&stream0)); + HIPCHECK (hipStreamCreate(&stream1)); - if (p_tests & 0x4) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); - } + // Easy tests to verify the test works - these don't allow overlap between the threads: + test_multiThread_1 ("Multithread NULL with serialized", NULL, NULL, true); + test_multiThread_1 ("Multithread with serialized", stream0, stream1, true); - if (p_tests & 0x8) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - multiThread_1(true, true); - multiThread_1(false, true); - multiThread_1(false, false); // TODO - } + test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); + } passed();