From b12ec2180667f0231eb7009b0782b338e9d553a3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 21:29:51 -0600 Subject: [PATCH 1/6] Tweak version numbers --- RELEASE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 9fea5d4c78..ae0a0d2b4e 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -17,17 +17,17 @@ Stay tuned - the work for many of these features is already in-flight. ## Revision History: =================================================================================================== -Release:0.80.01.00 +Release:0.80.01 Date: 2016.02.18 - Improve reporting and support for device-side math functions. - Update Runtime Documentation. - Improve implementations of cross-lane operations (_ballot, _any, _all). - Provide shuffle intrinsics (performance optimization in-progress). - Support hipDeviceAttribute for querying "one-shot" device attributes, as an alternative to hipDeviceGetProperties. -- + =================================================================================================== -Release:0.80.00.00 : +Release:0.80.00 : Date: 2016.01.25 Initial release with GPUOpen Launch. From b63470f4cc5d0cc8a7cf3d8a409e0dda012c923f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 18 Feb 2016 03:05:53 -0600 Subject: [PATCH 2/6] remove extra : --- include/nvcc_detail/hip_runtime_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index 7a1e9bc6e9..0ef4b38c67 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -243,7 +243,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att case hipDeviceAttributeClockRate: cdattr = cudaDevAttrClockRate; break; case hipDeviceAttributeMemoryClockRate: - cdattr = cudaDevAttrMemoryClockRate:; break; + cdattr = cudaDevAttrMemoryClockRate; break; case hipDeviceAttributeMultiprocessorCount: cdattr = cudaDevAttrMultiProcessorCount; break; case hipDeviceAttributeComputeMode: From c2d66a48a70cc0ce053f971f0033916709c5cfe1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 22 Feb 2016 15:09:23 -0600 Subject: [PATCH 3/6] Fix memcpy for Titan. Add to common includes --- include/hip_runtime.h | 5 +++++ tests/src/CMakeLists.txt | 4 ++++ tests/src/hipMemcpy.cpp | 13 ++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/hip_runtime.h b/include/hip_runtime.h index 59d3d6c4c9..0594726c90 100644 --- a/include/hip_runtime.h +++ b/include/hip_runtime.h @@ -43,6 +43,11 @@ THE SOFTWARE. #include #include +#ifdef __cplusplus +#include +#endif + + #include #if defined(__HIP_PLATFORM_HCC__) and not defined (__HIP_PLATFORM_NVCC__) diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index 09c0ca7162..c1582ecf88 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -41,6 +41,10 @@ if (${HIP_PLATFORM} STREQUAL "hcc") elseif (${HIP_PLATFORM} STREQUAL "nvcc") MESSAGE ("HIP_PLATFORM=nvcc") + + #Need C++11 for threads in some of the tests. + add_definitions(-std=c++11) + # NVCC does not not support -rdynamic option set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS ) set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS ) diff --git a/tests/src/hipMemcpy.cpp b/tests/src/hipMemcpy.cpp index 8286454098..f9bde2df9f 100644 --- a/tests/src/hipMemcpy.cpp +++ b/tests/src/hipMemcpy.cpp @@ -63,6 +63,13 @@ void simpleTest1() } +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + //--- // Test many different kinds of memory copies. // THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. @@ -79,7 +86,7 @@ void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, boo size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, - typeid(T).name(), + TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); @@ -169,7 +176,7 @@ template void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) { printSep(); - printf ("test: %s<%s>\n", __func__, typeid(T).name()); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); int deviceId; HIPCHECK(hipGetDevice(&deviceId)); @@ -199,7 +206,7 @@ template void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); - printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, typeid(T).name(), serialize, usePinnedHost); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); if (serialize) { t1.join(); From 7090f5c3f91642c0cda29ad60f37da843e1b84de Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 Feb 2016 12:08:22 -0600 Subject: [PATCH 4/6] Add tests for multi-threaded streams --- src/hip_hcc.cpp | 2 + tests/src/hipMultiThreadStreams.cpp | 272 ++++++++++++++++++++++++++++ 2 files changed, 274 insertions(+) create mode 100644 tests/src/hipMultiThreadStreams.cpp diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 8a0d0df1d4..2085ccbb19 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1183,6 +1183,8 @@ hipError_t hipDeviceReset(void) } #endif + // TODO - reset all streams on the device. + return ihipLogStatus(hipSuccess); } diff --git a/tests/src/hipMultiThreadStreams.cpp b/tests/src/hipMultiThreadStreams.cpp new file mode 100644 index 0000000000..f9bde2df9f --- /dev/null +++ b/tests/src/hipMultiThreadStreams.cpp @@ -0,0 +1,272 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "hip_runtime.h" +#include "test_common.h" + + +void printSep() +{ + printf ("======================================================================================\n"); +} + +//--- +// Test simple H2D copies and back. +// Designed to stress a small number of simple smoke tests +void simpleTest1() +{ + printf ("test: %s\n", __func__); + size_t Nbytes = N*sizeof(int); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); + + int *A_d, *B_d, *C_d; + int *A_h, *B_h, *C_h; + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + + printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + HIPCHECK (hipDeviceSynchronize()); + + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK (hipDeviceReset()); + + printf (" %s success\n", __func__); +} + + +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + +//--- +// Test many different kinds of memory copies. +// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. +// +// IN: numElements controls the number of elements used for allocations. +// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. +// IN: useHostToHost : If true, add an extra host-to-host copy. +// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. +// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. +// +template +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +{ + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + __func__, + TYPENAME(T), + sizeElements, sizeElements/1024.0/1024.0, + usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + + + T *A_d, *B_d, *C_d; + T *A_h, *B_h, *C_h; + + + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + T *A_hh = NULL; + T *B_hh = NULL; + T *C_dd = NULL; + + + + if (useHostToHost) { + if (usePinnedHost) { + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + + + // Do some extra host-to-host copies here to mix things up: + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + + + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } else { + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + } + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + + if (useDeviceToDevice) { + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + + // Do an extra device-to-device copies here to mix things up: + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + + //Destroy the original C_d: + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); + + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } else { + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + } + + HIPCHECK ( hipDeviceSynchronize() ); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); + + printf (" %s success\n", __func__); +} + + +//--- +//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault +template +void memcpytest2_loop(size_t numElements) +{ + printSep(); + + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { + for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO + for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { + for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + } + } + } + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + HIPCHECK ( hipDeviceReset() ); + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +//--- +//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: +template +void multiThread_1(bool serialize, bool usePinnedHost) +{ + printSep(); + printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); + std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + printf ("info: set device to %d\n", p_gpuDevice); + HIPCHECK(hipSetDevice(p_gpuDevice)); + + + if (p_tests & 0x1) { + HIPCHECK ( hipDeviceReset() ); + simpleTest1(); + } + + if (p_tests & 0x2) { + HIPCHECK ( hipDeviceReset() ); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } + + if (p_tests & 0x4) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + + if (p_tests & 0x8) { + HIPCHECK ( hipDeviceReset() ); + printSep(); + multiThread_1(true, true); + multiThread_1(false, true); + multiThread_1(false, false); // TODO + } + + passed(); + +} From ecec7e36d90fe281c9f023c2c3c3fb7e5dff4930 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 04:21:24 -0600 Subject: [PATCH 5/6] Add abstraction for pinned/unpinned, and sync/async mem copies selection in tests --- tests/src/test_common.h | 80 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 1bf89f1604..e37eec7e86 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -16,6 +16,16 @@ #define KCYN "\x1B[36m" #define KWHT "\x1B[37m" + + +#ifdef __HIP_PLATFORM_HCC +#define TYPENAME(T) typeid(T).name() +#else +#define TYPENAME(T) "?" +#endif + + + #define passed() \ printf ("%sPASSED!%s\n",KGRN, KNRM);\ exit(0); @@ -82,12 +92,12 @@ vectorADD(hipLaunchParm lp, const T *A_d, const T *B_d, T *C_d, - size_t N) + size_t NELEM) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (size_t i=offset; i struct MemTraits; + + +template<> +struct MemTraits +{ + + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + { + HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind)); + } +}; + + +template<> +struct MemTraits +{ + + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + { + HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream)); + } +}; + }; // namespace HipTest From 7e45addbee2f88e7ef2013444651577a8af7046a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 Feb 2016 04:22:34 -0600 Subject: [PATCH 6/6] Add test for thread-safety on streams --- tests/src/CMakeLists.txt | 2 + tests/src/hipMemcpy.cpp | 5 - tests/src/hipMultiThreadStreams.cpp | 240 +++++++--------------------- 3 files changed, 57 insertions(+), 190 deletions(-) diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index c1582ecf88..cc6af0b5d2 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -122,6 +122,7 @@ make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisio make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp) make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp) make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp) +make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp) target_link_libraries(hipMathFunctionsHost m) make_test(hip_ballot " " ) @@ -137,6 +138,7 @@ make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size. make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy make_test(hipGridLaunch " " ) make_test(hipPointerAttrib " " ) +make_test(hipMultiThreadStreams " " ) make_test(hipMemcpy " " ) make_test(hipMemcpyAsync " " ) diff --git a/tests/src/hipMemcpy.cpp b/tests/src/hipMemcpy.cpp index f9bde2df9f..1d4efcbc3f 100644 --- a/tests/src/hipMemcpy.cpp +++ b/tests/src/hipMemcpy.cpp @@ -63,11 +63,6 @@ void simpleTest1() } -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif //--- diff --git a/tests/src/hipMultiThreadStreams.cpp b/tests/src/hipMultiThreadStreams.cpp index f9bde2df9f..a3dd94e077 100644 --- a/tests/src/hipMultiThreadStreams.cpp +++ b/tests/src/hipMultiThreadStreams.cpp @@ -23,6 +23,7 @@ THE SOFTWARE. #include "test_common.h" + void printSep() { printf ("======================================================================================\n"); @@ -31,189 +32,63 @@ void printSep() //--- // Test simple H2D copies and back. // Designed to stress a small number of simple smoke tests -void simpleTest1() + +template< + typename T=float, + class P=HipTest::Unpinned, + class C=HipTest::Memcpy +> +void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) { - printf ("test: %s\n", __func__); - size_t Nbytes = N*sizeof(int); - printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); + using HipTest::MemTraits; - int *A_d, *B_d, *C_d; - int *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); - - printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); - - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); - - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); - - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); - - HIPCHECK (hipDeviceSynchronize()); - - HipTest::checkVectorADD(A_h, B_h, C_h, N); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false); - HIPCHECK (hipDeviceReset()); - - printf (" %s success\n", __func__); -} - - -#ifdef __HIP_PLATFORM_HCC -#define TYPENAME(T) typeid(T).name() -#else -#define TYPENAME(T) "?" -#endif - - -//--- -// Test many different kinds of memory copies. -// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result. -// -// IN: numElements controls the number of elements used for allocations. -// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc. -// IN: useHostToHost : If true, add an extra host-to-host copy. -// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced. -// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. -// -template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) -{ - size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", - __func__, - TYPENAME(T), - sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + std::thread::id pid = std::this_thread::get_id(); + printf ("test: %s <%s> %s %s\n", __func__, TYPENAME(T), P::str(), C::str()); + size_t Nbytes = numElements*sizeof(T); + printf ("numElements=%zu Nbytes=%6.2fMB\n", numElements, Nbytes/1024.0/1024.0); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (int i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - } + HIPCHECK (hipDeviceSynchronize()); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + } - if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + HIPCHECK (hipDeviceSynchronize()); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); - } - - HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); - - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); - - printf (" %s success\n", __func__); + std::cout <<" pid" << pid << " success\n"; } - -//--- -//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault -template -void memcpytest2_loop(size_t numElements) +template +void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t stream1, bool serialize) { - printSep(); + printSep(); + printf ("%s\n", __func__); + std::cout << testName << std::endl; - for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { - for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO - for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { - for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - } - } - } - } -} - - -//--- -//Try many different sizes to memory copy. -template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) -{ - printSep(); - printf ("test: %s<%s>\n", __func__, TYPENAME(T)); - - int deviceId; - HIPCHECK(hipGetDevice(&deviceId)); - - size_t free, total; - HIPCHECK(hipMemGetInfo(&free, &total)); - - if (maxElem == 0) { - maxElem = free/sizeof(T)/5; - } - - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); - - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host - } -} - - -//--- -//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic: -template -void multiThread_1(bool serialize, bool usePinnedHost) -{ - printSep(); - printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + // Test 2 threads operating on same stream: + std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 1000, stream0); if (serialize) { t1.join(); } - - - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 1000, stream1); if (serialize) { t2.join(); } @@ -222,8 +97,9 @@ void multiThread_1(bool serialize, bool usePinnedHost) t1.join(); t2.join(); } -} + HIPCHECK(hipDeviceSynchronize()); +}; int main(int argc, char *argv[]) @@ -236,36 +112,30 @@ int main(int argc, char *argv[]) if (p_tests & 0x1) { HIPCHECK ( hipDeviceReset() ); - simpleTest1(); + + hipStream_t stream; + HIPCHECK (hipStreamCreate(&stream)); + + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + + //HIPCHECK(hipStreamDestroy(stream)); } + if (p_tests & 0x2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - memcpytest2_loop(N); - } + hipStream_t stream0, stream1; + HIPCHECK (hipStreamCreate(&stream0)); + HIPCHECK (hipStreamCreate(&stream1)); - if (p_tests & 0x4) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); - } + // Easy tests to verify the test works - these don't allow overlap between the threads: + test_multiThread_1 ("Multithread NULL with serialized", NULL, NULL, true); + test_multiThread_1 ("Multithread with serialized", stream0, stream1, true); - if (p_tests & 0x8) { - HIPCHECK ( hipDeviceReset() ); - printSep(); - multiThread_1(true, true); - multiThread_1(false, true); - multiThread_1(false, false); // TODO - } + test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); + } passed();