Conflicts:
	tests/src/hipMemcpy.cpp
This commit is contained in:
Ben Sander
2016-02-25 23:24:22 -06:00
کامیت 69bb511b10
3فایلهای تغییر یافته به همراه135 افزوده شده و 187 حذف شده
@@ -122,6 +122,7 @@ make_hip_executable (hipMathFunctionsHost hipMathFunctions.cpp hipSinglePrecisio
make_hip_executable (hipMathFunctionsDevice hipMathFunctions.cpp hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp)
make_hip_executable (hipIntrinsics hipMathFunctions.cpp hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp)
make_hip_executable (hipPointerAttrib hipPointerAttrib.cpp)
make_hip_executable (hipMultiThreadStreams hipMultiThreadStreams.cpp)
target_link_libraries(hipMathFunctionsHost m)
make_test(hip_ballot " " )
@@ -137,6 +138,7 @@ make_test(hipMemset --N 10013 --memsetval 0x5a ) # oddball size.
make_test(hipMemset --N 256M --memsetval 0xa6 ) # big copy
make_test(hipGridLaunch " " )
make_test(hipPointerAttrib " " )
make_test(hipMultiThreadStreams " " )
make_test(hipMemcpy " " )
make_test(hipMemcpyAsync " " )
@@ -23,6 +23,7 @@ THE SOFTWARE.
#include "test_common.h"
void printSep()
{
printf ("======================================================================================\n");
@@ -31,189 +32,63 @@ void printSep()
//---
// Test simple H2D copies and back.
// Designed to stress a small number of simple smoke tests
void simpleTest1()
template<
typename T=float,
class P=HipTest::Unpinned,
class C=HipTest::Memcpy
>
void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream)
{
printf ("test: %s\n", __func__);
size_t Nbytes = N*sizeof(int);
printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0);
using HipTest::MemTraits;
int *A_d, *B_d, *C_d;
int *A_h, *B_h, *C_h;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
printf ("A_d=%p B_d=%p C_d=%p A_h=%p B_h=%p C_h=%p\n", A_d, B_d, C_d, A_h, B_d, C_h);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK (hipDeviceReset());
printf (" %s success\n", __func__);
}
#ifdef __HIP_PLATFORM_HCC
#define TYPENAME(T) typeid(T).name()
#else
#define TYPENAME(T) "?"
#endif
//---
// Test many different kinds of memory copies.
// THe subroutine allocates memory , copies to device, runs a vector add kernel, copies back, and checks the result.
//
// IN: numElements controls the number of elements used for allocations.
// IN: usePinnedHost : If true, allocate host with hipMallocHost and is pinned ; else allocate host memory with malloc.
// IN: useHostToHost : If true, add an extra host-to-host copy.
// IN: useDeviceToDevice : If true, add an extra deviceto-device copy after result is produced.
// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction.
//
template <typename T>
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
size_t sizeElements = numElements * sizeof(T);
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
__func__,
TYPENAME(T),
sizeElements, sizeElements/1024.0/1024.0,
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
std::thread::id pid = std::this_thread::get_id();
printf ("test: %s <%s> %s %s\n", __func__, TYPENAME(T), P::str(), C::str());
size_t Nbytes = numElements*sizeof(T);
printf ("numElements=%zu Nbytes=%6.2fMB\n", numElements, Nbytes/1024.0/1024.0);
T *A_d, *B_d, *C_d;
T *A_h, *B_h, *C_h;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
T *A_hh = NULL;
T *B_hh = NULL;
T *C_dd = NULL;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned);
for (int i=0; i<iters; i++) {
if (useHostToHost) {
if (usePinnedHost) {
HIPCHECK ( hipMallocHost(&A_hh, sizeElements) );
HIPCHECK ( hipMallocHost(&B_hh, sizeElements) );
} else {
A_hh = (T*)malloc(sizeElements);
B_hh = (T*)malloc(sizeElements);
}
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
MemTraits<C>::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream);
MemTraits<C>::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream);
// Do some extra host-to-host copies here to mix things up:
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
MemTraits<C>::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream);
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
} else {
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
HIPCHECK (hipDeviceSynchronize());
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
}
if (useDeviceToDevice) {
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned);
HIPCHECK (hipDeviceSynchronize());
// Do an extra device-to-device copies here to mix things up:
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
//Destroy the original C_d:
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
} else {
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
}
HIPCHECK ( hipDeviceSynchronize() );
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
printf (" %s success\n", __func__);
std::cout <<" pid" << pid << " success\n";
}
//---
//Try all the 16 possible combinations to memcpytest2 - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault
template<typename T>
void memcpytest2_loop(size_t numElements)
template<typename T, class C>
void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t stream1, bool serialize)
{
printSep();
printSep();
printf ("%s\n", __func__);
std::cout << testName << std::endl;
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
}
}
}
}
}
//---
//Try many different sizes to memory copy.
template<typename T>
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
{
printSep();
printf ("test: %s<%s>\n", __func__, TYPENAME(T));
int deviceId;
HIPCHECK(hipGetDevice(&deviceId));
size_t free, total;
HIPCHECK(hipMemGetInfo(&free, &total));
if (maxElem == 0) {
maxElem = free/sizeof(T)/5;
}
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
HIPCHECK ( hipDeviceReset() );
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
HIPCHECK ( hipDeviceReset() );
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
}
}
//---
//Create multiple threads to stress multi-thread locking behavior in the allocation/deallocation/tracking logic:
template<typename T>
void multiThread_1(bool serialize, bool usePinnedHost)
{
printSep();
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost);
std::thread t1 (memcpytest2<T>,N, usePinnedHost,0,0,0);
// Test 2 threads operating on same stream:
std::thread t1 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 1000, stream0);
if (serialize) {
t1.join();
}
std::thread t2 (memcpytest2<T>,N, usePinnedHost,0,0,0);
std::thread t2 (simpleVectorCopy<T, HipTest::Pinned, C>, 2000000/*mb*/, 1000, stream1);
if (serialize) {
t2.join();
}
@@ -222,8 +97,9 @@ void multiThread_1(bool serialize, bool usePinnedHost)
t1.join();
t2.join();
}
}
HIPCHECK(hipDeviceSynchronize());
};
int main(int argc, char *argv[])
@@ -236,36 +112,30 @@ int main(int argc, char *argv[])
if (p_tests & 0x1) {
HIPCHECK ( hipDeviceReset() );
simpleTest1();
hipStream_t stream;
HIPCHECK (hipStreamCreate(&stream));
simpleVectorCopy<float, HipTest::Pinned, HipTest::MemcpyAsync> (2000000/*mb*/, 10/*iters*/, stream);
simpleVectorCopy<float, HipTest::Pinned, HipTest::Memcpy> (2000000/*mb*/, 10/*iters*/, stream);
//HIPCHECK(hipStreamDestroy(stream));
}
if (p_tests & 0x2) {
HIPCHECK ( hipDeviceReset() );
memcpytest2_loop<float>(N);
memcpytest2_loop<double>(N);
memcpytest2_loop<char>(N);
memcpytest2_loop<int>(N);
}
hipStream_t stream0, stream1;
HIPCHECK (hipStreamCreate(&stream0));
HIPCHECK (hipStreamCreate(&stream1));
if (p_tests & 0x4) {
HIPCHECK ( hipDeviceReset() );
printSep();
memcpytest2_sizes<float>(0,0);
printSep();
memcpytest2_sizes<float>(0,64);
printSep();
memcpytest2_sizes<float>(1024*1024, 13);
printSep();
memcpytest2_sizes<float>(1024*1024, 50);
}
// Easy tests to verify the test works - these don't allow overlap between the threads:
test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread NULL with serialized", NULL, NULL, true);
test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with serialized", stream0, stream1, true);
if (p_tests & 0x8) {
HIPCHECK ( hipDeviceReset() );
printSep();
multiThread_1<float>(true, true);
multiThread_1<float>(false, true);
multiThread_1<float>(false, false); // TODO
}
test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with NULL stream", NULL, NULL, false);
test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with two streams", stream0, stream1, false);
test_multiThread_1<float, HipTest::MemcpyAsync> ("Multithread with one stream", stream0, stream0, false);
}
passed();
+78 -2
مشاهده پرونده
@@ -16,6 +16,16 @@
#define KCYN "\x1B[36m"
#define KWHT "\x1B[37m"
#ifdef __HIP_PLATFORM_HCC
#define TYPENAME(T) typeid(T).name()
#else
#define TYPENAME(T) "?"
#endif
#define passed() \
printf ("%sPASSED!%s\n",KGRN, KNRM);\
exit(0);
@@ -82,12 +92,12 @@ vectorADD(hipLaunchParm lp,
const T *A_d,
const T *B_d,
T *C_d,
size_t N)
size_t NELEM)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
for (size_t i=offset; i<N; i+=stride) {
for (size_t i=offset; i<NELEM; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
}
}
@@ -220,4 +230,70 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true
}
//---
struct Pinned {
static const bool isPinned = true;
static const char *str() { return "Pinned"; };
static void *Alloc(size_t sizeBytes)
{
void *p;
HIPCHECK(hipMallocHost(&p, sizeBytes));
return p;
};
};
//---
struct Unpinned
{
static const bool isPinned = false;
static const char *str() { return "Unpinned"; };
static void *Alloc(size_t sizeBytes)
{
void *p = malloc (sizeBytes);
HIPASSERT(p);
return p;
};
};
struct Memcpy
{
static const char *str() { return "Memcpy"; };
};
struct MemcpyAsync
{
static const char *str() { return "MemcpyAsync"; };
};
template <typename C> struct MemTraits;
template<>
struct MemTraits<Memcpy>
{
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind));
}
};
template<>
struct MemTraits<MemcpyAsync>
{
static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream)
{
HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream));
}
};
}; // namespace HipTest