Refactor hipMemcpy test to share mem alloc for multiple copies.

This commit is contained in:
Ben Sander
2017-04-24 11:53:31 -05:00
vanhempi dc001ef9b4
commit 2bc0a6030e
2 muutettua tiedostoa jossa 182 lisäystä ja 58 poistoa
-1
Näytä tiedosto
@@ -1 +0,0 @@
:set makeprg=make\ -C\ build.hcc-LC.db
+182 -57
Näytä tiedosto
@@ -38,6 +38,130 @@ void printSep()
printf ("======================================================================================\n");
}
//-------
template<typename T>
class DeviceMemory
{
public:
DeviceMemory(size_t numElements);
~DeviceMemory();
public:
T * A_d;
T* B_d;
T* C_d;
T* C_dd;
size_t _maxNumElements;
};
template<typename T>
DeviceMemory<T>::DeviceMemory(size_t numElements)
: _maxNumElements(numElements)
{
T ** np = nullptr;
HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0);
size_t sizeElements = numElements * sizeof(T);
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
}
template<typename T>
DeviceMemory<T>::~DeviceMemory ()
{
T * np = nullptr;
HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0);
HIPCHECK (hipFree(C_dd));
C_dd = NULL;
};
//-------
template<typename T>
class HostMemory
{
public:
HostMemory(size_t numElements, bool usePinnedHost);
void reset(size_t numElements, bool full=false) ;
~HostMemory();
public:
// Host arrays
T * A_h;
T* B_h;
T* C_h;
// Host arrays, secondary copy
T * A_hh;
T* B_hh;
size_t _maxNumElements;
bool _usePinnedHost;
};
template<typename T>
HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
: _maxNumElements(numElements),
_usePinnedHost(usePinnedHost)
{
T ** np = nullptr;
HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost);
A_hh = NULL;
B_hh = NULL;
size_t sizeElements = numElements * sizeof(T);
if (usePinnedHost) {
HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
} else {
A_hh = (T*)malloc(sizeElements);
B_hh = (T*)malloc(sizeElements);
}
}
template<typename T>
void
HostMemory<T>::reset(size_t numElements, bool full)
{
// Initialize the host data:
for (size_t i=0; i<numElements; i++) {
(A_hh)[i] = 1097.0 + i;
(B_hh)[i] = 1492.0 + i; // Phi
if (full) {
(A_h)[i] = 3.146f + i; // Pi
(B_h)[i] = 1.618f + i; // Phi
}
}
}
template<typename T>
HostMemory<T>::~HostMemory ()
{
HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost);
if (_usePinnedHost) {
HIPCHECK (hipHostFree(A_hh));
HIPCHECK (hipHostFree(B_hh));
} else {
free(A_hh);
free(B_hh);
}
T *A_hh = NULL;
T *B_hh = NULL;
};
@@ -52,71 +176,55 @@ void printSep()
// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction.
//
template <typename T>
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
size_t sizeElements = numElements * sizeof(T);
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
__func__,
TYPENAME(T),
sizeElements, sizeElements/1024.0/1024.0,
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
T *A_d, *B_d, *C_d;
T *A_h, *B_h, *C_h;
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
hmem->reset(numElements);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
T *A_hh = NULL;
T *B_hh = NULL;
T *C_dd = NULL;
assert (numElements <= dmem->_maxNumElements);
assert (numElements <= hmem->_maxNumElements);
if (useHostToHost) {
if (usePinnedHost) {
HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
} else {
A_hh = (T*)malloc(sizeElements);
B_hh = (T*)malloc(sizeElements);
}
// Do some extra host-to-host copies here to mix things up:
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
} else {
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements);
if (useDeviceToDevice) {
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
// Do an extra device-to-device copy here to mix things up:
HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
// Do an extra device-to-device copies here to mix things up:
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
//Destroy the original dmem->C_d:
HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements));
//Destroy the original C_d:
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
} else {
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
}
HIPCHECK ( hipDeviceSynchronize() );
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements);
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
printf (" %s success\n", __func__);
}
@@ -129,11 +237,15 @@ void memcpytest2_for_type(size_t numElements)
{
printSep();
DeviceMemory<T> memD(numElements);
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
HostMemory<T> memP(numElements, 1/*usePinnedHost*/);
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
}
}
}
@@ -156,17 +268,19 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
HIPCHECK(hipMemGetInfo(&free, &total));
if (maxElem == 0) {
maxElem = free/sizeof(T)/5;
maxElem = free/sizeof(T)/20;
}
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
HIPCHECK ( hipDeviceReset() );
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
HIPCHECK ( hipDeviceReset() );
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
HIPCHECK ( hipDeviceReset() );
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
memcpytest2<T>(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host
memcpytest2<T>(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host
}
}
@@ -178,13 +292,17 @@ void multiThread_1(bool serialize, bool usePinnedHost)
{
printSep();
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost);
std::thread t1 (memcpytest2<T>,N, usePinnedHost,0,0,0);
DeviceMemory<T> memD(N);
HostMemory<T> mem1(N, usePinnedHost);
HostMemory<T> mem2(N, usePinnedHost);
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, usePinnedHost,0,0,0);
if (serialize) {
t1.join();
}
std::thread t2 (memcpytest2<T>,N, usePinnedHost,0,0,0);
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, usePinnedHost,0,0,0);
if (serialize) {
t2.join();
}
@@ -218,24 +336,30 @@ int main(int argc, char *argv[])
if (p_tests & 0x2) {
// Some tests around the 64MB boundary which have historically shown issues:
printf ("\n\n=== tests&0x2 (64MB boundary)\n");
#if 0
// Some tests around the 64KB boundary which have historically shown issues:
printf ("\n\n=== tests&0x2 (64KB boundary)\n");
size_t maxElem = 32*1024*1024;
DeviceMemory<float> memD(maxElem);
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
// These all pass:
memcpytest2<float>(15*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(16*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(16*1024*1024+16*1024, 1, 0, 0, 0);
#endif
memcpytest2<float>(&memD, &memP, 15*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0);
// Just over 64MB:
memcpytest2<float>(16*1024*1024+512*1024, 1, 0, 0, 0);
memcpytest2<float>(17*1024*1024+1024, 1, 0, 0, 0);
memcpytest2<float>(32*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(32*1024*1024, 0, 0, 0, 0);
memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);
memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
}
if (p_tests & 0x4) {
printf ("\n\n=== tests&4 (test sizes and offsets)\n");
HIPCHECK ( hipDeviceReset() );
@@ -270,6 +394,7 @@ int main(int argc, char *argv[])
}
passed();
}