Refactor hipMemcpy test to share mem alloc for multiple copies.
This commit is contained in:
@@ -1 +0,0 @@
|
||||
:set makeprg=make\ -C\ build.hcc-LC.db
|
||||
@@ -38,6 +38,130 @@ void printSep()
|
||||
printf ("======================================================================================\n");
|
||||
}
|
||||
|
||||
//-------
|
||||
template<typename T>
|
||||
class DeviceMemory
|
||||
{
|
||||
public:
|
||||
DeviceMemory(size_t numElements);
|
||||
~DeviceMemory();
|
||||
public:
|
||||
T * A_d;
|
||||
T* B_d;
|
||||
T* C_d;
|
||||
T* C_dd;
|
||||
|
||||
size_t _maxNumElements;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
DeviceMemory<T>::DeviceMemory(size_t numElements)
|
||||
: _maxNumElements(numElements)
|
||||
{
|
||||
T ** np = nullptr;
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0);
|
||||
|
||||
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
|
||||
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
DeviceMemory<T>::~DeviceMemory ()
|
||||
{
|
||||
T * np = nullptr;
|
||||
HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0);
|
||||
|
||||
HIPCHECK (hipFree(C_dd));
|
||||
|
||||
C_dd = NULL;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-------
|
||||
template<typename T>
|
||||
class HostMemory
|
||||
{
|
||||
public:
|
||||
HostMemory(size_t numElements, bool usePinnedHost);
|
||||
void reset(size_t numElements, bool full=false) ;
|
||||
~HostMemory();
|
||||
public:
|
||||
// Host arrays
|
||||
T * A_h;
|
||||
T* B_h;
|
||||
T* C_h;
|
||||
|
||||
// Host arrays, secondary copy
|
||||
T * A_hh;
|
||||
T* B_hh;
|
||||
|
||||
size_t _maxNumElements;
|
||||
bool _usePinnedHost;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
|
||||
: _maxNumElements(numElements),
|
||||
_usePinnedHost(usePinnedHost)
|
||||
{
|
||||
T ** np = nullptr;
|
||||
HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost);
|
||||
|
||||
A_hh = NULL;
|
||||
B_hh = NULL;
|
||||
|
||||
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
|
||||
if (usePinnedHost) {
|
||||
HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
|
||||
HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
|
||||
} else {
|
||||
A_hh = (T*)malloc(sizeElements);
|
||||
B_hh = (T*)malloc(sizeElements);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void
|
||||
HostMemory<T>::reset(size_t numElements, bool full)
|
||||
{
|
||||
// Initialize the host data:
|
||||
for (size_t i=0; i<numElements; i++) {
|
||||
(A_hh)[i] = 1097.0 + i;
|
||||
(B_hh)[i] = 1492.0 + i; // Phi
|
||||
|
||||
if (full) {
|
||||
(A_h)[i] = 3.146f + i; // Pi
|
||||
(B_h)[i] = 1.618f + i; // Phi
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
HostMemory<T>::~HostMemory ()
|
||||
{
|
||||
HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost);
|
||||
|
||||
if (_usePinnedHost) {
|
||||
HIPCHECK (hipHostFree(A_hh));
|
||||
HIPCHECK (hipHostFree(B_hh));
|
||||
|
||||
} else {
|
||||
free(A_hh);
|
||||
free(B_hh);
|
||||
}
|
||||
T *A_hh = NULL;
|
||||
T *B_hh = NULL;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -52,71 +176,55 @@ void printSep()
|
||||
// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction.
|
||||
//
|
||||
template <typename T>
|
||||
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
{
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
|
||||
__func__,
|
||||
TYPENAME(T),
|
||||
sizeElements, sizeElements/1024.0/1024.0,
|
||||
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
|
||||
|
||||
T *A_d, *B_d, *C_d;
|
||||
T *A_h, *B_h, *C_h;
|
||||
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
|
||||
hmem->reset(numElements);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
|
||||
T *A_hh = NULL;
|
||||
T *B_hh = NULL;
|
||||
T *C_dd = NULL;
|
||||
assert (numElements <= dmem->_maxNumElements);
|
||||
assert (numElements <= hmem->_maxNumElements);
|
||||
|
||||
|
||||
|
||||
if (useHostToHost) {
|
||||
if (usePinnedHost) {
|
||||
HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
|
||||
HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
|
||||
} else {
|
||||
A_hh = (T*)malloc(sizeElements);
|
||||
B_hh = (T*)malloc(sizeElements);
|
||||
}
|
||||
|
||||
|
||||
// Do some extra host-to-host copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
|
||||
|
||||
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements);
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
|
||||
// Do an extra device-to-device copy here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
|
||||
// Do an extra device-to-device copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
//Destroy the original dmem->C_d:
|
||||
HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements));
|
||||
|
||||
//Destroy the original C_d:
|
||||
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
|
||||
|
||||
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
HIPCHECK ( hipDeviceSynchronize() );
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
|
||||
HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements);
|
||||
|
||||
|
||||
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
|
||||
|
||||
printf (" %s success\n", __func__);
|
||||
}
|
||||
@@ -129,11 +237,15 @@ void memcpytest2_for_type(size_t numElements)
|
||||
{
|
||||
printSep();
|
||||
|
||||
DeviceMemory<T> memD(numElements);
|
||||
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
|
||||
HostMemory<T> memP(numElements, 1/*usePinnedHost*/);
|
||||
|
||||
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
|
||||
for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO
|
||||
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
|
||||
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
|
||||
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -156,17 +268,19 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
if (maxElem == 0) {
|
||||
maxElem = free/sizeof(T)/5;
|
||||
maxElem = free/sizeof(T)/20;
|
||||
}
|
||||
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
|
||||
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
DeviceMemory<T> memD(maxElem);
|
||||
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
|
||||
|
||||
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
|
||||
memcpytest2<T>(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
@@ -178,13 +292,17 @@ void multiThread_1(bool serialize, bool usePinnedHost)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost);
|
||||
std::thread t1 (memcpytest2<T>,N, usePinnedHost,0,0,0);
|
||||
DeviceMemory<T> memD(N);
|
||||
HostMemory<T> mem1(N, usePinnedHost);
|
||||
HostMemory<T> mem2(N, usePinnedHost);
|
||||
|
||||
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, usePinnedHost,0,0,0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
std::thread t2 (memcpytest2<T>,N, usePinnedHost,0,0,0);
|
||||
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, usePinnedHost,0,0,0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
}
|
||||
@@ -218,24 +336,30 @@ int main(int argc, char *argv[])
|
||||
|
||||
|
||||
if (p_tests & 0x2) {
|
||||
// Some tests around the 64MB boundary which have historically shown issues:
|
||||
printf ("\n\n=== tests&0x2 (64MB boundary)\n");
|
||||
#if 0
|
||||
// Some tests around the 64KB boundary which have historically shown issues:
|
||||
printf ("\n\n=== tests&0x2 (64KB boundary)\n");
|
||||
size_t maxElem = 32*1024*1024;
|
||||
DeviceMemory<float> memD(maxElem);
|
||||
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
|
||||
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
|
||||
// These all pass:
|
||||
memcpytest2<float>(15*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(16*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(16*1024*1024+16*1024, 1, 0, 0, 0);
|
||||
#endif
|
||||
memcpytest2<float>(&memD, &memP, 15*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0);
|
||||
|
||||
// Just over 64MB:
|
||||
memcpytest2<float>(16*1024*1024+512*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(17*1024*1024+1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(32*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(32*1024*1024, 0, 0, 0, 0);
|
||||
memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);
|
||||
memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);
|
||||
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
|
||||
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (p_tests & 0x4) {
|
||||
printf ("\n\n=== tests&4 (test sizes and offsets)\n");
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
@@ -270,6 +394,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
|
||||
|
||||
passed();
|
||||
|
||||
}
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user