Add corrected test for offsets

This commit is contained in:
Ben Sander
2017-04-24 12:51:17 -05:00
orang tua 6b16f56f38
melakukan b44a3eefd1
@@ -24,6 +24,7 @@ THE SOFTWARE.
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11
* RUN_NAMED: %t hipMemcpy-modes --tests 0x1
* RUN_NAMED: %t hipMemcpy-size --tests 0x6
* RUN_NAMED: %t hipMemcpy-offsets --tests 0x10
* RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8
* HIT_END
*/
@@ -45,27 +46,42 @@ class DeviceMemory
public:
DeviceMemory(size_t numElements);
~DeviceMemory();
public:
T * A_d;
T* B_d;
T* C_d;
T* C_dd;
T *A_d() const { return _A_d + _offset; };
T *B_d() const { return _B_d + _offset; };
T *C_d() const { return _C_d + _offset; };
T *C_dd() const { return _C_dd + _offset; };
size_t maxNumElements() const { return _maxNumElements; };
void offset(int offset) { _offset = offset; };
int offset() const { return _offset; };
private:
T * _A_d;
T* _B_d;
T* _C_d;
T* _C_dd;
size_t _maxNumElements;
int _offset;
};
template<typename T>
DeviceMemory<T>::DeviceMemory(size_t numElements)
: _maxNumElements(numElements)
: _maxNumElements(numElements),
_offset(0)
{
T ** np = nullptr;
HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0);
HipTest::initArrays (&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0);
size_t sizeElements = numElements * sizeof(T);
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
HIPCHECK ( hipMalloc(&_C_dd, sizeElements) );
}
@@ -73,11 +89,11 @@ template<typename T>
DeviceMemory<T>::~DeviceMemory ()
{
T * np = nullptr;
HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0);
HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0);
HIPCHECK (hipFree(C_dd));
HIPCHECK (hipFree(_C_dd));
C_dd = NULL;
_C_dd = NULL;
};
@@ -90,6 +106,8 @@ public:
HostMemory(size_t numElements, bool usePinnedHost);
void reset(size_t numElements, bool full=false) ;
~HostMemory();
size_t maxNumElements() const { return _maxNumElements; };
public:
// Host arrays
T * A_h;
@@ -176,21 +194,22 @@ HostMemory<T>::~HostMemory ()
// IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction.
//
template <typename T>
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
size_t sizeElements = numElements * sizeof(T);
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:%+d\n",
__func__,
TYPENAME(T),
sizeElements, sizeElements/1024.0/1024.0,
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault,
dmem->offset());
hmem->reset(numElements);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
assert (numElements <= dmem->_maxNumElements);
assert (numElements <= hmem->_maxNumElements);
assert (numElements <= dmem->maxNumElements());
assert (numElements <= hmem->maxNumElements());
@@ -200,25 +219,25 @@ void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements,
HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
} else {
HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements);
if (useDeviceToDevice) {
// Do an extra device-to-device copy here to mix things up:
HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
HIPCHECK ( hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
//Destroy the original dmem->C_d:
HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements));
//Destroy the original dmem->C_d():
HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements));
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
} else {
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
}
HIPCHECK ( hipDeviceSynchronize() );
@@ -245,7 +264,7 @@ void memcpytest2_for_type(size_t numElements)
for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
memcpytest2<T>(&memD, usePinnedHost ? &memP : &memU, numElements, useHostToHost, useDeviceToDevice, useMemkindDefault);
}
}
}
@@ -256,7 +275,7 @@ void memcpytest2_for_type(size_t numElements)
//---
//Try many different sizes to memory copy.
template<typename T>
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
void memcpytest2_sizes(size_t maxElem=0)
{
printSep();
printf ("test: %s<%s>\n", __func__, TYPENAME(T));
@@ -268,19 +287,59 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
HIPCHECK(hipMemGetInfo(&free, &total));
if (maxElem == 0) {
maxElem = free/sizeof(T)/20;
maxElem = free/sizeof(T)/5;
}
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
HIPCHECK ( hipDeviceReset() );
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
memcpytest2<T>(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host
memcpytest2<T>(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host
for (size_t elem=1; elem<=maxElem; elem*=2) {
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
}
}
//---
//Try many different sizes to memory copy.
template<typename T>
void memcpytest2_offsets(size_t maxElem)
{
printSep();
printf ("test: %s<%s>\n", __func__, TYPENAME(T));
int deviceId;
HIPCHECK(hipGetDevice(&deviceId));
size_t free, total;
HIPCHECK(hipMemGetInfo(&free, &total));
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
HIPCHECK ( hipDeviceReset() );
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
size_t elem = maxElem / 2;
for (int offset=0; offset < 512; offset++) {
assert (elem + offset < maxElem);
memD.offset(offset);
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
}
for (int offset=512; offset < maxElem; offset*=2) {
assert (elem + offset < maxElem);
memD.offset(offset);
memcpytest2<T>(&memD, &memU, elem, 1, 1, 0); // unpinned host
memcpytest2<T>(&memD, &memP, elem, 1, 1, 0); // pinned host
}
}
@@ -296,13 +355,13 @@ void multiThread_1(bool serialize, bool usePinnedHost)
HostMemory<T> mem1(N, usePinnedHost);
HostMemory<T> mem2(N, usePinnedHost);
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, usePinnedHost,0,0,0);
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, 0,0,0);
if (serialize) {
t1.join();
}
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, usePinnedHost,0,0,0);
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, 0,0,0);
if (serialize) {
t2.join();
}
@@ -343,17 +402,17 @@ int main(int argc, char *argv[])
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
// These all pass:
memcpytest2<float>(&memD, &memP, 15*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 15*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0);
// Just over 64MB:
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 0, 0, 0);
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
}
@@ -361,16 +420,19 @@ int main(int argc, char *argv[])
if (p_tests & 0x4) {
printf ("\n\n=== tests&4 (test sizes and offsets)\n");
printf ("\n\n=== tests&4 (test sizes)\n");
HIPCHECK ( hipDeviceReset() );
memcpytest2_sizes<float>(0);
printSep();
memcpytest2_sizes<float>(0,0);
printSep();
memcpytest2_sizes<float>(0,64);
printSep();
memcpytest2_sizes<float>(1024*1024, 13);
printSep();
memcpytest2_sizes<float>(1024*1024, 50);
}
if (p_tests & 0x10) {
printf ("\n\n=== tests&4 (test offsets)\n");
HIPCHECK ( hipDeviceReset() );
memcpytest2_offsets<char>(256*1024*1024);
memcpytest2_offsets<float>(256*1024*1024);
memcpytest2_offsets<double>(256*1024*1024);
}
if (p_tests & 0x8) {