@@ -31,7 +31,6 @@ THE SOFTWARE.
|
||||
#include <list>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <hc.hpp>
|
||||
#include <hc_am.hpp>
|
||||
@@ -61,6 +60,7 @@ int HIP_PRINT_ENV = 0;
|
||||
int HIP_TRACE_API= 0;
|
||||
int HIP_LAUNCH_BLOCKING = 0;
|
||||
int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */
|
||||
int HIP_STAGING_DOUBLE_BUFFER = 1;
|
||||
|
||||
#define TRACE_API 0x1 /* trace API calls and return values */
|
||||
#define TRACE_SYNC 0x2 /* trace synchronization pieces */
|
||||
@@ -123,22 +123,23 @@ struct ihipEvent_t {
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
struct StagingBuffer {
|
||||
static const int numBuffers = 2;
|
||||
|
||||
int _bufferIndex; // Operating on buffer 0 or 1?
|
||||
|
||||
ihipDevice_t *_device;
|
||||
size_t _bufferSize; // Size of the buffers.
|
||||
static const int _numBuffers = 2;
|
||||
|
||||
|
||||
StagingBuffer(ihipDevice_t *device, size_t bufferSize) ;
|
||||
|
||||
StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ;
|
||||
~StagingBuffer();
|
||||
|
||||
void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes);
|
||||
void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes);
|
||||
|
||||
private:
|
||||
char *_pinnedStagingBuffer[numBuffers];
|
||||
hsa_signal_t _completion_signal[numBuffers];
|
||||
ihipDevice_t *_device;
|
||||
size_t _bufferSize; // Size of the buffers.
|
||||
bool _double_buffer;
|
||||
|
||||
char *_pinnedStagingBuffer[_numBuffers];
|
||||
hsa_signal_t _completion_signal[_numBuffers];
|
||||
};
|
||||
|
||||
|
||||
@@ -179,7 +180,7 @@ public:
|
||||
//Device may be reset multiple times, and may be reset after init.
|
||||
void ihipDevice_t::reset()
|
||||
{
|
||||
_staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024);
|
||||
_staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER);
|
||||
_staging_device2host = NULL;
|
||||
};
|
||||
|
||||
@@ -519,6 +520,7 @@ void ihipInit()
|
||||
READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes.");
|
||||
READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
|
||||
READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" );
|
||||
READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" );
|
||||
|
||||
/*
|
||||
* Build a table of valid compute devices.
|
||||
@@ -1568,12 +1570,13 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------------------
|
||||
StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) :
|
||||
_bufferIndex(0),
|
||||
StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) :
|
||||
_device(device),
|
||||
_bufferSize(bufferSize)
|
||||
_bufferSize(bufferSize),
|
||||
_double_buffer(doubleBuffer)
|
||||
{
|
||||
for (int i=0; i<numBuffers; i++) {
|
||||
for (int i=0; i<_numBuffers; i++) {
|
||||
// TODO - experiment with alignment here.
|
||||
_pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned);
|
||||
if (_pinnedStagingBuffer[i] == NULL) {
|
||||
throw;
|
||||
@@ -1585,7 +1588,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) :
|
||||
//---
|
||||
StagingBuffer::~StagingBuffer()
|
||||
{
|
||||
for (int i=0; i<numBuffers; i++) {
|
||||
for (int i=0; i<_numBuffers; i++) {
|
||||
if (_pinnedStagingBuffer[i]) {
|
||||
hc::AM_free(_pinnedStagingBuffer[i]);
|
||||
_pinnedStagingBuffer[i] = NULL;
|
||||
@@ -1596,33 +1599,98 @@ StagingBuffer::~StagingBuffer()
|
||||
|
||||
|
||||
//---
|
||||
void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) {
|
||||
void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes)
|
||||
{
|
||||
const char *srcp = static_cast<const char*> (src);
|
||||
char *dstp = static_cast<char*> (dst);
|
||||
|
||||
assert(sizeBytes < UINT64_MAX/2); // TODO
|
||||
for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) {
|
||||
for (int i=0; i<_numBuffers; i++) {
|
||||
hsa_signal_store_relaxed(_completion_signal[i], 0);
|
||||
}
|
||||
|
||||
assert(sizeBytes < UINT64_MAX/2); // TODO
|
||||
int bufferIndex = 0;
|
||||
for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) {
|
||||
|
||||
// TODO - double-buffer these guys.
|
||||
size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;
|
||||
|
||||
tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]);
|
||||
tprintf (TRACE_COPY2, "waiting... on completion signal\n");
|
||||
hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
|
||||
memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes);
|
||||
tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]);
|
||||
// TODO - use uncached memcpy, someday.
|
||||
memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes);
|
||||
|
||||
tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp);
|
||||
tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp);
|
||||
|
||||
hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]);
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
|
||||
|
||||
tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status);
|
||||
if (hsa_status == HSA_STATUS_SUCCESS) {
|
||||
hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw
|
||||
|
||||
srcp += theseBytes;
|
||||
dstp += theseBytes;
|
||||
if (_double_buffer) {
|
||||
bufferIndex = (bufferIndex + 1) % _numBuffers;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (int i=0; i<_numBuffers; i++) {
|
||||
hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
}
|
||||
}
|
||||
|
||||
//---
|
||||
void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes)
|
||||
{
|
||||
const char *srcp0 = static_cast<const char*> (src);
|
||||
char *dstp1 = static_cast<char*> (dst);
|
||||
|
||||
int numBuffers = _double_buffer ? _numBuffers : 1;
|
||||
|
||||
for (int i=0; i<numBuffers; i++) {
|
||||
hsa_signal_store_relaxed(_completion_signal[i], 0);
|
||||
}
|
||||
|
||||
assert(sizeBytes < UINT64_MAX/2); // TODO
|
||||
|
||||
int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer.
|
||||
int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest
|
||||
|
||||
while (bytesRemaining1 > 0) {
|
||||
// First launch the async copies to copy from dest to host
|
||||
for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) {
|
||||
|
||||
size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0;
|
||||
|
||||
tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
|
||||
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
|
||||
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
|
||||
assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw
|
||||
|
||||
srcp0 += theseBytes;
|
||||
}
|
||||
|
||||
// Now unload the staging buffers:
|
||||
for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) {
|
||||
|
||||
size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1;
|
||||
|
||||
tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1);
|
||||
hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
|
||||
tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
|
||||
memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes);
|
||||
|
||||
dstp1 += theseBytes;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//for (int i=0; i<_numBuffers; i++) {
|
||||
// hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
|
||||
//}
|
||||
}
|
||||
|
||||
|
||||
@@ -1657,10 +1725,18 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size
|
||||
if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) {
|
||||
if (useStagingBuffer) {
|
||||
device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes);
|
||||
} else {
|
||||
hc::AM_copy(dst, src, sizeBytes);
|
||||
}
|
||||
} else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) {
|
||||
// TODO - optimize the copy here.
|
||||
hc::AM_copy(dst, src, sizeBytes);
|
||||
if (useStagingBuffer) {
|
||||
device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes);
|
||||
} else {
|
||||
hc::AM_copy(dst, src, sizeBytes);
|
||||
}
|
||||
} else if (kind == hipMemcpyHostToHost) {
|
||||
memcpy(dst, src, sizeBytes);
|
||||
|
||||
} else {
|
||||
// Let HSA runtime handle it:
|
||||
// TODO - need buffer pool for the signals:
|
||||
|
||||
@@ -22,8 +22,11 @@ THE SOFTWARE.
|
||||
#include "hip_runtime.h"
|
||||
#include "test_common.h"
|
||||
|
||||
//:w #include <typeinfo>
|
||||
|
||||
void printSep()
|
||||
{
|
||||
printf ("======================================================================================\n");
|
||||
}
|
||||
|
||||
// Test simple H2D copies and back.
|
||||
void simpleTest1()
|
||||
@@ -61,21 +64,22 @@ void simpleTest1()
|
||||
// Test many different kinds of memory copies:
|
||||
|
||||
template <typename T>
|
||||
void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
|
||||
{
|
||||
printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
|
||||
size_t sizeElements = numElements * sizeof(T);
|
||||
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
|
||||
__func__,
|
||||
typeid(T).name(),
|
||||
sizeElements, sizeElements/1024.0/1024.0,
|
||||
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
|
||||
|
||||
T *A_d, *B_d, *C_d;
|
||||
T *A_h, *B_h, *C_h;
|
||||
|
||||
size_t Nbytes = N*sizeof(T);
|
||||
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
|
||||
|
||||
T *A_hh = NULL;
|
||||
T *B_hh = NULL;
|
||||
@@ -85,44 +89,44 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice,
|
||||
|
||||
if (useHostToHost) {
|
||||
if (usePinnedHost) {
|
||||
HIPCHECK ( hipMallocHost(&A_hh, Nbytes) );
|
||||
HIPCHECK ( hipMallocHost(&B_hh, Nbytes) );
|
||||
HIPCHECK ( hipMallocHost(&A_hh, sizeElements) );
|
||||
HIPCHECK ( hipMallocHost(&B_hh, sizeElements) );
|
||||
} else {
|
||||
A_hh = (T*)malloc(Nbytes);
|
||||
B_hh = (T*)malloc(Nbytes);
|
||||
A_hh = (T*)malloc(sizeElements);
|
||||
B_hh = (T*)malloc(sizeElements);
|
||||
}
|
||||
|
||||
|
||||
// Do some extra host-to-host copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
|
||||
|
||||
|
||||
HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
|
||||
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
|
||||
|
||||
if (useDeviceToDevice) {
|
||||
HIPCHECK ( hipMalloc(&C_dd, Nbytes) );
|
||||
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
|
||||
|
||||
// Do an extra device-to-device copies here to mix things up:
|
||||
HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
|
||||
|
||||
//Destroy the original C_d:
|
||||
HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes));
|
||||
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
|
||||
|
||||
HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
} else {
|
||||
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
HIPCHECK ( hipDeviceSynchronize() );
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, N);
|
||||
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
|
||||
|
||||
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
|
||||
HIPCHECK ( hipDeviceReset() );
|
||||
@@ -132,8 +136,10 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice,
|
||||
|
||||
|
||||
template<typename T>
|
||||
void memcpytest2_loop()
|
||||
void memcpytest2_loop(size_t numElements)
|
||||
{
|
||||
printSep();
|
||||
|
||||
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
|
||||
#define USE_HOST_2_HOST
|
||||
#ifdef USE_HOST_2_HOST
|
||||
@@ -143,7 +149,7 @@ void memcpytest2_loop()
|
||||
#endif
|
||||
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
|
||||
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
|
||||
memcpytest2<T>(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -151,20 +157,88 @@ void memcpytest2_loop()
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s>\n", __func__, typeid(T).name());
|
||||
|
||||
int deviceId;
|
||||
HIPCHECK(hipGetDevice(&deviceId));
|
||||
|
||||
size_t free, total;
|
||||
HIPCHECK(hipMemGetInfo(&free, &total));
|
||||
|
||||
if (maxElem == 0) {
|
||||
maxElem = free/sizeof(T)/5;
|
||||
}
|
||||
|
||||
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
|
||||
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
|
||||
|
||||
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
|
||||
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
|
||||
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void multiThread_1(bool serialize)
|
||||
{
|
||||
printSep();
|
||||
printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize);
|
||||
std::thread t1 (memcpytest2<T>,N, 0,0,0,0);
|
||||
if (serialize) {
|
||||
t1.join();
|
||||
}
|
||||
|
||||
|
||||
std::thread t2 (memcpytest2<T>,N, 0,0,0,0);
|
||||
if (serialize) {
|
||||
t2.join();
|
||||
}
|
||||
|
||||
if (!serialize) {
|
||||
t1.join();
|
||||
t2.join();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
HipTest::parseStandardArguments(argc, argv, true);
|
||||
|
||||
|
||||
simpleTest1();
|
||||
if (p_tests & 0x1) {
|
||||
simpleTest1();
|
||||
}
|
||||
|
||||
//memcpytest2<char>(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/);
|
||||
if (p_tests & 0x2) {
|
||||
memcpytest2_loop<float>(N);
|
||||
memcpytest2_loop<double>(N);
|
||||
memcpytest2_loop<char>(N);
|
||||
memcpytest2_loop<int>(N);
|
||||
}
|
||||
|
||||
memcpytest2_loop<float>();
|
||||
memcpytest2_loop<double>();
|
||||
memcpytest2_loop<char>();
|
||||
memcpytest2_loop<int>();
|
||||
if (p_tests & 0x4) {
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(0,0);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(0,64);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(1024*1024, 13);
|
||||
printSep();
|
||||
memcpytest2_sizes<float>(1024*1024, 50);
|
||||
}
|
||||
|
||||
if (p_tests & 0x8) {
|
||||
printSep();
|
||||
multiThread_1<float>(true);
|
||||
multiThread_1<float>(false);
|
||||
}
|
||||
|
||||
passed();
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user