[ROCm/hip commit: b314777bc1]
Этот коммит содержится в:
Ben Sander
2016-02-13 01:15:23 -06:00
родитель 7a633dc4b8
Коммит 2d468c6ce2
2 изменённых файлов: 212 добавлений и 62 удалений
+107 -31
Просмотреть файл
@@ -31,7 +31,6 @@ THE SOFTWARE.
#include <list>
#include <sys/types.h>
#include <unistd.h>
#include <unordered_map>
#include <hc.hpp>
#include <hc_am.hpp>
@@ -61,6 +60,7 @@ int HIP_PRINT_ENV = 0;
int HIP_TRACE_API= 0;
int HIP_LAUNCH_BLOCKING = 0;
int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */
int HIP_STAGING_DOUBLE_BUFFER = 1;
#define TRACE_API 0x1 /* trace API calls and return values */
#define TRACE_SYNC 0x2 /* trace synchronization pieces */
@@ -123,22 +123,23 @@ struct ihipEvent_t {
//-------------------------------------------------------------------------------------------------
struct StagingBuffer {
static const int numBuffers = 2;
int _bufferIndex; // Operating on buffer 0 or 1?
ihipDevice_t *_device;
size_t _bufferSize; // Size of the buffers.
static const int _numBuffers = 2;
StagingBuffer(ihipDevice_t *device, size_t bufferSize) ;
StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ;
~StagingBuffer();
void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes);
void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes);
private:
char *_pinnedStagingBuffer[numBuffers];
hsa_signal_t _completion_signal[numBuffers];
ihipDevice_t *_device;
size_t _bufferSize; // Size of the buffers.
bool _double_buffer;
char *_pinnedStagingBuffer[_numBuffers];
hsa_signal_t _completion_signal[_numBuffers];
};
@@ -179,7 +180,7 @@ public:
//Device may be reset multiple times, and may be reset after init.
void ihipDevice_t::reset()
{
_staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024);
_staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER);
_staging_device2host = NULL;
};
@@ -519,6 +520,7 @@ void ihipInit()
READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes.");
READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." );
READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" );
READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" );
/*
* Build a table of valid compute devices.
@@ -1568,12 +1570,13 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou
//-------------------------------------------------------------------------------------------------
StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) :
_bufferIndex(0),
StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) :
_device(device),
_bufferSize(bufferSize)
_bufferSize(bufferSize),
_double_buffer(doubleBuffer)
{
for (int i=0; i<numBuffers; i++) {
for (int i=0; i<_numBuffers; i++) {
// TODO - experiment with alignment here.
_pinnedStagingBuffer[i] = hc::AM_alloc(_bufferSize, device->_acc, amHostPinned);
if (_pinnedStagingBuffer[i] == NULL) {
throw;
@@ -1585,7 +1588,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) :
//---
StagingBuffer::~StagingBuffer()
{
for (int i=0; i<numBuffers; i++) {
for (int i=0; i<_numBuffers; i++) {
if (_pinnedStagingBuffer[i]) {
hc::AM_free(_pinnedStagingBuffer[i]);
_pinnedStagingBuffer[i] = NULL;
@@ -1596,33 +1599,98 @@ StagingBuffer::~StagingBuffer()
//---
void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) {
void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes)
{
const char *srcp = static_cast<const char*> (src);
char *dstp = static_cast<char*> (dst);
assert(sizeBytes < UINT64_MAX/2); // TODO
for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) {
for (int i=0; i<_numBuffers; i++) {
hsa_signal_store_relaxed(_completion_signal[i], 0);
}
assert(sizeBytes < UINT64_MAX/2); // TODO
int bufferIndex = 0;
for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) {
// TODO - double-buffer these guys.
size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining;
tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]);
tprintf (TRACE_COPY2, "waiting... on completion signal\n");
hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes);
tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]);
// TODO - use uncached memcpy, someday.
memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes);
tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp);
tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp);
hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1);
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]);
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status);
if (hsa_status == HSA_STATUS_SUCCESS) {
hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
}
assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw
srcp += theseBytes;
dstp += theseBytes;
if (_double_buffer) {
bufferIndex = (bufferIndex + 1) % _numBuffers;
}
}
for (int i=0; i<_numBuffers; i++) {
hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
}
}
//---
void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes)
{
const char *srcp0 = static_cast<const char*> (src);
char *dstp1 = static_cast<char*> (dst);
int numBuffers = _double_buffer ? _numBuffers : 1;
for (int i=0; i<numBuffers; i++) {
hsa_signal_store_relaxed(_completion_signal[i], 0);
}
assert(sizeBytes < UINT64_MAX/2); // TODO
int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer.
int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest
while (bytesRemaining1 > 0) {
// First launch the async copies to copy from dest to host
for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) {
size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0;
tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]);
hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1);
hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]);
assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw
srcp0 += theseBytes;
}
// Now unload the staging buffers:
for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) {
size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1;
tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1);
hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1);
memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes);
dstp1 += theseBytes;
}
}
//for (int i=0; i<_numBuffers; i++) {
// hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE);
//}
}
@@ -1657,10 +1725,18 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size
if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) {
if (useStagingBuffer) {
device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes);
} else {
hc::AM_copy(dst, src, sizeBytes);
}
} else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) {
// TODO - optimize the copy here.
hc::AM_copy(dst, src, sizeBytes);
if (useStagingBuffer) {
device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes);
} else {
hc::AM_copy(dst, src, sizeBytes);
}
} else if (kind == hipMemcpyHostToHost) {
memcpy(dst, src, sizeBytes);
} else {
// Let HSA runtime handle it:
// TODO - need buffer pool for the signals:
+105 -31
Просмотреть файл
@@ -22,8 +22,11 @@ THE SOFTWARE.
#include "hip_runtime.h"
#include "test_common.h"
//:w #include <typeinfo>
void printSep()
{
printf ("======================================================================================\n");
}
// Test simple H2D copies and back.
void simpleTest1()
@@ -61,21 +64,22 @@ void simpleTest1()
// Test many different kinds of memory copies:
template <typename T>
void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
size_t sizeElements = numElements * sizeof(T);
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n",
__func__,
typeid(T).name(),
sizeElements, sizeElements/1024.0/1024.0,
usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
T *A_d, *B_d, *C_d;
T *A_h, *B_h, *C_h;
size_t Nbytes = N*sizeof(T);
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
T *A_hh = NULL;
T *B_hh = NULL;
@@ -85,44 +89,44 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice,
if (useHostToHost) {
if (usePinnedHost) {
HIPCHECK ( hipMallocHost(&A_hh, Nbytes) );
HIPCHECK ( hipMallocHost(&B_hh, Nbytes) );
HIPCHECK ( hipMallocHost(&A_hh, sizeElements) );
HIPCHECK ( hipMallocHost(&B_hh, sizeElements) );
} else {
A_hh = (T*)malloc(Nbytes);
B_hh = (T*)malloc(Nbytes);
A_hh = (T*)malloc(sizeElements);
B_hh = (T*)malloc(sizeElements);
}
// Do some extra host-to-host copies here to mix things up:
HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
} else {
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
}
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);
if (useDeviceToDevice) {
HIPCHECK ( hipMalloc(&C_dd, Nbytes) );
HIPCHECK ( hipMalloc(&C_dd, sizeElements) );
// Do an extra device-to-device copies here to mix things up:
HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));
//Destroy the original C_d:
HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes));
HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));
HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
} else {
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
}
HIPCHECK ( hipDeviceSynchronize() );
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::checkVectorADD(A_h, B_h, C_h, numElements);
HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);
HIPCHECK ( hipDeviceReset() );
@@ -132,8 +136,10 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice,
template<typename T>
void memcpytest2_loop()
void memcpytest2_loop(size_t numElements)
{
printSep();
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
#define USE_HOST_2_HOST
#ifdef USE_HOST_2_HOST
@@ -143,7 +149,7 @@ void memcpytest2_loop()
#endif
for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) {
for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) {
memcpytest2<T>(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
memcpytest2<T>(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);
}
}
}
@@ -151,20 +157,88 @@ void memcpytest2_loop()
}
template<typename T>
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
{
printSep();
printf ("test: %s<%s>\n", __func__, typeid(T).name());
int deviceId;
HIPCHECK(hipGetDevice(&deviceId));
size_t free, total;
HIPCHECK(hipMemGetInfo(&free, &total));
if (maxElem == 0) {
maxElem = free/sizeof(T)/5;
}
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);
for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host
memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host
}
}
template<typename T>
void multiThread_1(bool serialize)
{
printSep();
printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize);
std::thread t1 (memcpytest2<T>,N, 0,0,0,0);
if (serialize) {
t1.join();
}
std::thread t2 (memcpytest2<T>,N, 0,0,0,0);
if (serialize) {
t2.join();
}
if (!serialize) {
t1.join();
t2.join();
}
}
int main(int argc, char *argv[])
{
HipTest::parseStandardArguments(argc, argv, true);
simpleTest1();
if (p_tests & 0x1) {
simpleTest1();
}
//memcpytest2<char>(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/);
if (p_tests & 0x2) {
memcpytest2_loop<float>(N);
memcpytest2_loop<double>(N);
memcpytest2_loop<char>(N);
memcpytest2_loop<int>(N);
}
memcpytest2_loop<float>();
memcpytest2_loop<double>();
memcpytest2_loop<char>();
memcpytest2_loop<int>();
if (p_tests & 0x4) {
printSep();
memcpytest2_sizes<float>(0,0);
printSep();
memcpytest2_sizes<float>(0,64);
printSep();
memcpytest2_sizes<float>(1024*1024, 13);
printSep();
memcpytest2_sizes<float>(1024*1024, 50);
}
if (p_tests & 0x8) {
printSep();
multiThread_1<float>(true);
multiThread_1<float>(false);
}
passed();