diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index 0b7c81a365..4921a61c72 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -31,7 +31,6 @@ THE SOFTWARE. #include #include #include -#include #include #include @@ -61,6 +60,7 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ +int HIP_STAGING_DOUBLE_BUFFER = 1; #define TRACE_API 0x1 /* trace API calls and return values */ #define TRACE_SYNC 0x2 /* trace synchronization pieces */ @@ -123,22 +123,23 @@ struct ihipEvent_t { //------------------------------------------------------------------------------------------------- struct StagingBuffer { - static const int numBuffers = 2; - - int _bufferIndex; // Operating on buffer 0 or 1? - - ihipDevice_t *_device; - size_t _bufferSize; // Size of the buffers. + static const int _numBuffers = 2; - StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) ; ~StagingBuffer(); + void CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); private: - char *_pinnedStagingBuffer[numBuffers]; - hsa_signal_t _completion_signal[numBuffers]; + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + bool _double_buffer; + + char *_pinnedStagingBuffer[_numBuffers]; + hsa_signal_t _completion_signal[_numBuffers]; }; @@ -179,7 +180,7 @@ public: //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_DOUBLE_BUFFER); _staging_device2host = NULL; }; @@ -519,6 +520,7 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of staging buffer, in KB" ); + READ_ENV_I(release, HIP_STAGING_DOUBLE_BUFFER, 0, "Double-buffer copies to device" ); /* * Build a table of valid compute devices. @@ -1568,12 +1570,13 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : - _bufferIndex(0), +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, bool doubleBuffer) : _device(device), - _bufferSize(bufferSize) + _bufferSize(bufferSize), + _double_buffer(doubleBuffer) { - for (int i=0; i_acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { throw; @@ -1585,7 +1588,7 @@ StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : //--- StagingBuffer::~StagingBuffer() { - for (int i=0; i (src); char *dstp = static_cast (dst); - assert(sizeBytes < UINT64_MAX/2); // TODO - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } + + assert(sizeBytes < UINT64_MAX/2); // TODO + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { - // TODO - double-buffer these guys. size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + tprintf (TRACE_COPY2, "waiting... on completion signal\n"); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + // TODO - use uncached memcpy, someday. + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[bufferIndex], dstp); - hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); - tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); - if (hsa_status == HSA_STATUS_SUCCESS) { - hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw srcp += theseBytes; dstp += theseBytes; + if (_double_buffer) { + bufferIndex = (bufferIndex + 1) % _numBuffers; + } } + + + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } +} + +//--- +void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes) +{ + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); + + int numBuffers = _double_buffer ? _numBuffers : 1; + + for (int i=0; i 0) { + // First launch the async copies to copy from dest to host + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + + tprintf (TRACE_COPY2, "D2H: async_copy %zu bytes src:%p to staging:%p\n", theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], srcp0, theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[bufferIndex]); + assert(hsa_status == HSA_STATUS_SUCCESS); // TODO - throw + + srcp0 += theseBytes; + } + + // Now unload the staging buffers: + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; + + tprintf (TRACE_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (TRACE_COPY2, "D2H: copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); + + dstp1 += theseBytes; + } + } + + + //for (int i=0; i<_numBuffers; i++) { + // hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //} } @@ -1657,10 +1725,18 @@ void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t size if ((kind == hipMemcpyHostToDevice) && (srcNotTracked)) { if (useStagingBuffer) { device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); } } else if ((kind == hipMemcpyDeviceToHost) && (dstNotTracked)) { - // TODO - optimize the copy here. - hc::AM_copy(dst, src, sizeBytes); + if (useStagingBuffer) { + device->_staging_host2device->CopyDeviceToHost(dst, src, sizeBytes); + } else { + hc::AM_copy(dst, src, sizeBytes); + } + } else if (kind == hipMemcpyHostToHost) { + memcpy(dst, src, sizeBytes); + } else { // Let HSA runtime handle it: // TODO - need buffer pool for the signals: diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 0de1b0b7a0..509f4a1177 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -22,8 +22,11 @@ THE SOFTWARE. #include "hip_runtime.h" #include "test_common.h" -//:w #include +void printSep() +{ + printf ("======================================================================================\n"); +} // Test simple H2D copies and back. void simpleTest1() @@ -61,21 +64,22 @@ void simpleTest1() // Test many different kinds of memory copies: template -void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { - printf ("test: %s<%s> usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + size_t sizeElements = numElements * sizeof(T); + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, typeid(T).name(), + sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; - size_t Nbytes = N*sizeof(T); - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, usePinnedHost); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; @@ -85,44 +89,44 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, if (useHostToHost) { if (usePinnedHost) { - HIPCHECK ( hipMallocHost(&A_hh, Nbytes) ); - HIPCHECK ( hipMallocHost(&B_hh, Nbytes) ); + HIPCHECK ( hipMallocHost(&A_hh, sizeElements) ); + HIPCHECK ( hipMallocHost(&B_hh, sizeElements) ); } else { - A_hh = (T*)malloc(Nbytes); - B_hh = (T*)malloc(Nbytes); + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, Nbytes) ); + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, Nbytes)); + HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(C_h, C_dd, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, N); + HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); HIPCHECK ( hipDeviceReset() ); @@ -132,8 +136,10 @@ void memcpytest2(bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, template -void memcpytest2_loop() +void memcpytest2_loop(size_t numElements) { + printSep(); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { #define USE_HOST_2_HOST #ifdef USE_HOST_2_HOST @@ -143,7 +149,7 @@ void memcpytest2_loop() #endif for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -151,20 +157,88 @@ void memcpytest2_loop() } +template +void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, typeid(T).name()); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + if (maxElem == 0) { + maxElem = free/sizeof(T)/5; + } + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + + for (size_t elem=64; elem+offset<=maxElem; elem*=2) { + memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host + memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + } +} + + +template +void multiThread_1(bool serialize) +{ + printSep(); + printf ("test: %s<%s> serialize=%d\n", __func__, typeid(T).name(), serialize); + std::thread t1 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t1.join(); + } + + + std::thread t2 (memcpytest2,N, 0,0,0,0); + if (serialize) { + t2.join(); + } + + if (!serialize) { + t1.join(); + t2.join(); + } +} + + + int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); - simpleTest1(); + if (p_tests & 0x1) { + simpleTest1(); + } - //memcpytest2(0/*usePinnedHost*/, 0/*useHostToHost*/, 0/*useDeviceToDevice*/, 1/*useMemkindDefault*/); + if (p_tests & 0x2) { + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + memcpytest2_loop(N); + } - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); - memcpytest2_loop(); + if (p_tests & 0x4) { + printSep(); + memcpytest2_sizes(0,0); + printSep(); + memcpytest2_sizes(0,64); + printSep(); + memcpytest2_sizes(1024*1024, 13); + printSep(); + memcpytest2_sizes(1024*1024, 50); + } + if (p_tests & 0x8) { + printSep(); + multiThread_1(true); + multiThread_1(false); + } passed();