From c441d5ec29b8d4e59a41dedace57118165371e9a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 04:30:09 -0600 Subject: [PATCH] Step1 in staging buffer copy. - use StagingBuffer class for copies. - refactor g_device to use array rather than vector. (keeps pointers from moving). [ROCm/hip commit: 24c1fdb8643d25be2218fac99095c1ed0e4ea059] --- projects/hip/include/hcc_detail/AM.h | 8 +- projects/hip/src/hc_AM.cpp | 18 +-- projects/hip/src/hip_hcc.cpp | 200 +++++++++++++++++++++---- projects/hip/tests/src/hipMemcpy.cpp | 2 +- projects/hip/tests/src/test_common.cpp | 2 +- projects/hip/util/vim/hip.vim | 3 + 6 files changed, 188 insertions(+), 45 deletions(-) diff --git a/projects/hip/include/hcc_detail/AM.h b/projects/hip/include/hcc_detail/AM.h index c183844869..04804ffaa5 100644 --- a/projects/hip/include/hcc_detail/AM.h +++ b/projects/hip/include/hcc_detail/AM.h @@ -22,8 +22,8 @@ struct AmPointerInfo { bool _isInDeviceMem; ///< Memory is physically resident on a device (if false, memory is located on host) bool _isAmManaged; ///< Memory was allocated by AM and should be freed when am_reset is called. - int _appId; ///< App-specific storage. Used by HIP to store deviceID. - unsigned _appAllocationFlags; ///< App-specific allocation flags. Used by HIP to store allocation flags. + int _appId; ///< App-specific storage. (Used by HIP to store deviceID.) + unsigned _appAllocationFlags; ///< App-specific allocation flags. (Used by HIP to store allocation flags.) AmPointerInfo() {}; @@ -91,7 +91,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t size); * * @see AM_memtracker_add, */ -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr); +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr); //TODO-doc @@ -99,7 +99,7 @@ am_status_t am_memtracker_add(void* ptr, size_t sizeBytes, hc::accelerator acc, //TODO-doc -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags); +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags); /** diff --git a/projects/hip/src/hc_AM.cpp b/projects/hip/src/hc_AM.cpp index 3a6d116261..2d22b49fd4 100644 --- a/projects/hip/src/hc_AM.cpp +++ b/projects/hip/src/hc_AM.cpp @@ -24,10 +24,10 @@ //#include struct AmMemoryRange { - void * _basePointer; - void * _endPointer; - AmMemoryRange(void *basePointer, size_t sizeBytes) : - _basePointer(basePointer), _endPointer((unsigned char*)basePointer + sizeBytes - 1) {}; + const void * _basePointer; + const void * _endPointer; + AmMemoryRange(const void *basePointer, size_t sizeBytes) : + _basePointer(basePointer), _endPointer((const unsigned char*)basePointer + sizeBytes - 1) {}; }; // Functor to compare ranges: @@ -63,7 +63,7 @@ public: void insert(void *pointer, const hc::AmPointerInfo &p); int remove(void *pointer); - MapTrackerType::iterator find(void *hostPtr) ; + MapTrackerType::iterator find(const void *hostPtr) ; MapTrackerType::iterator readerLockBegin() { _mutex.lock(); return _tracker.begin(); } ; MapTrackerType::iterator end() { return _tracker.end(); } ; @@ -107,7 +107,7 @@ int AmPointerTracker::remove (void *pointer) //--- -AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (void *pointer) +AmPointerTracker::MapTrackerType::iterator AmPointerTracker::find (const void *pointer) { // TODO-mutex- read lock std::lock_guard l (_mutex); @@ -144,7 +144,7 @@ size_t AmPointerTracker::reset (hc::accelerator acc) for (auto iter = _tracker.begin() ; iter != _tracker.end(); ) { if (iter->second._acc == acc) { if (iter->second._isAmManaged) { - hsa_memory_free(iter->first._basePointer); + hsa_memory_free(const_cast (iter->first._basePointer)); } count++; @@ -278,7 +278,7 @@ am_status_t AM_copy(void* dst, const void* src, size_t sizeBytes) } -am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) +am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, const void *ptr) { auto infoI = g_amPointerTracker.find(ptr); if (infoI != g_amPointerTracker.end()) { @@ -290,7 +290,7 @@ am_status_t am_memtracker_getinfo(hc::AmPointerInfo *info, void *ptr) } -am_status_t am_memtracker_update(void* ptr, int appId, unsigned allocationFlags) +am_status_t am_memtracker_update(const void* ptr, int appId, unsigned allocationFlags) { auto iter = g_amPointerTracker.find(ptr); if (iter != g_amPointerTracker.end()) { diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index fe273aa21c..08f7859271 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -43,7 +43,7 @@ THE SOFTWARE. #include "hc_AM.cpp" -#define USE_ASYNC_COPY 0 +#define USE_ASYNC_COPY 1 #define USE_AM_TRACKER 1 /* use new AM memory tracker features */ #define INLINE static inline @@ -60,10 +60,12 @@ static const int release = 1; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; int HIP_LAUNCH_BLOCKING = 0; +int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -#define TRACE_API 0x1 /* trace API calls and return values */ -#define TRACE_SYNC 0x2 /* trace synchronization pieces */ -#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_API 0x1 /* trace API calls and return values */ +#define TRACE_SYNC 0x2 /* trace synchronization pieces */ +#define TRACE_MEM 0x4 /* trace memory allocation / deallocation */ +#define TRACE_COPY2 0x8 /* trace memory copy commands. Detailed. */ #define tprintf(trace_level, ...) {\ if (HIP_TRACE_API & trace_level) {\ @@ -119,6 +121,28 @@ struct ihipEvent_t { } ; +//------------------------------------------------------------------------------------------------- +struct StagingBuffer { + static const int numBuffers = 2; + + int _bufferIndex; // Operating on buffer 0 or 1? + + ihipDevice_t *_device; + size_t _bufferSize; // Size of the buffers. + + + StagingBuffer(ihipDevice_t *device, size_t bufferSize) ; + + void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes); + +private: + char *_pinnedStagingBuffer[numBuffers]; + hsa_signal_t _completion_signal[numBuffers]; +}; + + + +//------------------------------------------------------------------------------------------------- struct ihipDevice_t { unsigned _device_index; // index into g_devices. @@ -135,8 +159,11 @@ struct ihipDevice_t unsigned _compute_units; + StagingBuffer *_staging_host2device; + StagingBuffer *_staging_device2host; + public: - ihipDevice_t(unsigned device_index, hc::accelerator acc); + void init(unsigned device_index, hc::accelerator acc); hipError_t getProperties(hipDeviceProp_t* prop); // TODO- create a copy constructor. @@ -145,10 +172,10 @@ public: //================================================================================================= -ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) - : _device_index(device_index), - _acc(acc) +void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { + _device_index = device_index; + _acc = acc; hsa_agent_t *agent = static_cast (acc.get_default_view().get_hsa_agent()); if (agent) { int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units); @@ -166,6 +193,9 @@ ihipDevice_t::ihipDevice_t(unsigned device_index, hc::accelerator acc) _null_stream = new ihipStream_t(device_index, acc.get_default_view(), hipStreamDefault); this->_streams.push_back(_null_stream); tprintf(TRACE_SYNC, "created device with null_stream=%p\n", _null_stream); + + _staging_host2device = new StagingBuffer(this, HIP_STAGING_SIZE*1024); + _staging_device2host = NULL; }; #if 0 @@ -187,7 +217,8 @@ thread_local int tls_defaultDevice = 0; // Global initialization. std::once_flag hip_initialized; -std::vector g_devices; // Vector of all non-emulated (ie GPU) accelerators in the system. +ihipDevice_t *g_devices; // Array of all non-emulated (ie GPU) accelerators in the system. +unsigned g_deviceCnt; //================================================================================================= @@ -462,25 +493,36 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c //It is called with C++11 call_once, which provided thread-safety. void ihipInit() { - - /* - * Build a table of valid compute devices. - */ - auto accs = hc::accelerator::get_all(); - g_devices.reserve(accs.size()); - for (int i=0; i"); @@ -489,7 +531,7 @@ void ihipInit() INLINE bool ihipIsValidDevice(unsigned deviceIndex) { // deviceIndex is unsigned so always > 0 - return (deviceIndex < g_devices.size()); + return (deviceIndex < g_deviceCnt); } @@ -508,7 +550,7 @@ INLINE ihipDevice_t *ihipGetTlsDefaultDevice() //--- INLINE ihipDevice_t *ihipGetDevice(int deviceId) { - if ((deviceId >= 0) && (deviceId < g_devices.size())) { + if ((deviceId >= 0) && (deviceId < g_deviceCnt)) { return &g_devices[deviceId]; } else { return NULL; @@ -675,7 +717,7 @@ hipError_t hipGetDeviceCount(int *count) { std::call_once(hip_initialized, ihipInit); - *count = g_devices.size(); + *count = g_deviceCnt; if (*count > 0) { return ihipLogStatus(hipSuccess); @@ -764,7 +806,7 @@ hipError_t hipSetDevice(int device) { std::call_once(hip_initialized, ihipInit); - if ((device < 0) || (device >= g_devices.size())) { + if ((device < 0) || (device >= g_deviceCnt)) { return ihipLogStatus(hipErrorInvalidDevice); } else { tls_defaultDevice = device; @@ -1299,6 +1341,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; + if (attributes->device < 0) { + e = hipErrorInvalidDevice; + } + } else { attributes->memoryType = hipMemoryTypeDevice; @@ -1482,6 +1528,7 @@ hipError_t hipMallocHost(void** ptr, size_t sizeBytes) } +//--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { #ifdef USE_MEMCPYTOSYMBOL @@ -1500,6 +1547,102 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou } +//------------------------------------------------------------------------------------------------- +StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize) : + _bufferIndex(0), + _device(device), + _bufferSize(bufferSize) +{ + for (int i=0; i_acc, amHostPinned); + if (_pinnedStagingBuffer[i] == NULL) { + throw; + } + hsa_signal_create(0, 0, NULL, &_completion_signal[i]); + } +}; + + +//--- +void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes) { + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); + + assert(sizeBytes < UINT64_MAX/2); // TODO + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0; bytesRemaining -= _bufferSize) { + + // TODO - double-buffer these guys. + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (TRACE_COPY2, "copy %zu bytes %p to stagingBuf[%d]:%p\n", theseBytes, srcp, _bufferIndex, _pinnedStagingBuffer[_bufferIndex]); + + memcpy(_pinnedStagingBuffer[_bufferIndex], srcp, theseBytes); + + tprintf (TRACE_COPY2, "async_copy %zu bytes %p to %p\n", theseBytes, _pinnedStagingBuffer[_bufferIndex], dstp); + + hsa_signal_store_relaxed(_completion_signal[_bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _pinnedStagingBuffer[_bufferIndex], theseBytes, _device->_hsa_agent, 0, NULL, _completion_signal[_bufferIndex]); + + tprintf (TRACE_COPY2, "waiting... status=%d\n", hsa_status); + if (hsa_status == HSA_STATUS_SUCCESS) { + hsa_signal_wait_acquire(_completion_signal[_bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + + srcp += theseBytes; + dstp += theseBytes; + } +} + + + + +#if USE_AM_TRACKER +// TODO - add mutex to limit in/out: +void ihipAsyncCopy(ihipDevice_t *device, void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) +{ + hc::AmPointerInfo dstPtrInfo, srcPtrInfo; + + bool dstNotTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) != AM_SUCCESS); + bool srcNotTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) != AM_SUCCESS); + + bool useStagingBuffer = true; + + // Resolve default to a specific Kind, since we use different algorithms: + if (kind == hipMemcpyDefault) { + bool dstIsHost = (dstNotTracked || dstPtrInfo._isInDeviceMem); + bool srcIsHost = (srcNotTracked || srcPtrInfo._isInDeviceMem); + if (srcIsHost && !dstIsHost) { + kind = hipMemcpyHostToDevice; + } else if (!srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyHostToHost; + } else if (srcIsHost && dstIsHost) { + kind = hipMemcpyDeviceToDevice; + } + } + + switch (kind) { + case hipMemcpyHostToDevice: + if (srcNotTracked) { + device->_staging_host2device->CopyHostToDevice(dst, src, sizeBytes); + } else { + assert(0); // TODO + //hsa_signal_wait_relaxed(completion_signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + break; + case hipMemcpyDeviceToHost: + // TODO - optimize the copy here. + hc::AM_copy(dst, src, sizeBytes); + break; + default: + assert(0); // TODO + } +} +#endif + + + //--- hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { @@ -1517,11 +1660,8 @@ hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind ihipDevice_t *device = &g_devices[stream->_device_index]; - hsa_signal_t completion_signal; // init/obtain from pool. + ihipAsyncCopy(device, dst, src, sizeBytes, kind); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, src, size, device->_hsa_agent, 0, NULL, &completion_signal); - - e = (hsa_status == HSA_STATUS_SUCCESS) ? hipSuccess : hipErrorTbd; } else { e = hipErrorInvalidResourceHandle; } diff --git a/projects/hip/tests/src/hipMemcpy.cpp b/projects/hip/tests/src/hipMemcpy.cpp index 5db2b270d6..7664cfb581 100644 --- a/projects/hip/tests/src/hipMemcpy.cpp +++ b/projects/hip/tests/src/hipMemcpy.cpp @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) size_t Nbytes = N*sizeof(int); - printf ("N=%zu \n", N); + printf ("N=%zu Nbytes=%6.2fMB\n", N, Nbytes/1024.0/1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; diff --git a/projects/hip/tests/src/test_common.cpp b/projects/hip/tests/src/test_common.cpp index 02deb51c85..3da5568b7c 100644 --- a/projects/hip/tests/src/test_common.cpp +++ b/projects/hip/tests/src/test_common.cpp @@ -88,7 +88,7 @@ int parseStandardArguments(int argc, char *argv[], bool failOnUndefinedArg) if (!strcmp(arg, " ")) { // skip NULL args. - } else if (!strcmp(arg, "--N")) { + } else if (!strcmp(arg, "--N") || (!strcmp(arg, "-N"))) { if (++i >= argc || !HipTest::parseSize(argv[i], &N)) { failed("Bad N size argument"); } diff --git a/projects/hip/util/vim/hip.vim b/projects/hip/util/vim/hip.vim index 01f3b3f2ad..e2236f4fbc 100644 --- a/projects/hip/util/vim/hip.vim +++ b/projects/hip/util/vim/hip.vim @@ -151,6 +151,9 @@ syn keyword hipFunctionName hipUnbindTexture syn keyword hipFlags hipFilterModePoint syn keyword hipFlags hipMemcpyHostToDevice syn keyword hipFlags hipMemcpyDeviceToHost +syn keyword hipFlags hipMemcpyHostToHost +syn keyword hipFlags hipMemcpyDeviceToDevice +syn keyword hipFlags hipMemcpyDefault syn keyword hipFlags hipReadModeElementType syn keyword hipFlags hipSuccess syn keyword hipFlags hipTextureType1D