From e7586adb3308313a413547aa953be072029e2f75 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 17 Mar 2016 00:20:56 -0500 Subject: [PATCH] Refactor to isolate staging buffer code. --- include/hcc_detail/staging_buffer.h | 6 +-- include/hip_runtime_api.h | 2 + src/hip_hcc.cpp | 17 +++---- src/staging_buffer.cpp | 76 +++++++++++++++++------------ tests/src/CMakeLists.txt | 7 +-- 5 files changed, 61 insertions(+), 47 deletions(-) diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/staging_buffer.h index b8c08410a9..d5f5860ce6 100644 --- a/include/hcc_detail/staging_buffer.h +++ b/include/hcc_detail/staging_buffer.h @@ -2,14 +2,13 @@ #include "hsa.h" -struct ihipDevice_t; //------------------------------------------------------------------------------------------------- struct StagingBuffer { static const int _max_buffers = 4; - StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) ; + StagingBuffer(hc::accelerator &acc, size_t bufferSize, int numBuffers) ; ~StagingBuffer(); void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); @@ -20,7 +19,8 @@ struct StagingBuffer { private: - ihipDevice_t *_device; + hc::accelerator &_acc; + hsa_agent_t _hsa_agent; size_t _bufferSize; // Size of the buffers. int _numBuffers; diff --git a/include/hip_runtime_api.h b/include/hip_runtime_api.h index f73ebeeb9b..9e191e138a 100644 --- a/include/hip_runtime_api.h +++ b/include/hip_runtime_api.h @@ -153,6 +153,8 @@ typedef enum hipError_t { ,hipErrorNoDevice ///< Call to hipGetDeviceCount returned 0 devices ,hipErrorNotReady ///< Indicates that asynchronous operations enqueued earlier are not ready. This is not actually an error, but is used to distinguish from hipSuccess (which indicates completion). APIs that return this error include hipEventQuery and hipStreamQuery. ,hipErrorUnknown ///< Unknown error. + ,hipErrorRuntimeMemory ///< HSA runtime memory call returned error. Typically not seen in production systems. + ,hipErrorRuntimeOther ///< HSA runtime call other than memory returned error. Typically not seen in production systems. ,hipErrorTbd ///< Marker that more error codes are needed. } hipError_t; diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 0cef779790..468caf436f 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -77,9 +77,10 @@ int HIP_DISABLE_HW_KERNEL_DEP = 1; int HIP_DISABLE_HW_COPY_DEP = 1; int HIP_DISABLE_BIDIR_MEMCPY = 0; -int HIP_ONESHOT_COPY_DEP = 1; // TODO - setting this =1 is a good thing, reduces input deps +#define HIP_HCC + // If set, thread-safety is enforced on all stream functions. // Stream functions will acquire a mutex before entering critical sections. #define STREAM_THREAD_SAFE 1 @@ -641,8 +642,8 @@ inline void ihipStream_t::postCopyCommand() //Device may be reset multiple times, and may be reset after init. void ihipDevice_t::reset() { - _staging_buffer[0] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); - _staging_buffer[1] = new StagingBuffer(this, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[0] = new StagingBuffer(_acc, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _staging_buffer[1] = new StagingBuffer(_acc, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); }; @@ -651,7 +652,7 @@ void ihipDevice_t::init(unsigned device_index, hc::accelerator acc) { _device_index = device_index; _acc = acc; - hsa_agent_t *agent = static_cast (acc.get_default_view().get_hsa_agent()); + hsa_agent_t *agent = static_cast (acc.get_hsa_agent()); if (agent) { int err = hsa_agent_get_info(*agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &_compute_units); if (err != HSA_STATUS_SUCCESS) { @@ -1018,8 +1019,6 @@ void ihipInit() READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)"); READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host. -1 means ifnore these dependencies (debug mode)"); READ_ENV_I(release, HIP_DISABLE_BIDIR_MEMCPY, 0, "Disable simultaneous H2D memcpy and D2H memcpy to same device"); - READ_ENV_I(release, HIP_ONESHOT_COPY_DEP, 0, "If set, only set the copy input dependency for the first copy command in a staged copy. If clear, set the dep for each copy."); - /* @@ -1508,6 +1507,8 @@ const char *hipGetErrorName(hipError_t hip_error) case hipErrorInvalidMemcpyDirection : return "hipErrorInvalidMemcpyDirection"; case hipErrorNoDevice : return "hipErrorNoDevice"; case hipErrorNotReady : return "hipErrorNotReady"; + case hipErrorRuntimeMemory : return "hipErrorRuntimeMemory"; + case hipErrorRuntimeOther : return "hipErrorRuntimeOther"; case hipErrorUnknown : return "hipErrorUnknown"; case hipErrorTbd : return "hipErrorTbd"; default : return "hipErrorUnknown"; @@ -2189,6 +2190,7 @@ hipError_t hipHostUnregister(void *hostPtr){ return ihipLogStatus(hip_status); } + //--- hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t count, size_t offset, hipMemcpyKind kind) { @@ -2212,9 +2214,6 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou - - - void ihipStream_t::ihipSyncCopy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) { ihipDevice_t *device = this->getDevice(); diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index 18f8d9a512..11e271fbf4 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -1,25 +1,41 @@ +#include + +#include "hsa_ext_amd.h" + #include "hcc_detail/staging_buffer.h" +#ifndef tprintf +#define tprintf(trace_level, ...) +#endif + +#ifdef HIP_HCC +#define THROW_ERROR(e) throw ihipException(e) +#else +#define THROW_ERROR(e) throw +#endif //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(ihipDevice_t *device, size_t bufferSize, int numBuffers) : - _device(device), +StagingBuffer::StagingBuffer(hc::accelerator &acc, size_t bufferSize, int numBuffers) : + _acc(acc), _bufferSize(bufferSize), _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) { + hsa_agent_t *agentPtr = static_cast (acc.get_hsa_agent()); + _hsa_agent = *agentPtr; for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. - _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, device->_acc, amHostPinned); + _pinnedStagingBuffer[i] = hc::am_alloc(_bufferSize, _acc, amHostPinned); if (_pinnedStagingBuffer[i] == NULL) { - throw ihipException(hipErrorMemoryAllocation); + THROW_ERROR(hipErrorMemoryAllocation); } hsa_signal_create(0, 0, NULL, &_completion_signal[i]); } }; + //--- StagingBuffer::~StagingBuffer() { @@ -34,9 +50,10 @@ StagingBuffer::~StagingBuffer() +//--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsa_agent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { @@ -48,7 +65,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ } if (sizeBytes >= UINT64_MAX/2) { - throw (ihipException(hipErrorInvalidValue)); + THROW_ERROR (hipErrorInvalidValue); } int bufferIndex = 0; for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { @@ -58,26 +75,24 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO void *locked_srcp; - hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_device->_hsa_agent, 1, &locked_srcp); - //hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_device->_hsa_agent, 1, &locked_srcp); + hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsa_agent, 1, &locked_srcp); + //hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsa_agent, 1, &locked_srcp); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); if (hsa_status != HSA_STATUS_SUCCESS) { - throw (ihipException(hipErrorUnknown)); + THROW_ERROR (hipErrorRuntimeMemory); } hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, locked_srcp, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { - throw (ihipException(hipErrorUnknown)); + THROW_ERROR (hipErrorRuntimeMemory); } srcp += theseBytes; @@ -86,9 +101,8 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ bufferIndex = 0; } - if (HIP_ONESHOT_COPY_DEP) { - waitFor = NULL; // TODO - don't need dependency after first copy submitted? - } + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } // TODO - @@ -106,7 +120,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _device) +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsa_agent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { @@ -117,7 +131,9 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[i], 0); } - assert(sizeBytes < UINT64_MAX/2); // TODO + if (sizeBytes >= UINT64_MAX/2) { + THROW_ERROR (hipErrorInvalidValue); + } int bufferIndex = 0; for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { @@ -133,11 +149,11 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _device->_hsa_agent, _pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { - throw (ihipException(hipErrorUnknown)); + THROW_ERROR ((hipErrorRuntimeMemory)); } srcp += theseBytes; @@ -146,9 +162,8 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte bufferIndex = 0; } - if (HIP_ONESHOT_COPY_DEP) { - waitFor = NULL; // TODO - don't need dependency after first copy submitted? - } + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } @@ -159,7 +174,7 @@ void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeByte //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from agent this buffer is assocaited with (via _device). +//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsa_agent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) @@ -171,7 +186,9 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte hsa_signal_store_relaxed(_completion_signal[i], 0); } - assert(sizeBytes < UINT64_MAX/2); // TODO + if (sizeBytes >= UINT64_MAX/2) { + THROW_ERROR (hipErrorInvalidValue); + } int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest @@ -184,17 +201,16 @@ void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeByte tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _device->_hsa_agent, srcp0, _device->_hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _hsa_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { - throw (ihipException(hipErrorUnknown)); + THROW_ERROR (hipErrorRuntimeMemory); } srcp0 += theseBytes; - if (HIP_ONESHOT_COPY_DEP) { - waitFor = NULL; // TODO - don't need dependency after first copy submitted? - } + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } // Now unload the staging buffers: diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index 3223bdb9f8..946fbdc22d 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -35,8 +35,8 @@ if (${HIP_PLATFORM} STREQUAL "hcc") include_directories(${HIP_PATH}/include) # hip_hcc.o: - add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ) - #add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ${HIP_PATH}/src/staging_buffer.cpp) + #add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ) + add_library(hip_hcc STATIC ${HIP_PATH}/src/hip_hcc.cpp ${HIP_PATH}/src/staging_buffer.cpp) target_include_directories(hip_hcc PRIVATE ${HSA_PATH}/include) @@ -139,9 +139,6 @@ make_hip_executable (hipHostGetFlags hipHostGetFlags.cpp) make_hip_executable (hipHostRegister hipHostRegister.cpp) make_hip_executable (hipRandomMemcpyAsync hipRandomMemcpyAsync.cpp) -# not needed since hipcc now includes -lm by default: -#target_link_libraries(hipMathFunctionsHost m) - make_test(hip_ballot " " ) make_test(hip_anyall " " ) make_test(hip_popc " " )