diff --git a/include/hcc_detail/staging_buffer.h b/include/hcc_detail/staging_buffer.h index 799de58c3e..fe28a93e16 100644 --- a/include/hcc_detail/staging_buffer.h +++ b/include/hcc_detail/staging_buffer.h @@ -41,21 +41,21 @@ struct StagingBuffer { static const int _max_buffers = 4; - StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) ; + StagingBuffer(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; ~StagingBuffer(); - void CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHost (void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHost (int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); private: - hsa_agent_t _hsa_agent; - hsa_agent_t _cpu_agent; + hsa_agent_t _hsaAgent; + hsa_agent_t _cpuAgent; size_t _bufferSize; // Size of the buffers. int _numBuffers; @@ -63,6 +63,9 @@ private: hsa_signal_t _completion_signal[_max_buffers]; hsa_signal_t _completion_signal2[_max_buffers]; // P2P needs another set of signals. std::mutex _copy_lock; // provide thread-safe access + int _hipH2DTransferThresholdDirectOrStaging; + int _hipH2DTransferThresholdStagingOrPininplace; + int _hipD2HTransferThreshold; }; #endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 6cfc73d037..82375c7754 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -567,8 +567,8 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato initProperties(&_props); - _stagingBuffer[0] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); - _stagingBuffer[1] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS); + _stagingBuffer[0] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); + _stagingBuffer[1] = new StagingBuffer(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost); } @@ -1547,28 +1547,17 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); if(HIP_OPTIMAL_MEM_TRANSFER) { - if((device->_isLargeBar)&&(sizeBytes < HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING)){ - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); - } - else{ - if(sizeBytes > HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE){ - //if (HIP_PININPLACE) { - device->_stagingBuffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - device->_stagingBuffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - // The copy waits for inputs and then completes before returning so can reset queue to empty: - this->wait(crit, true); - } + device->_stagingBuffer[0]->CopyHostToDevice(1,device->_isLargeBar,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { if (HIP_PININPLACE) { device->_stagingBuffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } else { - device->_stagingBuffer[0]->CopyHostToDevice(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_stagingBuffer[0]->CopyHostToDevice(0,0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } } + // The copy waits for inputs and then completes before returning so can reset queue to empty: + this->wait(crit, true); } else { // TODO - remove, slow path. @@ -1608,15 +1597,12 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const //printf ("staged-copy- read dep signals\n"); if(HIP_OPTIMAL_MEM_TRANSFER) { - if(sizeBytes> HIP_D2H_MEM_TRANSFER_THRESHOLD){ - device->_stagingBuffer[1]->CopyDeviceToHostPinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - }else { - //printf ("staged-copy- read dep signals\n"); - device->_stagingBuffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - }else + //printf ("staged-copy- read dep signals\n"); + device->_stagingBuffer[1]->CopyDeviceToHost(1,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + else { - device->_stagingBuffer[1]->CopyDeviceToHost(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_stagingBuffer[1]->CopyDeviceToHost(0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); } // The copy completes before returning so can reset queue to empty: this->wait(crit, true); diff --git a/src/staging_buffer.cpp b/src/staging_buffer.cpp index 69f22e38b0..9ce5722559 100644 --- a/src/staging_buffer.cpp +++ b/src/staging_buffer.cpp @@ -32,16 +32,16 @@ THE SOFTWARE. #define tprintf(trace_level, ...) #endif -void error_check1(hsa_status_t hsa_error_code, int line_num, std::string str) { +void errorCheck(hsa_status_t hsa_error_code, int line_num, std::string str) { if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK)) { printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num); } } -#define ErrorCheck(x) error_check1(x, __LINE__, __FILE__) +#define ErrorCheck(x) errorCheck(x, __LINE__, __FILE__) hsa_amd_memory_pool_t sys_pool_; -hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) { +hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) { if (NULL == data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -62,13 +62,16 @@ hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data) { } //------------------------------------------------------------------------------------------------- -StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers) : - _hsa_agent(hsaAgent), - _cpu_agent(cpuAgent), +StagingBuffer::StagingBuffer(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) : + _hsaAgent(hsaAgent), + _cpuAgent(cpuAgent), _bufferSize(bufferSize), - _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers) + _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers), + _hipH2DTransferThresholdDirectOrStaging(thresholdH2DDirectStaging), + _hipH2DTransferThresholdStagingOrPininplace(thresholdH2DStagingPinInPlace), + _hipD2HTransferThreshold(thresholdD2H) { - hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpu_agent, FindGlobalPool, &sys_pool_); + hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpuAgent, findGlobalPool, &sys_pool_); ErrorCheck(err); for (int i=0; i<_numBuffers; i++) { // TODO - experiment with alignment here. @@ -107,7 +110,7 @@ StagingBuffer::~StagingBuffer() //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsa_agent) +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { @@ -131,8 +134,8 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO void *locked_srcp; - //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsa_agent, 1, &locked_srcp); - hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsa_agent, 1, &locked_srcp); + //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsaAgent, 1, &locked_srcp); + hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsaAgent, 1, &locked_srcp); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); //printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); @@ -142,7 +145,7 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, locked_srcp, _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -161,56 +164,66 @@ void StagingBuffer::CopyHostToDevicePinInPlace(void* dst, const void* src, size_ //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsa_agent) +//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyHostToDevice(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void StagingBuffer::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); - - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){ + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); + } + else if((tempIndex==1) && (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace)){ + CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); } + else + { + std::lock_guard l (_copy_lock); - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - int bufferIndex = 0; - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { + const char *srcp = static_cast (src); + char *dstp = static_cast (dst); - size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); - // TODO - use uncached memcpy, someday. - memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - - - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsa_agent, _pinnedStagingBuffer[bufferIndex], _cpu_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR ((hipErrorRuntimeMemory)); + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); } - srcp += theseBytes; - dstp += theseBytes; - if (++bufferIndex >= _numBuffers) { - bufferIndex = 0; + if (sizeBytes >= UINT64_MAX/2) { + THROW_ERROR (hipErrorInvalidValue); + } + int bufferIndex = 0; + for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { + + size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; + + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + + tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); + // TODO - use uncached memcpy, someday. + memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); + + + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR ((hipErrorRuntimeMemory)); + } + + srcp += theseBytes; + dstp += theseBytes; + if (++bufferIndex >= _numBuffers) { + bufferIndex = 0; + } + + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } + for (int i=0; i<_numBuffers; i++) { + hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + } + } } @@ -232,7 +245,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_ size_t theseBytes= sizeBytes; void *locked_destp; - hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (dstp), theseBytes, &_hsa_agent, 1, &locked_destp); + hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (dstp), theseBytes, &_hsaAgent, 1, &locked_destp); if (hsa_status != HSA_STATUS_SUCCESS) { @@ -241,7 +254,7 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_ hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpu_agent , srcp, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); @@ -256,67 +269,74 @@ void StagingBuffer::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_ //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsa_agent). +//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void StagingBuffer::CopyDeviceToHost(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void StagingBuffer::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); - - const char *srcp0 = static_cast (src); - char *dstp1 = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){ + CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); } + else + { + std::lock_guard l (_copy_lock); - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } + const char *srcp0 = static_cast (src); + char *dstp1 = static_cast (dst); - int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. - int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest + for (int i=0; i<_numBuffers; i++) { + hsa_signal_store_relaxed(_completion_signal[i], 0); + } - while (bytesRemaining1 > 0) { - // First launch the async copies to copy from dest to host - for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + if (sizeBytes >= UINT64_MAX/2) { + THROW_ERROR (hipErrorInvalidValue); + } - size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. + int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest - tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, _hsa_agent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); + while (bytesRemaining1 > 0) + { + // First launch the async copies to copy from dest to host + for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { + + size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; + + tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); + hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + if (hsa_status != HSA_STATUS_SUCCESS) { + THROW_ERROR (hipErrorRuntimeMemory); + } + + srcp0 += theseBytes; + + + // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 + waitFor = NULL; } - srcp0 += theseBytes; + // Now unload the staging buffers: + for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } + tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); + hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - // Now unload the staging buffers: - for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { + tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); + memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); - size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; - - tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); - - dstp1 += theseBytes; - } + dstp1 += theseBytes; + } + } } } //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsa_agent). +//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) @@ -349,7 +369,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpu_agent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -377,7 +397,7 @@ void StagingBuffer::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpu_agent /*not used*/, theseBytes, + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes, hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], _completion_signal2[bufferIndex]);