diff --git a/projects/hip/include/hcc_detail/unpinned_copy_engine.h b/projects/hip/include/hcc_detail/unpinned_copy_engine.h index 2dd7e15d28..f50ff54b55 100644 --- a/projects/hip/include/hcc_detail/unpinned_copy_engine.h +++ b/projects/hip/include/hcc_detail/unpinned_copy_engine.h @@ -39,15 +39,20 @@ THE SOFTWARE. // Staging buffer provides thread-safe access via a mutex. struct UnpinnedCopyEngine { + enum CopyMode {ChooseBest, UsePinInPlace, UseStaging, UseMemcpy} ; + static const int _max_buffers = 4; UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; ~UnpinnedCopyEngine(); - void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + /* Use hueristic to choose best copy algorithm */ + + void CopyHostToDeviceBest(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHost (int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); @@ -60,12 +65,12 @@ private: int _numBuffers; char *_pinnedStagingBuffer[_max_buffers]; - hsa_signal_t _completion_signal[_max_buffers]; - hsa_signal_t _completion_signal2[_max_buffers]; // P2P needs another set of signals. - std::mutex _copy_lock; // provide thread-safe access - int _hipH2DTransferThresholdDirectOrStaging; - int _hipH2DTransferThresholdStagingOrPininplace; - int _hipD2HTransferThreshold; + hsa_signal_t _completionSignal[_max_buffers]; + hsa_signal_t _completionSignal2[_max_buffers]; // P2P needs another set of signals. + std::mutex _copyLock; // provide thread-safe access + size_t _hipH2DTransferThresholdDirectOrStaging; + size_t _hipH2DTransferThresholdStagingOrPininplace; + size_t _hipD2HTransferThreshold; }; #endif diff --git a/projects/hip/src/hip_hcc.cpp b/projects/hip/src/hip_hcc.cpp index f14b4612b8..99fe80753d 100644 --- a/projects/hip/src/hip_hcc.cpp +++ b/projects/hip/src/hip_hcc.cpp @@ -1436,7 +1436,6 @@ void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, c } } -// TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr) @@ -1733,22 +1732,17 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const #endif } + if (kind == hipMemcpyHostToDevice) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D); if(!srcTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - if(HIP_OPTIMAL_MEM_TRANSFER) - { - device->_stagingBuffer[0]->CopyHostToDevice(1,device->_isLargeBar,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - else { - if (HIP_PININPLACE) { - device->_stagingBuffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - device->_stagingBuffer[0]->CopyHostToDevice(0,0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - } + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::ChooseBest; + if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; + } + device->_stagingBuffer[0]->CopyHostToDeviceBest(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); // The copy waits for inputs and then completes before returning so can reset queue to empty: this->wait(crit, true); } diff --git a/projects/hip/src/unpinned_copy_engine.cpp b/projects/hip/src/unpinned_copy_engine.cpp index 5501c66f9d..820dda1a02 100644 --- a/projects/hip/src/unpinned_copy_engine.cpp +++ b/projects/hip/src/unpinned_copy_engine.cpp @@ -85,8 +85,8 @@ UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgen err = hsa_amd_agents_allow_access(1, &hsaAgent, NULL, _pinnedStagingBuffer[i]); ErrorCheck(err); - hsa_signal_create(0, 0, NULL, &_completion_signal[i]); - hsa_signal_create(0, 0, NULL, &_completion_signal2[i]); + hsa_signal_create(0, 0, NULL, &_completionSignal[i]); + hsa_signal_create(0, 0, NULL, &_completionSignal2[i]); } }; @@ -100,8 +100,8 @@ UnpinnedCopyEngine::~UnpinnedCopyEngine() hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } - hsa_signal_destroy(_completion_signal[i]); - hsa_signal_destroy(_completion_signal2[i]); + hsa_signal_destroy(_completionSignal[i]); + hsa_signal_destroy(_completionSignal2[i]); } } @@ -114,13 +114,13 @@ UnpinnedCopyEngine::~UnpinnedCopyEngine() //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -129,8 +129,8 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, int bufferIndex = 0; size_t theseBytes= sizeBytes; - //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - //hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + //hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO void *locked_srcp; @@ -143,22 +143,54 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, THROW_ERROR (hipErrorRuntimeMemory); } - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast (srcp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; } +void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (copyMode == ChooseBest) { + if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { + copyMode = UseMemcpy; + } else if (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace) { + copyMode = UsePinInPlace; + } else { + copyMode = UseStaging; + } + } + + if (copyMode == UseMemcpy) { + + if (!isLargeBar) { + THROW_ERROR (hipErrorInvalidValue); + } + + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); + + } else if (copyMode == UsePinInPlace) { + CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); + + } else if (copyMode == UseStaging) { + CopyHostToDeviceStaging(dst, src, sizeBytes, waitFor); + + } else { + // Unknown copy mode. + THROW_ERROR(hipErrorInvalidValue); + } +} //--- @@ -166,24 +198,16 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){ - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); - } - else if((tempIndex==1) && (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace)){ - CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); - } - else { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -194,16 +218,16 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR ((hipErrorRuntimeMemory)); @@ -221,7 +245,7 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } } @@ -229,13 +253,13 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -252,15 +276,15 @@ void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, THROW_ERROR (hipErrorRuntimeMemory); } - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } - tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast (dstp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 @@ -279,13 +303,13 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s } else { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -303,8 +327,8 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -322,7 +346,7 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); @@ -341,14 +365,14 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); - hsa_signal_store_relaxed(_completion_signal2[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); + hsa_signal_store_relaxed(_completionSignal2[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -365,11 +389,11 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; // Wait to make sure we are not overwriting a buffer before it has been drained: - hsa_signal_wait_acquire(_completion_signal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -392,14 +416,14 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v if (hostWait) { // Host-side wait, should not be necessary: - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal2[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes, - hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], - _completion_signal2[bufferIndex]); + hostWait ? 0:1, hostWait ? NULL : &_completionSignal[bufferIndex], + _completionSignal2[bufferIndex]); dstp1 += theseBytes; } @@ -408,6 +432,6 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v // Wait for the staging-buffer to dest copies to complete: for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } }