diff --git a/projects/clr/hipamd/include/hcc_detail/unpinned_copy_engine.h b/projects/clr/hipamd/include/hcc_detail/unpinned_copy_engine.h index f50ff54b55..653beb89ee 100644 --- a/projects/clr/hipamd/include/hcc_detail/unpinned_copy_engine.h +++ b/projects/clr/hipamd/include/hcc_detail/unpinned_copy_engine.h @@ -46,15 +46,23 @@ struct UnpinnedCopyEngine { UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; ~UnpinnedCopyEngine(); - /* Use hueristic to choose best copy algorithm */ + // Use hueristic to choose best copy algorithm + void CopyHostToDevice(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHost(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyHostToDeviceBest(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + // Specific H2D copy algorithm implementations: void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + // Specific D2H copy algorithm implementations: + void CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + // P2P Copy implementation: void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 99fe80753d..78285cefa0 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1272,7 +1272,7 @@ void ihipInit() READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL"); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); - READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development."); + READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy."); READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); @@ -1738,11 +1738,14 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if(!srcTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::ChooseBest; - if (HIP_PININPLACE) { + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; + + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { copyMode = UnpinnedCopyEngine::UsePinInPlace; - } - device->_stagingBuffer[0]->CopyHostToDeviceBest(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + device->_stagingBuffer[0]->CopyHostToDevice(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); // The copy waits for inputs and then completes before returning so can reset queue to empty: this->wait(crit, true); } @@ -1781,16 +1784,16 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (!dstTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - //printf ("staged-copy- read dep signals\n"); - if(HIP_OPTIMAL_MEM_TRANSFER) - { - //printf ("staged-copy- read dep signals\n"); - device->_stagingBuffer[1]->CopyDeviceToHost(1,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - else - { - device->_stagingBuffer[1]->CopyDeviceToHost(0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; + + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; } + + device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + // The copy completes before returning so can reset queue to empty: this->wait(crit, true); diff --git a/projects/clr/hipamd/src/unpinned_copy_engine.cpp b/projects/clr/hipamd/src/unpinned_copy_engine.cpp index 820dda1a02..f446220e7a 100644 --- a/projects/clr/hipamd/src/unpinned_copy_engine.cpp +++ b/projects/clr/hipamd/src/unpinned_copy_engine.cpp @@ -159,7 +159,20 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, } -void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +// Copy using simple memcpy. Only works on large-bar systems. +void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (!isLargeBar) { + THROW_ERROR (hipErrorInvalidValue); + } + + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); +}; + + + +void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { if (copyMode == ChooseBest) { if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { @@ -173,12 +186,7 @@ void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyM if (copyMode == UseMemcpy) { - if (!isLargeBar) { - THROW_ERROR (hipErrorInvalidValue); - } - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); } else if (copyMode == UsePinInPlace) { CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); @@ -291,17 +299,35 @@ void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, waitFor = NULL; } + +void UnpinnedCopyEngine::CopyDeviceToHost(CopyMode copyMode ,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (copyMode == ChooseBest) { + if (sizeBytes > _hipD2HTransferThreshold) { + copyMode = UsePinInPlace; + } else { + copyMode = UseStaging; + } + } + + + if (copyMode == UsePinInPlace) { + CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); + } if (copyMode == UseStaging) { + CopyDeviceToHostStaging(dst, src, sizeBytes, waitFor); + } else { + // Unknown copy mode. + THROW_ERROR(hipErrorInvalidValue); + } +} + //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){ - CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); - } - else { std::lock_guard l (_copyLock);