Refactor Staging Buffer CopyDeviceToHost
Use copyMode. Embed algorithm selection inside the unpinned class.
Change-Id: Ic75fd5931717a3160904402794bbed3ccd445112
[ROCm/clr commit: 77c86934c1]
Esse commit está contido em:
@@ -46,15 +46,23 @@ struct UnpinnedCopyEngine {
|
||||
UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ;
|
||||
~UnpinnedCopyEngine();
|
||||
|
||||
/* Use hueristic to choose best copy algorithm */
|
||||
// Use hueristic to choose best copy algorithm
|
||||
void CopyHostToDevice(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
void CopyDeviceToHost(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
void CopyHostToDeviceBest(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
// Specific H2D copy algorithm implementations:
|
||||
void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
void CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
void CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
// Specific D2H copy algorithm implementations:
|
||||
void CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
|
||||
// P2P Copy implementation:
|
||||
void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor);
|
||||
|
||||
|
||||
|
||||
@@ -1272,7 +1272,7 @@ void ihipInit()
|
||||
READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL");
|
||||
READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" );
|
||||
READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy.");
|
||||
READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development.");
|
||||
READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy.");
|
||||
READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing.");
|
||||
READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing.");
|
||||
READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing.");
|
||||
@@ -1738,11 +1738,14 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
if(!srcTracked){
|
||||
if (HIP_STAGING_BUFFERS) {
|
||||
tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
|
||||
UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::ChooseBest;
|
||||
if (HIP_PININPLACE) {
|
||||
UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging;
|
||||
|
||||
if (HIP_OPTIMAL_MEM_TRANSFER) {
|
||||
copyMode = UnpinnedCopyEngine::ChooseBest;
|
||||
} else if (HIP_PININPLACE) {
|
||||
copyMode = UnpinnedCopyEngine::UsePinInPlace;
|
||||
}
|
||||
device->_stagingBuffer[0]->CopyHostToDeviceBest(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
|
||||
}
|
||||
device->_stagingBuffer[0]->CopyHostToDevice(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
|
||||
// The copy waits for inputs and then completes before returning so can reset queue to empty:
|
||||
this->wait(crit, true);
|
||||
}
|
||||
@@ -1781,16 +1784,16 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const
|
||||
if (!dstTracked){
|
||||
if (HIP_STAGING_BUFFERS) {
|
||||
tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes);
|
||||
//printf ("staged-copy- read dep signals\n");
|
||||
if(HIP_OPTIMAL_MEM_TRANSFER)
|
||||
{
|
||||
//printf ("staged-copy- read dep signals\n");
|
||||
device->_stagingBuffer[1]->CopyDeviceToHost(1,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
device->_stagingBuffer[1]->CopyDeviceToHost(0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
|
||||
UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging;
|
||||
|
||||
if (HIP_OPTIMAL_MEM_TRANSFER) {
|
||||
copyMode = UnpinnedCopyEngine::ChooseBest;
|
||||
} else if (HIP_PININPLACE) {
|
||||
copyMode = UnpinnedCopyEngine::UsePinInPlace;
|
||||
}
|
||||
|
||||
device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL);
|
||||
|
||||
// The copy completes before returning so can reset queue to empty:
|
||||
this->wait(crit, true);
|
||||
|
||||
|
||||
@@ -159,7 +159,20 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src,
|
||||
}
|
||||
|
||||
|
||||
void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
// Copy using simple memcpy. Only works on large-bar systems.
|
||||
void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if (!isLargeBar) {
|
||||
THROW_ERROR (hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
memcpy(dst,src,sizeBytes);
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
};
|
||||
|
||||
|
||||
|
||||
void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if (copyMode == ChooseBest) {
|
||||
if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) {
|
||||
@@ -173,12 +186,7 @@ void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyM
|
||||
|
||||
if (copyMode == UseMemcpy) {
|
||||
|
||||
if (!isLargeBar) {
|
||||
THROW_ERROR (hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
memcpy(dst,src,sizeBytes);
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
|
||||
} else if (copyMode == UsePinInPlace) {
|
||||
CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor);
|
||||
@@ -291,17 +299,35 @@ void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src,
|
||||
waitFor = NULL;
|
||||
}
|
||||
|
||||
|
||||
void UnpinnedCopyEngine::CopyDeviceToHost(CopyMode copyMode ,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if (copyMode == ChooseBest) {
|
||||
if (sizeBytes > _hipD2HTransferThreshold) {
|
||||
copyMode = UsePinInPlace;
|
||||
} else {
|
||||
copyMode = UseStaging;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (copyMode == UsePinInPlace) {
|
||||
CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor);
|
||||
} if (copyMode == UseStaging) {
|
||||
CopyDeviceToHostStaging(dst, src, sizeBytes, waitFor);
|
||||
} else {
|
||||
// Unknown copy mode.
|
||||
THROW_ERROR(hipErrorInvalidValue);
|
||||
}
|
||||
}
|
||||
|
||||
//---
|
||||
//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy
|
||||
//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent).
|
||||
//IN: src - src pointer for copy. Must be accessible from host CPU.
|
||||
//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency.
|
||||
void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor)
|
||||
{
|
||||
if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){
|
||||
CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::lock_guard<std::mutex> l (_copyLock);
|
||||
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário