Optimize synch operations
- Stall the queue only for HSA copy operations Change-Id: Ia3debcc0f36284c5f8cd2776d31674f3aeed04ea
Этот коммит содержится в:
@@ -35,11 +35,8 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
|
||||
context_(nullptr) {}
|
||||
|
||||
inline void DmaBlitManager::synchronize() const {
|
||||
// todo TS tracking isn't implemented
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (syncOperation_) {
|
||||
// gpu().waitAllEngines();
|
||||
gpu().releaseGpuMemoryFence();
|
||||
gpu().releasePinnedMem();
|
||||
}
|
||||
}
|
||||
@@ -65,6 +62,9 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
|
||||
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
@@ -149,6 +149,9 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableReadBufferRect_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
|
||||
@@ -184,6 +187,9 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
|
||||
bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
|
||||
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableReadImage_) {
|
||||
return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch,
|
||||
entire);
|
||||
@@ -213,6 +219,9 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
|
||||
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
@@ -300,6 +309,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
const amd::BufferRect& hostRect,
|
||||
const amd::BufferRect& bufRect, const amd::Coord3D& size,
|
||||
bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
// Use host copy if memory has direct access
|
||||
if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
|
||||
gpuMem(dstMemory).IsPersistentDirectMap()) {
|
||||
@@ -335,6 +347,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
|
||||
bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
const amd::Coord3D& origin, const amd::Coord3D& size,
|
||||
size_t rowPitch, size_t slicePitch, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableWriteImage_) {
|
||||
return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
|
||||
entire);
|
||||
@@ -350,6 +365,9 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
|
||||
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBuffer_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
|
||||
(dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
|
||||
@@ -364,6 +382,9 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
|
||||
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
if (setup_.disableCopyBufferRect_ ||
|
||||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
|
||||
dstMemory.isHostMemDirectAccess())) {
|
||||
@@ -471,6 +492,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
bool result = false;
|
||||
|
||||
if (setup_.disableCopyImageToBuffer_) {
|
||||
@@ -512,6 +536,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire, size_t rowPitch,
|
||||
size_t slicePitch) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
bool result = false;
|
||||
|
||||
if (setup_.disableCopyBufferToImage_) {
|
||||
@@ -550,6 +577,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
|
||||
bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
|
||||
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
|
||||
const amd::Coord3D& size, bool entire) const {
|
||||
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
bool result = false;
|
||||
|
||||
if (setup_.disableCopyImage_) {
|
||||
@@ -1626,6 +1656,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
|
||||
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
|
||||
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
// CPU read ahead, hence release GPU memory
|
||||
gpu().releaseGpuMemoryFence();
|
||||
void* src = srcMemory.owner()->getSvmPtr();
|
||||
std::memcpy(dstHost, src, size[0]);
|
||||
// Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
|
||||
@@ -1729,11 +1761,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
|
||||
if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
|
||||
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
|
||||
// CPU read ahead, hence release GPU memory
|
||||
gpu().releaseGpuMemoryFence();
|
||||
void* dst = dstMemory.owner()->getSvmPtr();
|
||||
std::memcpy(dst, srcHost, size[0]);
|
||||
// Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
|
||||
// Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
|
||||
gpu().hasPendingDispatch();
|
||||
synchronize();
|
||||
gpu().releaseGpuMemoryFence();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -1972,6 +2006,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
|
||||
releaseArguments(parameters);
|
||||
} else {
|
||||
//printf("rocm!\n");
|
||||
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
|
||||
}
|
||||
|
||||
|
||||
@@ -922,9 +922,6 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
size_t offset = 0;
|
||||
@@ -1030,9 +1027,6 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
size_t offset = 0;
|
||||
@@ -1229,9 +1223,6 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
cl_command_type type = cmd.type();
|
||||
@@ -1249,9 +1240,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// in-order semantics: previous commands need to be done before we start
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
// no op for FGS supported device
|
||||
if (!dev().isFineGrainedSystem(true)) {
|
||||
@@ -1286,6 +1274,9 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
|
||||
if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space
|
||||
dev().forceFineGrain(srcMem) ||
|
||||
dev().forceFineGrain(dstMem)) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
// If these are from different contexts, then one of them could be in the device memory
|
||||
// This is fine, since spec doesn't allow for copies with pointers from different contexts
|
||||
amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
|
||||
@@ -1328,9 +1319,6 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
Memory* srcDevMem = static_cast<roc::Memory*>(
|
||||
@@ -1424,9 +1412,6 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
// no op for FGS supported device
|
||||
@@ -1447,6 +1432,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
|
||||
LogError("submitSVMMapMemory() - copy failed");
|
||||
cmd.setStatus(CL_MAP_FAILURE);
|
||||
}
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
|
||||
amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
|
||||
@@ -1463,9 +1449,6 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
// no op for FGS supported device
|
||||
@@ -1476,6 +1459,8 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
|
||||
|
||||
if (memory->mapMemory() != nullptr) {
|
||||
if (writeMapInfo->isUnmapWrite()) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
amd::Coord3D srcOrigin(0, 0, 0);
|
||||
Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
|
||||
|
||||
@@ -1503,9 +1488,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
@@ -1563,8 +1545,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
|
||||
result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
|
||||
cmd.isEntireMemory());
|
||||
void* svmPtr = devMemory->owner()->getSvmPtr();
|
||||
if ((svmPtr != nullptr) &&
|
||||
(hostPtr != svmPtr)) {
|
||||
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
|
||||
}
|
||||
@@ -1608,8 +1590,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
|
||||
LogError("Unmap without map call");
|
||||
return;
|
||||
}
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
// Force buffer write for IMAGE1D_BUFFER
|
||||
@@ -1663,8 +1644,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
|
||||
|
||||
const void* svmPtr = devMemory->owner()->getSvmPtr();
|
||||
void* hostPtr = mapMemory->getHostMem();
|
||||
if ((svmPtr != nullptr) &&
|
||||
(hostPtr != svmPtr)) {
|
||||
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
|
||||
}
|
||||
result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
|
||||
@@ -1751,9 +1733,6 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
|
||||
@@ -1767,9 +1746,6 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// in-order semantics: previous commands need to be done before we start
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(cmd);
|
||||
|
||||
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
|
||||
@@ -1811,9 +1787,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
|
||||
// Make sure VirtualGPU has an exclusive access to the resources
|
||||
amd::ScopedLock lock(execution());
|
||||
|
||||
// Wait on a kernel if one is outstanding
|
||||
releaseGpuMemoryFence();
|
||||
|
||||
profilingBegin(vcmd);
|
||||
|
||||
for (auto itr : vcmd.memObjects()) {
|
||||
|
||||
Ссылка в новой задаче
Block a user