- Stall the queue only for HSA copy operations

Change-Id: Ia3debcc0f36284c5f8cd2776d31674f3aeed04ea
Этот коммит содержится в:
German Andryeyev
2020-04-30 10:03:23 -04:00
родитель 6c5a42b33c
Коммит 7302ebcfbc
2 изменённых файлов: 53 добавлений и 45 удалений
+41 -6
Просмотреть файл
@@ -35,11 +35,8 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
context_(nullptr) {}
inline void DmaBlitManager::synchronize() const {
// todo TS tracking isn't implemented
gpu().releaseGpuMemoryFence();
if (syncOperation_) {
// gpu().waitAllEngines();
gpu().releaseGpuMemoryFence();
gpu().releasePinnedMem();
}
}
@@ -65,6 +62,9 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -149,6 +149,9 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableReadBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -184,6 +187,9 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableReadImage_) {
return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch,
entire);
@@ -213,6 +219,9 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -300,6 +309,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
const amd::BufferRect& hostRect,
const amd::BufferRect& bufRect, const amd::Coord3D& size,
bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
// Use host copy if memory has direct access
if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -335,6 +347,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
const amd::Coord3D& origin, const amd::Coord3D& size,
size_t rowPitch, size_t slicePitch, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableWriteImage_) {
return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
entire);
@@ -350,6 +365,9 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBuffer_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
(dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
@@ -364,6 +382,9 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
if (setup_.disableCopyBufferRect_ ||
(srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
dstMemory.isHostMemDirectAccess())) {
@@ -471,6 +492,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
bool result = false;
if (setup_.disableCopyImageToBuffer_) {
@@ -512,6 +536,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire, size_t rowPitch,
size_t slicePitch) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
bool result = false;
if (setup_.disableCopyBufferToImage_) {
@@ -550,6 +577,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
const amd::Coord3D& size, bool entire) const {
// HSA copy functionality with a possible async operaiton, hence make sure GPU is done
gpu().releaseGpuMemoryFence();
bool result = false;
if (setup_.disableCopyImage_) {
@@ -1626,6 +1656,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
// CPU read ahead, hence release GPU memory
gpu().releaseGpuMemoryFence();
void* src = srcMemory.owner()->getSvmPtr();
std::memcpy(dstHost, src, size[0]);
// Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
@@ -1729,11 +1761,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
// CPU read ahead, hence release GPU memory
gpu().releaseGpuMemoryFence();
void* dst = dstMemory.owner()->getSvmPtr();
std::memcpy(dst, srcHost, size[0]);
// Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
// Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
gpu().hasPendingDispatch();
synchronize();
gpu().releaseGpuMemoryFence();
return true;
}
}
@@ -1972,6 +2006,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
releaseArguments(parameters);
} else {
//printf("rocm!\n");
result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
}
+12 -39
Просмотреть файл
@@ -922,9 +922,6 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
size_t offset = 0;
@@ -1030,9 +1027,6 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
size_t offset = 0;
@@ -1229,9 +1223,6 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
cl_command_type type = cmd.type();
@@ -1249,9 +1240,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
// no op for FGS supported device
if (!dev().isFineGrainedSystem(true)) {
@@ -1286,6 +1274,9 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space
dev().forceFineGrain(srcMem) ||
dev().forceFineGrain(dstMem)) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
// If these are from different contexts, then one of them could be in the device memory
// This is fine, since spec doesn't allow for copies with pointers from different contexts
amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
@@ -1328,9 +1319,6 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
Memory* srcDevMem = static_cast<roc::Memory*>(
@@ -1424,9 +1412,6 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
// no op for FGS supported device
@@ -1447,6 +1432,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
LogError("submitSVMMapMemory() - copy failed");
cmd.setStatus(CL_MAP_FAILURE);
}
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
@@ -1463,9 +1449,6 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
// no op for FGS supported device
@@ -1476,6 +1459,8 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
if (memory->mapMemory() != nullptr) {
if (writeMapInfo->isUnmapWrite()) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
amd::Coord3D srcOrigin(0, 0, 0);
Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
@@ -1503,9 +1488,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
//! @todo add multi-devices synchronization when supported.
@@ -1563,8 +1545,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
cmd.isEntireMemory());
void* svmPtr = devMemory->owner()->getSvmPtr();
if ((svmPtr != nullptr) &&
(hostPtr != svmPtr)) {
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
}
@@ -1608,8 +1590,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
LogError("Unmap without map call");
return;
}
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
// Force buffer write for IMAGE1D_BUFFER
@@ -1663,8 +1644,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
const void* svmPtr = devMemory->owner()->getSvmPtr();
void* hostPtr = mapMemory->getHostMem();
if ((svmPtr != nullptr) &&
(hostPtr != svmPtr)) {
if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
}
result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
@@ -1751,9 +1733,6 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
@@ -1767,9 +1746,6 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
@@ -1811,9 +1787,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
// Make sure VirtualGPU has an exclusive access to the resources
amd::ScopedLock lock(execution());
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(vcmd);
for (auto itr : vcmd.memObjects()) {