diff --git a/rocclr/device/rocm/rocblit.cpp b/rocclr/device/rocm/rocblit.cpp index f8d2d22a96..3a7285c2ad 100755 --- a/rocclr/device/rocm/rocblit.cpp +++ b/rocclr/device/rocm/rocblit.cpp @@ -35,11 +35,8 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup) context_(nullptr) {} inline void DmaBlitManager::synchronize() const { - // todo TS tracking isn't implemented - gpu().releaseGpuMemoryFence(); - if (syncOperation_) { - // gpu().waitAllEngines(); + gpu().releaseGpuMemoryFence(); gpu().releasePinnedMem(); } } @@ -65,6 +62,9 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory& bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableReadBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { @@ -149,6 +149,9 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, const amd::BufferRect& bufRect, const amd::BufferRect& hostRect, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableReadBufferRect_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) { @@ -184,6 +187,9 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost, bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + if (setup_.disableReadImage_) { return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch, entire); @@ -213,6 +219,9 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { @@ -300,6 +309,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem const amd::BufferRect& hostRect, const amd::BufferRect& bufRect, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + // Use host copy if memory has direct access if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() || gpuMem(dstMemory).IsPersistentDirectMap()) { @@ -335,6 +347,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, const amd::Coord3D& origin, const amd::Coord3D& size, size_t rowPitch, size_t slicePitch, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + if (setup_.disableWriteImage_) { return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch, entire); @@ -350,6 +365,9 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory, bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyBuffer_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() && (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) { @@ -364,6 +382,9 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory, const amd::BufferRect& srcRect, const amd::BufferRect& dstRect, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + if (setup_.disableCopyBufferRect_ || (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() && dstMemory.isHostMemDirectAccess())) { @@ -471,6 +492,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + bool result = false; if (setup_.disableCopyImageToBuffer_) { @@ -512,6 +536,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire, size_t rowPitch, size_t slicePitch) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + bool result = false; if (setup_.disableCopyBufferToImage_) { @@ -550,6 +577,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory, const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin, const amd::Coord3D& size, bool entire) const { + // HSA copy functionality with a possible async operaiton, hence make sure GPU is done + gpu().releaseGpuMemoryFence(); + bool result = false; if (setup_.disableCopyImage_) { @@ -1626,6 +1656,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost, if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) { if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) { + // CPU read ahead, hence release GPU memory + gpu().releaseGpuMemoryFence(); void* src = srcMemory.owner()->getSvmPtr(); std::memcpy(dstHost, src, size[0]); // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush @@ -1729,11 +1761,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) { if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) { + // CPU read ahead, hence release GPU memory + gpu().releaseGpuMemoryFence(); void* dst = dstMemory.owner()->getSvmPtr(); std::memcpy(dst, srcHost, size[0]); - // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache + // Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache gpu().hasPendingDispatch(); - synchronize(); + gpu().releaseGpuMemoryFence(); return true; } } @@ -1972,6 +2006,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr); releaseArguments(parameters); } else { + //printf("rocm!\n"); result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire); } diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 62173325c7..7a7e3efc1b 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -922,9 +922,6 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); size_t offset = 0; @@ -1030,9 +1027,6 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); size_t offset = 0; @@ -1229,9 +1223,6 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); cl_command_type type = cmd.type(); @@ -1249,9 +1240,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); // no op for FGS supported device if (!dev().isFineGrainedSystem(true)) { @@ -1286,6 +1274,9 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) { if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space dev().forceFineGrain(srcMem) || dev().forceFineGrain(dstMem)) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); + // If these are from different contexts, then one of them could be in the device memory // This is fine, since spec doesn't allow for copies with pointers from different contexts amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize()); @@ -1328,9 +1319,6 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); Memory* srcDevMem = static_cast( @@ -1424,9 +1412,6 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); // no op for FGS supported device @@ -1447,6 +1432,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) { LogError("submitSVMMapMemory() - copy failed"); cmd.setStatus(CL_MAP_FAILURE); } + // Wait on a kernel if one is outstanding releaseGpuMemoryFence(); const void* mappedPtr = hsaMapMemory->owner()->getHostMem(); amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]); @@ -1463,9 +1449,6 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); // no op for FGS supported device @@ -1476,6 +1459,8 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) { if (memory->mapMemory() != nullptr) { if (writeMapInfo->isUnmapWrite()) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); amd::Coord3D srcOrigin(0, 0, 0); Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory()); @@ -1503,9 +1488,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); //! @todo add multi-devices synchronization when supported. @@ -1563,8 +1545,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) { result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size, cmd.isEntireMemory()); void* svmPtr = devMemory->owner()->getSvmPtr(); - if ((svmPtr != nullptr) && - (hostPtr != svmPtr)) { + if ((svmPtr != nullptr) && (hostPtr != svmPtr)) { + // Wait on a kernel if one is outstanding releaseGpuMemoryFence(); amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]); } @@ -1608,8 +1590,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { LogError("Unmap without map call"); return; } - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); + profilingBegin(cmd); // Force buffer write for IMAGE1D_BUFFER @@ -1663,8 +1644,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) { const void* svmPtr = devMemory->owner()->getSvmPtr(); void* hostPtr = mapMemory->getHostMem(); - if ((svmPtr != nullptr) && - (hostPtr != svmPtr)) { + if ((svmPtr != nullptr) && (hostPtr != svmPtr)) { + // Wait on a kernel if one is outstanding + releaseGpuMemoryFence(); amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]); } result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_, @@ -1751,9 +1733,6 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(cmd); if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(), @@ -1767,9 +1746,6 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // in-order semantics: previous commands need to be done before we start - releaseGpuMemoryFence(); - profilingBegin(cmd); amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst()); @@ -1811,9 +1787,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) { // Make sure VirtualGPU has an exclusive access to the resources amd::ScopedLock lock(execution()); - // Wait on a kernel if one is outstanding - releaseGpuMemoryFence(); - profilingBegin(vcmd); for (auto itr : vcmd.memObjects()) {