Optimize synch operations

- Stall the queue only for HSA copy operations Change-Id: Ia3debcc0f36284c5f8cd2776d31674f3aeed04ea
2020-04-30 10:03:23 -04:00
@@ -35,11 +35,8 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
      context_(nullptr) {}

 inline void DmaBlitManager::synchronize() const {
-  // todo TS tracking isn't implemented
-  gpu().releaseGpuMemoryFence();
-
  if (syncOperation_) {
-    //        gpu().waitAllEngines();
+    gpu().releaseGpuMemoryFence();
    gpu().releasePinnedMem();
  }
 }
@@ -65,6 +62,9 @@ bool DmaBlitManager::readMemoryStaged(Memory& srcMemory, void* dstHost, Memory&
 bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  // Use host copy if memory has direct access
  if (setup_.disableReadBuffer_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -149,6 +149,9 @@ bool DmaBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
                                    const amd::BufferRect& bufRect, const amd::BufferRect& hostRect,
                                    const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  // Use host copy if memory has direct access
  if (setup_.disableReadBufferRect_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached())) {
@@ -184,6 +187,9 @@ bool DmaBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
 bool DmaBlitManager::readImage(device::Memory& srcMemory, void* dstHost, const amd::Coord3D& origin,
                               const amd::Coord3D& size, size_t rowPitch, size_t slicePitch,
                               bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  if (setup_.disableReadImage_) {
    return HostBlitManager::readImage(srcMemory, dstHost, origin, size, rowPitch, slicePitch,
                                      entire);
@@ -213,6 +219,9 @@ bool DmaBlitManager::writeMemoryStaged(const void* srcHost, Memory& dstMemory, M
 bool DmaBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  // Use host copy if memory has direct access
  if (setup_.disableWriteBuffer_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -300,6 +309,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
                                     const amd::BufferRect& hostRect,
                                     const amd::BufferRect& bufRect, const amd::Coord3D& size,
                                     bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  // Use host copy if memory has direct access
  if (setup_.disableWriteBufferRect_ || dstMemory.isHostMemDirectAccess() ||
      gpuMem(dstMemory).IsPersistentDirectMap()) {
@@ -335,6 +347,9 @@ bool DmaBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMem
 bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                size_t rowPitch, size_t slicePitch, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  if (setup_.disableWriteImage_) {
    return HostBlitManager::writeImage(srcHost, dstMemory, origin, size, rowPitch, slicePitch,
                                       entire);
@@ -350,6 +365,9 @@ bool DmaBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
 bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  if (setup_.disableCopyBuffer_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
      (dev().agent_profile() != HSA_PROFILE_FULL) && dstMemory.isHostMemDirectAccess())) {
@@ -364,6 +382,9 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
 bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
                                    const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                                    const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  if (setup_.disableCopyBufferRect_ ||
      (srcMemory.isHostMemDirectAccess() && !srcMemory.isCpuUncached() &&
       dstMemory.isHostMemDirectAccess())) {
@@ -471,6 +492,9 @@ bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory
                                       const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                       const amd::Coord3D& size, bool entire, size_t rowPitch,
                                       size_t slicePitch) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  bool result = false;

  if (setup_.disableCopyImageToBuffer_) {
@@ -512,6 +536,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
                                       const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                       const amd::Coord3D& size, bool entire, size_t rowPitch,
                                       size_t slicePitch) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  bool result = false;

  if (setup_.disableCopyBufferToImage_) {
@@ -550,6 +577,9 @@ bool DmaBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory
 bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                               const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                               const amd::Coord3D& size, bool entire) const {
+  // HSA copy functionality with a possible async operaiton, hence make sure GPU is done
+  gpu().releaseGpuMemoryFence();
+
  bool result = false;

  if (setup_.disableCopyImage_) {
@@ -1626,6 +1656,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,

  if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
    if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
+      // CPU read ahead, hence release GPU memory
+      gpu().releaseGpuMemoryFence();
      void* src = srcMemory.owner()->getSvmPtr();
      std::memcpy(dstHost, src, size[0]);
      // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
@@ -1729,11 +1761,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo

  if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
    if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
+      // CPU read ahead, hence release GPU memory
+      gpu().releaseGpuMemoryFence();
      void* dst = dstMemory.owner()->getSvmPtr();
      std::memcpy(dst, srcHost, size[0]);
-      // Set HASPENDINGDISPATCH_ FLAG. Then synchronize() will use barrier to invalidate cache
+      // Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
      gpu().hasPendingDispatch();
-      synchronize();
+      gpu().releaseGpuMemoryFence();
      return true;
    }
  }
@@ -1972,6 +2006,7 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
    result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters, nullptr);
    releaseArguments(parameters);
  } else {
+    //printf("rocm!\n");
    result = DmaBlitManager::copyBuffer(srcMemory, dstMemory, srcOrigin, dstOrigin, sizeIn, entire);
  }

@@ -922,9 +922,6 @@ void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  size_t offset = 0;
@@ -1030,9 +1027,6 @@ void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  size_t offset = 0;
@@ -1229,9 +1223,6 @@ void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  cl_command_type type = cmd.type();
@@ -1249,9 +1240,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // in-order semantics: previous commands need to be done before we start
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);
  // no op for FGS supported device
  if (!dev().isFineGrainedSystem(true)) {
@@ -1286,6 +1274,9 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
    if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space
        dev().forceFineGrain(srcMem) ||
        dev().forceFineGrain(dstMem)) {
+      // Wait on a kernel if one is outstanding
+      releaseGpuMemoryFence();
+
      // If these are from different contexts, then one of them could be in the device memory
      // This is fine, since spec doesn't allow for copies with pointers from different contexts
      amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
@@ -1328,9 +1319,6 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  Memory* srcDevMem = static_cast<roc::Memory*>(
@@ -1424,9 +1412,6 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  // no op for FGS supported device
@@ -1447,6 +1432,7 @@ void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
          LogError("submitSVMMapMemory() - copy failed");
          cmd.setStatus(CL_MAP_FAILURE);
        }
+        // Wait on a kernel if one is outstanding
        releaseGpuMemoryFence();
        const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
        amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
@@ -1463,9 +1449,6 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  // no op for FGS supported device
@@ -1476,6 +1459,8 @@ void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {

    if (memory->mapMemory() != nullptr) {
      if (writeMapInfo->isUnmapWrite()) {
+        // Wait on a kernel if one is outstanding
+        releaseGpuMemoryFence();
        amd::Coord3D srcOrigin(0, 0, 0);
        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());

@@ -1503,9 +1488,6 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  //! @todo add multi-devices synchronization when supported.
@@ -1563,8 +1545,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
        result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
                                      cmd.isEntireMemory());
        void* svmPtr = devMemory->owner()->getSvmPtr();
-        if ((svmPtr != nullptr) &&
-            (hostPtr != svmPtr)) {
+        if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
+          // Wait on a kernel if one is outstanding
          releaseGpuMemoryFence();
          amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
        }
@@ -1608,8 +1590,7 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
    LogError("Unmap without map call");
    return;
  }
-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
+
  profilingBegin(cmd);

  // Force buffer write for IMAGE1D_BUFFER
@@ -1663,8 +1644,9 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {

          const void* svmPtr = devMemory->owner()->getSvmPtr();
          void* hostPtr = mapMemory->getHostMem();
-          if ((svmPtr != nullptr) &&
-              (hostPtr != svmPtr)) {
+          if ((svmPtr != nullptr) && (hostPtr != svmPtr)) {
+            // Wait on a kernel if one is outstanding
+            releaseGpuMemoryFence();
            amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
          }
          result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
@@ -1751,9 +1733,6 @@ void VirtualGPU::submitFillMemory(amd::FillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  if (!fillMemory(cmd.type(), &cmd.memory(), cmd.pattern(), cmd.patternSize(), cmd.origin(),
@@ -1767,9 +1746,6 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // in-order semantics: previous commands need to be done before we start
-  releaseGpuMemoryFence();
-
  profilingBegin(cmd);

  amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
@@ -1811,9 +1787,6 @@ void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());

-  // Wait on a kernel if one is outstanding
-  releaseGpuMemoryFence();
-
  profilingBegin(vcmd);

  for (auto itr : vcmd.memObjects()) {