From 9c2c23a3a3344fcd025e45ca30bf85b23ca98193 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 16 Aug 2018 13:59:59 -0400
Subject: [PATCH] P4 to Git Change 1594574 by vsytchen@vsytchen-win10 on
 2018/08/16 13:43:33

	SWDEV-159881 - [OCL][ROCm] Add SVM coarse-grain buffer support with device memory (Part 2)

	1. Implement clEnqueueSvmMap/Unmap using a staging buffer
	2. Enable device memory coarse grain SVM for OCL only with single device contexts.

	ReviewBoardURL = http://ocltc.amd.com/reviews/r/15616/diff/

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#94 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmemory.cpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#296 edit


[ROCm/clr commit: 8dcc948d3784a96727256f76638b7d162a17c848]
---
 .../rocclr/runtime/device/rocm/rocdevice.cpp  |   4 +-
 .../rocclr/runtime/device/rocm/rocdevice.hpp  |   7 +-
 .../rocclr/runtime/device/rocm/rocmemory.cpp  |  27 ++--
 .../rocclr/runtime/device/rocm/rocvirtual.cpp | 132 ++++++++++++++----
 .../rocclr/runtime/device/rocm/rocvirtual.hpp |   6 +-
 projects/clr/rocclr/runtime/utils/flags.hpp   |   4 +-
 6 files changed, 136 insertions(+), 44 deletions(-)

diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
index 281fe5b4af..98df526728 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -1454,7 +1454,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
   }
 
   // Prepin sysmem buffer for possible data synchronization between CPU and GPU
-  if (!memory->isHostMemDirectAccess() && (owner.getHostMem() != nullptr)) {
+  if (!memory->isHostMemDirectAccess() &&
+      (owner.getHostMem() != nullptr) &&
+      (owner.getSvmPtr() == nullptr)) {
     memory->pinSystemMemory(owner.getHostMem(), owner.getSize());
   }
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
index 3d934fcb88..7a2c811a93 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -147,6 +147,11 @@ class NullDevice : public amd::Device {
     return;
   }
 
+  //! Determine if we can use device memory for SVM
+  const bool forceFineGrain(amd::Memory* memory) const {
+    return !settings().enableCoarseGrainSVM_ || (memory->getContext().devices().size() > 1);
+  }
+
   //! Acquire external graphics API object in the host thread
   //! Needed for OpenGL objects on CPU device
 
@@ -285,7 +290,7 @@ class Device : public NullDevice {
                              device::Sampler** sampler   //!< device sampler object
                              ) const {
     //! \todo HSA team has to implement sampler allocation.
-    //! Currently allocate the base device class 
+    //! Currently allocate the base device class
     *sampler = new device::Sampler();
     if (*sampler == nullptr) {
       return false;
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
index 7f8096bc3a..45b57a4ada 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocmemory.cpp
@@ -105,11 +105,6 @@ void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& reg
   if (IsPersistentDirectMap()) {
     return (static_cast<char*>(persistent_host_ptr_) + origin[0]);
   }
-  // Otherwise, check for host memory.
-  void* hostMem = owner()->getHostMem();
-  if (hostMem != nullptr) {
-    return (static_cast<char*>(hostMem) + origin[0]);
-  }
 
   // Allocate one if needed.
   if (indirectMapCount_ == 1) {
@@ -124,7 +119,17 @@ void* Memory::allocMapTarget(const amd::Coord3D& origin, const amd::Coord3D& reg
       return nullptr;
     }
   }
-  return reinterpret_cast<address>(mapMemory_->getHostMem()) + origin[0];
+
+  void* mappedMemory = nullptr;
+
+  if (owner()->getSvmPtr() != nullptr) {
+    owner()->commitSvmMemory();
+    mappedMemory = owner()->getSvmPtr();
+  } else {
+    mappedMemory = reinterpret_cast<address>(mapMemory_->getHostMem()) + origin[0];
+  }
+
+  return mappedMemory;
 }
 
 void Memory::decIndMapCount() {
@@ -584,7 +589,7 @@ void Buffer::destroy() {
   cl_mem_flags memFlags = owner()->getMemFlags();
 
   if (owner()->getSvmPtr() != nullptr) {
-    if (!dev().settings().enableCoarseGrainSVM_) {
+    if (dev().forceFineGrain(owner())) {
       memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
     }
     const bool isFineGrain = memFlags & CL_MEM_SVM_FINE_GRAIN_BUFFER;
@@ -652,7 +657,7 @@ bool Buffer::create() {
   cl_mem_flags memFlags = owner()->getMemFlags();
 
   if (owner()->getSvmPtr() != nullptr) {
-    if (!dev().settings().enableCoarseGrainSVM_) {
+    if (dev().forceFineGrain(owner())) {
       memFlags |= CL_MEM_SVM_FINE_GRAIN_BUFFER;
       flags_ |= HostMemoryDirectAccess;
     }
@@ -669,6 +674,12 @@ bool Buffer::create() {
       deviceMemory_ = owner()->getSvmPtr();
     }
 
+    if (!isFineGrain &&
+        (owner()->parent() != nullptr) &&
+        (owner()->parent()->getSvmPtr() != nullptr)) {
+      owner()->parent()->commitSvmMemory();
+    }
+
     if (dev().settings().apuSystem_ || !isFineGrain) {
       const_cast<Device&>(dev()).updateFreeMemory(size(), false);
     }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index d35628a3f8..f57797333d 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1215,8 +1215,7 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
 
   profilingBegin(cmd);
   // no op for FGS supported device
-  if (!dev().isFineGrainedSystem() &&
-      dev().settings().enableCoarseGrainSVM_) {
+  if (!dev().isFineGrainedSystem()) {
     amd::Coord3D srcOrigin(0, 0, 0);
     amd::Coord3D dstOrigin(0, 0, 0);
     amd::Coord3D size(cmd.srcSize(), 1, 1);
@@ -1229,7 +1228,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
 
     device::Memory::SyncFlags syncFlags;
     if (nullptr != srcMem) {
-      srcMem->commitSvmMemory();
       srcOrigin.c[0] =
           static_cast<const_address>(cmd.src()) - static_cast<address>(srcMem->getSvmPtr());
       if (!(srcMem->validateRegion(srcOrigin, size))) {
@@ -1238,7 +1236,6 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
       }
     }
     if (nullptr != dstMem) {
-      dstMem->commitSvmMemory();
       dstOrigin.c[0] =
           static_cast<const_address>(cmd.dst()) - static_cast<address>(dstMem->getSvmPtr());
       if (!(dstMem->validateRegion(dstOrigin, size))) {
@@ -1247,7 +1244,11 @@ void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd) {
       }
     }
 
-    if (nullptr == srcMem && nullptr == dstMem) { // both not in svm space
+    if ((nullptr == srcMem && nullptr == dstMem) || // both not in svm space
+        dev().forceFineGrain(srcMem) ||
+        dev().forceFineGrain(dstMem)) {
+      // If these are from different contexts, then one of them could be in the device memory
+      // This is fine, since spec doesn't allow for copies with pointers from different contexts
       amd::Os::fastMemcpy(cmd.dst(), cmd.src(), cmd.srcSize());
       result = true;
     } else if (nullptr == srcMem && nullptr != dstMem) {  // src not in svm space
@@ -1367,18 +1368,75 @@ void VirtualGPU::submitCopyMemoryP2P(amd::CopyMemoryP2PCommand& cmd) {
 }
 
 void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) {
-  // No fence is needed since this is a no-op: the
-  // command will be completed only after all the
-  // previous commands are complete
+  // Wait on a kernel if one is outstanding
+  releaseGpuMemoryFence();
+
   profilingBegin(cmd);
+
+  // no op for FGS supported device
+  if (!dev().isFineGrainedSystem() &&
+      !dev().forceFineGrain(cmd.getSvmMem())) {
+    // Make sure we have memory for the command execution
+    Memory* memory = dev().getRocMemory(cmd.getSvmMem());
+
+    memory->saveMapInfo(cmd.svmPtr(), cmd.origin(), cmd.size(), cmd.mapFlags(),
+                        cmd.isEntireMemory());
+
+    if (memory->mapMemory() != nullptr) {
+      if (cmd.mapFlags() & (CL_MAP_READ | CL_MAP_WRITE)) {
+        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
+
+        if (!blitMgr().copyBuffer(*memory, *hsaMapMemory, cmd.origin(), cmd.origin(),
+                                  cmd.size(), cmd.isEntireMemory())) {
+          LogError("submitSVMMapMemory() - copy failed");
+          cmd.setStatus(CL_MAP_FAILURE);
+        }
+        releaseGpuMemoryFence();
+        const void* mappedPtr = hsaMapMemory->owner()->getHostMem();
+        amd::Os::fastMemcpy(cmd.svmPtr(), mappedPtr, cmd.size()[0]);
+      }
+    } else {
+      LogError("Unhandled svm map!");
+    }
+  }
+
   profilingEnd(cmd);
 }
 
 void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) {
-  // No fence is needed since this is a no-op: the
-  // command will be completed only after all the
-  // previous commands are complete
+  // Wait on a kernel if one is outstanding
+  releaseGpuMemoryFence();
+
   profilingBegin(cmd);
+
+  // no op for FGS supported device
+  if (!dev().isFineGrainedSystem() &&
+      !dev().forceFineGrain(cmd.getSvmMem())) {
+    Memory* memory = dev().getRocMemory(cmd.getSvmMem());
+    const device::Memory::WriteMapInfo* writeMapInfo = memory->writeMapInfo(cmd.svmPtr());
+
+    if (memory->mapMemory() != nullptr) {
+      if (writeMapInfo->isUnmapWrite()) {
+        amd::Coord3D srcOrigin(0, 0, 0);
+        Memory* hsaMapMemory = dev().getRocMemory(memory->mapMemory());
+
+        void* mappedPtr = hsaMapMemory->owner()->getHostMem();
+        amd::Os::fastMemcpy(mappedPtr, cmd.svmPtr(), writeMapInfo->region_[0]);
+        // Target is a remote resource, so copy
+        if (!blitMgr().copyBuffer(*hsaMapMemory, *memory, writeMapInfo->origin_,
+                                  writeMapInfo->origin_, writeMapInfo->region_,
+                                  writeMapInfo->isEntire())) {
+          LogError("submitSvmUnmapMemory() - copy failed");
+          cmd.setStatus(CL_OUT_OF_RESOURCES);
+        }
+      }
+    } else {
+      LogError("Unhandled svm map!");
+    }
+
+    memory->clearUnmapInfo(cmd.svmPtr());
+  }
+
   profilingEnd(cmd);
 }
 
@@ -1408,7 +1466,8 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
 
   // Sync to the map target.
   // If we have host memory, use it
-  if (devMemory->owner()->getHostMem() != nullptr) {
+  if ((devMemory->owner()->getHostMem() != nullptr) &&
+      (devMemory->owner()->getSvmPtr() == nullptr)) {
     // Target is the backing store, so just ensure that owner is up-to-date
     devMemory->owner()->cacheWriteBack();
 
@@ -1441,6 +1500,12 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
             static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
         result = blitMgr().copyBuffer(*hsaMemory, *hsaMapMemory, origin, dstOrigin, size,
                                       cmd.isEntireMemory());
+        void* svmPtr = devMemory->owner()->getSvmPtr();
+        if ((svmPtr != nullptr) &&
+            (hostPtr != svmPtr)) {
+          releaseGpuMemoryFence();
+          amd::Os::fastMemcpy(svmPtr, hostPtr, size[0]);
+        }
       } else {
         result = blitMgr().readBuffer(*hsaMemory, static_cast<char*>(hostPtr) + origin[0], origin,
                                       size, cmd.isEntireMemory());
@@ -1448,10 +1513,10 @@ void VirtualGPU::submitMapMemory(amd::MapMemoryCommand& cmd) {
     } else if (type == CL_COMMAND_MAP_IMAGE) {
       amd::Image* image = cmd.memory().asImage();
       if (mapMemory != nullptr) {
-        roc::Memory* mapMemory =
-            static_cast<roc::Memory*>(devMemory->mapMemory()->getDeviceMemory(dev(), false));
+        roc::Memory* hsaMapMemory =
+            static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
         result =
-            blitMgr().copyImageToBuffer(*hsaMemory, *mapMemory, cmd.origin(), amd::Coord3D(0, 0, 0),
+            blitMgr().copyImageToBuffer(*hsaMemory, *hsaMapMemory, cmd.origin(), amd::Coord3D(0, 0, 0),
                                         cmd.size(), cmd.isEntireMemory());
       } else {
         result = blitMgr().readImage(*hsaMemory, hostPtr, amd::Coord3D(0), image->getRegion(),
@@ -1486,7 +1551,8 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
   bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
 
   // We used host memory
-  if (devMemory->owner()->getHostMem() != nullptr) {
+  if ((devMemory->owner()->getHostMem() != nullptr) &&
+      (devMemory->owner()->getSvmPtr() == nullptr)) {
     if (mapInfo->isUnmapWrite()) {
       // Target is the backing store, so sync
       devMemory->owner()->signalWrite(nullptr);
@@ -1503,14 +1569,14 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
     if (!devMemory->isHostMemDirectAccess()) {
       bool result = false;
 
+      amd::Memory* mapMemory = devMemory->mapMemory();
       if (cmd.memory().asImage() && !imageBuffer) {
         amd::Image* image = cmd.memory().asImage();
-        amd::Memory* mapMemory = devMemory->mapMemory();
-        if (devMemory->mapMemory() != nullptr) {
-          roc::Memory* mapMemory =
-              static_cast<roc::Memory*>(devMemory->mapMemory()->getDeviceMemory(dev(), false));
+        if (mapMemory != nullptr) {
+          roc::Memory* hsaMapMemory =
+              static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
           result =
-              blitMgr().copyBufferToImage(*mapMemory, *devMemory, amd::Coord3D(0, 0, 0),
+              blitMgr().copyBufferToImage(*hsaMapMemory, *devMemory, amd::Coord3D(0, 0, 0),
                                           mapInfo->origin_, mapInfo->region_, mapInfo->isEntire());
         } else {
           void* hostPtr = devMemory->owner()->getHostMem();
@@ -1526,11 +1592,17 @@ void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand& cmd) {
           origin.c[0] *= elemSize;
           size.c[0] *= elemSize;
         }
-        if (devMemory->mapMemory() != nullptr) {
-          roc::Memory* mapMemory =
-              static_cast<roc::Memory*>(devMemory->mapMemory()->getDeviceMemory(dev(), false));
+        if (mapMemory != nullptr) {
+          roc::Memory* hsaMapMemory =
+              static_cast<roc::Memory*>(mapMemory->getDeviceMemory(dev(), false));
 
-          result = blitMgr().copyBuffer(*mapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
+          const void* svmPtr = devMemory->owner()->getSvmPtr();
+          void* hostPtr = mapMemory->getHostMem();
+          if ((svmPtr != nullptr) &&
+              (hostPtr != svmPtr)) {
+            amd::Os::fastMemcpy(hostPtr, svmPtr, size[0]);
+          }
+          result = blitMgr().copyBuffer(*hsaMapMemory, *devMemory, mapInfo->origin_, mapInfo->origin_,
                                         mapInfo->region_, mapInfo->isEntire());
         } else {
           result = blitMgr().writeBuffer(cmd.mapPtr(), *devMemory, origin, size);
@@ -1626,12 +1698,14 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
 
   profilingBegin(cmd);
 
-  if (!dev().isFineGrainedSystem() &&
-      dev().settings().enableCoarseGrainSVM_) {
+  amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
+
+  if (!dev().isFineGrainedSystem() ||
+      ((dstMemory != nullptr) &&
+       !dev().forceFineGrain(dstMemory))) {
     size_t patternSize = cmd.patternSize();
     size_t fillSize = patternSize * cmd.times();
-    amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(cmd.dst());
-    assert(dstMemory && "No svm Buffer to fill with!");
+
     size_t offset = reinterpret_cast<uintptr_t>(cmd.dst()) -
         reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
 
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
index 32b4237403..0b9a0b2da1 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -182,6 +182,8 @@ class VirtualGPU : public device::VirtualDevice {
   void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
   void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
   void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
+  void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
+  void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
 
   // { roc OpenCL integration
   // Added these stub (no-ops) implementation of pure virtual methods,
@@ -191,8 +193,6 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitSignal(amd::SignalCommand& cmd) {}
   virtual void submitMakeBuffersResident(amd::MakeBuffersResidentCommand& cmd) {}
 
-  virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
-  virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
   virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
 
   void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd) {}
@@ -219,7 +219,7 @@ class VirtualGPU : public device::VirtualDevice {
   //! Detects memory dependency for HSAIL kernels and uses appropriate AQL header
   bool processMemObjects(const amd::Kernel& kernel,  //!< AMD kernel object for execution
                          const_address params,       //!< Pointer to the param's store
-			 size_t& ldsAddress          //!< LDS usage
+                         size_t& ldsAddress          //!< LDS usage
                          );
   // Retun the virtual gpu unique index
   uint index() const { return index_; }
diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp
index 3a81e74c3d..d7f7f45714 100644
--- a/projects/clr/rocclr/runtime/utils/flags.hpp
+++ b/projects/clr/rocclr/runtime/utils/flags.hpp
@@ -145,8 +145,8 @@ release(uint, HSA_SIGNAL_POOL_SIZE, 16,                                       \
         "Signal object pool size")                                            \
 release(bool, HSA_ENABLE_ATOMICS_32B, false,                                  \
         "1 = Enable SVM atomics in 32 bits (HSA backend-only). Any other value keeps then disabled.") \
-release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, IS_HIP,                            \
-        "Enable device memory for coarse grain SVM allocations") \
+release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
+        "Enable device memory for coarse grain SVM allocations")              \
 release(bool, GPU_IFH_MODE, false,                                            \
         "1 = Enable GPU IFH (infinitely fast hardware) mode. Any other value keeps setting disabled.") \
 release(bool, GPU_MIPMAP, true,                                               \