SWDEV-347670 - GPU StreamWait and StreamWrite support in Windows PAL backend

Change-Id: Ic4881305b6332e217f3d3127dce7e9d9d0a7df11 [ROCm/clr commit: 373a7d1195]
2022-07-21 18:05:05 +00:00
@@ -193,6 +193,24 @@ class BlitManager : public amd::HeapObject {
                         bool entire = false          //!< Entire buffer will be updated
                         ) const = 0;

+
+  //! Stream memory write operation - Write a 'value' at 'memory'.
+  virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes
+  ) const = 0;
+
+
+  //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
+  virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes,
+                             uint64_t flags,
+                             uint64_t mask
+  ) const = 0;
+
  //! Enables synchronization on blit operations
  void enableSynchronization() { syncOperation_ = true; }

@@ -983,7 +983,9 @@ void CalcRowSlicePitches(uint64_t* pitch, const int32_t* copySize, size_t rowPit
  }
 }

-static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const void* value) {
+inline void KernelBlitManager::setArgument(amd::Kernel* kernel, size_t index, size_t size,
+                        const void* value, size_t offset,
+                        const device::Memory* dev_mem, bool writeVAImmediate) const {
  const amd::KernelParameterDescriptor& desc = kernel->signature().at(index);

  void* param = kernel->parameters().values() + desc.offset_;
@@ -1003,7 +1005,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
    } else {
      // convert cl_mem to amd::Memory*, return false if invalid.
      LP64_SWITCH(uint32_value, uint64_value) =
-          static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress());
+          static_cast<uintptr_t>((*static_cast<Memory* const*>(value))->virtualAddress()) + offset;
      reinterpret_cast<Memory**>(kernel->parameters().values() +
                                 kernel->parameters().memoryObjOffset())[desc.info_.arrayIndex_] =
          *static_cast<Memory* const*>(value);
@@ -1036,7 +1038,7 @@ static void setArgument(amd::Kernel* kernel, size_t index, size_t size, const vo
        break;
    }

-  switch (argSize) {
+  switch (desc.size_) {
    case sizeof(uint32_t):
      *static_cast<uint32_t*>(param) = uint32_value;
      break;
@@ -2426,6 +2428,85 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
  return result;
 }

+// ================================================================================================
+bool KernelBlitManager::streamOpsWrite(device::Memory& memory, uint64_t value,
+                                       size_t offset, size_t sizeBytes) const {
+  amd::ScopedLock k(lockXferOps_);
+  bool result = false;
+  uint blitType = StreamOpsWrite;
+  size_t dim = 1;
+  size_t globalWorkOffset[1] = { 0 };
+  size_t globalWorkSize[1] = { 1 };
+  size_t localWorkSize[1] = { 1 };
+  // Program kernels arguments for the write operation
+  Memory* mem = &gpuMem(memory);
+  bool is32BitWrite = (sizeBytes == sizeof(uint32_t)) ? true : false;
+  // Program kernels arguments for the write operation
+  if (is32BitWrite) {
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
+    setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
+  } else {
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
+    setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
+  }
+  setArgument(kernels_[blitType], 3, sizeof(size_t), &sizeBytes);
+  // Create ND range object for the kernel's execution
+  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
+  // Execute the blit
+  address parameters = kernels_[blitType]->parameters().values();
+  result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+  synchronize();
+  return result;
+}
+
+// ================================================================================================
+bool KernelBlitManager::streamOpsWait(device::Memory& memory, uint64_t value, size_t offset,
+                                      size_t sizeBytes, uint64_t flags, uint64_t mask) const {
+  amd::ScopedLock k(lockXferOps_);
+  bool result = false;
+  uint blitType = StreamOpsWait;
+  size_t dim = 1;
+
+  size_t globalWorkOffset[1] = { 0 };
+  size_t globalWorkSize[1] = { 1 };
+  size_t localWorkSize[1] = { 1 };
+
+  // Program kernels arguments for the wait operation
+  Memory* mem = &gpuMem(memory);
+  bool is32BitWait = (sizeBytes == sizeof(uint32_t)) ? true : false;
+  // Program kernels arguments for the wait operation
+  if (is32BitWait) {
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), &mem, offset);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), nullptr);
+    setArgument(kernels_[blitType], 2, sizeof(uint32_t), &value);
+    setArgument(kernels_[blitType], 3, sizeof(uint32_t), &flags);
+    setArgument(kernels_[blitType], 4, sizeof(uint32_t), &mask);
+  } else {
+    setArgument(kernels_[blitType], 0, sizeof(cl_mem), nullptr);
+    setArgument(kernels_[blitType], 1, sizeof(cl_mem), &mem, offset);
+    setArgument(kernels_[blitType], 2, sizeof(uint64_t), &value);
+    setArgument(kernels_[blitType], 3, sizeof(uint64_t), &flags);
+    setArgument(kernels_[blitType], 4, sizeof(uint64_t), &mask);
+  }
+
+  // Create ND range object for the kernel's execution
+  amd::NDRangeContainer ndrange(dim, globalWorkOffset, globalWorkSize, localWorkSize);
+
+  // Execute the blit
+  address parameters = kernels_[blitType]->parameters().values();
+  result = gpu().submitKernelInternal(ndrange, *kernels_[blitType], parameters);
+  synchronize();
+
+  return result;
+}
+
+
+
+
+
+
 bool KernelBlitManager::runScheduler(device::Memory& vqueue, device::Memory& params, uint paramIdx,
                                     uint threads) const {
  amd::ScopedLock k(lockXferOps_);
@@ -155,6 +155,30 @@ class DmaBlitManager : public device::HostBlitManager {
                         bool entire = false             //!< Entire buffer will be updated
                         ) const;

+  //! Stream memory write operation - Write a 'value' at 'memory'.
+  virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes
+  ) const {
+    assert(!"Unimplemented");
+    return false;
+  };
+
+  //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
+  virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes,
+                             uint64_t flags,
+                             uint64_t mask
+  ) const {
+    assert(!"Unimplemented");
+    return false;
+  };
+
+
+
 protected:
  static constexpr uint MaxPinnedBuffers = 4;

@@ -223,7 +247,9 @@ class KernelBlitManager : public DmaBlitManager {
    FillImage,
    Scheduler,
    GwsInit,
-    BlitTotal
+    StreamOpsWrite,
+    StreamOpsWait,
+    BlitTotal,
  };

  //! Constructor
@@ -378,6 +404,23 @@ class KernelBlitManager : public DmaBlitManager {

  virtual amd::Monitor* lockXfer() const { return &lockXferOps_; }

+  //! Stream memory write operation - Write a 'value' at 'memory'.
+  virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes
+  ) const;
+
+  //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
+  virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes,
+                             uint64_t flags,
+                             uint64_t mask
+  ) const;
+
+
 private:
  static constexpr size_t MaxXferBuffers = 2;
  static constexpr uint TransferSplitSize = 3;
@@ -408,6 +451,12 @@ class KernelBlitManager : public DmaBlitManager {
  bool createProgram(Device& device  //!< Device object
  );

+  inline void setArgument(amd::Kernel* kernel, size_t index,
+                          size_t size, const void* value, size_t offset = 0,
+                          const device::Memory* dev_mem = nullptr,
+                          bool writeVAImmediate = false) const;
+
+
  //! Creates a view memory object
  Memory* createView(const Memory& parent,         //!< Parent memory object
                     const cl_image_format format  //!< The new format for a view
@@ -432,7 +481,8 @@ static const char* BlitName[KernelBlitManager::BlitTotal] = {
    "__amd_rocclr_copyBufferRectAligned", "__amd_rocclr_copyBuffer",
    "__amd_rocclr_copyBufferAligned", "__amd_rocclr_fillBufferAligned",
    "__amd_rocclr_fillImage", "__amd_rocclr_scheduler",
-    "__amd_rocclr_gwsInit"
+    "__amd_rocclr_gwsInit", "__amd_rocclr_streamOpsWrite",
+    "__amd_rocclr_streamOpsWait"
 };

 /*@}*/  // namespace pal
@@ -22,6 +22,25 @@ namespace pal {

 #define BLIT_KERNEL(...) #__VA_ARGS__

+const char* palBlitLinearSourceCode = BLIT_KERNEL(
+\n
+extern void __amd_streamOpsWrite(__global uint*, __global ulong*, ulong, ulong);
+\n
+extern void __amd_streamOpsWait(__global uint*,__global ulong*, ulong, ulong, ulong);
+\n
+__kernel void __amd_rocclr_streamOpsWrite(__global uint* ptrInt, __global ulong* ptrUlong,
+                                          ulong value, ulong sizeBytes) {
+  __amd_streamOpsWrite(ptrInt, ptrUlong, value, sizeBytes);
+}
+\n
+__kernel void __amd_rocclr_streamOpsWait(__global uint* ptrInt, __global ulong* ptrUlong,
+                                         ulong value, ulong flags, ulong mask) {
+  __amd_streamOpsWait(ptrInt, ptrUlong, value, flags, mask);
+}
+\n);
+
+
+
 const char* SchedulerSourceCode = BLIT_KERNEL(
 \n
 extern void __amd_scheduler(__global void*, __global void*, uint);
@@ -821,6 +821,8 @@ Device::~Device() {
 extern const char* SchedulerSourceCode;
 extern const char* SchedulerSourceCode20;
 extern const char* GwsInitSourceCode;
+extern const char* palBlitLinearSourceCode;
+
 Pal::IDevice* gDeviceList[Pal::MaxDevices] = {};
 uint32_t gStartDevice = 0;
 uint32_t gNumDevices = 0;
@@ -2469,8 +2471,9 @@ bool Device::createBlitProgram() {
  std::string extraBlits;
  std::string ocl20;
  if (amd::IS_HIP) {
+    extraBlits = palBlitLinearSourceCode;
    if (info().cooperativeGroups_) {
-      extraBlits = GwsInitSourceCode;
+      extraBlits.append(GwsInitSourceCode);
    }
  }
  else {
@@ -2134,6 +2134,48 @@ void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& vcmd) {
  profilingEnd(vcmd);
 }

+void VirtualGPU::submitStreamOperation(amd::StreamOperationCommand& cmd) {
+  // Make sure VirtualGPU has an exclusive access to the resources
+  amd::ScopedLock lock(execution());
+  profilingBegin(cmd);
+
+  const cl_command_type type = cmd.type();
+  const uint64_t value = cmd.value();
+  const uint64_t mask = cmd.mask();
+  const unsigned int flags = cmd.flags();
+  const size_t sizeBytes = cmd.sizeBytes();
+  const size_t offset = cmd.offset();
+
+  amd::Memory* amdMemory = &cmd.memory();
+  Memory* memory = dev().getGpuMemory(amdMemory);
+
+  if (type == ROCCLR_COMMAND_STREAM_WAIT_VALUE) {
+
+    // Use a blit kernel to perform the wait operation
+    // mask is applied on value before performing
+    // the comparision defined by 'condition'
+    bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWait(*memory, value, offset,
+                                                                            sizeBytes, flags, mask);
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Waiting for value: 0x%lx."
+            " Flags: 0x%lx mask: 0x%lx", value, flags, mask);
+    if (!result) {
+      LogError("submitStreamOperation: Wait failed!");
+    }
+  } else if (type == ROCCLR_COMMAND_STREAM_WRITE_VALUE) {
+    bool result = static_cast<KernelBlitManager&>(blitMgr()).streamOpsWrite(*memory, value,
+                                                                            offset, sizeBytes);
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Writing value: 0x%lx", value);
+    if (!result) {
+      LogError("submitStreamOperation: Write failed!");
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  profilingEnd(cmd);
+}
+
+
+
 void VirtualGPU::submitVirtualMap(amd::VirtualMapCommand& vcmd) {
  // Make sure VirtualGPU has an exclusive access to the resources
  amd::ScopedLock lock(execution());
@@ -349,7 +349,7 @@ class VirtualGPU : public device::VirtualDevice {
  virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
  virtual void submitTransferBufferFromFile(amd::TransferBufferFileCommand& cmd);
  virtual void submitVirtualMap(amd::VirtualMapCommand& cmd);
-
+  virtual void submitStreamOperation(amd::StreamOperationCommand& cmd);
  void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd);

  void releaseMemory(GpuMemoryReference* mem);
@@ -157,6 +157,30 @@ class DmaBlitManager : public device::HostBlitManager {
                         bool entire = false             //!< Entire buffer will be updated
                         ) const;

+  //! Stream memory write operation - Write a 'value' at 'memory'.
+  virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes
+  ) const {
+    assert(!"Unimplemented");
+    return false;
+  }
+
+  //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
+  virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
+                             uint64_t value,
+                             size_t offset,
+                             size_t sizeBytes,
+                             uint64_t flags,
+                             uint64_t mask
+  ) const {
+    assert(!"Unimplemented");
+    return false;
+  }
+
+
+
 protected:
  static constexpr uint MaxPinnedBuffers = 4;
  static constexpr size_t kMaxH2dMemcpySize = 8 * Ki;
@@ -427,14 +451,14 @@ class KernelBlitManager : public DmaBlitManager {
                  ) const;

  //! Stream memory write operation - Write a 'value' at 'memory'.
-  bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
+  virtual bool streamOpsWrite(device::Memory& memory, //!< Memory to write the 'value'
                             uint64_t value,
                             size_t offset,
                             size_t sizeBytes
  ) const;

  //! Stream memory ops- Waits for a 'value' at 'memory' and wait is released based on compare op.
-  bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
+  virtual bool streamOpsWait(device::Memory& memory, //!< Memory contents to compare the 'value' against
                             uint64_t value,
                             size_t offset,
                             size_t sizeBytes,
@@ -19,6 +19,7 @@
 THE SOFTWARE. */

 #include "platform/activity.hpp"
+#include "platform/command_utils.hpp"

 ACTIVITY_PROF_INSTANCES();

@@ -59,6 +60,8 @@ const char* getOclCommandKindString(uint32_t op) {
    CASE_STRING(CL_COMMAND_SVM_MEMFILL, SvmMemFill)
    CASE_STRING(CL_COMMAND_SVM_MAP, SvmMap)
    CASE_STRING(CL_COMMAND_SVM_UNMAP, SvmUnmap)
+    CASE_STRING(ROCCLR_COMMAND_STREAM_WAIT_VALUE, StreamWait)
+    CASE_STRING(ROCCLR_COMMAND_STREAM_WRITE_VALUE, StreamWrite)
    default: case_string = "Unknown command type";
  };
  return case_string;