From 14aa72890ecee449bb3bd8cdc832b5acfb87a37c Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Fri, 27 Mar 2020 15:43:06 -0400
Subject: [PATCH] SWDEV-184709 - support hipLaunchCooperativeKernel()

Add ROCr cooperative queue allocation

Change-Id: I1384482692f4080d31255b09e0f68a21ccad3da8


[ROCm/clr commit: 7ef8dfdfe793fe9298dc9606bac680eb566e444d]
---
 projects/clr/rocclr/device/rocm/rocdevice.cpp | 67 ++++++++++++-------
 projects/clr/rocclr/device/rocm/rocdevice.hpp |  2 +-
 .../clr/rocclr/device/rocm/rocvirtual.cpp     | 11 +--
 .../clr/rocclr/device/rocm/rocvirtual.hpp     | 20 +++---
 4 files changed, 63 insertions(+), 37 deletions(-)
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp
index 6ab4be5070..edddf47316 100755
--- a/projects/clr/rocclr/device/rocm/rocdevice.cpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp
@@ -1457,14 +1457,20 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
   amd::ScopedLock lock(vgpusAccess());
 
   bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
+  bool cooperative = false;
 
-  profiling |= (queue == nullptr) ? true : false;
+  // If amd command queue is null, then it's an internal device queue
+  if (queue == nullptr) {
+    // In HIP mode the device queue will be allocated for the cooperative launches only
+    cooperative = amd::IS_HIP;
+    profiling = amd::IS_HIP;
+  }
 
   // Initialization of heap and other resources occur during the command
   // queue creation time.
-  VirtualGPU* virtualDevice = new VirtualGPU(*this);
+  VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative);
 
-  if (!virtualDevice->create(profiling)) {
+  if (!virtualDevice->create()) {
     delete virtualDevice;
     return nullptr;
   }
@@ -1874,12 +1880,13 @@ VirtualGPU* Device::xferQueue() const {
   return xferQueue_;
 }
 
-bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+  cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
   bool result = true;
   return result;
 }
 
-hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
+hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue) {
   assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
   ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
           queuePool_.size(), GPU_MAX_HW_QUEUES);
@@ -1907,8 +1914,15 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
   }
   auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint;
 
-  hsa_queue_t *queue;
-  while (hsa_queue_create(_bkendDevice, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
+  hsa_queue_t* queue;
+  auto queue_type = HSA_QUEUE_TYPE_MULTI;
+
+  // Enable cooperative queue for the device queue
+  if (coop_queue) {
+    queue_type = HSA_QUEUE_TYPE_COOPERATIVE;
+  }
+
+  while (hsa_queue_create(_bkendDevice, queue_size, queue_type, nullptr, nullptr,
                           std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
                           &queue) != HSA_STATUS_SUCCESS) {
     queue_size >>= 1;
@@ -1919,6 +1933,11 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
   ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
                 queue, queue_size);
   hsa_amd_profiling_set_profiler_enabled(queue, 1);
+  if (coop_queue) {
+    // Skip queue recycling for cooperative queues, since it should be just one
+    // per device.
+    return queue;
+  }
   auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
   assert(result.second && "QueueInfo already exists");
   auto &qInfo = result.first->second;
@@ -1928,26 +1947,26 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
 
 void Device::releaseQueue(hsa_queue_t* queue) {
   auto qIter = queuePool_.find(queue);
-  assert(qIter != queuePool_.end());
+  if (qIter != queuePool_.end()) {
+    auto &qInfo = qIter->second;
+    assert(qInfo.refCount > 0);
+    qInfo.refCount--;
+    if (qInfo.refCount != 0) {
+        return;
+    }
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
 
-  auto &qInfo = qIter->second;
-  assert(qInfo.refCount > 0);
-  qInfo.refCount--;
-  if (qInfo.refCount != 0) {
-      return;
+    if (qInfo.hostcallBuffer_) {
+      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
+              qInfo.hostcallBuffer_, queue);
+      disableHostcalls(qInfo.hostcallBuffer_, queue);
+      context().svmFree(qInfo.hostcallBuffer_);
+    }
+
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
+    queuePool_.erase(qIter);
   }
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
-
-  if (qInfo.hostcallBuffer_) {
-    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
-            qInfo.hostcallBuffer_, queue);
-    disableHostcalls(qInfo.hostcallBuffer_, queue);
-    context().svmFree(qInfo.hostcallBuffer_);
-  }
-
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
   hsa_queue_destroy(queue);
-  queuePool_.erase(qIter);
 }
 
 void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp
index aadd1cb463..24c8b71de1 100644
--- a/projects/clr/rocclr/device/rocm/rocdevice.hpp
+++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp
@@ -432,7 +432,7 @@ class Device : public NullDevice {
 
   //! Acquire HSA queue. This method can create a new HSA queue or
   //! share previously created
-  hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
+  hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false);
 
   //! Release HSA queue
   void releaseQueue(hsa_queue_t*);
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
index 2a18f97d8e..68b563dce6 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.cpp
@@ -584,8 +584,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
   return true;
 }
 
-VirtualGPU::VirtualGPU(Device& device)
+VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative)
     : device::VirtualDevice(device),
+      state_(0),
       gpu_queue_(nullptr),
       roc_device_(device),
       virtualQueue_(nullptr),
@@ -603,6 +604,8 @@ VirtualGPU::VirtualGPU(Device& device)
   // Initialize the last signal and dispatch flags
   timestamp_ = nullptr;
   hasPendingDispatch_ = false;
+  profiling_ = profiling;
+  cooperative_ = cooperative;
 
   kernarg_pool_base_ = nullptr;
   kernarg_pool_size_ = 0;
@@ -690,7 +693,7 @@ VirtualGPU::~VirtualGPU() {
   }
 }
 
-bool VirtualGPU::create(bool profilingEna) {
+bool VirtualGPU::create() {
   // Checking Virtual gpu unique index for ROCm backend
   if (index() > device().settings().commandQueues_) {
     return false;
@@ -698,10 +701,10 @@ bool VirtualGPU::create(bool profilingEna) {
 
   // Pick a reasonable queue size
   uint32_t queue_size = 1024;
-  gpu_queue_ = roc_device_.acquireQueue(queue_size);
+  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_);
   if (!gpu_queue_) return false;
 
-  if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
+  if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
     LogError("Couldn't allocate arguments/signals for the queue");
     return false;
   }
diff --git a/projects/clr/rocclr/device/rocm/rocvirtual.hpp b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
index cfadcc397d..811d445a97 100644
--- a/projects/clr/rocclr/device/rocm/rocvirtual.hpp
+++ b/projects/clr/rocclr/device/rocm/rocvirtual.hpp
@@ -163,10 +163,10 @@ class VirtualGPU : public device::VirtualDevice {
     size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
   };
 
-  VirtualGPU(Device& device);
+  VirtualGPU(Device& device, bool profiling = false, bool cooperative = false);
   ~VirtualGPU();
 
-  bool create(bool profilingEna);
+  bool create();
   bool terminate() { return true; }
   const Device& dev() const { return roc_device_; }
 
@@ -311,13 +311,17 @@ class VirtualGPU : public device::VirtualDevice {
   std::vector<Memory*> xferWriteBuffers_;  //!< Stage write buffers
   std::vector<amd::Memory*> pinnedMems_;   //!< Pinned memory list
 
-  /**
-   * @brief Indicates if a kernel dispatch is outstanding. This flag is
-   * used to synchronized on kernel outputs.
-   */
-  bool hasPendingDispatch_;
+  //! Queue state flags
+  union {
+    struct {
+      uint32_t hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
+      uint32_t imageBufferWrtBack_ : 1; //!< Image buffer write back is required
+      uint32_t profiling_          : 1; //!< Profiling is enabled
+      uint32_t cooperative_        : 1; //!< Cooperative launch is enabled
+    };
+    uint32_t  state_;
+  };
 
-  bool imageBufferWrtBack_;  //!< Enable image buffer write back
   std::vector<device::Memory*> wrtBackImageBuffer_;  //!< Array of images for write back
 
   Timestamp* timestamp_;