SWDEV-184709 - support hipLaunchCooperativeKernel()

Add ROCr cooperative queue allocation Change-Id: I1384482692f4080d31255b09e0f68a21ccad3da8 [ROCm/clr commit: 7ef8dfdfe7]
2020-03-27 15:43:06 -04:00
parent 782a76511b
commit 14aa72890e
4 changed files with 63 additions and 37 deletions
@@ -1457,14 +1457,20 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
  amd::ScopedLock lock(vgpusAccess());

  bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
+  bool cooperative = false;

-  profiling |= (queue == nullptr) ? true : false;
+  // If amd command queue is null, then it's an internal device queue
+  if (queue == nullptr) {
+    // In HIP mode the device queue will be allocated for the cooperative launches only
+    cooperative = amd::IS_HIP;
+    profiling = amd::IS_HIP;
+  }

  // Initialization of heap and other resources occur during the command
  // queue creation time.
-  VirtualGPU* virtualDevice = new VirtualGPU(*this);
+  VirtualGPU* virtualDevice = new VirtualGPU(*this, profiling, cooperative);

-  if (!virtualDevice->create(profiling)) {
+  if (!virtualDevice->create()) {
    delete virtualDevice;
    return nullptr;
  }
@@ -1874,12 +1880,13 @@ VirtualGPU* Device::xferQueue() const {
  return xferQueue_;
 }

-bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput, cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
+bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeInput,
+  cl_set_device_clock_mode_output_amd* pSetClockModeOutput) {
  bool result = true;
  return result;
 }

-hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
+hsa_queue_t* Device::acquireQueue(uint32_t queue_size_hint, bool coop_queue) {
  assert(queuePool_.size() <= GPU_MAX_HW_QUEUES);
  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "number of allocated hardware queues: %d, maximum: %d",
          queuePool_.size(), GPU_MAX_HW_QUEUES);
@@ -1907,8 +1914,15 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
  }
  auto queue_size = (queue_max_packets < queue_size_hint) ? queue_max_packets : queue_size_hint;

-  hsa_queue_t *queue;
-  while (hsa_queue_create(_bkendDevice, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
+  hsa_queue_t* queue;
+  auto queue_type = HSA_QUEUE_TYPE_MULTI;
+
+  // Enable cooperative queue for the device queue
+  if (coop_queue) {
+    queue_type = HSA_QUEUE_TYPE_COOPERATIVE;
+  }
+
+  while (hsa_queue_create(_bkendDevice, queue_size, queue_type, nullptr, nullptr,
                          std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
                          &queue) != HSA_STATUS_SUCCESS) {
    queue_size >>= 1;
@@ -1919,6 +1933,11 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {
  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "created hardware queue %p with size %d",
                queue, queue_size);
  hsa_amd_profiling_set_profiler_enabled(queue, 1);
+  if (coop_queue) {
+    // Skip queue recycling for cooperative queues, since it should be just one
+    // per device.
+    return queue;
+  }
  auto result = queuePool_.emplace(std::make_pair(queue, QueueInfo()));
  assert(result.second && "QueueInfo already exists");
  auto &qInfo = result.first->second;
@@ -1928,26 +1947,26 @@ hsa_queue_t *Device::acquireQueue(uint32_t queue_size_hint) {

 void Device::releaseQueue(hsa_queue_t* queue) {
  auto qIter = queuePool_.find(queue);
-  assert(qIter != queuePool_.end());
+  if (qIter != queuePool_.end()) {
+    auto &qInfo = qIter->second;
+    assert(qInfo.refCount > 0);
+    qInfo.refCount--;
+    if (qInfo.refCount != 0) {
+        return;
+    }
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);

-  auto &qInfo = qIter->second;
-  assert(qInfo.refCount > 0);
-  qInfo.refCount--;
-  if (qInfo.refCount != 0) {
-      return;
+    if (qInfo.hostcallBuffer_) {
+      ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
+              qInfo.hostcallBuffer_, queue);
+      disableHostcalls(qInfo.hostcallBuffer_, queue);
+      context().svmFree(qInfo.hostcallBuffer_);
+    }
+
+    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
+    queuePool_.erase(qIter);
  }
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
-
-  if (qInfo.hostcallBuffer_) {
-    ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hostcall buffer %p for hardware queue %p",
-            qInfo.hostcallBuffer_, queue);
-    disableHostcalls(qInfo.hostcallBuffer_, queue);
-    context().svmFree(qInfo.hostcallBuffer_);
-  }
-
-  ClPrint(amd::LOG_INFO, amd::LOG_QUEUE, "deleting hardware queue %p with refCount 0", queue);
  hsa_queue_destroy(queue);
-  queuePool_.erase(qIter);
 }

 void* Device::getOrCreateHostcallBuffer(hsa_queue_t* queue) {
@@ -432,7 +432,7 @@ class Device : public NullDevice {

  //! Acquire HSA queue. This method can create a new HSA queue or
  //! share previously created
-  hsa_queue_t* acquireQueue(uint32_t queue_size_hint);
+  hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false);

  //! Release HSA queue
  void releaseQueue(hsa_queue_t*);
@@ -584,8 +584,9 @@ bool VirtualGPU::releaseGpuMemoryFence() {
  return true;
 }

-VirtualGPU::VirtualGPU(Device& device)
+VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative)
    : device::VirtualDevice(device),
+      state_(0),
      gpu_queue_(nullptr),
      roc_device_(device),
      virtualQueue_(nullptr),
@@ -603,6 +604,8 @@ VirtualGPU::VirtualGPU(Device& device)
  // Initialize the last signal and dispatch flags
  timestamp_ = nullptr;
  hasPendingDispatch_ = false;
+  profiling_ = profiling;
+  cooperative_ = cooperative;

  kernarg_pool_base_ = nullptr;
  kernarg_pool_size_ = 0;
@@ -690,7 +693,7 @@ VirtualGPU::~VirtualGPU() {
  }
 }

-bool VirtualGPU::create(bool profilingEna) {
+bool VirtualGPU::create() {
  // Checking Virtual gpu unique index for ROCm backend
  if (index() > device().settings().commandQueues_) {
    return false;
@@ -698,10 +701,10 @@ bool VirtualGPU::create(bool profilingEna) {

  // Pick a reasonable queue size
  uint32_t queue_size = 1024;
-  gpu_queue_ = roc_device_.acquireQueue(queue_size);
+  gpu_queue_ = roc_device_.acquireQueue(queue_size, cooperative_);
  if (!gpu_queue_) return false;

-  if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
+  if (!initPool(dev().settings().kernargPoolSize_, (profiling_) ? queue_size : 0)) {
    LogError("Couldn't allocate arguments/signals for the queue");
    return false;
  }
@@ -163,10 +163,10 @@ class VirtualGPU : public device::VirtualDevice {
    size_t maxMemObjectsInQueue_;     //!< Maximum number of mem objects in the queue
  };

-  VirtualGPU(Device& device);
+  VirtualGPU(Device& device, bool profiling = false, bool cooperative = false);
  ~VirtualGPU();

-  bool create(bool profilingEna);
+  bool create();
  bool terminate() { return true; }
  const Device& dev() const { return roc_device_; }

@@ -311,13 +311,17 @@ class VirtualGPU : public device::VirtualDevice {
  std::vector<Memory*> xferWriteBuffers_;  //!< Stage write buffers
  std::vector<amd::Memory*> pinnedMems_;   //!< Pinned memory list

-  /**
-   * @brief Indicates if a kernel dispatch is outstanding. This flag is
-   * used to synchronized on kernel outputs.
-   */
-  bool hasPendingDispatch_;
+  //! Queue state flags
+  union {
+    struct {
+      uint32_t hasPendingDispatch_ : 1; //!< A kernel dispatch is outstanding
+      uint32_t imageBufferWrtBack_ : 1; //!< Image buffer write back is required
+      uint32_t profiling_          : 1; //!< Profiling is enabled
+      uint32_t cooperative_        : 1; //!< Cooperative launch is enabled
+    };
+    uint32_t  state_;
+  };

-  bool imageBufferWrtBack_;  //!< Enable image buffer write back
  std::vector<device::Memory*> wrtBackImageBuffer_;  //!< Array of images for write back

  Timestamp* timestamp_;