From 950dccb283cc038bd78795ffd547b83f2fa9dd2d Mon Sep 17 00:00:00 2001
From: foreman <dl.swbuild@amd.com>
Date: Thu, 1 Aug 2019 16:24:33 -0400
Subject: [PATCH] P4 to Git Change 1977224 by gandryey@gera-hip-lnx on
 2019/08/01 16:17:51

	SWDEV-193423 - HIP/VDI - Support for lazy hsa queue creation
	- Add queue pool support for HSA HW queues. GPU_MAX_HW_QUEUES controls the pool size. The current default value is 4 (the number of active pipes on GPU).

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#132 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#81 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#314 edit
---
 rocclr/runtime/device/rocm/rocdevice.cpp  |   4 -
 rocclr/runtime/device/rocm/rocdevice.hpp  |   3 +
 rocclr/runtime/device/rocm/rocvirtual.cpp | 103 ++++++++++++++++------
 rocclr/runtime/device/rocm/rocvirtual.hpp |   6 +-
 rocclr/runtime/utils/flags.hpp            |   4 +-
 5 files changed, 83 insertions(+), 37 deletions(-)
diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp
index 35f935bfa8..8c4f2e2bf4 100644
--- a/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -1411,10 +1411,6 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
     return nullptr;
   }
 
-  if (profiling) {
-    hsa_amd_profiling_set_profiler_enabled(virtualDevice->gpu_queue(), 1);
-  }
-
   return virtualDevice;
 }
 
diff --git a/rocclr/runtime/device/rocm/rocdevice.hpp b/rocclr/runtime/device/rocm/rocdevice.hpp
index 7b7b8fb58c..2ec5b65273 100644
--- a/rocclr/runtime/device/rocm/rocdevice.hpp
+++ b/rocclr/runtime/device/rocm/rocdevice.hpp
@@ -406,6 +406,8 @@ class Device : public NullDevice {
 
   VirtualGPU* xferQueue() const;
 
+  std::map<hsa_queue_t*, int>& QueuePool() { return queue_pool_; }
+
  private:
   static hsa_ven_amd_loader_1_00_pfn_t amd_loader_ext_table;
 
@@ -436,6 +438,7 @@ class Device : public NullDevice {
   std::atomic<size_t> freeMem_;   //!< Total of free memory available
   mutable amd::Monitor vgpusAccess_;     //!< Lock to serialise virtual gpu list access
   bool hsa_exclusive_gpu_access_;  //!< TRUE if current device was moved into exclusive GPU access mode
+  std::map<hsa_queue_t*, int> queue_pool_;  //!< Pool of HSA queues for recycling
 
  public:
   amd::Atomic<uint> numOfVgpus_;  //!< Virtual gpu unique index
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index f6af73f13f..950bbce25d 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -48,6 +48,9 @@ namespace roc {
 // (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes
 // L2
 
+static const uint16_t kInvalidAql =
+    (HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE);
+
 static const uint16_t kDispatchPacketHeaderNoSync =
     (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
     (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
@@ -387,13 +390,18 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
   return true;
 }
 
+static inline void packet_store_release(uint32_t* packet, uint16_t header, uint16_t rest) {
+  __atomic_store_n(packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
 template <typename AqlPacket>
-bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size_t size) {
+bool VirtualGPU::dispatchGenericAqlPacket(
+  AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size) {
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
 
   // Check for queue full and wait if needed.
-  uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
+  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
   uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
   hsa_signal_t signal;
 
@@ -412,7 +420,11 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size
     timestamp_->setAgent(gpu_device_);
   }
 
-  if (blocking || (index - read) == queueMask) {
+  // Make sure the slot is free for usage
+  while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
+
+  // Add blocking command if the original value of read index was behind of the queue size
+  if (blocking || (index - read) >= queueMask) {
     if (packet->completion_signal.handle == 0) {
       packet->completion_signal = barrier_signal_;
     }
@@ -426,10 +438,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size
   // NOTE: need multiple packets to dispatch the performance counter
   //       packet blob of the legacy devices (gfx8)
   for (uint i = 0; i < size; i++, index++, packet++) {
-    ((AqlPacket*)(gpu_queue_->base_address))[index & queueMask] = *packet;
+    AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask]; 
+    *aql_loc = *packet;
+    if (header != 0) {
+      packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
+    }
   }
-  hsa_queue_store_write_index_release(gpu_queue_, index);
-  hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index-1);
+  //hsa_queue_store_write_index_release(gpu_queue_, index);
+  hsa_signal_store_release(gpu_queue_->doorbell_signal, index - 1);
 
   // Wait on signal ?
   if (blocking) {
@@ -446,12 +462,14 @@ bool VirtualGPU::dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size
   return true;
 }
 
-bool VirtualGPU::dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking) {
-  return dispatchGenericAqlPacket(packet, blocking);
+bool VirtualGPU::dispatchAqlPacket(
+  hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
+  return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }
 
-bool VirtualGPU::dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking) {
-  return dispatchGenericAqlPacket(packet, blocking);
+bool VirtualGPU::dispatchAqlPacket(
+  hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking) {
+  return dispatchGenericAqlPacket(packet, header, rest, blocking);
 }
 
 bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
@@ -467,13 +485,13 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
       { // Create legacy devices PM4 data
         hsa_ext_amd_aql_pm4_packet_t pm4Packet[SLOT_PM4_SIZE_AQLP];
         extApi->hsa_ven_amd_aqlprofile_legacy_get_pm4(packet, static_cast<void*>(&pm4Packet[0]));
-        return dispatchGenericAqlPacket(&pm4Packet[0], blocking, SLOT_PM4_SIZE_AQLP);
+        return dispatchGenericAqlPacket(&pm4Packet[0], 0, 0, blocking, SLOT_PM4_SIZE_AQLP);
       }
       break;
     case PerfCounter::ROC_GFX9:
       {
         packet->header = HSA_PACKET_TYPE_VENDOR_SPECIFIC << HSA_PACKET_HEADER_TYPE;
-        return dispatchGenericAqlPacket(packet, blocking);
+        return dispatchGenericAqlPacket(packet, 0, 0, blocking);
       }
       break;
   }
@@ -485,13 +503,16 @@ void VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet) {
   assert(packet->completion_signal.handle != 0);
   const uint32_t queueSize = gpu_queue_->size;
   const uint32_t queueMask = queueSize - 1;
+  uint32_t header = kBarrierPacketHeader;
 
-  uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
-  ((hsa_barrier_and_packet_t*)(gpu_queue_->base_address))[index & queueMask] = *packet;
+  uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
+  while ((index - hsa_queue_load_read_index_scacquire(gpu_queue_)) >= queueMask);
+  hsa_barrier_and_packet_t* aql_loc =
+    &(reinterpret_cast<hsa_barrier_and_packet_t*>(gpu_queue_->base_address))[index & queueMask];
+  *aql_loc = *packet;
+ __atomic_store_n(reinterpret_cast<uint32_t*>(aql_loc), kBarrierPacketHeader, __ATOMIC_RELEASE);
 
-  hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1);
-
-  hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
+ hsa_signal_store_release(gpu_queue_->doorbell_signal, index);
 }
 
 /**
@@ -569,8 +590,6 @@ VirtualGPU::~VirtualGPU() {
   // Release the resources of signal
   releaseGpuMemoryFence();
 
-  hsa_status_t err = hsa_queue_destroy(gpu_queue_);
-
   if (barrier_signal_.handle != 0) {
     hsa_signal_destroy(barrier_signal_);
   }
@@ -613,6 +632,13 @@ VirtualGPU::~VirtualGPU() {
   for (uint idx = index(); idx < roc_device_.vgpus().size(); ++idx) {
     roc_device_.vgpus()[idx]->index_--;
   }
+  // Decrement the counter
+  roc_device_.QueuePool()[gpu_queue_]--;
+  // Release the queue if the counter is 0
+  if (roc_device_.QueuePool()[gpu_queue_] == 0) {
+    hsa_status_t err = hsa_queue_destroy(gpu_queue_);
+    roc_device_.QueuePool().erase(gpu_queue_);
+  }
 }
 
 bool VirtualGPU::create(bool profilingEna) {
@@ -630,13 +656,28 @@ bool VirtualGPU::create(bool profilingEna) {
   // Pick a reasonable queue size
   uint32_t queue_size = 1024;
   queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size;
-  while (hsa_queue_create(gpu_device_, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
+  if (roc_device_.QueuePool().size() < GPU_MAX_HW_QUEUES) {
+    while (hsa_queue_create(gpu_device_, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr, nullptr,
                           std::numeric_limits<uint>::max(), std::numeric_limits<uint>::max(),
                           &gpu_queue_) != HSA_STATUS_SUCCESS) {
-    queue_size >>= 1;
-    if (queue_size < 64) {
-      return false;
+      queue_size >>= 1;
+      if (queue_size < 64) {
+        return false;
+      }
     }
+    hsa_amd_profiling_set_profiler_enabled(gpu_queue(), 1);
+    roc_device_.QueuePool().insert({gpu_queue_, 1});
+  } else {
+    int usage = std::numeric_limits<int>::max();
+    // Loop through all allocated queues and find the lowest usage
+    for (const auto it : roc_device_.QueuePool()) {
+      if (it.second < usage) {
+        gpu_queue_ = it.first;
+        usage = it.second;
+      }
+    }
+    // Increment the usage of the current queue
+    roc_device_.QueuePool()[gpu_queue_]++;
   }
 
   if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
@@ -660,7 +701,7 @@ bool VirtualGPU::create(bool profilingEna) {
 
   // Initialize barrier packet.
   memset(&barrier_packet_, 0, sizeof(barrier_packet_));
-  barrier_packet_.header = kBarrierPacketHeader;
+  barrier_packet_.header = kInvalidAql;
   barrier_packet_.completion_signal = barrier_signal_;
 
   // Create a object of PrintfDbg
@@ -2100,10 +2141,11 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     hsa_kernel_dispatch_packet_t dispatchPacket;
     memset(&dispatchPacket, 0, sizeof(dispatchPacket));
 
+    dispatchPacket.header = kInvalidAql;
     dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
 
-    dispatchPacket.header = aqlHeader_;
-    dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+   // dispatchPacket.header = aqlHeader_;
+    // dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     dispatchPacket.grid_size_x = sizes.dimensions() > 0 ? newGlobalSize[0] : 1;
     dispatchPacket.grid_size_y = sizes.dimensions() > 1 ? newGlobalSize[1] : 1;
     dispatchPacket.grid_size_z = sizes.dimensions() > 2 ? newGlobalSize[2] : 1;
@@ -2119,7 +2161,10 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
     dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
 
     // Dispatch the packet
-    if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)) {
+    if (!dispatchAqlPacket(
+            &dispatchPacket, aqlHeader_,
+            (sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS),
+            GPU_FLUSH_ON_EXECUTION)) {
       return false;
     }
   }
@@ -2219,7 +2264,7 @@ void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd) {
 
   profilingBegin(vcmd);
   auto fence = kBarrierAcquirePacket;
-  dispatchAqlPacket(&fence, false);
+  dispatchAqlPacket(&fence, 0, 0, false);
   profilingEnd(vcmd);
 }
 
@@ -2228,7 +2273,7 @@ void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd) {
   amd::ScopedLock lock(execution());
   profilingBegin(vcmd);
   auto fence = kBarrierReleasePacket;
-  dispatchAqlPacket(&fence, false);
+  dispatchAqlPacket(&fence, 0, 0, false);
   profilingEnd(vcmd);
 }
 
diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp
index ea84bf1764..97153fbaeb 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -249,9 +249,9 @@ class VirtualGPU : public device::VirtualDevice {
 
   // } roc OpenCL integration
  private:
-  bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, bool blocking = true);
-  bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, bool blocking = true);
-  template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, bool blocking, size_t size = 1);
+  bool dispatchAqlPacket(hsa_kernel_dispatch_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true);
+  bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest, bool blocking = true);
+  template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size = 1);
   void dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet);
   bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion, bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
   void initializeDispatchPacket(hsa_kernel_dispatch_packet_t* packet, amd::NDRangeContainer& sizes);
diff --git a/rocclr/runtime/utils/flags.hpp b/rocclr/runtime/utils/flags.hpp
index 85e815ca95..549ee5a54b 100644
--- a/rocclr/runtime/utils/flags.hpp
+++ b/rocclr/runtime/utils/flags.hpp
@@ -115,7 +115,7 @@ release(uint, OPENCL_VERSION, (IS_BRAHMA ? 120 : 200),                        \
         "Force GPU opencl verison")                                           \
 release(bool, HSA_LOCAL_MEMORY_ENABLE, true,                                  \
         "Enable HSA device local memory usage")                               \
-release(uint, HSA_KERNARG_POOL_SIZE, 2 * 1024 * 1024,                         \
+release(uint, HSA_KERNARG_POOL_SIZE, 512 * 1024,                              \
         "Kernarg pool size")                                                  \
 release(bool, HSA_ENABLE_COARSE_GRAIN_SVM, true,                              \
         "Enable device memory for coarse grain SVM allocations")              \
@@ -177,6 +177,8 @@ release(bool, GPU_ENABLE_COOP_GROUPS, false,                                  \
          "Enables cooperative group launch")                                  \
 release(uint, GPU_MAX_COMMAND_BUFFERS, 8,                                     \
          "The maximum number of command buffers allocated per queue")         \
+release(uint, GPU_MAX_HW_QUEUES, 4,                                           \
+         "The maximum number of HW queues allocated per device")              \
 release(bool, GPU_IMAGE_BUFFER_WAR, true,                                     \
         "Enables image buffer workaround")                                    \
 release(cstring, HIP_VISIBLE_DEVICES, "",                                     \