From e844aa939d42291d3c87ea99f4fd5ae4920ea088 Mon Sep 17 00:00:00 2001
From: Sean Keely <Sean.Keely@amd.com>
Date: Wed, 31 Jan 2018 22:48:00 -0600
Subject: [PATCH] Defer creation of internal queues and blits until first
 needed.

Change-Id: I2e61d7e102f38389d806d9eb24beda910573157b


[ROCm/ROCR-Runtime commit: bd5dd47ca1fd20727180285f28cf35a9d5677bcf]
---
 .../hsa-runtime/core/inc/amd_gpu_agent.h      |  21 ++-
 .../core/runtime/amd_gpu_agent.cpp            | 127 ++++++++----------
 .../core/runtime/amd_memory_region.cpp        |   9 +-
 .../runtime/hsa-runtime/core/util/lazy_ptr.h  | 125 +++++++++++++++++
 4 files changed, 194 insertions(+), 88 deletions(-)
 create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
index 58a70a2cb9..fd5c8f8d56 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h
@@ -56,6 +56,7 @@
 #include "core/inc/cache.h"
 #include "core/util/small_heap.h"
 #include "core/util/locks.h"
+#include "core/util/lazy_ptr.h"
 
 namespace amd {
 class MemoryRegion;
@@ -75,10 +76,8 @@ class GpuAgentInt : public core::Agent {
   GpuAgentInt(uint32_t node_id)
       : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
 
-  // @brief Initialize DMA queue.
-  //
-  // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
-  virtual void InitDma() = 0;
+  // @brief Ensure blits are ready (performance hint).
+  virtual void PreloadBlits(){};
 
   // @brief Initialization hook invoked after tools library has loaded,
   // to allow tools interception of interface functions.
@@ -185,8 +184,8 @@ class GpuAgent : public GpuAgentInt {
   // @brief GPU agent destructor.
   ~GpuAgent();
 
-  // @brief Override from core::Agent.
-  void InitDma() override;
+  // @brief Ensure blits are ready (performance hint).
+  void PreloadBlits() override;
 
   // @brief Override from core::Agent.
   hsa_status_t PostToolsInit() override;
@@ -376,7 +375,7 @@ class GpuAgent : public GpuAgentInt {
   // @brief Blit interfaces for each data path.
   enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
 
-  core::Blit* blits_[BlitCount];
+  lazy_ptr<core::Blit> blits_[BlitCount];
 
   // @brief AQL queues for cache management and blit compute usage.
   enum QueueEnum {
@@ -385,7 +384,7 @@ class GpuAgent : public GpuAgentInt {
     QueueCount
   };
 
-  core::Queue* queues_[QueueCount];
+  lazy_ptr<core::Queue> queues_[QueueCount];
 
   // @brief Mutex to protect the update to coherency type.
   KernelMutex coherency_lock_;
@@ -443,6 +442,9 @@ class GpuAgent : public GpuAgentInt {
   // @brief Query the driver to get the cache properties.
   void InitCacheList();
 
+  // @brief Create internal queues and blits.
+  void InitDma();
+
   // @brief Initialize memory pool for end timestamp object.
   // @retval True if the memory pool for end timestamp object is initialized.
   bool InitEndTsPool();
@@ -453,9 +455,6 @@ class GpuAgent : public GpuAgentInt {
   // @brief Alternative aperture size. Only on KV.
   size_t ape1_size_;
 
-  // @brief True if blit objects are initialized.
-  std::atomic<bool> blit_initialized_;
-
   // Each end ts is 32 bytes.
   static const size_t kTsSize = 32;
 
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
index 1a92a9a92d..624380b589 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp
@@ -83,7 +83,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
       memory_max_frequency_(0),
       ape1_base_(0),
       ape1_size_(0),
-      blit_initialized_(false),
       end_ts_pool_size_(0),
       end_ts_pool_counter_(0),
       end_ts_base_addr_(NULL) {
@@ -131,17 +130,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
 
 GpuAgent::~GpuAgent() {
   for (int i = 0; i < BlitCount; ++i) {
-    if (blits_[i] != NULL) {
+    if (blits_[i] != nullptr) {
       hsa_status_t status = blits_[i]->Destroy(*this);
       assert(status == HSA_STATUS_SUCCESS);
-      delete blits_[i];
     }
   }
 
-  for (int i = 0; i < QueueCount; ++i) {
-    delete queues_[i];
-  }
-
   if (end_ts_base_addr_ != NULL) {
     core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
   }
@@ -552,68 +546,54 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
 }
 
 void GpuAgent::InitDma() {
-  // This provides the ability to lazy init the blit objects on places that
-  // could give indication of DMA usage in the future. E.g.:
-  // 1. Call to allow access API.
-  // 2. Call to memory lock API.
-  if (!blit_initialized_.load(std::memory_order_acquire)) {
-    ScopedAcquire<KernelMutex> lock(&blit_lock_);
-    if (!blit_initialized_.load(std::memory_order_relaxed)) {
-      // Try create SDMA blit first.
-      // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
-      if ((isa_->GetMajorVersion() != 8) &&
-          core::Runtime::runtime_singleton_->flag().enable_sdma() &&
-          (HSA_PROFILE_BASE == profile_)) {
-        blits_[BlitHostToDev] = CreateBlitSdma();
-        blits_[BlitDevToHost] = CreateBlitSdma();
+  // Setup lazy init pointers on queues and blits.
+  auto queue_lambda = [this]() {
+    auto ret = CreateInterceptibleQueue();
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
+                               "Internal queue creation failed.");
+    return ret;
+  };
+  // Dedicated compute queue for host-to-device blits.
+  queues_[QueueBlitOnly].reset(queue_lambda);
+  // Share utility queue with device-to-host blits.
+  queues_[QueueUtility].reset(queue_lambda);
 
-        if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) {
-          blit_initialized_.store(true, std::memory_order_release);
-          return;
-        }
-      }
-
-      // Fall back to blit kernel if SDMA is unavailable.
-      if (blits_[BlitHostToDev] == NULL) {
-        // Create a dedicated compute queue for host-to-device blits.
-        queues_[QueueBlitOnly] = CreateInterceptibleQueue();
-        assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");
-
-        blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
-        assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
-      }
-
-      if (blits_[BlitDevToHost] == NULL) {
-        // Share utility queue with device-to-host blits.
-        if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
-        blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
-        assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
-      }
-
-      blit_initialized_.store(true, std::memory_order_release);
+  // Blits, try create SDMA blit first.
+  // Disable SDMA on specific ISA targets until they are fully qualified.
+  auto blit_lambda = [this](lazy_ptr<core::Queue>& queue) {
+    if ((isa_->GetMajorVersion() != 8) && core::Runtime::runtime_singleton_->flag().enable_sdma() &&
+        (HSA_PROFILE_BASE == profile_)) {
+      auto ret = CreateBlitSdma();
+      if (ret != nullptr) return ret;
     }
-  }
+    auto ret = CreateBlitKernel((*queue).get());
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
+    return ret;
+  };
+
+  blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueBlitOnly]); });
+  blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueUtility]); });
+  blits_[BlitDevToDev].reset([this]() {
+    auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
+    if (ret == nullptr)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
+    return ret;
+  });
+}
+
+void GpuAgent::PreloadBlits() {
+  blits_[BlitHostToDev].touch();
+  blits_[BlitDevToHost].touch();
+  blits_[BlitDevToDev].touch();
 }
 
 hsa_status_t GpuAgent::PostToolsInit() {
   // Defer memory allocation until agents have been discovered.
   InitScratchPool();
   BindTrapHandler();
-
-  // Defer utility queue creation to allow tools to intercept.
-  if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
-
-  if (queues_[QueueUtility] == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  // Share utility queue with device-to-device blits.
-  if (blits_[BlitDevToDev] == nullptr)
-    blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);
-
-  if (blits_[BlitDevToDev] == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
+  InitDma();
 
   return HSA_STATUS_SUCCESS;
 }
@@ -627,18 +607,14 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
-  core::Blit* blit =
-      (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-       dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-          ? blits_[BlitHostToDev]
-          : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-             dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-                ? blits_[BlitDevToHost]
-                : blits_[BlitDevToDev];
-
-  if (blit == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
+  lazy_ptr<core::Blit>& blit =
+    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
+       ? blits_[BlitHostToDev]
+       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
+          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
+            ? blits_[BlitDevToHost]
+            : blits_[BlitDevToDev];
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
@@ -925,6 +901,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
     }
   }
 
+  // Ensure utility queue has been created.
+  // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
+  // ensured.
+  queues_[QueueUtility].touch();
+
   // Create an HW AQL queue
   *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
   scratchGuard.Dismiss();
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
index 78e749be09..39814839e5 100644
--- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp
@@ -525,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
   lock.Release();
 
   for (GpuAgentInt* gpu : whitelist_gpus) {
-    gpu->InitDma();
+    gpu->PreloadBlits();
   }
 
   return HSA_STATUS_SUCCESS;
@@ -574,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
 
       if (agent->device_type() == core::Agent::kAmdGpuDevice) {
         whitelist_nodes.push_back(agent->node_id());
-        whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
+        whitelist_gpus.insert(agent);
       }
     }
   }
@@ -597,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
       } else {
         *agent_ptr = host_ptr;
       }
-      for (core::Agent* gpu : whitelist_gpus) {
-        reinterpret_cast<GpuAgentInt*>(gpu)->InitDma();
+
+      for (auto gpu : whitelist_gpus) {
+        static_cast<GpuAgentInt*>(gpu)->PreloadBlits();
       }
 
       return HSA_STATUS_SUCCESS;
diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
new file mode 100644
index 0000000000..7837200d89
--- /dev/null
+++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h
@@ -0,0 +1,125 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
+
+#include <memory>
+#include <utility>
+#include <functional>
+
+#include "core/util/utils.h"
+
+/*
+ * Wrapper for a std::unique_ptr that initializes its object at first use.
+ */
+template <typename T> class lazy_ptr {
+ public:
+  lazy_ptr() {}
+
+  explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
+
+  void reset(std::function<T*()> Constructor = nullptr) {
+    obj.reset();
+    func = Constructor;
+  }
+
+  void reset(T* ptr) {
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  bool operator==(T* rhs) const { return obj.get() == rhs; }
+  bool operator!=(T* rhs) const { return obj.get() != rhs; }
+
+  const std::unique_ptr<T>& operator->() const {
+    make(true);
+    return obj;
+  }
+
+  std::unique_ptr<T>& operator*() {
+    make(true);
+    return obj;
+  }
+
+  const std::unique_ptr<T>& operator*() const {
+    make(true);
+    return obj;
+  }
+
+  /*
+   * Ensures that the object is created or is being created.
+   * This is useful when early consruction of the object is required.
+   */
+  void touch() const { make(false); }
+
+ private:
+  mutable std::unique_ptr<T> obj;
+  mutable std::function<T*(void)> func;
+  mutable KernelMutex lock;
+
+  // Separated from make to improve inlining.
+  void make_body(bool block) const {
+    if (block) {
+      lock.Acquire();
+    } else if (!lock.Try()) {
+      return;
+    }
+    MAKE_SCOPE_GUARD([&]() { lock.Release(); });
+    if (obj != nullptr) return;
+    T* ptr = func();
+    std::atomic_thread_fence(std::memory_order_release);
+    obj.reset(ptr);
+    func = nullptr;
+  }
+
+  __forceinline void make(bool block) const {
+    std::atomic_thread_fence(std::memory_order_acquire);
+    if (obj == nullptr) {
+      make_body(block);
+    }
+  }
+
+  DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
+};
+
+#endif  // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_