From e844aa939d42291d3c87ea99f4fd5ae4920ea088 Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Wed, 31 Jan 2018 22:48:00 -0600 Subject: [PATCH] Defer creation of internal queues and blits until first needed. Change-Id: I2e61d7e102f38389d806d9eb24beda910573157b [ROCm/ROCR-Runtime commit: bd5dd47ca1fd20727180285f28cf35a9d5677bcf] --- .../hsa-runtime/core/inc/amd_gpu_agent.h | 21 ++- .../core/runtime/amd_gpu_agent.cpp | 127 ++++++++---------- .../core/runtime/amd_memory_region.cpp | 9 +- .../runtime/hsa-runtime/core/util/lazy_ptr.h | 125 +++++++++++++++++ 4 files changed, 194 insertions(+), 88 deletions(-) create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 58a70a2cb9..fd5c8f8d56 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -56,6 +56,7 @@ #include "core/inc/cache.h" #include "core/util/small_heap.h" #include "core/util/locks.h" +#include "core/util/lazy_ptr.h" namespace amd { class MemoryRegion; @@ -75,10 +76,8 @@ class GpuAgentInt : public core::Agent { GpuAgentInt(uint32_t node_id) : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} - // @brief Initialize DMA queue. - // - // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. - virtual void InitDma() = 0; + // @brief Ensure blits are ready (performance hint). + virtual void PreloadBlits(){}; // @brief Initialization hook invoked after tools library has loaded, // to allow tools interception of interface functions. @@ -185,8 +184,8 @@ class GpuAgent : public GpuAgentInt { // @brief GPU agent destructor. ~GpuAgent(); - // @brief Override from core::Agent. - void InitDma() override; + // @brief Ensure blits are ready (performance hint). + void PreloadBlits() override; // @brief Override from core::Agent. hsa_status_t PostToolsInit() override; @@ -376,7 +375,7 @@ class GpuAgent : public GpuAgentInt { // @brief Blit interfaces for each data path. enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount }; - core::Blit* blits_[BlitCount]; + lazy_ptr blits_[BlitCount]; // @brief AQL queues for cache management and blit compute usage. enum QueueEnum { @@ -385,7 +384,7 @@ class GpuAgent : public GpuAgentInt { QueueCount }; - core::Queue* queues_[QueueCount]; + lazy_ptr queues_[QueueCount]; // @brief Mutex to protect the update to coherency type. KernelMutex coherency_lock_; @@ -443,6 +442,9 @@ class GpuAgent : public GpuAgentInt { // @brief Query the driver to get the cache properties. void InitCacheList(); + // @brief Create internal queues and blits. + void InitDma(); + // @brief Initialize memory pool for end timestamp object. // @retval True if the memory pool for end timestamp object is initialized. bool InitEndTsPool(); @@ -453,9 +455,6 @@ class GpuAgent : public GpuAgentInt { // @brief Alternative aperture size. Only on KV. size_t ape1_size_; - // @brief True if blit objects are initialized. - std::atomic blit_initialized_; - // Each end ts is 32 bytes. static const size_t kTsSize = 32; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 1a92a9a92d..624380b589 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -83,7 +83,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) memory_max_frequency_(0), ape1_base_(0), ape1_size_(0), - blit_initialized_(false), end_ts_pool_size_(0), end_ts_pool_counter_(0), end_ts_base_addr_(NULL) { @@ -131,17 +130,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) GpuAgent::~GpuAgent() { for (int i = 0; i < BlitCount; ++i) { - if (blits_[i] != NULL) { + if (blits_[i] != nullptr) { hsa_status_t status = blits_[i]->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); - delete blits_[i]; } } - for (int i = 0; i < QueueCount; ++i) { - delete queues_[i]; - } - if (end_ts_base_addr_ != NULL) { core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_); } @@ -552,68 +546,54 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) { } void GpuAgent::InitDma() { - // This provides the ability to lazy init the blit objects on places that - // could give indication of DMA usage in the future. E.g.: - // 1. Call to allow access API. - // 2. Call to memory lock API. - if (!blit_initialized_.load(std::memory_order_acquire)) { - ScopedAcquire lock(&blit_lock_); - if (!blit_initialized_.load(std::memory_order_relaxed)) { - // Try create SDMA blit first. - // TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified. - if ((isa_->GetMajorVersion() != 8) && - core::Runtime::runtime_singleton_->flag().enable_sdma() && - (HSA_PROFILE_BASE == profile_)) { - blits_[BlitHostToDev] = CreateBlitSdma(); - blits_[BlitDevToHost] = CreateBlitSdma(); + // Setup lazy init pointers on queues and blits. + auto queue_lambda = [this]() { + auto ret = CreateInterceptibleQueue(); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, + "Internal queue creation failed."); + return ret; + }; + // Dedicated compute queue for host-to-device blits. + queues_[QueueBlitOnly].reset(queue_lambda); + // Share utility queue with device-to-host blits. + queues_[QueueUtility].reset(queue_lambda); - if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) { - blit_initialized_.store(true, std::memory_order_release); - return; - } - } - - // Fall back to blit kernel if SDMA is unavailable. - if (blits_[BlitHostToDev] == NULL) { - // Create a dedicated compute queue for host-to-device blits. - queues_[QueueBlitOnly] = CreateInterceptibleQueue(); - assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed"); - - blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]); - assert(blits_[BlitHostToDev] != NULL && "Blit creation failed"); - } - - if (blits_[BlitDevToHost] == NULL) { - // Share utility queue with device-to-host blits. - if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue(); - blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]); - assert(blits_[BlitDevToHost] != NULL && "Blit creation failed"); - } - - blit_initialized_.store(true, std::memory_order_release); + // Blits, try create SDMA blit first. + // Disable SDMA on specific ISA targets until they are fully qualified. + auto blit_lambda = [this](lazy_ptr& queue) { + if ((isa_->GetMajorVersion() != 8) && core::Runtime::runtime_singleton_->flag().enable_sdma() && + (HSA_PROFILE_BASE == profile_)) { + auto ret = CreateBlitSdma(); + if (ret != nullptr) return ret; } - } + auto ret = CreateBlitKernel((*queue).get()); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed."); + return ret; + }; + + blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueBlitOnly]); }); + blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueUtility]); }); + blits_[BlitDevToDev].reset([this]() { + auto ret = CreateBlitKernel((*queues_[QueueUtility]).get()); + if (ret == nullptr) + throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed."); + return ret; + }); +} + +void GpuAgent::PreloadBlits() { + blits_[BlitHostToDev].touch(); + blits_[BlitDevToHost].touch(); + blits_[BlitDevToDev].touch(); } hsa_status_t GpuAgent::PostToolsInit() { // Defer memory allocation until agents have been discovered. InitScratchPool(); BindTrapHandler(); - - // Defer utility queue creation to allow tools to intercept. - if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue(); - - if (queues_[QueueUtility] == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - // Share utility queue with device-to-device blits. - if (blits_[BlitDevToDev] == nullptr) - blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]); - - if (blits_[BlitDevToDev] == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } + InitDma(); return HSA_STATUS_SUCCESS; } @@ -627,18 +607,14 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, size_t size, std::vector& dep_signals, core::Signal& out_signal) { - core::Blit* blit = - (src_agent.device_type() == core::Agent::kAmdCpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice) - ? blits_[BlitHostToDev] - : (src_agent.device_type() == core::Agent::kAmdGpuDevice && - dst_agent.device_type() == core::Agent::kAmdCpuDevice) - ? blits_[BlitDevToHost] - : blits_[BlitDevToDev]; - - if (blit == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } + lazy_ptr& blit = + (src_agent.device_type() == core::Agent::kAmdCpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice) + ? blits_[BlitHostToDev] + : (src_agent.device_type() == core::Agent::kAmdGpuDevice && + dst_agent.device_type() == core::Agent::kAmdCpuDevice) + ? blits_[BlitDevToHost] + : blits_[BlitDevToDev]; if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system @@ -925,6 +901,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, } } + // Ensure utility queue has been created. + // Deferring longer risks exhausting queue count before ISA upload and invalidation capability is + // ensured. + queues_[QueueUtility].touch(); + // Create an HW AQL queue *queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_); scratchGuard.Dismiss(); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp index 78e749be09..39814839e5 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_memory_region.cpp @@ -525,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents, lock.Release(); for (GpuAgentInt* gpu : whitelist_gpus) { - gpu->InitDma(); + gpu->PreloadBlits(); } return HSA_STATUS_SUCCESS; @@ -574,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, if (agent->device_type() == core::Agent::kAmdGpuDevice) { whitelist_nodes.push_back(agent->node_id()); - whitelist_gpus.insert(reinterpret_cast(agent)); + whitelist_gpus.insert(agent); } } } @@ -597,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents, } else { *agent_ptr = host_ptr; } - for (core::Agent* gpu : whitelist_gpus) { - reinterpret_cast(gpu)->InitDma(); + + for (auto gpu : whitelist_gpus) { + static_cast(gpu)->PreloadBlits(); } return HSA_STATUS_SUCCESS; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h new file mode 100644 index 0000000000..7837200d89 --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/util/lazy_ptr.h @@ -0,0 +1,125 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ +#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_ + +#include +#include +#include + +#include "core/util/utils.h" + +/* + * Wrapper for a std::unique_ptr that initializes its object at first use. + */ +template class lazy_ptr { + public: + lazy_ptr() {} + + explicit lazy_ptr(std::function Constructor) { Init(Constructor); } + + void reset(std::function Constructor = nullptr) { + obj.reset(); + func = Constructor; + } + + void reset(T* ptr) { + obj.reset(ptr); + func = nullptr; + } + + bool operator==(T* rhs) const { return obj.get() == rhs; } + bool operator!=(T* rhs) const { return obj.get() != rhs; } + + const std::unique_ptr& operator->() const { + make(true); + return obj; + } + + std::unique_ptr& operator*() { + make(true); + return obj; + } + + const std::unique_ptr& operator*() const { + make(true); + return obj; + } + + /* + * Ensures that the object is created or is being created. + * This is useful when early consruction of the object is required. + */ + void touch() const { make(false); } + + private: + mutable std::unique_ptr obj; + mutable std::function func; + mutable KernelMutex lock; + + // Separated from make to improve inlining. + void make_body(bool block) const { + if (block) { + lock.Acquire(); + } else if (!lock.Try()) { + return; + } + MAKE_SCOPE_GUARD([&]() { lock.Release(); }); + if (obj != nullptr) return; + T* ptr = func(); + std::atomic_thread_fence(std::memory_order_release); + obj.reset(ptr); + func = nullptr; + } + + __forceinline void make(bool block) const { + std::atomic_thread_fence(std::memory_order_acquire); + if (obj == nullptr) { + make_body(block); + } + } + + DISALLOW_COPY_AND_ASSIGN(lazy_ptr); +}; + +#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_