Defer creation of internal queues and blits until first needed.

Change-Id: I2e61d7e102f38389d806d9eb24beda910573157b


[ROCm/ROCR-Runtime commit: bd5dd47ca1]
Αυτή η υποβολή περιλαμβάνεται σε:
Sean Keely
2018-01-31 22:48:00 -06:00
γονέας 27ce54a0aa
υποβολή e844aa939d
4 αρχεία άλλαξαν με 194 προσθήκες και 88 διαγραφές
@@ -56,6 +56,7 @@
#include "core/inc/cache.h"
#include "core/util/small_heap.h"
#include "core/util/locks.h"
#include "core/util/lazy_ptr.h"
namespace amd {
class MemoryRegion;
@@ -75,10 +76,8 @@ class GpuAgentInt : public core::Agent {
GpuAgentInt(uint32_t node_id)
: core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
// @brief Initialize DMA queue.
//
// @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
virtual void InitDma() = 0;
// @brief Ensure blits are ready (performance hint).
virtual void PreloadBlits(){};
// @brief Initialization hook invoked after tools library has loaded,
// to allow tools interception of interface functions.
@@ -185,8 +184,8 @@ class GpuAgent : public GpuAgentInt {
// @brief GPU agent destructor.
~GpuAgent();
// @brief Override from core::Agent.
void InitDma() override;
// @brief Ensure blits are ready (performance hint).
void PreloadBlits() override;
// @brief Override from core::Agent.
hsa_status_t PostToolsInit() override;
@@ -376,7 +375,7 @@ class GpuAgent : public GpuAgentInt {
// @brief Blit interfaces for each data path.
enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
core::Blit* blits_[BlitCount];
lazy_ptr<core::Blit> blits_[BlitCount];
// @brief AQL queues for cache management and blit compute usage.
enum QueueEnum {
@@ -385,7 +384,7 @@ class GpuAgent : public GpuAgentInt {
QueueCount
};
core::Queue* queues_[QueueCount];
lazy_ptr<core::Queue> queues_[QueueCount];
// @brief Mutex to protect the update to coherency type.
KernelMutex coherency_lock_;
@@ -443,6 +442,9 @@ class GpuAgent : public GpuAgentInt {
// @brief Query the driver to get the cache properties.
void InitCacheList();
// @brief Create internal queues and blits.
void InitDma();
// @brief Initialize memory pool for end timestamp object.
// @retval True if the memory pool for end timestamp object is initialized.
bool InitEndTsPool();
@@ -453,9 +455,6 @@ class GpuAgent : public GpuAgentInt {
// @brief Alternative aperture size. Only on KV.
size_t ape1_size_;
// @brief True if blit objects are initialized.
std::atomic<bool> blit_initialized_;
// Each end ts is 32 bytes.
static const size_t kTsSize = 32;
@@ -83,7 +83,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
memory_max_frequency_(0),
ape1_base_(0),
ape1_size_(0),
blit_initialized_(false),
end_ts_pool_size_(0),
end_ts_pool_counter_(0),
end_ts_base_addr_(NULL) {
@@ -131,17 +130,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
GpuAgent::~GpuAgent() {
for (int i = 0; i < BlitCount; ++i) {
if (blits_[i] != NULL) {
if (blits_[i] != nullptr) {
hsa_status_t status = blits_[i]->Destroy(*this);
assert(status == HSA_STATUS_SUCCESS);
delete blits_[i];
}
}
for (int i = 0; i < QueueCount; ++i) {
delete queues_[i];
}
if (end_ts_base_addr_ != NULL) {
core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
}
@@ -552,68 +546,54 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
}
void GpuAgent::InitDma() {
// This provides the ability to lazy init the blit objects on places that
// could give indication of DMA usage in the future. E.g.:
// 1. Call to allow access API.
// 2. Call to memory lock API.
if (!blit_initialized_.load(std::memory_order_acquire)) {
ScopedAcquire<KernelMutex> lock(&blit_lock_);
if (!blit_initialized_.load(std::memory_order_relaxed)) {
// Try create SDMA blit first.
// TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
if ((isa_->GetMajorVersion() != 8) &&
core::Runtime::runtime_singleton_->flag().enable_sdma() &&
(HSA_PROFILE_BASE == profile_)) {
blits_[BlitHostToDev] = CreateBlitSdma();
blits_[BlitDevToHost] = CreateBlitSdma();
// Setup lazy init pointers on queues and blits.
auto queue_lambda = [this]() {
auto ret = CreateInterceptibleQueue();
if (ret == nullptr)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
"Internal queue creation failed.");
return ret;
};
// Dedicated compute queue for host-to-device blits.
queues_[QueueBlitOnly].reset(queue_lambda);
// Share utility queue with device-to-host blits.
queues_[QueueUtility].reset(queue_lambda);
if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) {
blit_initialized_.store(true, std::memory_order_release);
return;
}
}
// Fall back to blit kernel if SDMA is unavailable.
if (blits_[BlitHostToDev] == NULL) {
// Create a dedicated compute queue for host-to-device blits.
queues_[QueueBlitOnly] = CreateInterceptibleQueue();
assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");
blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
}
if (blits_[BlitDevToHost] == NULL) {
// Share utility queue with device-to-host blits.
if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
}
blit_initialized_.store(true, std::memory_order_release);
// Blits, try create SDMA blit first.
// Disable SDMA on specific ISA targets until they are fully qualified.
auto blit_lambda = [this](lazy_ptr<core::Queue>& queue) {
if ((isa_->GetMajorVersion() != 8) && core::Runtime::runtime_singleton_->flag().enable_sdma() &&
(HSA_PROFILE_BASE == profile_)) {
auto ret = CreateBlitSdma();
if (ret != nullptr) return ret;
}
}
auto ret = CreateBlitKernel((*queue).get());
if (ret == nullptr)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
return ret;
};
blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueBlitOnly]); });
blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueUtility]); });
blits_[BlitDevToDev].reset([this]() {
auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
if (ret == nullptr)
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
return ret;
});
}
void GpuAgent::PreloadBlits() {
blits_[BlitHostToDev].touch();
blits_[BlitDevToHost].touch();
blits_[BlitDevToDev].touch();
}
hsa_status_t GpuAgent::PostToolsInit() {
// Defer memory allocation until agents have been discovered.
InitScratchPool();
BindTrapHandler();
// Defer utility queue creation to allow tools to intercept.
if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
if (queues_[QueueUtility] == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
// Share utility queue with device-to-device blits.
if (blits_[BlitDevToDev] == nullptr)
blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);
if (blits_[BlitDevToDev] == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
InitDma();
return HSA_STATUS_SUCCESS;
}
@@ -627,18 +607,14 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
size_t size,
std::vector<core::Signal*>& dep_signals,
core::Signal& out_signal) {
core::Blit* blit =
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
? blits_[BlitHostToDev]
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
? blits_[BlitDevToHost]
: blits_[BlitDevToDev];
if (blit == NULL) {
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
lazy_ptr<core::Blit>& blit =
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
? blits_[BlitHostToDev]
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
? blits_[BlitDevToHost]
: blits_[BlitDevToDev];
if (profiling_enabled()) {
// Track the agent so we could translate the resulting timestamp to system
@@ -925,6 +901,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
}
}
// Ensure utility queue has been created.
// Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
// ensured.
queues_[QueueUtility].touch();
// Create an HW AQL queue
*queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
scratchGuard.Dismiss();
@@ -525,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
lock.Release();
for (GpuAgentInt* gpu : whitelist_gpus) {
gpu->InitDma();
gpu->PreloadBlits();
}
return HSA_STATUS_SUCCESS;
@@ -574,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
if (agent->device_type() == core::Agent::kAmdGpuDevice) {
whitelist_nodes.push_back(agent->node_id());
whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
whitelist_gpus.insert(agent);
}
}
}
@@ -597,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
} else {
*agent_ptr = host_ptr;
}
for (core::Agent* gpu : whitelist_gpus) {
reinterpret_cast<GpuAgentInt*>(gpu)->InitDma();
for (auto gpu : whitelist_gpus) {
static_cast<GpuAgentInt*>(gpu)->PreloadBlits();
}
return HSA_STATUS_SUCCESS;
@@ -0,0 +1,125 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////
#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
#include <memory>
#include <utility>
#include <functional>
#include "core/util/utils.h"
/*
* Wrapper for a std::unique_ptr that initializes its object at first use.
*/
template <typename T> class lazy_ptr {
public:
lazy_ptr() {}
explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
void reset(std::function<T*()> Constructor = nullptr) {
obj.reset();
func = Constructor;
}
void reset(T* ptr) {
obj.reset(ptr);
func = nullptr;
}
bool operator==(T* rhs) const { return obj.get() == rhs; }
bool operator!=(T* rhs) const { return obj.get() != rhs; }
const std::unique_ptr<T>& operator->() const {
make(true);
return obj;
}
std::unique_ptr<T>& operator*() {
make(true);
return obj;
}
const std::unique_ptr<T>& operator*() const {
make(true);
return obj;
}
/*
* Ensures that the object is created or is being created.
* This is useful when early consruction of the object is required.
*/
void touch() const { make(false); }
private:
mutable std::unique_ptr<T> obj;
mutable std::function<T*(void)> func;
mutable KernelMutex lock;
// Separated from make to improve inlining.
void make_body(bool block) const {
if (block) {
lock.Acquire();
} else if (!lock.Try()) {
return;
}
MAKE_SCOPE_GUARD([&]() { lock.Release(); });
if (obj != nullptr) return;
T* ptr = func();
std::atomic_thread_fence(std::memory_order_release);
obj.reset(ptr);
func = nullptr;
}
__forceinline void make(bool block) const {
std::atomic_thread_fence(std::memory_order_acquire);
if (obj == nullptr) {
make_body(block);
}
}
DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
};
#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_