Defer creation of internal queues and blits until first needed.
Change-Id: I2e61d7e102f38389d806d9eb24beda910573157b
[ROCm/ROCR-Runtime commit: bd5dd47ca1]
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -56,6 +56,7 @@
|
||||
#include "core/inc/cache.h"
|
||||
#include "core/util/small_heap.h"
|
||||
#include "core/util/locks.h"
|
||||
#include "core/util/lazy_ptr.h"
|
||||
|
||||
namespace amd {
|
||||
class MemoryRegion;
|
||||
@@ -75,10 +76,8 @@ class GpuAgentInt : public core::Agent {
|
||||
GpuAgentInt(uint32_t node_id)
|
||||
: core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
|
||||
|
||||
// @brief Initialize DMA queue.
|
||||
//
|
||||
// @retval HSA_STATUS_SUCCESS DMA queue initialization is successful.
|
||||
virtual void InitDma() = 0;
|
||||
// @brief Ensure blits are ready (performance hint).
|
||||
virtual void PreloadBlits(){};
|
||||
|
||||
// @brief Initialization hook invoked after tools library has loaded,
|
||||
// to allow tools interception of interface functions.
|
||||
@@ -185,8 +184,8 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief GPU agent destructor.
|
||||
~GpuAgent();
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
void InitDma() override;
|
||||
// @brief Ensure blits are ready (performance hint).
|
||||
void PreloadBlits() override;
|
||||
|
||||
// @brief Override from core::Agent.
|
||||
hsa_status_t PostToolsInit() override;
|
||||
@@ -376,7 +375,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Blit interfaces for each data path.
|
||||
enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
|
||||
|
||||
core::Blit* blits_[BlitCount];
|
||||
lazy_ptr<core::Blit> blits_[BlitCount];
|
||||
|
||||
// @brief AQL queues for cache management and blit compute usage.
|
||||
enum QueueEnum {
|
||||
@@ -385,7 +384,7 @@ class GpuAgent : public GpuAgentInt {
|
||||
QueueCount
|
||||
};
|
||||
|
||||
core::Queue* queues_[QueueCount];
|
||||
lazy_ptr<core::Queue> queues_[QueueCount];
|
||||
|
||||
// @brief Mutex to protect the update to coherency type.
|
||||
KernelMutex coherency_lock_;
|
||||
@@ -443,6 +442,9 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Query the driver to get the cache properties.
|
||||
void InitCacheList();
|
||||
|
||||
// @brief Create internal queues and blits.
|
||||
void InitDma();
|
||||
|
||||
// @brief Initialize memory pool for end timestamp object.
|
||||
// @retval True if the memory pool for end timestamp object is initialized.
|
||||
bool InitEndTsPool();
|
||||
@@ -453,9 +455,6 @@ class GpuAgent : public GpuAgentInt {
|
||||
// @brief Alternative aperture size. Only on KV.
|
||||
size_t ape1_size_;
|
||||
|
||||
// @brief True if blit objects are initialized.
|
||||
std::atomic<bool> blit_initialized_;
|
||||
|
||||
// Each end ts is 32 bytes.
|
||||
static const size_t kTsSize = 32;
|
||||
|
||||
|
||||
@@ -83,7 +83,6 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
memory_max_frequency_(0),
|
||||
ape1_base_(0),
|
||||
ape1_size_(0),
|
||||
blit_initialized_(false),
|
||||
end_ts_pool_size_(0),
|
||||
end_ts_pool_counter_(0),
|
||||
end_ts_base_addr_(NULL) {
|
||||
@@ -131,17 +130,12 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)
|
||||
|
||||
GpuAgent::~GpuAgent() {
|
||||
for (int i = 0; i < BlitCount; ++i) {
|
||||
if (blits_[i] != NULL) {
|
||||
if (blits_[i] != nullptr) {
|
||||
hsa_status_t status = blits_[i]->Destroy(*this);
|
||||
assert(status == HSA_STATUS_SUCCESS);
|
||||
delete blits_[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < QueueCount; ++i) {
|
||||
delete queues_[i];
|
||||
}
|
||||
|
||||
if (end_ts_base_addr_ != NULL) {
|
||||
core::Runtime::runtime_singleton_->FreeMemory(end_ts_base_addr_);
|
||||
}
|
||||
@@ -552,68 +546,54 @@ core::Blit* GpuAgent::CreateBlitKernel(core::Queue* queue) {
|
||||
}
|
||||
|
||||
void GpuAgent::InitDma() {
|
||||
// This provides the ability to lazy init the blit objects on places that
|
||||
// could give indication of DMA usage in the future. E.g.:
|
||||
// 1. Call to allow access API.
|
||||
// 2. Call to memory lock API.
|
||||
if (!blit_initialized_.load(std::memory_order_acquire)) {
|
||||
ScopedAcquire<KernelMutex> lock(&blit_lock_);
|
||||
if (!blit_initialized_.load(std::memory_order_relaxed)) {
|
||||
// Try create SDMA blit first.
|
||||
// TODO: Temporarily disable SDMA on specific ISA targets until they are fully qualified.
|
||||
if ((isa_->GetMajorVersion() != 8) &&
|
||||
core::Runtime::runtime_singleton_->flag().enable_sdma() &&
|
||||
(HSA_PROFILE_BASE == profile_)) {
|
||||
blits_[BlitHostToDev] = CreateBlitSdma();
|
||||
blits_[BlitDevToHost] = CreateBlitSdma();
|
||||
// Setup lazy init pointers on queues and blits.
|
||||
auto queue_lambda = [this]() {
|
||||
auto ret = CreateInterceptibleQueue();
|
||||
if (ret == nullptr)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES,
|
||||
"Internal queue creation failed.");
|
||||
return ret;
|
||||
};
|
||||
// Dedicated compute queue for host-to-device blits.
|
||||
queues_[QueueBlitOnly].reset(queue_lambda);
|
||||
// Share utility queue with device-to-host blits.
|
||||
queues_[QueueUtility].reset(queue_lambda);
|
||||
|
||||
if (blits_[BlitHostToDev] != NULL && blits_[BlitDevToHost] != NULL) {
|
||||
blit_initialized_.store(true, std::memory_order_release);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to blit kernel if SDMA is unavailable.
|
||||
if (blits_[BlitHostToDev] == NULL) {
|
||||
// Create a dedicated compute queue for host-to-device blits.
|
||||
queues_[QueueBlitOnly] = CreateInterceptibleQueue();
|
||||
assert(queues_[QueueBlitOnly] != NULL && "Queue creation failed");
|
||||
|
||||
blits_[BlitHostToDev] = CreateBlitKernel(queues_[QueueBlitOnly]);
|
||||
assert(blits_[BlitHostToDev] != NULL && "Blit creation failed");
|
||||
}
|
||||
|
||||
if (blits_[BlitDevToHost] == NULL) {
|
||||
// Share utility queue with device-to-host blits.
|
||||
if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
|
||||
blits_[BlitDevToHost] = CreateBlitKernel(queues_[QueueUtility]);
|
||||
assert(blits_[BlitDevToHost] != NULL && "Blit creation failed");
|
||||
}
|
||||
|
||||
blit_initialized_.store(true, std::memory_order_release);
|
||||
// Blits, try create SDMA blit first.
|
||||
// Disable SDMA on specific ISA targets until they are fully qualified.
|
||||
auto blit_lambda = [this](lazy_ptr<core::Queue>& queue) {
|
||||
if ((isa_->GetMajorVersion() != 8) && core::Runtime::runtime_singleton_->flag().enable_sdma() &&
|
||||
(HSA_PROFILE_BASE == profile_)) {
|
||||
auto ret = CreateBlitSdma();
|
||||
if (ret != nullptr) return ret;
|
||||
}
|
||||
}
|
||||
auto ret = CreateBlitKernel((*queue).get());
|
||||
if (ret == nullptr)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
|
||||
return ret;
|
||||
};
|
||||
|
||||
blits_[BlitHostToDev].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueBlitOnly]); });
|
||||
blits_[BlitDevToHost].reset([blit_lambda, this]() { return blit_lambda(queues_[QueueUtility]); });
|
||||
blits_[BlitDevToDev].reset([this]() {
|
||||
auto ret = CreateBlitKernel((*queues_[QueueUtility]).get());
|
||||
if (ret == nullptr)
|
||||
throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
|
||||
return ret;
|
||||
});
|
||||
}
|
||||
|
||||
void GpuAgent::PreloadBlits() {
|
||||
blits_[BlitHostToDev].touch();
|
||||
blits_[BlitDevToHost].touch();
|
||||
blits_[BlitDevToDev].touch();
|
||||
}
|
||||
|
||||
hsa_status_t GpuAgent::PostToolsInit() {
|
||||
// Defer memory allocation until agents have been discovered.
|
||||
InitScratchPool();
|
||||
BindTrapHandler();
|
||||
|
||||
// Defer utility queue creation to allow tools to intercept.
|
||||
if (queues_[QueueUtility] == nullptr) queues_[QueueUtility] = CreateInterceptibleQueue();
|
||||
|
||||
if (queues_[QueueUtility] == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
// Share utility queue with device-to-device blits.
|
||||
if (blits_[BlitDevToDev] == nullptr)
|
||||
blits_[BlitDevToDev] = CreateBlitKernel(queues_[QueueUtility]);
|
||||
|
||||
if (blits_[BlitDevToDev] == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
InitDma();
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -627,18 +607,14 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
|
||||
size_t size,
|
||||
std::vector<core::Signal*>& dep_signals,
|
||||
core::Signal& out_signal) {
|
||||
core::Blit* blit =
|
||||
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
|
||||
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
|
||||
? blits_[BlitHostToDev]
|
||||
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
|
||||
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
|
||||
? blits_[BlitDevToHost]
|
||||
: blits_[BlitDevToDev];
|
||||
|
||||
if (blit == NULL) {
|
||||
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
|
||||
}
|
||||
lazy_ptr<core::Blit>& blit =
|
||||
(src_agent.device_type() == core::Agent::kAmdCpuDevice &&
|
||||
dst_agent.device_type() == core::Agent::kAmdGpuDevice)
|
||||
? blits_[BlitHostToDev]
|
||||
: (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
|
||||
dst_agent.device_type() == core::Agent::kAmdCpuDevice)
|
||||
? blits_[BlitDevToHost]
|
||||
: blits_[BlitDevToDev];
|
||||
|
||||
if (profiling_enabled()) {
|
||||
// Track the agent so we could translate the resulting timestamp to system
|
||||
@@ -925,6 +901,11 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type,
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure utility queue has been created.
|
||||
// Deferring longer risks exhausting queue count before ISA upload and invalidation capability is
|
||||
// ensured.
|
||||
queues_[QueueUtility].touch();
|
||||
|
||||
// Create an HW AQL queue
|
||||
*queue = new AqlQueue(this, size, node_id(), scratch, event_callback, data, is_kv_device_);
|
||||
scratchGuard.Dismiss();
|
||||
|
||||
@@ -525,7 +525,7 @@ hsa_status_t MemoryRegion::AllowAccess(uint32_t num_agents,
|
||||
lock.Release();
|
||||
|
||||
for (GpuAgentInt* gpu : whitelist_gpus) {
|
||||
gpu->InitDma();
|
||||
gpu->PreloadBlits();
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -574,7 +574,7 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
|
||||
|
||||
if (agent->device_type() == core::Agent::kAmdGpuDevice) {
|
||||
whitelist_nodes.push_back(agent->node_id());
|
||||
whitelist_gpus.insert(reinterpret_cast<GpuAgentInt*>(agent));
|
||||
whitelist_gpus.insert(agent);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -597,8 +597,9 @@ hsa_status_t MemoryRegion::Lock(uint32_t num_agents, const hsa_agent_t* agents,
|
||||
} else {
|
||||
*agent_ptr = host_ptr;
|
||||
}
|
||||
for (core::Agent* gpu : whitelist_gpus) {
|
||||
reinterpret_cast<GpuAgentInt*>(gpu)->InitDma();
|
||||
|
||||
for (auto gpu : whitelist_gpus) {
|
||||
static_cast<GpuAgentInt*>(gpu)->PreloadBlits();
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
|
||||
@@ -0,0 +1,125 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIESd OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
|
||||
#define HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <functional>
|
||||
|
||||
#include "core/util/utils.h"
|
||||
|
||||
/*
|
||||
* Wrapper for a std::unique_ptr that initializes its object at first use.
|
||||
*/
|
||||
template <typename T> class lazy_ptr {
|
||||
public:
|
||||
lazy_ptr() {}
|
||||
|
||||
explicit lazy_ptr(std::function<T*()> Constructor) { Init(Constructor); }
|
||||
|
||||
void reset(std::function<T*()> Constructor = nullptr) {
|
||||
obj.reset();
|
||||
func = Constructor;
|
||||
}
|
||||
|
||||
void reset(T* ptr) {
|
||||
obj.reset(ptr);
|
||||
func = nullptr;
|
||||
}
|
||||
|
||||
bool operator==(T* rhs) const { return obj.get() == rhs; }
|
||||
bool operator!=(T* rhs) const { return obj.get() != rhs; }
|
||||
|
||||
const std::unique_ptr<T>& operator->() const {
|
||||
make(true);
|
||||
return obj;
|
||||
}
|
||||
|
||||
std::unique_ptr<T>& operator*() {
|
||||
make(true);
|
||||
return obj;
|
||||
}
|
||||
|
||||
const std::unique_ptr<T>& operator*() const {
|
||||
make(true);
|
||||
return obj;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensures that the object is created or is being created.
|
||||
* This is useful when early consruction of the object is required.
|
||||
*/
|
||||
void touch() const { make(false); }
|
||||
|
||||
private:
|
||||
mutable std::unique_ptr<T> obj;
|
||||
mutable std::function<T*(void)> func;
|
||||
mutable KernelMutex lock;
|
||||
|
||||
// Separated from make to improve inlining.
|
||||
void make_body(bool block) const {
|
||||
if (block) {
|
||||
lock.Acquire();
|
||||
} else if (!lock.Try()) {
|
||||
return;
|
||||
}
|
||||
MAKE_SCOPE_GUARD([&]() { lock.Release(); });
|
||||
if (obj != nullptr) return;
|
||||
T* ptr = func();
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
obj.reset(ptr);
|
||||
func = nullptr;
|
||||
}
|
||||
|
||||
__forceinline void make(bool block) const {
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
if (obj == nullptr) {
|
||||
make_body(block);
|
||||
}
|
||||
}
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(lazy_ptr);
|
||||
};
|
||||
|
||||
#endif // HSA_RUNTIME_CORE_UTIL_LAZY_PTR_H_
|
||||
Αναφορά σε νέο ζήτημα
Block a user