From 7b50eacba55cbf5fe4dfc43ac2b5ebd110fcb274 Mon Sep 17 00:00:00 2001 From: Besar Wicaksono Date: Tue, 31 May 2016 13:38:45 -0500 Subject: [PATCH] Blit SDMA support for gfx70x Change-Id: Ie6f215890553ef41c3f36b349fc9cc39c2d38747 [ROCm/ROCR-Runtime commit: 103cd04236f9b10e487ec1b9caede0b277ec920f] --- .../hsa-runtime/core/inc/amd_blit_sdma.h | 3 ++ .../hsa-runtime/core/inc/amd_gpu_agent.h | 11 ++++-- .../core/runtime/amd_blit_sdma.cpp | 39 +++++++++++++++++-- .../hsa-runtime/core/runtime/amd_topology.cpp | 6 --- .../hsa-runtime/core/runtime/runtime.cpp | 9 +++++ 5 files changed, 54 insertions(+), 14 deletions(-) diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 063e1d2f6a..670bbd4b6a 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -227,6 +227,9 @@ class BlitSdma : public core::Blit { /// Max total fill count supported by the queue. size_t max_total_fill_size_; + + /// True if platform atomic is supported. + bool platform_atomic_support_; }; } // namespace amd diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index c503b50d16..7b32464653 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -79,6 +79,11 @@ class GpuAgentInt : public core::Agent { // @retval HSA_STATUS_SUCCESS DMA queue initialization is successful. virtual void InitDma() = 0; + // @brief Initialize blit kernel object based on AQL queue. + // + // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful. + virtual hsa_status_t InitBlitKernel() = 0; + // @brief Invoke the user provided callback for each region accessible by // this agent. // @@ -178,10 +183,8 @@ class GpuAgent : public GpuAgentInt { // @brief Override from core::Agent. void InitDma() override; - // @brief Initialize blit kernel object based on AQL queue. - // - // @retval HSA_STATUS_SUCCESS blit kernel object initialization is successful. - hsa_status_t InitBlitKernel(); + // @brief Override from core::Agent. + hsa_status_t InitBlitKernel() override; uint16_t GetMicrocodeVersion() const; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index ab1cadcf66..ece6a91bd3 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -365,7 +365,8 @@ BlitSdma::BlitSdma() fence_pool_size_(0), fence_pool_counter_(0), cached_reserve_offset_(0), - cached_commit_offset_(0) { + cached_commit_offset_(0), + platform_atomic_support_(true) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } @@ -418,6 +419,10 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) { return HSA_STATUS_ERROR; } + if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) { + platform_atomic_support_ = false; + } + // Allocate queue buffer. queue_size_ = kQueueSize; @@ -568,7 +573,8 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; - // In case the user disable or enable the profiling in the middle of the call. + // Load the profiling state early in case the user disable or enable the + // profiling in the middle of the call. const bool profiling_enabled = agent_->profiling_enabled(); uint64_t* end_ts_addr = NULL; @@ -589,8 +595,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( (2 * timestamp_command_size_) + linear_copy_command_size_; } + // On agent that does not support platform atomic, we replace it with + // one or two fence packet(s) to update the signal value. The reason fence + // is used and not write packet is because the SDMA engine may overlap a + // serial copy/write packets. + const uint64_t completion_signal_value = + static_cast(out_signal.LoadRelaxed() - 1); + const size_t sync_command_size = (platform_atomic_support_) + ? atomic_command_size_ + : (completion_signal_value > UINT32_MAX) + ? 2 * fence_command_size_ + : fence_command_size_; + const uint32_t total_command_size = - total_poll_command_size + total_copy_command_size + atomic_command_size_ + + total_poll_command_size + total_copy_command_size + sync_command_size + total_timestamp_command_size; char* command_addr = AcquireWriteAddress(total_command_size); @@ -635,7 +653,20 @@ hsa_status_t BlitSdma::SubmitLinearCopyCommand( } // After transfer is completed, decrement the signal. - BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + if (platform_atomic_support_) { + BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + } else { + uint32_t* signal_value_location = + reinterpret_cast(out_signal.ValueLocation()); + if (completion_signal_value > UINT32_MAX) { + BuildFenceCommand(command_addr, signal_value_location + 1, + static_cast(completion_signal_value >> 32)); + command_addr += fence_command_size_; + } + + BuildFenceCommand(command_addr, signal_value_location, + static_cast(completion_signal_value)); + } ReleaseWriteAddress(command_addr_temp, total_command_size); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp index e008261ee4..b54292554d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -78,12 +78,6 @@ GpuAgent* DiscoverGpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { GpuAgent* gpu = new GpuAgent(node_id, node_prop); core::Runtime::runtime_singleton_->RegisterAgent(gpu); - if (HSA_STATUS_SUCCESS != gpu->InitBlitKernel()) { - assert(false && "Fail init blit"); - delete gpu; - gpu = NULL; - } - return gpu; } diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index 240fbfd035..9abca46d08 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -430,6 +430,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, core::Agent& dst_agent, [](void* dst, const void* src, size_t size, std::vector dep_signals, core::Signal* completion_signal, bool profiling_enabled) { + for (core::Signal* dep : dep_signals) { dep->WaitRelaxed(HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); @@ -820,6 +821,14 @@ void Runtime::Load() { // Load tools libraries LoadTools(); + + // Initialize blit kernel object after tools is initialized to allow tools + // to overload blit kernel. + for (core::Agent* agent : gpu_agents_) { + const hsa_status_t stat = + reinterpret_cast(agent)->InitBlitKernel(); + assert(HSA_STATUS_SUCCESS == stat); + } } void Runtime::Unload() {