From e0839ab27e360fd08f48f231ca1e331dbdc86c2c Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Thu, 7 Jun 2018 12:14:01 -0500 Subject: [PATCH] Implement SDMA copy rect for gfx9. Fix pitch overflow due to small element detection. Add wide pitch 2D copy handling. Cleanup code duplication. Change-Id: I93b1584aba8e5964957eb7ab3544df806ca3e2f9 --- .../core/common/hsa_table_interface.cpp | 11 + runtime/hsa-runtime/core/inc/amd_blit_sdma.h | 23 + runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 6 + .../hsa-runtime/core/inc/hsa_ext_amd_impl.h | 7 + runtime/hsa-runtime/core/inc/sdma_registers.h | 499 +++++++++++ .../core/runtime/amd_blit_sdma.cpp | 820 ++++++++---------- .../core/runtime/amd_gpu_agent.cpp | 25 + .../core/runtime/hsa_api_trace.cpp | 1 + .../hsa-runtime/core/runtime/hsa_ext_amd.cpp | 47 + runtime/hsa-runtime/core/util/utils.h | 10 + runtime/hsa-runtime/hsacore.so.def | 1 + runtime/hsa-runtime/inc/hsa_api_trace.h | 1 + runtime/hsa-runtime/inc/hsa_ext_amd.h | 37 + 13 files changed, 1015 insertions(+), 473 deletions(-) create mode 100644 runtime/hsa-runtime/core/inc/sdma_registers.h diff --git a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp index 7e48e0cbbe..7fcd1d1723 100644 --- a/runtime/hsa-runtime/core/common/hsa_table_interface.cpp +++ b/runtime/hsa-runtime/core/common/hsa_table_interface.cpp @@ -963,6 +963,17 @@ hsa_status_t HSA_API num_dep_signals, dep_signals, completion_signal); } +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + return amdExtTable->hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range, + copy_agent, dir, num_dep_signals, + dep_signals, completion_signal); +} + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, diff --git a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 8bf05d006e..c29cfbf140 100644 --- a/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -45,6 +45,7 @@ #include #include +#include #include "hsakmt.h" @@ -55,6 +56,7 @@ #include "core/util/utils.h" namespace amd { + class BlitSdmaBase : public core::Blit { public: static const size_t kQueueSize; @@ -62,6 +64,12 @@ class BlitSdmaBase : public core::Blit { static const size_t kMaxSingleCopySize; static const size_t kMaxSingleFillSize; virtual bool isSDMA() const override { return true; } + virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst, + const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, + std::vector& dep_signals, + core::Signal& out_signal) = 0; }; // RingIndexTy: 32/64-bit monotonic ring index, counting in bytes. @@ -116,6 +124,13 @@ class BlitSdma : public BlitSdmaBase { std::vector& dep_signals, core::Signal& out_signal) override; + virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst, + const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, + std::vector& dep_signals, + core::Signal& out_signal) override; + /// @brief Submit a linear fill command to the queue buffer /// /// @param ptr Memory address of the fill destination. @@ -181,6 +196,11 @@ class BlitSdma : public BlitSdmaBase { void BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst, const void* src, size_t size); + void BuildCopyRectCommand(const std::function& append, + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, + const hsa_dim3_t* range); + void BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference); void BuildAtomicDecrementCommand(char* cmd_addr, void* addr); @@ -189,6 +209,9 @@ class BlitSdma : public BlitSdmaBase { void BuildTrapCommand(char* cmd_addr); + hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size, + std::vector& dep_signals, core::Signal& out_signal); + // Agent object owning the SDMA engine. GpuAgent* agent_; diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 2de2fd9a61..cef9cc5ed9 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -240,6 +240,12 @@ class GpuAgent : public GpuAgentInt { std::vector& dep_signals, core::Signal& out_signal) override; + // @brief Override from core::Agent. + hsa_status_t DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, + const hsa_dim3_t* range, hsa_amd_copy_direction_t dir, + std::vector& dep_signals, core::Signal& out_signal); + // @brief Override from core::Agent. hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override; diff --git a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h index 2d30331d48..fcbd2eff33 100644 --- a/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h +++ b/runtime/hsa-runtime/core/inc/hsa_ext_amd_impl.h @@ -140,6 +140,13 @@ hsa_status_t HSA_API const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); +// Mirrors Amd Extension Apis +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + // Mirrors Amd Extension Apis hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info( hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool, diff --git a/runtime/hsa-runtime/core/inc/sdma_registers.h b/runtime/hsa-runtime/core/inc/sdma_registers.h new file mode 100644 index 0000000000..735ba465ae --- /dev/null +++ b/runtime/hsa-runtime/core/inc/sdma_registers.h @@ -0,0 +1,499 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_ +#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_ + +namespace amd { + +// SDMA packet for VI device. +// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt + +const unsigned int SDMA_OP_COPY = 1; +const unsigned int SDMA_OP_FENCE = 5; +const unsigned int SDMA_OP_TRAP = 6; +const unsigned int SDMA_OP_POLL_REGMEM = 8; +const unsigned int SDMA_OP_ATOMIC = 10; +const unsigned int SDMA_OP_CONST_FILL = 11; +const unsigned int SDMA_OP_TIMESTAMP = 13; +const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; +const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4; +const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2; +const unsigned int SDMA_ATOMIC_ADD64 = 47; + +typedef struct SDMA_PKT_COPY_LINEAR_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int extra_info : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int count : 22; + unsigned int reserved_0 : 10; + }; + unsigned int DW_1_DATA; + } COUNT_UNION; + + union { + struct { + unsigned int reserved_0 : 16; + unsigned int dst_swap : 2; + unsigned int reserved_1 : 6; + unsigned int src_swap : 2; + unsigned int reserved_2 : 6; + }; + unsigned int DW_2_DATA; + } PARAMETER_UNION; + + union { + struct { + unsigned int src_addr_31_0 : 32; + }; + unsigned int DW_3_DATA; + } SRC_ADDR_LO_UNION; + + union { + struct { + unsigned int src_addr_63_32 : 32; + }; + unsigned int DW_4_DATA; + } SRC_ADDR_HI_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_5_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_6_DATA; + } DST_ADDR_HI_UNION; + + static const size_t kMaxSize_ = 0x3fffe0; +} SDMA_PKT_COPY_LINEAR; + +// linear sub-window +typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG { + static const unsigned int pitch_bits = 19; + static const unsigned int slice_bits = 28; + static const unsigned int rect_xy_bits = 14; + static const unsigned int rect_z_bits = 11; + + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved : 13; + unsigned int element : 3; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int src_addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } SRC_ADDR_LO_UNION; + + union { + struct { + unsigned int src_addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } SRC_ADDR_HI_UNION; + + union { + struct { + unsigned int src_offset_x : 14; + unsigned int reserved_1 : 2; + unsigned int src_offset_y : 14; + unsigned int reserved_2 : 2; + }; + unsigned int DW_3_DATA; + } SRC_PARAMETER_1_UNION; + + union { + struct { + unsigned int src_offset_z : 11; + unsigned int reserved_1 : 2; + unsigned int src_pitch : pitch_bits; + }; + unsigned int DW_4_DATA; + } SRC_PARAMETER_2_UNION; + + union { + struct { + unsigned int src_slice_pitch : slice_bits; + unsigned int reserved_1 : 4; + }; + unsigned int DW_5_DATA; + } SRC_PARAMETER_3_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_6_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_7_DATA; + } DST_ADDR_HI_UNION; + + union { + struct { + unsigned int dst_offset_x : 14; + unsigned int reserved_1 : 2; + unsigned int dst_offset_y : 14; + unsigned int reserved_2 : 2; + }; + unsigned int DW_8_DATA; + } DST_PARAMETER_1_UNION; + + union { + struct { + unsigned int dst_offset_z : 11; + unsigned int reserved_1 : 2; + unsigned int dst_pitch : pitch_bits; + }; + unsigned int DW_9_DATA; + } DST_PARAMETER_2_UNION; + + union { + struct { + unsigned int dst_slice_pitch : slice_bits; + unsigned int reserved_1 : 4; + }; + unsigned int DW_10_DATA; + } DST_PARAMETER_3_UNION; + + union { + struct { + unsigned int rect_x : rect_xy_bits; + unsigned int reserved_1 : 2; + unsigned int rect_y : rect_xy_bits; + unsigned int reserved_2 : 2; + }; + unsigned int DW_11_DATA; + } RECT_PARAMETER_1_UNION; + + union { + struct { + unsigned int rect_z : rect_z_bits; + unsigned int reserved_1 : 5; + unsigned int dst_swap : 2; + unsigned int reserved_2 : 6; + unsigned int src_swap : 2; + unsigned int reserved_3 : 6; + }; + unsigned int DW_12_DATA; + } RECT_PARAMETER_2_UNION; + + // static const unsigned int pitch_bits = 19; +} SDMA_PKT_COPY_LINEAR_RECT; + +typedef struct SDMA_PKT_CONSTANT_FILL_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int sw : 2; + unsigned int reserved_0 : 12; + unsigned int fillsize : 2; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int dst_addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } DST_ADDR_LO_UNION; + + union { + struct { + unsigned int dst_addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } DST_ADDR_HI_UNION; + + union { + struct { + unsigned int src_data_31_0 : 32; + }; + unsigned int DW_3_DATA; + } DATA_UNION; + + union { + struct { + unsigned int count : 22; + unsigned int reserved_0 : 10; + }; + unsigned int DW_4_DATA; + } COUNT_UNION; + + static const size_t kMaxSize_ = 0x3fffe0; +} SDMA_PKT_CONSTANT_FILL; + +typedef struct SDMA_PKT_FENCE_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int data : 32; + }; + unsigned int DW_3_DATA; + } DATA_UNION; +} SDMA_PKT_FENCE; + +typedef struct SDMA_PKT_POLL_REGMEM_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 10; + unsigned int hdp_flush : 1; + unsigned int reserved_1 : 1; + unsigned int func : 3; + unsigned int mem_poll : 1; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int value : 32; + }; + unsigned int DW_3_DATA; + } VALUE_UNION; + + union { + struct { + unsigned int mask : 32; + }; + unsigned int DW_4_DATA; + } MASK_UNION; + + union { + struct { + unsigned int interval : 16; + unsigned int retry_count : 12; + unsigned int reserved_0 : 4; + }; + unsigned int DW_5_DATA; + } DW5_UNION; +} SDMA_PKT_POLL_REGMEM; + +typedef struct SDMA_PKT_ATOMIC_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int l : 1; + unsigned int reserved_0 : 8; + unsigned int operation : 7; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + + union { + struct { + unsigned int src_data_31_0 : 32; + }; + unsigned int DW_3_DATA; + } SRC_DATA_LO_UNION; + + union { + struct { + unsigned int src_data_63_32 : 32; + }; + unsigned int DW_4_DATA; + } SRC_DATA_HI_UNION; + + union { + struct { + unsigned int cmp_data_31_0 : 32; + }; + unsigned int DW_5_DATA; + } CMP_DATA_LO_UNION; + + union { + struct { + unsigned int cmp_data_63_32 : 32; + }; + unsigned int DW_6_DATA; + } CMP_DATA_HI_UNION; + + union { + struct { + unsigned int loop_interval : 13; + unsigned int reserved_0 : 19; + }; + unsigned int DW_7_DATA; + } LOOP_UNION; +} SDMA_PKT_ATOMIC; + +typedef struct SDMA_PKT_TIMESTAMP_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int addr_31_0 : 32; + }; + unsigned int DW_1_DATA; + } ADDR_LO_UNION; + + union { + struct { + unsigned int addr_63_32 : 32; + }; + unsigned int DW_2_DATA; + } ADDR_HI_UNION; + +} SDMA_PKT_TIMESTAMP; + +typedef struct SDMA_PKT_TRAP_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int reserved_0 : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int int_ctx : 28; + unsigned int reserved_1 : 4; + }; + unsigned int DW_1_DATA; + } INT_CONTEXT_UNION; +} SDMA_PKT_TRAP; + +// HDP flush packet, no parameters. +typedef struct SDMA_PKT_HDP_FLUSH_TAG { + unsigned int DW_0_DATA; + unsigned int DW_1_DATA; + unsigned int DW_2_DATA; + unsigned int DW_3_DATA; + unsigned int DW_4_DATA; + unsigned int DW_5_DATA; + + // Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH + static const uint16_t kMinVersion_ = 0x1A5; +} SDMA_PKT_HDP_FLUSH; +static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0}; + +} // namespace amd + +#endif // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_ diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index a75988c510..2c10404cd8 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -51,328 +51,10 @@ #include "core/inc/amd_gpu_agent.h" #include "core/inc/amd_memory_region.h" #include "core/inc/runtime.h" +#include "core/inc/sdma_registers.h" #include "core/inc/signal.h" namespace amd { -// SDMA packet for VI device. -// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt - -const unsigned int SDMA_OP_COPY = 1; -const unsigned int SDMA_OP_FENCE = 5; -const unsigned int SDMA_OP_TRAP = 6; -const unsigned int SDMA_OP_POLL_REGMEM = 8; -const unsigned int SDMA_OP_ATOMIC = 10; -const unsigned int SDMA_OP_CONST_FILL = 11; -const unsigned int SDMA_OP_TIMESTAMP = 13; -const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; -const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2; -const unsigned int SDMA_ATOMIC_ADD64 = 47; - -typedef struct SDMA_PKT_COPY_LINEAR_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int extra_info : 16; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int count : 22; - unsigned int reserved_0 : 10; - }; - unsigned int DW_1_DATA; - } COUNT_UNION; - - union { - struct { - unsigned int reserved_0 : 16; - unsigned int dst_swap : 2; - unsigned int reserved_1 : 6; - unsigned int src_swap : 2; - unsigned int reserved_2 : 6; - }; - unsigned int DW_2_DATA; - } PARAMETER_UNION; - - union { - struct { - unsigned int src_addr_31_0 : 32; - }; - unsigned int DW_3_DATA; - } SRC_ADDR_LO_UNION; - - union { - struct { - unsigned int src_addr_63_32 : 32; - }; - unsigned int DW_4_DATA; - } SRC_ADDR_HI_UNION; - - union { - struct { - unsigned int dst_addr_31_0 : 32; - }; - unsigned int DW_5_DATA; - } DST_ADDR_LO_UNION; - - union { - struct { - unsigned int dst_addr_63_32 : 32; - }; - unsigned int DW_6_DATA; - } DST_ADDR_HI_UNION; -} SDMA_PKT_COPY_LINEAR; - -typedef struct SDMA_PKT_CONSTANT_FILL_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int sw : 2; - unsigned int reserved_0 : 12; - unsigned int fillsize : 2; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int dst_addr_31_0 : 32; - }; - unsigned int DW_1_DATA; - } DST_ADDR_LO_UNION; - - union { - struct { - unsigned int dst_addr_63_32 : 32; - }; - unsigned int DW_2_DATA; - } DST_ADDR_HI_UNION; - - union { - struct { - unsigned int src_data_31_0 : 32; - }; - unsigned int DW_3_DATA; - } DATA_UNION; - - union { - struct { - unsigned int count : 22; - unsigned int reserved_0 : 10; - }; - unsigned int DW_4_DATA; - } COUNT_UNION; -} SDMA_PKT_CONSTANT_FILL; - -typedef struct SDMA_PKT_FENCE_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int reserved_0 : 16; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int addr_31_0 : 32; - }; - unsigned int DW_1_DATA; - } ADDR_LO_UNION; - - union { - struct { - unsigned int addr_63_32 : 32; - }; - unsigned int DW_2_DATA; - } ADDR_HI_UNION; - - union { - struct { - unsigned int data : 32; - }; - unsigned int DW_3_DATA; - } DATA_UNION; -} SDMA_PKT_FENCE; - -typedef struct SDMA_PKT_POLL_REGMEM_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int reserved_0 : 10; - unsigned int hdp_flush : 1; - unsigned int reserved_1 : 1; - unsigned int func : 3; - unsigned int mem_poll : 1; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int addr_31_0 : 32; - }; - unsigned int DW_1_DATA; - } ADDR_LO_UNION; - - union { - struct { - unsigned int addr_63_32 : 32; - }; - unsigned int DW_2_DATA; - } ADDR_HI_UNION; - - union { - struct { - unsigned int value : 32; - }; - unsigned int DW_3_DATA; - } VALUE_UNION; - - union { - struct { - unsigned int mask : 32; - }; - unsigned int DW_4_DATA; - } MASK_UNION; - - union { - struct { - unsigned int interval : 16; - unsigned int retry_count : 12; - unsigned int reserved_0 : 4; - }; - unsigned int DW_5_DATA; - } DW5_UNION; -} SDMA_PKT_POLL_REGMEM; - -typedef struct SDMA_PKT_ATOMIC_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int l : 1; - unsigned int reserved_0 : 8; - unsigned int operation : 7; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int addr_31_0 : 32; - }; - unsigned int DW_1_DATA; - } ADDR_LO_UNION; - - union { - struct { - unsigned int addr_63_32 : 32; - }; - unsigned int DW_2_DATA; - } ADDR_HI_UNION; - - union { - struct { - unsigned int src_data_31_0 : 32; - }; - unsigned int DW_3_DATA; - } SRC_DATA_LO_UNION; - - union { - struct { - unsigned int src_data_63_32 : 32; - }; - unsigned int DW_4_DATA; - } SRC_DATA_HI_UNION; - - union { - struct { - unsigned int cmp_data_31_0 : 32; - }; - unsigned int DW_5_DATA; - } CMP_DATA_LO_UNION; - - union { - struct { - unsigned int cmp_data_63_32 : 32; - }; - unsigned int DW_6_DATA; - } CMP_DATA_HI_UNION; - - union { - struct { - unsigned int loop_interval : 13; - unsigned int reserved_0 : 19; - }; - unsigned int DW_7_DATA; - } LOOP_UNION; -} SDMA_PKT_ATOMIC; - -typedef struct SDMA_PKT_TIMESTAMP_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int reserved_0 : 16; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int addr_31_0 : 32; - }; - unsigned int DW_1_DATA; - } ADDR_LO_UNION; - - union { - struct { - unsigned int addr_63_32 : 32; - }; - unsigned int DW_2_DATA; - } ADDR_HI_UNION; - -} SDMA_PKT_TIMESTAMP; - -typedef struct SDMA_PKT_TRAP_TAG { - union { - struct { - unsigned int op : 8; - unsigned int sub_op : 8; - unsigned int reserved_0 : 16; - }; - unsigned int DW_0_DATA; - } HEADER_UNION; - - union { - struct { - unsigned int int_ctx : 28; - unsigned int reserved_1 : 4; - }; - unsigned int DW_1_DATA; - } INT_CONTEXT_UNION; -} SDMA_PKT_TRAP; - -// Initialize Hdp flush packet for use on sDMA of devices -// from Gfx9 or new family -static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ { - { SDMA_OP_POLL_REGMEM }, - { 0x00 }, - { 0x80000000 }, - { 0x00 }, - { 0x00 }, - { 0x00 }, -}; - -// Version of sDMA microcode supporting Hdp flush -static const uint16_t sdma_version_ = 0x01A5; inline uint32_t ptrlow32(const void* p) { return static_cast(reinterpret_cast(p)); @@ -388,8 +70,8 @@ inline uint32_t ptrhigh32(const void* p) { const size_t BlitSdmaBase::kQueueSize = 1024 * 1024; const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR); -const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0; // From HW documentation -const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0; +const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_; +const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_; // Initialize size of various sDMA commands use by this module template @@ -437,8 +119,6 @@ BlitSdma::~BlitSdma() {} template hsa_status_t BlitSdma::Initialize( const core::Agent& agent) { - agent_ = reinterpret_cast(&const_cast(agent)); - if (queue_start_addr_ != NULL) { // Already initialized. return HSA_STATUS_SUCCESS; @@ -448,24 +128,23 @@ hsa_status_t BlitSdma::Initial return HSA_STATUS_ERROR; } - const amd::GpuAgentInt& amd_gpu_agent = - static_cast(agent); + agent_ = reinterpret_cast(&const_cast(agent)); - if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) { + if (HSA_PROFILE_FULL == agent_->profile()) { assert(false && "Only support SDMA for dgpu currently"); return HSA_STATUS_ERROR; } - if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) { + if (agent_->isa()->version() == core::Isa::Version(7, 0, 1)) { platform_atomic_support_ = false; } else { const core::Runtime::LinkInfo& link = core::Runtime::runtime_singleton_->GetLinkInfo( - amd_gpu_agent.node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id()); + agent_->node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id()); platform_atomic_support_ = link.info.atomic_support_64bit; } // Determine if sDMA microcode supports HDP flush command - if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) { + if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) { hdp_flush_support_ = true; } @@ -483,7 +162,7 @@ hsa_status_t BlitSdma::Initial // This call binds user mode queue object to underlying compute // device. const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA; - if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100, + if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100, HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_, kQueueSize, NULL, &queue_resource_)) { Destroy(agent); @@ -539,6 +218,159 @@ hsa_status_t BlitSdma::Destroy return HSA_STATUS_SUCCESS; } +template +hsa_status_t BlitSdma::SubmitCommand( + const void* cmd, size_t cmd_size, std::vector& dep_signals, + core::Signal& out_signal) { + // The signal is 64 bit value, and poll checks for 32 bit value. So we + // need to use two poll operations per dependent signal. + const uint32_t num_poll_command = + static_cast(2 * dep_signals.size()); + const uint32_t total_poll_command_size = + (num_poll_command * poll_command_size_); + + // Load the profiling state early in case the user disable or enable the + // profiling in the middle of the call. + const bool profiling_enabled = agent_->profiling_enabled(); + + uint64_t* end_ts_addr = NULL; + uint32_t total_timestamp_command_size = 0; + + if (profiling_enabled) { + // SDMA timestamp packet requires 32 byte of aligned memory, but + // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to + // read from a 32 byte aligned bounce buffer is required to avoid changing + // the amd_signal_t ABI. + + end_ts_addr = agent_->ObtainEndTsObject(); + if (end_ts_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + total_timestamp_command_size = + (2 * timestamp_command_size_) + linear_copy_command_size_; + } + + // On agent that does not support platform atomic, we replace it with + // one or two fence packet(s) to update the signal value. The reason fence + // is used and not write packet is because the SDMA engine may overlap a + // serial copy/write packets. + const uint64_t completion_signal_value = + static_cast(out_signal.LoadRelaxed() - 1); + const size_t sync_command_size = (platform_atomic_support_) + ? atomic_command_size_ + : (completion_signal_value > UINT32_MAX) + ? 2 * fence_command_size_ + : fence_command_size_; + + // If the signal is an interrupt signal, we also need to make SDMA engine to + // send interrupt packet to IH. + const size_t interrupt_command_size = + (out_signal.signal_.event_mailbox_ptr != 0) + ? (fence_command_size_ + trap_command_size_) + : 0; + + // Add space for acquire or release Hdp flush command + uint32_t flush_cmd_size = 0; + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_)) { + flush_cmd_size = flush_command_size_; + } + } + + const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size + + total_timestamp_command_size + interrupt_command_size + flush_cmd_size; + + RingIndexTy curr_index; + char* command_addr = AcquireWriteAddress(total_command_size, curr_index); + + if (command_addr == NULL) { + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } + + for (size_t i = 0; i < dep_signals.size(); ++i) { + uint32_t* signal_addr = + reinterpret_cast(dep_signals[i]->ValueLocation()); + // Wait for the higher 64 bit to 0. + BuildPollCommand(command_addr, &signal_addr[1], 0); + command_addr += poll_command_size_; + // Then wait for the lower 64 bit to 0. + BuildPollCommand(command_addr, &signal_addr[0], 0); + command_addr += poll_command_size_; + } + + if (profiling_enabled) { + BuildGetGlobalTimestampCommand( + command_addr, reinterpret_cast(&out_signal.signal_.start_ts)); + command_addr += timestamp_command_size_; + } + + // Determine if a Hdp flush cmd is required at the top of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + + // Do the command after all polls are satisfied. + memcpy(command_addr, cmd, cmd_size); + command_addr += cmd_size; + + // Determine if a Hdp flush cmd is required at the end of cmd stream + if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { + if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) { + BuildHdpFlushCommand(command_addr); + command_addr += flush_command_size_; + } + } + + if (profiling_enabled) { + assert(IsMultipleOf(end_ts_addr, 32)); + BuildGetGlobalTimestampCommand(command_addr, + reinterpret_cast(end_ts_addr)); + command_addr += timestamp_command_size_; + + BuildCopyCommand(command_addr, 1, + reinterpret_cast(&out_signal.signal_.end_ts), + reinterpret_cast(end_ts_addr), sizeof(uint64_t)); + command_addr += linear_copy_command_size_; + } + + // After transfer is completed, decrement the signal value. + if (platform_atomic_support_) { + BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); + command_addr += atomic_command_size_; + + } else { + uint32_t* signal_value_location = reinterpret_cast(out_signal.ValueLocation()); + if (completion_signal_value > UINT32_MAX) { + BuildFenceCommand(command_addr, signal_value_location + 1, + static_cast(completion_signal_value >> 32)); + command_addr += fence_command_size_; + } + + BuildFenceCommand(command_addr, signal_value_location, + static_cast(completion_signal_value)); + + command_addr += fence_command_size_; + } + + // Update mailbox event and send interrupt to IH. + if (out_signal.signal_.event_mailbox_ptr != 0) { + BuildFenceCommand(command_addr, + reinterpret_cast(out_signal.signal_.event_mailbox_ptr), + static_cast(out_signal.signal_.event_id)); + command_addr += fence_command_size_; + + BuildTrapCommand(command_addr); + } + + ReleaseWriteAddress(curr_index, total_command_size); + + return HSA_STATUS_SUCCESS; +} + template hsa_status_t BlitSdma::SubmitLinearCopyCommand( void* dst, const void* src, size_t size) { @@ -546,8 +378,7 @@ hsa_status_t BlitSdma::SubmitL // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; - const uint32_t total_copy_command_size = - num_copy_command * linear_copy_command_size_; + const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_; // Add space for acquire or release Hdp flush command uint32_t flush_cmd_size = 0; @@ -603,161 +434,79 @@ template hsa_status_t BlitSdma::SubmitLinearCopyCommand( void* dst, const void* src, size_t size, std::vector& dep_signals, core::Signal& out_signal) { - // The signal is 64 bit value, and poll checks for 32 bit value. So we - // need to use two poll operations per dependent signal. - const uint32_t num_poll_command = - static_cast(2 * dep_signals.size()); - const uint32_t total_poll_command_size = - (num_poll_command * poll_command_size_); - - // Break the copy into multiple copy operation incase the copy size exceeds + // Break the copy into multiple copy operations when the copy size exceeds // the SDMA linear copy limit. const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize; - const uint32_t total_copy_command_size = - num_copy_command * linear_copy_command_size_; - // Load the profiling state early in case the user disable or enable the - // profiling in the middle of the call. - const bool profiling_enabled = agent_->profiling_enabled(); + // Assemble copy packets. + std::vector buff(num_copy_command); + BuildCopyCommand(reinterpret_cast(&buff[0]), num_copy_command, dst, src, size); - uint64_t* end_ts_addr = NULL; - uint32_t total_timestamp_command_size = 0; + return SubmitCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), dep_signals, + out_signal); +} - if (profiling_enabled) { - // SDMA timestamp packet requires 32 byte of aligned memory, but - // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to - // read from a 32 byte aligned bounce buffer is required to avoid changing - // the amd_signal_t ABI. +template +hsa_status_t BlitSdma::SubmitCopyRectCommand( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector& dep_signals, + core::Signal& out_signal) { + // Hardware requires DWORD alignment for base address, pitches + // Also confirm that we have a geometric rect (copied block does not wrap an edge). + if (((uintptr_t)dst->base) % 4 != 0 || ((uintptr_t)src->base) % 4 != 0) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Copy rect base address not aligned."); + if (((uintptr_t)dst->pitch) % 4 != 0 || ((uintptr_t)src->pitch) % 4 != 0) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch not aligned."); + if (((uintptr_t)dst->slice) % 4 != 0 || ((uintptr_t)src->slice) % 4 != 0) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice not aligned."); + if (uint64_t(src_offset->x) + range->x > src->pitch || + uint64_t(dst_offset->x) + range->x > dst->pitch) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect width out of range."); + if ((src->slice != 0) && (uint64_t(src_offset->y) + range->y) > src->slice / src->pitch) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range."); + if ((dst->slice != 0) && (uint64_t(dst_offset->y) + range->y) > dst->slice / dst->pitch) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range."); + if (range->z > 1 && (src->slice == 0 || dst->slice == 0)) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed."); - end_ts_addr = agent_->ObtainEndTsObject(); - if (end_ts_addr == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } + const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits; - total_timestamp_command_size = - (2 * timestamp_command_size_) + linear_copy_command_size_; - } + std::vector pkts; + auto append = [&](size_t size) { + assert(size == sizeof(SDMA_PKT_COPY_LINEAR_RECT) && "SDMA packet size missmatch"); + pkts.emplace_back(SDMA_PKT_COPY_LINEAR_RECT()); + return &pkts.back(); + }; - // On agent that does not support platform atomic, we replace it with - // one or two fence packet(s) to update the signal value. The reason fence - // is used and not write packet is because the SDMA engine may overlap a - // serial copy/write packets. - const uint64_t completion_signal_value = - static_cast(out_signal.LoadRelaxed() - 1); - const size_t sync_command_size = (platform_atomic_support_) - ? atomic_command_size_ - : (completion_signal_value > UINT32_MAX) - ? 2 * fence_command_size_ - : fence_command_size_; + // Do wide pitch 2D copies along X-Z + if (range->z == 1 && (src->pitch > max_pitch || dst->pitch > max_pitch)) { + hsa_pitched_ptr_t Src = *src; + hsa_pitched_ptr_t Dst = *dst; + hsa_dim3_t Soff = *src_offset; + hsa_dim3_t Doff = *dst_offset; + hsa_dim3_t Range = *range; - // If the signal is an interrupt signal, we also need to make SDMA engine to - // send interrupt packet to IH. - const size_t interrupt_command_size = - (out_signal.signal_.event_mailbox_ptr != 0) - ? (fence_command_size_ + trap_command_size_) - : 0; + Src.base += Soff.z * Src.slice + Soff.y * Src.pitch; + Dst.base += Doff.z * Dst.slice + Doff.y * Dst.pitch; + Soff.y = Soff.z = 0; + Doff.y = Doff.z = 0; - // Add space for acquire or release Hdp flush command - uint32_t flush_cmd_size = 0; - if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { - if ((HwIndexMonotonic) && (hdp_flush_support_)) { - flush_cmd_size = flush_command_size_; - } - } + Src.slice = Src.pitch; + Src.pitch = 0; + Dst.slice = Dst.pitch; + Dst.pitch = 0; - const uint32_t total_command_size = - total_poll_command_size + total_copy_command_size + sync_command_size + - total_timestamp_command_size + interrupt_command_size + flush_cmd_size; - - RingIndexTy curr_index; - char* command_addr = AcquireWriteAddress(total_command_size, curr_index); - - if (command_addr == NULL) { - return HSA_STATUS_ERROR_OUT_OF_RESOURCES; - } - - for (size_t i = 0; i < dep_signals.size(); ++i) { - uint32_t* signal_addr = - reinterpret_cast(dep_signals[i]->ValueLocation()); - // Wait for the higher 64 bit to 0. - BuildPollCommand(command_addr, &signal_addr[1], 0); - command_addr += poll_command_size_; - // Then wait for the lower 64 bit to 0. - BuildPollCommand(command_addr, &signal_addr[0], 0); - command_addr += poll_command_size_; - } - - if (profiling_enabled) { - BuildGetGlobalTimestampCommand( - command_addr, reinterpret_cast(&out_signal.signal_.start_ts)); - command_addr += timestamp_command_size_; - } - - // Determine if a Hdp flush cmd is required at the top of cmd stream - if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { - if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) { - BuildHdpFlushCommand(command_addr); - command_addr += flush_command_size_; - } - } - - // Do the transfer after all polls are satisfied. - BuildCopyCommand(command_addr, num_copy_command, dst, src, size); - command_addr += total_copy_command_size; - - // Determine if a Hdp flush cmd is required at the end of cmd stream - if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) { - if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) { - BuildHdpFlushCommand(command_addr); - command_addr += flush_command_size_; - } - } - - if (profiling_enabled) { - assert(IsMultipleOf(end_ts_addr, 32)); - BuildGetGlobalTimestampCommand(command_addr, - reinterpret_cast(end_ts_addr)); - command_addr += timestamp_command_size_; - - BuildCopyCommand(command_addr, 1, - reinterpret_cast(&out_signal.signal_.end_ts), - reinterpret_cast(end_ts_addr), sizeof(uint64_t)); - command_addr += linear_copy_command_size_; - } - - // After transfer is completed, decrement the signal value. - if (platform_atomic_support_) { - BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation()); - command_addr += atomic_command_size_; + Range.z = Range.y; + Range.y = 1; + BuildCopyRectCommand(append, &Dst, &Doff, &Src, &Soff, &Range); } else { - uint32_t* signal_value_location = - reinterpret_cast(out_signal.ValueLocation()); - if (completion_signal_value > UINT32_MAX) { - BuildFenceCommand(command_addr, signal_value_location + 1, - static_cast(completion_signal_value >> 32)); - command_addr += fence_command_size_; - } - - BuildFenceCommand(command_addr, signal_value_location, - static_cast(completion_signal_value)); - - command_addr += fence_command_size_; + BuildCopyRectCommand(append, dst, dst_offset, src, src_offset, range); } - // Update mailbox event and send interrupt to IH. - if (out_signal.signal_.event_mailbox_ptr != 0) { - BuildFenceCommand(command_addr, reinterpret_cast( - out_signal.signal_.event_mailbox_ptr), - static_cast(out_signal.signal_.event_id)); - command_addr += fence_command_size_; - - BuildTrapCommand(command_addr); - } - - ReleaseWriteAddress(curr_index, total_command_size); - - return HSA_STATUS_SUCCESS; + return SubmitCommand(&pkts[0], pkts.size() * sizeof(SDMA_PKT_COPY_LINEAR_RECT), dep_signals, + out_signal); } template @@ -1057,6 +806,131 @@ void BlitSdma::BuildCopyComman assert(cur_size == size); } +/* +Copies are done in terms of elements (1, 2, 4, 8, or 16 bytes) and have alignment restrictions. +Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byte, 4=16 byte). +This routine breaks a large rect into tiles that can be handled by hardware. Pitches and offsets +must be representable in terms of elements in all tiles of the copy. +*/ +template +void BlitSdma::BuildCopyRectCommand( + const std::function& append, const hsa_pitched_ptr_t* dst, + const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, + const hsa_dim3_t* range) { + // Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides + // width), the largest element that perfectly covers width. + // width | 16 ensures that we don't return a higher element than is supported and avoids + // issues with 0. + auto maxAlignedElement = [](size_t width) { + return __builtin_ctz(width | 16); + }; + + // Limits in terms of element count + const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits; + const uint max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits; + const uint max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits; + const uint max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits; + const uint max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits; + + // Find maximum element that describes the pitch and slice. + // Pitch and slice must both be represented in units of elements. No element larger than this + // may be used in any tile as the pitches would not be exactly represented. + int max_ele = Min(maxAlignedElement(src->pitch), maxAlignedElement(dst->pitch)); + if (range->z != 1) // Only need to consider slice if HW will copy along Z. + max_ele = Min(max_ele, maxAlignedElement(src->slice), maxAlignedElement(dst->slice)); + + /* + Find the minimum element size that will be needed for any tile. + + No subdivision of a range admits a larger element size for the smallest element in any subdivision + than the element size that covers the whole range, though some can be worse (this is easily model + checked). Subdividing with any element larger than the covering element won't change the covering + element of the remainder + ( Range%Element = (Range-N*LargerElement)%Element since LargerElement%Element=0 ). + Ex. range->x=71, assume max range is 16 elements: We can break at 64 giving tiles: + [0,63], [64-70] (width 64 & 7). 64 is covered by element 4 (16B) and 7 is covered by element 0 + (1B). Exactly covering 71 requires using element 0. + + Base addresses in each tile must be DWORD aligned, if not then the offset from an aligned address + must be represented in elements. This may reduce the size of the element, but since elements are + integer multiples of each other this is harmless. + + src and dst base has already been checked for DWORD alignment so we only need to consider the + offset here. + */ + int min_ele = Min(max_ele, maxAlignedElement(range->x), maxAlignedElement(src_offset->x % 4), + maxAlignedElement(dst_offset->x % 4)); + + // Check that pitch and slice can be represented in the tile with the smallest element + if ((src->pitch >> min_ele) > max_pitch || (dst->pitch >> min_ele) > max_pitch) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch out of limits.\n"); + if (range->z != 1) { // Only need to consider slice if HW will copy along Z. + if ((src->slice >> min_ele) > max_slice || (dst->slice >> min_ele) > max_slice) + throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, + "Copy rect slice out of limits.\n"); + } + + // Break copy into tiles + for (uint64_t z = 0; z < range->z; z += max_z) { + for (uint64_t y = 0; y < range->y; y += max_y) { + uint64_t x = 0; + while (x < range->x) { + uint64_t width = range->x - x; + + // Get largest element which describes the start of this tile after its base address has + // been aligned. Base addresses must be DWORD (4 byte) aligned. + int aligned_ele = Min(maxAlignedElement((src_offset->x + x) % 4), + maxAlignedElement((dst_offset->x + x) % 4), max_ele); + + // Get largest permissible element which exactly covers width + int element = Min(maxAlignedElement(width), aligned_ele); + int xcount = width >> element; + + // If width is too large then width is at least max_x bytes (bigger than any element) so + // drop the width restriction and clip element count to max_x. + if (xcount > max_x) { + element = aligned_ele; + xcount = Min(width >> element, max_x); + } + + // Get base addresses and offsets for this tile. + uintptr_t sbase = (uintptr_t)src->base + src_offset->x + x + + (src_offset->y + y) * src->pitch + (src_offset->z + z) * src->slice; + uintptr_t dbase = (uintptr_t)dst->base + dst_offset->x + x + + (dst_offset->y + y) * dst->pitch + (dst_offset->z + z) * dst->slice; + uint soff = (sbase % 4) >> element; + uint doff = (dbase % 4) >> element; + sbase &= ~3ull; + dbase &= ~3ull; + + x += xcount << element; + + SDMA_PKT_COPY_LINEAR_RECT* pkt = + (SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT)); + *pkt = {}; + pkt->HEADER_UNION.op = SDMA_OP_COPY; + pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT; + pkt->HEADER_UNION.element = element; + pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase; + pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32; + pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff; + pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1; + pkt->SRC_PARAMETER_3_UNION.src_slice_pitch = + (range->z == 1) ? 0 : (src->slice >> element) - 1; + pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase; + pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32; + pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff; + pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1; + pkt->DST_PARAMETER_3_UNION.dst_slice_pitch = + (range->z == 1) ? 0 : (dst->slice >> element) - 1; + pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1; + pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1; + pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1; + } + } + } +} + template void BlitSdma::BuildPollCommand( char* cmd_addr, void* addr, uint32_t reference) { @@ -1126,7 +1000,7 @@ void BlitSdma::BuildHdpFlushCo char* cmd_addr) { assert(cmd_addr != NULL); SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast(cmd_addr); - memcpy(addr, &hdp_flush_cmd_, flush_command_size_); + memcpy(addr, &hdp_flush_cmd, flush_command_size_); } template class BlitSdma; diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 7b8878ad45..29a1010899 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -634,6 +634,31 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, return stat; } +hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, + const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, + const hsa_dim3_t* range, hsa_amd_copy_direction_t dir, + std::vector& dep_signals, + core::Signal& out_signal) { + if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT; + + lazy_ptr& blit = + (dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost]; + + if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + if (profiling_enabled()) { + // Track the agent so we could translate the resulting timestamp to system + // domain correctly. + out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); + } + + BlitSdmaBase* sdmaBlit = static_cast((*blit).get()); + hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range, + dep_signals, out_signal); + + return stat; +} + hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) { return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count); } diff --git a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp index 75332351e4..100d102f21 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_api_trace.cpp @@ -381,6 +381,7 @@ void HsaApiTable::UpdateAmdExts() { amd_ext_api.hsa_amd_queue_intercept_create_fn = AMD::hsa_amd_queue_intercept_create; amd_ext_api.hsa_amd_queue_intercept_register_fn = AMD::hsa_amd_queue_intercept_register; amd_ext_api.hsa_amd_queue_set_priority_fn = AMD::hsa_amd_queue_set_priority; + amd_ext_api.hsa_amd_memory_async_copy_rect_fn = AMD::hsa_amd_memory_async_copy_rect; } class Init { diff --git a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index 1ef79a3b8c..16bcb9c2cf 100644 --- a/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -47,6 +47,7 @@ #include #include #include +#include #include "core/inc/runtime.h" #include "core/inc/agent.h" @@ -262,6 +263,52 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle, CATCH; } +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) { + TRY; + if (dst == nullptr || src == nullptr || dst_offset == nullptr || src_offset == nullptr || + range == nullptr) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if ((num_dep_signals == 0 && dep_signals != NULL) || + (num_dep_signals > 0 && dep_signals == NULL)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + if (dir == hsaHostToHost) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + core::Agent* base_agent = core::Agent::Convert(copy_agent); + IS_VALID(base_agent); + if (base_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) + return HSA_STATUS_ERROR_INVALID_AGENT; + amd::GpuAgent* agent = static_cast(base_agent); + + std::vector dep_signal_list(num_dep_signals); + if (num_dep_signals > 0) { + for (size_t i = 0; i < num_dep_signals; ++i) { + core::Signal* dep_signal_obj = core::Signal::Convert(dep_signals[i]); + IS_VALID(dep_signal_obj); + dep_signal_list[i] = dep_signal_obj; + } + } + + core::Signal* out_signal_obj = core::Signal::Convert(completion_signal); + IS_VALID(out_signal_obj); + + if ((range->x != 0) && (range->y != 0) && (range->z != 0)) { + return agent->DmaCopyRect(dst, dst_offset, src, src_offset, range, dir, dep_signal_list, + *out_signal_obj); + } + + return HSA_STATUS_SUCCESS; + CATCH; +} + + hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) { TRY; IS_OPEN(); diff --git a/runtime/hsa-runtime/core/util/utils.h b/runtime/hsa-runtime/core/util/utils.h index 2132e277e1..8a18b06ce5 100755 --- a/runtime/hsa-runtime/core/util/utils.h +++ b/runtime/hsa-runtime/core/util/utils.h @@ -176,6 +176,11 @@ static __forceinline T Min(const T& a, const T& b) { return (a > b) ? b : a; } +template +static __forceinline T Min(const T& a, const T& b, Arg... args) { + return Min(a, Min(b, args...)); +} + /// @brief: Find out the max one of two inputs, input must support ">" operator. /// @param: a(Input), a reference to type T. /// @param: b(Input), a reference to type T. @@ -185,6 +190,11 @@ static __forceinline T Max(const T& a, const T& b) { return (b > a) ? b : a; } +template +static __forceinline T Max(const T& a, const T& b, Arg... args) { + return Max(a, Max(b, args...)); +} + /// @brief: Free the memory space which is newed previously. /// @param: ptr(Input), a pointer to memory space. Can't be NULL. /// @return: void. diff --git a/runtime/hsa-runtime/hsacore.so.def b/runtime/hsa-runtime/hsacore.so.def index 128fd35f28..cce468b985 100644 --- a/runtime/hsa-runtime/hsacore.so.def +++ b/runtime/hsa-runtime/hsacore.so.def @@ -216,6 +216,7 @@ global: hsa_amd_ipc_signal_attach; hsa_amd_register_system_event_handler; hsa_amd_queue_set_priority; + hsa_amd_memory_async_copy_rect; local: *; diff --git a/runtime/hsa-runtime/inc/hsa_api_trace.h b/runtime/hsa-runtime/inc/hsa_api_trace.h index c7efec8626..73c7ad17d7 100644 --- a/runtime/hsa-runtime/inc/hsa_api_trace.h +++ b/runtime/hsa-runtime/inc/hsa_api_trace.h @@ -172,6 +172,7 @@ struct AmdExtTable { decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn; + decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; }; // Table to export HSA Core Runtime Apis diff --git a/runtime/hsa-runtime/inc/hsa_ext_amd.h b/runtime/hsa-runtime/inc/hsa_ext_amd.h index 8033050d9b..bcde6ba390 100644 --- a/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -882,6 +882,43 @@ hsa_status_t HSA_API const hsa_signal_t* dep_signals, hsa_signal_t completion_signal); +/* +[Provisional API] +Pitched memory descriptor. +All elements must be 4 byte aligned. Pitch and slice are in bytes. +*/ +typedef struct hsa_pitched_ptr_s { + void* base; + size_t pitch; + size_t slice; +} hsa_pitched_ptr_t; + +/* +[Provisional API] +Copy direction flag. +*/ +typedef enum { + hsaHostToHost = 0, + hsaHostToDevice = 1, + hsaDeviceToHost = 2, + hsaDeviceToDevice = 3 +} hsa_amd_copy_direction_t; + +/* +[Provisional API] +SDMA 3D memory copy API. The same requirements must be met by src and dst as in +hsa_amd_memory_async_copy. +Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects +must not overlap. +CPU agents are not supported. API requires SDMA and will return an error if SDMA is not available. +Offsets and range carry x in bytes, y and z in rows and layers. +*/ +hsa_status_t HSA_API hsa_amd_memory_async_copy_rect( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + /** * @brief Type of accesses to a memory pool from a given agent. */