Implement SDMA copy rect for gfx9.

Fix pitch overflow due to small element detection. Add wide pitch 2D copy handling. Cleanup code duplication. Change-Id: I93b1584aba8e5964957eb7ab3544df806ca3e2f9
2018-06-07 12:14:01 -05:00
@@ -963,6 +963,17 @@ hsa_status_t HSA_API
                                     num_dep_signals, dep_signals, completion_signal);
 }

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal) {
+  return amdExtTable->hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, src_offset, range,
+                                                        copy_agent, dir, num_dep_signals,
+                                                        dep_signals, completion_signal);
+}
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
@@ -45,6 +45,7 @@

 #include <mutex>
 #include <stdint.h>
+#include <vector>

 #include "hsakmt.h"

@@ -55,6 +56,7 @@
 #include "core/util/utils.h"

 namespace amd {
+
 class BlitSdmaBase : public core::Blit {
 public:
  static const size_t kQueueSize;
@@ -62,6 +64,12 @@ class BlitSdmaBase : public core::Blit {
  static const size_t kMaxSingleCopySize;
  static const size_t kMaxSingleFillSize;
  virtual bool isSDMA() const override { return true; }
+  virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
+                                             const hsa_dim3_t* dst_offset,
+                                             const hsa_pitched_ptr_t* src,
+                                             const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
+                                             std::vector<core::Signal*>& dep_signals,
+                                             core::Signal& out_signal) = 0;
 };

 // RingIndexTy: 32/64-bit monotonic ring index, counting in bytes.
@@ -116,6 +124,13 @@ class BlitSdma : public BlitSdmaBase {
      std::vector<core::Signal*>& dep_signals,
      core::Signal& out_signal) override;

+  virtual hsa_status_t SubmitCopyRectCommand(const hsa_pitched_ptr_t* dst,
+                                             const hsa_dim3_t* dst_offset,
+                                             const hsa_pitched_ptr_t* src,
+                                             const hsa_dim3_t* src_offset, const hsa_dim3_t* range,
+                                             std::vector<core::Signal*>& dep_signals,
+                                             core::Signal& out_signal) override;
+
  /// @brief Submit a linear fill command to the queue buffer
  ///
  /// @param ptr Memory address of the fill destination.
@@ -181,6 +196,11 @@ class BlitSdma : public BlitSdmaBase {
  void BuildCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst,
                        const void* src, size_t size);

+  void BuildCopyRectCommand(const std::function<void*(size_t)>& append,
+                            const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
+                            const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
+                            const hsa_dim3_t* range);
+
  void BuildPollCommand(char* cmd_addr, void* addr, uint32_t reference);

  void BuildAtomicDecrementCommand(char* cmd_addr, void* addr);
@@ -189,6 +209,9 @@ class BlitSdma : public BlitSdmaBase {

  void BuildTrapCommand(char* cmd_addr);

+  hsa_status_t SubmitCommand(const void* cmds, size_t cmd_size,
+                             std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
+
  // Agent object owning the SDMA engine.
  GpuAgent* agent_;

@@ -240,6 +240,12 @@ class GpuAgent : public GpuAgentInt {
                       std::vector<core::Signal*>& dep_signals,
                       core::Signal& out_signal) override;

+  // @brief Override from core::Agent.
+  hsa_status_t DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
+                           const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
+                           const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
+                           std::vector<core::Signal*>& dep_signals, core::Signal& out_signal);
+
  // @brief Override from core::Agent.
  hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;

@@ -140,6 +140,13 @@ hsa_status_t HSA_API
                              const hsa_signal_t* dep_signals,
                              hsa_signal_t completion_signal);

+// Mirrors Amd Extension Apis
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal);
+
 // Mirrors Amd Extension Apis
 hsa_status_t HSA_API hsa_amd_agent_memory_pool_get_info(
    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
@@ -0,0 +1,499 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
+#define HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
+
+namespace amd {
+
+// SDMA packet for VI device.
+// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
+
+const unsigned int SDMA_OP_COPY = 1;
+const unsigned int SDMA_OP_FENCE = 5;
+const unsigned int SDMA_OP_TRAP = 6;
+const unsigned int SDMA_OP_POLL_REGMEM = 8;
+const unsigned int SDMA_OP_ATOMIC = 10;
+const unsigned int SDMA_OP_CONST_FILL = 11;
+const unsigned int SDMA_OP_TIMESTAMP = 13;
+const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
+const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
+const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
+const unsigned int SDMA_ATOMIC_ADD64 = 47;
+
+typedef struct SDMA_PKT_COPY_LINEAR_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int extra_info : 16;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int count : 22;
+      unsigned int reserved_0 : 10;
+    };
+    unsigned int DW_1_DATA;
+  } COUNT_UNION;
+
+  union {
+    struct {
+      unsigned int reserved_0 : 16;
+      unsigned int dst_swap : 2;
+      unsigned int reserved_1 : 6;
+      unsigned int src_swap : 2;
+      unsigned int reserved_2 : 6;
+    };
+    unsigned int DW_2_DATA;
+  } PARAMETER_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_31_0 : 32;
+    };
+    unsigned int DW_3_DATA;
+  } SRC_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_63_32 : 32;
+    };
+    unsigned int DW_4_DATA;
+  } SRC_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_31_0 : 32;
+    };
+    unsigned int DW_5_DATA;
+  } DST_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_63_32 : 32;
+    };
+    unsigned int DW_6_DATA;
+  } DST_ADDR_HI_UNION;
+
+  static const size_t kMaxSize_ = 0x3fffe0;
+} SDMA_PKT_COPY_LINEAR;
+
+// linear sub-window
+typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
+  static const unsigned int pitch_bits = 19;
+  static const unsigned int slice_bits = 28;
+  static const unsigned int rect_xy_bits = 14;
+  static const unsigned int rect_z_bits = 11;
+
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved : 13;
+      unsigned int element : 3;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } SRC_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int src_addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } SRC_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int src_offset_x : 14;
+      unsigned int reserved_1 : 2;
+      unsigned int src_offset_y : 14;
+      unsigned int reserved_2 : 2;
+    };
+    unsigned int DW_3_DATA;
+  } SRC_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int src_offset_z : 11;
+      unsigned int reserved_1 : 2;
+      unsigned int src_pitch : pitch_bits;
+    };
+    unsigned int DW_4_DATA;
+  } SRC_PARAMETER_2_UNION;
+
+  union {
+    struct {
+      unsigned int src_slice_pitch : slice_bits;
+      unsigned int reserved_1 : 4;
+    };
+    unsigned int DW_5_DATA;
+  } SRC_PARAMETER_3_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_31_0 : 32;
+    };
+    unsigned int DW_6_DATA;
+  } DST_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_63_32 : 32;
+    };
+    unsigned int DW_7_DATA;
+  } DST_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int dst_offset_x : 14;
+      unsigned int reserved_1 : 2;
+      unsigned int dst_offset_y : 14;
+      unsigned int reserved_2 : 2;
+    };
+    unsigned int DW_8_DATA;
+  } DST_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int dst_offset_z : 11;
+      unsigned int reserved_1 : 2;
+      unsigned int dst_pitch : pitch_bits;
+    };
+    unsigned int DW_9_DATA;
+  } DST_PARAMETER_2_UNION;
+
+  union {
+    struct {
+      unsigned int dst_slice_pitch : slice_bits;
+      unsigned int reserved_1 : 4;
+    };
+    unsigned int DW_10_DATA;
+  } DST_PARAMETER_3_UNION;
+
+  union {
+    struct {
+      unsigned int rect_x : rect_xy_bits;
+      unsigned int reserved_1 : 2;
+      unsigned int rect_y : rect_xy_bits;
+      unsigned int reserved_2 : 2;
+    };
+    unsigned int DW_11_DATA;
+  } RECT_PARAMETER_1_UNION;
+
+  union {
+    struct {
+      unsigned int rect_z : rect_z_bits;
+      unsigned int reserved_1 : 5;
+      unsigned int dst_swap : 2;
+      unsigned int reserved_2 : 6;
+      unsigned int src_swap : 2;
+      unsigned int reserved_3 : 6;
+    };
+    unsigned int DW_12_DATA;
+  } RECT_PARAMETER_2_UNION;
+
+  // static const unsigned int pitch_bits = 19;
+} SDMA_PKT_COPY_LINEAR_RECT;
+
+typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int sw : 2;
+      unsigned int reserved_0 : 12;
+      unsigned int fillsize : 2;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } DST_ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int dst_addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } DST_ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int src_data_31_0 : 32;
+    };
+    unsigned int DW_3_DATA;
+  } DATA_UNION;
+
+  union {
+    struct {
+      unsigned int count : 22;
+      unsigned int reserved_0 : 10;
+    };
+    unsigned int DW_4_DATA;
+  } COUNT_UNION;
+
+  static const size_t kMaxSize_ = 0x3fffe0;
+} SDMA_PKT_CONSTANT_FILL;
+
+typedef struct SDMA_PKT_FENCE_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 16;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int data : 32;
+    };
+    unsigned int DW_3_DATA;
+  } DATA_UNION;
+} SDMA_PKT_FENCE;
+
+typedef struct SDMA_PKT_POLL_REGMEM_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 10;
+      unsigned int hdp_flush : 1;
+      unsigned int reserved_1 : 1;
+      unsigned int func : 3;
+      unsigned int mem_poll : 1;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int value : 32;
+    };
+    unsigned int DW_3_DATA;
+  } VALUE_UNION;
+
+  union {
+    struct {
+      unsigned int mask : 32;
+    };
+    unsigned int DW_4_DATA;
+  } MASK_UNION;
+
+  union {
+    struct {
+      unsigned int interval : 16;
+      unsigned int retry_count : 12;
+      unsigned int reserved_0 : 4;
+    };
+    unsigned int DW_5_DATA;
+  } DW5_UNION;
+} SDMA_PKT_POLL_REGMEM;
+
+typedef struct SDMA_PKT_ATOMIC_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int l : 1;
+      unsigned int reserved_0 : 8;
+      unsigned int operation : 7;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } ADDR_HI_UNION;
+
+  union {
+    struct {
+      unsigned int src_data_31_0 : 32;
+    };
+    unsigned int DW_3_DATA;
+  } SRC_DATA_LO_UNION;
+
+  union {
+    struct {
+      unsigned int src_data_63_32 : 32;
+    };
+    unsigned int DW_4_DATA;
+  } SRC_DATA_HI_UNION;
+
+  union {
+    struct {
+      unsigned int cmp_data_31_0 : 32;
+    };
+    unsigned int DW_5_DATA;
+  } CMP_DATA_LO_UNION;
+
+  union {
+    struct {
+      unsigned int cmp_data_63_32 : 32;
+    };
+    unsigned int DW_6_DATA;
+  } CMP_DATA_HI_UNION;
+
+  union {
+    struct {
+      unsigned int loop_interval : 13;
+      unsigned int reserved_0 : 19;
+    };
+    unsigned int DW_7_DATA;
+  } LOOP_UNION;
+} SDMA_PKT_ATOMIC;
+
+typedef struct SDMA_PKT_TIMESTAMP_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 16;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int addr_31_0 : 32;
+    };
+    unsigned int DW_1_DATA;
+  } ADDR_LO_UNION;
+
+  union {
+    struct {
+      unsigned int addr_63_32 : 32;
+    };
+    unsigned int DW_2_DATA;
+  } ADDR_HI_UNION;
+
+} SDMA_PKT_TIMESTAMP;
+
+typedef struct SDMA_PKT_TRAP_TAG {
+  union {
+    struct {
+      unsigned int op : 8;
+      unsigned int sub_op : 8;
+      unsigned int reserved_0 : 16;
+    };
+    unsigned int DW_0_DATA;
+  } HEADER_UNION;
+
+  union {
+    struct {
+      unsigned int int_ctx : 28;
+      unsigned int reserved_1 : 4;
+    };
+    unsigned int DW_1_DATA;
+  } INT_CONTEXT_UNION;
+} SDMA_PKT_TRAP;
+
+// HDP flush packet, no parameters.
+typedef struct SDMA_PKT_HDP_FLUSH_TAG {
+  unsigned int DW_0_DATA;
+  unsigned int DW_1_DATA;
+  unsigned int DW_2_DATA;
+  unsigned int DW_3_DATA;
+  unsigned int DW_4_DATA;
+  unsigned int DW_5_DATA;
+
+  // Version of gfx9 sDMA microcode introducing SDMA_PKT_HDP_FLUSH
+  static const uint16_t kMinVersion_ = 0x1A5;
+} SDMA_PKT_HDP_FLUSH;
+static const SDMA_PKT_HDP_FLUSH hdp_flush_cmd = {0x8, 0x0, 0x80000000, 0x0, 0x0, 0x0};
+
+}  // namespace amd
+
+#endif  // HSA_RUNTIME_CORE_INC_SDMA_REGISTERS_H_
@@ -51,328 +51,10 @@
 #include "core/inc/amd_gpu_agent.h"
 #include "core/inc/amd_memory_region.h"
 #include "core/inc/runtime.h"
+#include "core/inc/sdma_registers.h"
 #include "core/inc/signal.h"

 namespace amd {
-// SDMA packet for VI device.
-// Reference: http://people.freedesktop.org/~agd5f/dma_packets.txt
-
-const unsigned int SDMA_OP_COPY = 1;
-const unsigned int SDMA_OP_FENCE = 5;
-const unsigned int SDMA_OP_TRAP = 6;
-const unsigned int SDMA_OP_POLL_REGMEM = 8;
-const unsigned int SDMA_OP_ATOMIC = 10;
-const unsigned int SDMA_OP_CONST_FILL = 11;
-const unsigned int SDMA_OP_TIMESTAMP = 13;
-const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
-const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
-const unsigned int SDMA_ATOMIC_ADD64 = 47;
-
-typedef struct SDMA_PKT_COPY_LINEAR_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int extra_info : 16;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int count : 22;
-      unsigned int reserved_0 : 10;
-    };
-    unsigned int DW_1_DATA;
-  } COUNT_UNION;
-
-  union {
-    struct {
-      unsigned int reserved_0 : 16;
-      unsigned int dst_swap : 2;
-      unsigned int reserved_1 : 6;
-      unsigned int src_swap : 2;
-      unsigned int reserved_2 : 6;
-    };
-    unsigned int DW_2_DATA;
-  } PARAMETER_UNION;
-
-  union {
-    struct {
-      unsigned int src_addr_31_0 : 32;
-    };
-    unsigned int DW_3_DATA;
-  } SRC_ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int src_addr_63_32 : 32;
-    };
-    unsigned int DW_4_DATA;
-  } SRC_ADDR_HI_UNION;
-
-  union {
-    struct {
-      unsigned int dst_addr_31_0 : 32;
-    };
-    unsigned int DW_5_DATA;
-  } DST_ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int dst_addr_63_32 : 32;
-    };
-    unsigned int DW_6_DATA;
-  } DST_ADDR_HI_UNION;
-} SDMA_PKT_COPY_LINEAR;
-
-typedef struct SDMA_PKT_CONSTANT_FILL_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int sw : 2;
-      unsigned int reserved_0 : 12;
-      unsigned int fillsize : 2;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int dst_addr_31_0 : 32;
-    };
-    unsigned int DW_1_DATA;
-  } DST_ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int dst_addr_63_32 : 32;
-    };
-    unsigned int DW_2_DATA;
-  } DST_ADDR_HI_UNION;
-
-  union {
-    struct {
-      unsigned int src_data_31_0 : 32;
-    };
-    unsigned int DW_3_DATA;
-  } DATA_UNION;
-
-  union {
-    struct {
-      unsigned int count : 22;
-      unsigned int reserved_0 : 10;
-    };
-    unsigned int DW_4_DATA;
-  } COUNT_UNION;
-} SDMA_PKT_CONSTANT_FILL;
-
-typedef struct SDMA_PKT_FENCE_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int reserved_0 : 16;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int addr_31_0 : 32;
-    };
-    unsigned int DW_1_DATA;
-  } ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int addr_63_32 : 32;
-    };
-    unsigned int DW_2_DATA;
-  } ADDR_HI_UNION;
-
-  union {
-    struct {
-      unsigned int data : 32;
-    };
-    unsigned int DW_3_DATA;
-  } DATA_UNION;
-} SDMA_PKT_FENCE;
-
-typedef struct SDMA_PKT_POLL_REGMEM_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int reserved_0 : 10;
-      unsigned int hdp_flush : 1;
-      unsigned int reserved_1 : 1;
-      unsigned int func : 3;
-      unsigned int mem_poll : 1;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int addr_31_0 : 32;
-    };
-    unsigned int DW_1_DATA;
-  } ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int addr_63_32 : 32;
-    };
-    unsigned int DW_2_DATA;
-  } ADDR_HI_UNION;
-
-  union {
-    struct {
-      unsigned int value : 32;
-    };
-    unsigned int DW_3_DATA;
-  } VALUE_UNION;
-
-  union {
-    struct {
-      unsigned int mask : 32;
-    };
-    unsigned int DW_4_DATA;
-  } MASK_UNION;
-
-  union {
-    struct {
-      unsigned int interval : 16;
-      unsigned int retry_count : 12;
-      unsigned int reserved_0 : 4;
-    };
-    unsigned int DW_5_DATA;
-  } DW5_UNION;
-} SDMA_PKT_POLL_REGMEM;
-
-typedef struct SDMA_PKT_ATOMIC_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int l : 1;
-      unsigned int reserved_0 : 8;
-      unsigned int operation : 7;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int addr_31_0 : 32;
-    };
-    unsigned int DW_1_DATA;
-  } ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int addr_63_32 : 32;
-    };
-    unsigned int DW_2_DATA;
-  } ADDR_HI_UNION;
-
-  union {
-    struct {
-      unsigned int src_data_31_0 : 32;
-    };
-    unsigned int DW_3_DATA;
-  } SRC_DATA_LO_UNION;
-
-  union {
-    struct {
-      unsigned int src_data_63_32 : 32;
-    };
-    unsigned int DW_4_DATA;
-  } SRC_DATA_HI_UNION;
-
-  union {
-    struct {
-      unsigned int cmp_data_31_0 : 32;
-    };
-    unsigned int DW_5_DATA;
-  } CMP_DATA_LO_UNION;
-
-  union {
-    struct {
-      unsigned int cmp_data_63_32 : 32;
-    };
-    unsigned int DW_6_DATA;
-  } CMP_DATA_HI_UNION;
-
-  union {
-    struct {
-      unsigned int loop_interval : 13;
-      unsigned int reserved_0 : 19;
-    };
-    unsigned int DW_7_DATA;
-  } LOOP_UNION;
-} SDMA_PKT_ATOMIC;
-
-typedef struct SDMA_PKT_TIMESTAMP_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int reserved_0 : 16;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int addr_31_0 : 32;
-    };
-    unsigned int DW_1_DATA;
-  } ADDR_LO_UNION;
-
-  union {
-    struct {
-      unsigned int addr_63_32 : 32;
-    };
-    unsigned int DW_2_DATA;
-  } ADDR_HI_UNION;
-
-} SDMA_PKT_TIMESTAMP;
-
-typedef struct SDMA_PKT_TRAP_TAG {
-  union {
-    struct {
-      unsigned int op : 8;
-      unsigned int sub_op : 8;
-      unsigned int reserved_0 : 16;
-    };
-    unsigned int DW_0_DATA;
-  } HEADER_UNION;
-
-  union {
-    struct {
-      unsigned int int_ctx : 28;
-      unsigned int reserved_1 : 4;
-    };
-    unsigned int DW_1_DATA;
-  } INT_CONTEXT_UNION;
-} SDMA_PKT_TRAP;
-
-// Initialize Hdp flush packet for use on sDMA of devices
-// from Gfx9 or new  family
-static const SDMA_PKT_POLL_REGMEM hdp_flush_cmd_ {
-                                        { SDMA_OP_POLL_REGMEM },
-                                        { 0x00 },
-                                        { 0x80000000 },
-                                        { 0x00 },
-                                        { 0x00 },
-                                        { 0x00 },
-};
-
-// Version of sDMA microcode supporting Hdp flush
-static const uint16_t sdma_version_ = 0x01A5;

 inline uint32_t ptrlow32(const void* p) {
  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
@@ -388,8 +70,8 @@ inline uint32_t ptrhigh32(const void* p) {

 const size_t BlitSdmaBase::kQueueSize = 1024 * 1024;
 const size_t BlitSdmaBase::kCopyPacketSize = sizeof(SDMA_PKT_COPY_LINEAR);
-const size_t BlitSdmaBase::kMaxSingleCopySize = 0x3fffe0;  // From HW documentation
-const size_t BlitSdmaBase::kMaxSingleFillSize = 0x3fffe0;
+const size_t BlitSdmaBase::kMaxSingleCopySize = SDMA_PKT_COPY_LINEAR::kMaxSize_;
+const size_t BlitSdmaBase::kMaxSingleFillSize = SDMA_PKT_CONSTANT_FILL::kMaxSize_;

 // Initialize size of various sDMA commands use by this module
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
@@ -437,8 +119,6 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::~BlitSdma() {}
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initialize(
    const core::Agent& agent) {
-  agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));
-
  if (queue_start_addr_ != NULL) {
    // Already initialized.
    return HSA_STATUS_SUCCESS;
@@ -448,24 +128,23 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
    return HSA_STATUS_ERROR;
  }

-  const amd::GpuAgentInt& amd_gpu_agent =
-      static_cast<const amd::GpuAgentInt&>(agent);
+  agent_ = reinterpret_cast<amd::GpuAgent*>(&const_cast<core::Agent&>(agent));

-  if (HSA_PROFILE_FULL == amd_gpu_agent.profile()) {
+  if (HSA_PROFILE_FULL == agent_->profile()) {
    assert(false && "Only support SDMA for dgpu currently");
    return HSA_STATUS_ERROR;
  }

-  if (amd_gpu_agent.isa()->version() == core::Isa::Version(7, 0, 1)) {
+  if (agent_->isa()->version() == core::Isa::Version(7, 0, 1)) {
    platform_atomic_support_ = false;
  } else {
    const core::Runtime::LinkInfo& link = core::Runtime::runtime_singleton_->GetLinkInfo(
-        amd_gpu_agent.node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
+        agent_->node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
    platform_atomic_support_ = link.info.atomic_support_64bit;
  }

  // Determine if sDMA microcode supports HDP flush command
-  if (agent_->GetSdmaMicrocodeVersion() >= sdma_version_) {
+  if (agent_->GetSdmaMicrocodeVersion() >= SDMA_PKT_HDP_FLUSH::kMinVersion_) {
    hdp_flush_support_ = true;
  }

@@ -483,7 +162,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Initial
  // This call binds user mode queue object to underlying compute
  // device.
  const HSA_QUEUE_TYPE kQueueType_ = HSA_QUEUE_SDMA;
-  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(amd_gpu_agent.node_id(), kQueueType_, 100,
+  if (HSAKMT_STATUS_SUCCESS != hsaKmtCreateQueue(agent_->node_id(), kQueueType_, 100,
                                                 HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
                                                 kQueueSize, NULL, &queue_resource_)) {
    Destroy(agent);
@@ -539,6 +218,159 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::Destroy
  return HSA_STATUS_SUCCESS;
 }

+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCommand(
+    const void* cmd, size_t cmd_size, std::vector<core::Signal*>& dep_signals,
+    core::Signal& out_signal) {
+  // The signal is 64 bit value, and poll checks for 32 bit value. So we
+  // need to use two poll operations per dependent signal.
+  const uint32_t num_poll_command =
+      static_cast<uint32_t>(2 * dep_signals.size());
+  const uint32_t total_poll_command_size =
+      (num_poll_command * poll_command_size_);
+
+  // Load the profiling state early in case the user disable or enable the
+  // profiling in the middle of the call.
+  const bool profiling_enabled = agent_->profiling_enabled();
+
+  uint64_t* end_ts_addr = NULL;
+  uint32_t total_timestamp_command_size = 0;
+
+  if (profiling_enabled) {
+    // SDMA timestamp packet requires 32 byte of aligned memory, but
+    // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
+    // read from a 32 byte aligned bounce buffer is required to avoid changing
+    // the amd_signal_t ABI.
+
+    end_ts_addr = agent_->ObtainEndTsObject();
+    if (end_ts_addr == NULL) {
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
+
+    total_timestamp_command_size =
+        (2 * timestamp_command_size_) + linear_copy_command_size_;
+  }
+
+  // On agent that does not support platform atomic, we replace it with
+  // one or two fence packet(s) to update the signal value. The reason fence
+  // is used and not write packet is because the SDMA engine may overlap a
+  // serial copy/write packets.
+  const uint64_t completion_signal_value =
+      static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
+  const size_t sync_command_size = (platform_atomic_support_)
+                                       ? atomic_command_size_
+                                       : (completion_signal_value > UINT32_MAX)
+                                             ? 2 * fence_command_size_
+                                             : fence_command_size_;
+
+  // If the signal is an interrupt signal, we also need to make SDMA engine to
+  // send interrupt packet to IH.
+  const size_t interrupt_command_size =
+      (out_signal.signal_.event_mailbox_ptr != 0)
+          ? (fence_command_size_ + trap_command_size_)
+          : 0;
+
+  // Add space for acquire or release Hdp flush command
+  uint32_t flush_cmd_size = 0;
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_)) {
+      flush_cmd_size = flush_command_size_;
+    }
+  }
+
+  const uint32_t total_command_size = total_poll_command_size + cmd_size + sync_command_size +
+      total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
+
+  RingIndexTy curr_index;
+  char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
+
+  if (command_addr == NULL) {
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
+
+  for (size_t i = 0; i < dep_signals.size(); ++i) {
+    uint32_t* signal_addr =
+        reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
+    // Wait for the higher 64 bit to 0.
+    BuildPollCommand(command_addr, &signal_addr[1], 0);
+    command_addr += poll_command_size_;
+    // Then wait for the lower 64 bit to 0.
+    BuildPollCommand(command_addr, &signal_addr[0], 0);
+    command_addr += poll_command_size_;
+  }
+
+  if (profiling_enabled) {
+    BuildGetGlobalTimestampCommand(
+        command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
+    command_addr += timestamp_command_size_;
+  }
+
+  // Determine if a Hdp flush cmd is required at the top of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+
+  // Do the command after all polls are satisfied.
+  memcpy(command_addr, cmd, cmd_size);
+  command_addr += cmd_size;
+
+  // Determine if a Hdp flush cmd is required at the end of cmd stream
+  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
+    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
+      BuildHdpFlushCommand(command_addr);
+      command_addr += flush_command_size_;
+    }
+  }
+
+  if (profiling_enabled) {
+    assert(IsMultipleOf(end_ts_addr, 32));
+    BuildGetGlobalTimestampCommand(command_addr,
+                                   reinterpret_cast<void*>(end_ts_addr));
+    command_addr += timestamp_command_size_;
+
+    BuildCopyCommand(command_addr, 1,
+                     reinterpret_cast<void*>(&out_signal.signal_.end_ts),
+                     reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
+    command_addr += linear_copy_command_size_;
+  }
+
+  // After transfer is completed, decrement the signal value.
+  if (platform_atomic_support_) {
+    BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
+    command_addr += atomic_command_size_;
+
+  } else {
+    uint32_t* signal_value_location = reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
+    if (completion_signal_value > UINT32_MAX) {
+      BuildFenceCommand(command_addr, signal_value_location + 1,
+                        static_cast<uint32_t>(completion_signal_value >> 32));
+      command_addr += fence_command_size_;
+    }
+
+    BuildFenceCommand(command_addr, signal_value_location,
+                      static_cast<uint32_t>(completion_signal_value));
+
+    command_addr += fence_command_size_;
+  }
+
+  // Update mailbox event and send interrupt to IH.
+  if (out_signal.signal_.event_mailbox_ptr != 0) {
+    BuildFenceCommand(command_addr,
+                      reinterpret_cast<uint32_t*>(out_signal.signal_.event_mailbox_ptr),
+                      static_cast<uint32_t>(out_signal.signal_.event_id));
+    command_addr += fence_command_size_;
+
+    BuildTrapCommand(command_addr);
+  }
+
+  ReleaseWriteAddress(curr_index, total_command_size);
+
+  return HSA_STATUS_SUCCESS;
+}
+
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
    void* dst, const void* src, size_t size) {
@@ -546,8 +378,7 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitL
  // the SDMA linear copy limit.
  const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;

-  const uint32_t total_copy_command_size =
-      num_copy_command * linear_copy_command_size_;
+  const uint32_t total_copy_command_size = num_copy_command * linear_copy_command_size_;

  // Add space for acquire or release Hdp flush command
  uint32_t flush_cmd_size = 0;
@@ -603,161 +434,79 @@ template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitLinearCopyCommand(
    void* dst, const void* src, size_t size, std::vector<core::Signal*>& dep_signals,
    core::Signal& out_signal) {
-  // The signal is 64 bit value, and poll checks for 32 bit value. So we
-  // need to use two poll operations per dependent signal.
-  const uint32_t num_poll_command =
-      static_cast<uint32_t>(2 * dep_signals.size());
-  const uint32_t total_poll_command_size =
-      (num_poll_command * poll_command_size_);
-
-  // Break the copy into multiple copy operation incase the copy size exceeds
+  // Break the copy into multiple copy operations when the copy size exceeds
  // the SDMA linear copy limit.
  const uint32_t num_copy_command = (size + kMaxSingleCopySize - 1) / kMaxSingleCopySize;
-  const uint32_t total_copy_command_size =
-      num_copy_command * linear_copy_command_size_;

-  // Load the profiling state early in case the user disable or enable the
-  // profiling in the middle of the call.
-  const bool profiling_enabled = agent_->profiling_enabled();
+  // Assemble copy packets.
+  std::vector<SDMA_PKT_COPY_LINEAR> buff(num_copy_command);
+  BuildCopyCommand(reinterpret_cast<char*>(&buff[0]), num_copy_command, dst, src, size);

-  uint64_t* end_ts_addr = NULL;
-  uint32_t total_timestamp_command_size = 0;
+  return SubmitCommand(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR), dep_signals,
+                       out_signal);
+}

-  if (profiling_enabled) {
-    // SDMA timestamp packet requires 32 byte of aligned memory, but
-    // amd_signal_t::end_ts is not 32 byte aligned. So an extra copy packet to
-    // read from a 32 byte aligned bounce buffer is required to avoid changing
-    // the amd_signal_t ABI.
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::SubmitCopyRectCommand(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, std::vector<core::Signal*>& dep_signals,
+    core::Signal& out_signal) {
+  // Hardware requires DWORD alignment for base address, pitches
+  // Also confirm that we have a geometric rect (copied block does not wrap an edge).
+  if (((uintptr_t)dst->base) % 4 != 0 || ((uintptr_t)src->base) % 4 != 0)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                             "Copy rect base address not aligned.");
+  if (((uintptr_t)dst->pitch) % 4 != 0 || ((uintptr_t)src->pitch) % 4 != 0)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch not aligned.");
+  if (((uintptr_t)dst->slice) % 4 != 0 || ((uintptr_t)src->slice) % 4 != 0)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice not aligned.");
+  if (uint64_t(src_offset->x) + range->x > src->pitch ||
+      uint64_t(dst_offset->x) + range->x > dst->pitch)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect width out of range.");
+  if ((src->slice != 0) && (uint64_t(src_offset->y) + range->y) > src->slice / src->pitch)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
+  if ((dst->slice != 0) && (uint64_t(dst_offset->y) + range->y) > dst->slice / dst->pitch)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect height out of range.");
+  if (range->z > 1 && (src->slice == 0 || dst->slice == 0))
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed.");

-    end_ts_addr = agent_->ObtainEndTsObject();
-    if (end_ts_addr == NULL) {
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
+  const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;

-    total_timestamp_command_size =
-        (2 * timestamp_command_size_) + linear_copy_command_size_;
-  }
+  std::vector<SDMA_PKT_COPY_LINEAR_RECT> pkts;
+  auto append = [&](size_t size) {
+    assert(size == sizeof(SDMA_PKT_COPY_LINEAR_RECT) && "SDMA packet size missmatch");
+    pkts.emplace_back(SDMA_PKT_COPY_LINEAR_RECT());
+    return &pkts.back();
+  };

-  // On agent that does not support platform atomic, we replace it with
-  // one or two fence packet(s) to update the signal value. The reason fence
-  // is used and not write packet is because the SDMA engine may overlap a
-  // serial copy/write packets.
-  const uint64_t completion_signal_value =
-      static_cast<uint64_t>(out_signal.LoadRelaxed() - 1);
-  const size_t sync_command_size = (platform_atomic_support_)
-                                       ? atomic_command_size_
-                                       : (completion_signal_value > UINT32_MAX)
-                                             ? 2 * fence_command_size_
-                                             : fence_command_size_;
+  // Do wide pitch 2D copies along X-Z
+  if (range->z == 1 && (src->pitch > max_pitch || dst->pitch > max_pitch)) {
+    hsa_pitched_ptr_t Src = *src;
+    hsa_pitched_ptr_t Dst = *dst;
+    hsa_dim3_t Soff = *src_offset;
+    hsa_dim3_t Doff = *dst_offset;
+    hsa_dim3_t Range = *range;

-  // If the signal is an interrupt signal, we also need to make SDMA engine to
-  // send interrupt packet to IH.
-  const size_t interrupt_command_size =
-      (out_signal.signal_.event_mailbox_ptr != 0)
-          ? (fence_command_size_ + trap_command_size_)
-          : 0;
+    Src.base += Soff.z * Src.slice + Soff.y * Src.pitch;
+    Dst.base += Doff.z * Dst.slice + Doff.y * Dst.pitch;
+    Soff.y = Soff.z = 0;
+    Doff.y = Doff.z = 0;

-  // Add space for acquire or release Hdp flush command
-  uint32_t flush_cmd_size = 0;
-  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
-    if ((HwIndexMonotonic) && (hdp_flush_support_)) {
-      flush_cmd_size = flush_command_size_;
-    }
-  }
+    Src.slice = Src.pitch;
+    Src.pitch = 0;
+    Dst.slice = Dst.pitch;
+    Dst.pitch = 0;

-  const uint32_t total_command_size =
-      total_poll_command_size + total_copy_command_size + sync_command_size +
-      total_timestamp_command_size + interrupt_command_size + flush_cmd_size;
-
-  RingIndexTy curr_index;
-  char* command_addr = AcquireWriteAddress(total_command_size, curr_index);
-
-  if (command_addr == NULL) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  for (size_t i = 0; i < dep_signals.size(); ++i) {
-    uint32_t* signal_addr =
-        reinterpret_cast<uint32_t*>(dep_signals[i]->ValueLocation());
-    // Wait for the higher 64 bit to 0.
-    BuildPollCommand(command_addr, &signal_addr[1], 0);
-    command_addr += poll_command_size_;
-    // Then wait for the lower 64 bit to 0.
-    BuildPollCommand(command_addr, &signal_addr[0], 0);
-    command_addr += poll_command_size_;
-  }
-
-  if (profiling_enabled) {
-    BuildGetGlobalTimestampCommand(
-        command_addr, reinterpret_cast<void*>(&out_signal.signal_.start_ts));
-    command_addr += timestamp_command_size_;
-  }
-
-  // Determine if a Hdp flush cmd is required at the top of cmd stream
-  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
-    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_ == false)) {
-      BuildHdpFlushCommand(command_addr);
-      command_addr += flush_command_size_;
-    }
-  }
-
-  // Do the transfer after all polls are satisfied.
-  BuildCopyCommand(command_addr, num_copy_command, dst, src, size);
-  command_addr += total_copy_command_size;
-
-  // Determine if a Hdp flush cmd is required at the end of cmd stream
-  if (core::Runtime::runtime_singleton_->flag().enable_sdma_hdp_flush()) {
-    if ((HwIndexMonotonic) && (hdp_flush_support_) && (sdma_h2d_)) {
-      BuildHdpFlushCommand(command_addr);
-      command_addr += flush_command_size_;
-    }
-  }
-
-  if (profiling_enabled) {
-    assert(IsMultipleOf(end_ts_addr, 32));
-    BuildGetGlobalTimestampCommand(command_addr,
-                                   reinterpret_cast<void*>(end_ts_addr));
-    command_addr += timestamp_command_size_;
-
-    BuildCopyCommand(command_addr, 1,
-                     reinterpret_cast<void*>(&out_signal.signal_.end_ts),
-                     reinterpret_cast<void*>(end_ts_addr), sizeof(uint64_t));
-    command_addr += linear_copy_command_size_;
-  }
-
-  // After transfer is completed, decrement the signal value.
-  if (platform_atomic_support_) {
-    BuildAtomicDecrementCommand(command_addr, out_signal.ValueLocation());
-    command_addr += atomic_command_size_;
+    Range.z = Range.y;
+    Range.y = 1;

+    BuildCopyRectCommand(append, &Dst, &Doff, &Src, &Soff, &Range);
  } else {
-    uint32_t* signal_value_location =
-        reinterpret_cast<uint32_t*>(out_signal.ValueLocation());
-    if (completion_signal_value > UINT32_MAX) {
-      BuildFenceCommand(command_addr, signal_value_location + 1,
-                        static_cast<uint32_t>(completion_signal_value >> 32));
-      command_addr += fence_command_size_;
-    }
-
-    BuildFenceCommand(command_addr, signal_value_location,
-                      static_cast<uint32_t>(completion_signal_value));
-
-    command_addr += fence_command_size_;
+    BuildCopyRectCommand(append, dst, dst_offset, src, src_offset, range);
  }

-  // Update mailbox event and send interrupt to IH.
-  if (out_signal.signal_.event_mailbox_ptr != 0) {
-    BuildFenceCommand(command_addr, reinterpret_cast<uint32_t*>(
-                                        out_signal.signal_.event_mailbox_ptr),
-                      static_cast<uint32_t>(out_signal.signal_.event_id));
-    command_addr += fence_command_size_;
-
-    BuildTrapCommand(command_addr);
-  }
-
-  ReleaseWriteAddress(curr_index, total_command_size);
-
-  return HSA_STATUS_SUCCESS;
+  return SubmitCommand(&pkts[0], pkts.size() * sizeof(SDMA_PKT_COPY_LINEAR_RECT), dep_signals,
+                       out_signal);
 }

 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
@@ -1057,6 +806,131 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyComman
  assert(cur_size == size);
 }

+/*
+Copies are done in terms of elements (1, 2, 4, 8, or 16 bytes) and have alignment restrictions.
+Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byte, 4=16 byte).
+This routine breaks a large rect into tiles that can be handled by hardware.  Pitches and offsets
+must be representable in terms of elements in all tiles of the copy.
+*/
+template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
+void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildCopyRectCommand(
+    const std::function<void*(size_t)>& append, const hsa_pitched_ptr_t* dst,
+    const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
+    const hsa_dim3_t* range) {
+  // Returns the index of the first set bit (ie log2 of the largest power of 2 that evenly divides
+  // width), the largest element that perfectly covers width.
+  // width | 16 ensures that we don't return a higher element than is supported and avoids
+  // issues with 0.
+  auto maxAlignedElement = [](size_t width) {
+    return __builtin_ctz(width | 16);
+  };
+
+  // Limits in terms of element count
+  const uint max_pitch = 1 << SDMA_PKT_COPY_LINEAR_RECT::pitch_bits;
+  const uint max_slice = 1 << SDMA_PKT_COPY_LINEAR_RECT::slice_bits;
+  const uint max_x = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
+  const uint max_y = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_xy_bits;
+  const uint max_z = 1 << SDMA_PKT_COPY_LINEAR_RECT::rect_z_bits;
+
+  // Find maximum element that describes the pitch and slice.
+  // Pitch and slice must both be represented in units of elements.  No element larger than this
+  // may be used in any tile as the pitches would not be exactly represented.
+  int max_ele = Min(maxAlignedElement(src->pitch), maxAlignedElement(dst->pitch));
+  if (range->z != 1)  // Only need to consider slice if HW will copy along Z.
+    max_ele = Min(max_ele, maxAlignedElement(src->slice), maxAlignedElement(dst->slice));
+
+  /*
+  Find the minimum element size that will be needed for any tile.
+
+  No subdivision of a range admits a larger element size for the smallest element in any subdivision
+  than the element size that covers the whole range, though some can be worse (this is easily model
+  checked).  Subdividing with any element larger than the covering element won't change the covering
+  element of the remainder
+  ( Range%Element = (Range-N*LargerElement)%Element since LargerElement%Element=0 ).
+    Ex. range->x=71, assume max range is 16 elements:  We can break at 64 giving tiles:
+    [0,63], [64-70] (width 64 & 7).  64 is covered by element 4 (16B) and 7 is covered by element 0
+    (1B).  Exactly covering 71 requires using element 0.
+  
+  Base addresses in each tile must be DWORD aligned, if not then the offset from an aligned address
+  must be represented in elements.  This may reduce the size of the element, but since elements are
+  integer multiples of each other this is harmless.
+
+  src and dst base has already been checked for DWORD alignment so we only need to consider the
+  offset here.
+  */
+  int min_ele = Min(max_ele, maxAlignedElement(range->x), maxAlignedElement(src_offset->x % 4),
+                    maxAlignedElement(dst_offset->x % 4));
+
+  // Check that pitch and slice can be represented in the tile with the smallest element
+  if ((src->pitch >> min_ele) > max_pitch || (dst->pitch >> min_ele) > max_pitch)
+    throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect pitch out of limits.\n");
+  if (range->z != 1) {  // Only need to consider slice if HW will copy along Z.
+    if ((src->slice >> min_ele) > max_slice || (dst->slice >> min_ele) > max_slice)
+      throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT,
+                               "Copy rect slice out of limits.\n");
+  }
+
+  // Break copy into tiles
+  for (uint64_t z = 0; z < range->z; z += max_z) {
+    for (uint64_t y = 0; y < range->y; y += max_y) {
+      uint64_t x = 0;
+      while (x < range->x) {
+        uint64_t width = range->x - x;
+
+        // Get largest element which describes the start of this tile after its base address has
+        // been aligned.  Base addresses must be DWORD (4 byte) aligned.
+        int aligned_ele = Min(maxAlignedElement((src_offset->x + x) % 4),
+                              maxAlignedElement((dst_offset->x + x) % 4), max_ele);
+
+        // Get largest permissible element which exactly covers width
+        int element = Min(maxAlignedElement(width), aligned_ele);
+        int xcount = width >> element;
+
+        // If width is too large then width is at least max_x bytes (bigger than any element) so
+        // drop the width restriction and clip element count to max_x.
+        if (xcount > max_x) {
+          element = aligned_ele;
+          xcount = Min(width >> element, max_x);
+        }
+
+        // Get base addresses and offsets for this tile.
+        uintptr_t sbase = (uintptr_t)src->base + src_offset->x + x +
+            (src_offset->y + y) * src->pitch + (src_offset->z + z) * src->slice;
+        uintptr_t dbase = (uintptr_t)dst->base + dst_offset->x + x +
+            (dst_offset->y + y) * dst->pitch + (dst_offset->z + z) * dst->slice;
+        uint soff = (sbase % 4) >> element;
+        uint doff = (dbase % 4) >> element;
+        sbase &= ~3ull;
+        dbase &= ~3ull;
+
+        x += xcount << element;
+
+        SDMA_PKT_COPY_LINEAR_RECT* pkt =
+            (SDMA_PKT_COPY_LINEAR_RECT*)append(sizeof(SDMA_PKT_COPY_LINEAR_RECT));
+        *pkt = {};
+        pkt->HEADER_UNION.op = SDMA_OP_COPY;
+        pkt->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_LINEAR_RECT;
+        pkt->HEADER_UNION.element = element;
+        pkt->SRC_ADDR_LO_UNION.src_addr_31_0 = sbase;
+        pkt->SRC_ADDR_HI_UNION.src_addr_63_32 = sbase >> 32;
+        pkt->SRC_PARAMETER_1_UNION.src_offset_x = soff;
+        pkt->SRC_PARAMETER_2_UNION.src_pitch = (src->pitch >> element) - 1;
+        pkt->SRC_PARAMETER_3_UNION.src_slice_pitch =
+            (range->z == 1) ? 0 : (src->slice >> element) - 1;
+        pkt->DST_ADDR_LO_UNION.dst_addr_31_0 = dbase;
+        pkt->DST_ADDR_HI_UNION.dst_addr_63_32 = dbase >> 32;
+        pkt->DST_PARAMETER_1_UNION.dst_offset_x = doff;
+        pkt->DST_PARAMETER_2_UNION.dst_pitch = (dst->pitch >> element) - 1;
+        pkt->DST_PARAMETER_3_UNION.dst_slice_pitch =
+            (range->z == 1) ? 0 : (dst->slice >> element) - 1;
+        pkt->RECT_PARAMETER_1_UNION.rect_x = xcount - 1;
+        pkt->RECT_PARAMETER_1_UNION.rect_y = Min(range->y - y, max_y) - 1;
+        pkt->RECT_PARAMETER_2_UNION.rect_z = Min(range->z - z, max_z) - 1;
+      }
+    }
+  }
+}
+
 template <typename RingIndexTy, bool HwIndexMonotonic, int SizeToCountOffset>
 void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildPollCommand(
    char* cmd_addr, void* addr, uint32_t reference) {
@@ -1126,7 +1000,7 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset>::BuildHdpFlushCo
    char* cmd_addr) {
  assert(cmd_addr != NULL);
  SDMA_PKT_POLL_REGMEM* addr = reinterpret_cast<SDMA_PKT_POLL_REGMEM*>(cmd_addr);
-  memcpy(addr, &hdp_flush_cmd_, flush_command_size_);
+  memcpy(addr, &hdp_flush_cmd, flush_command_size_);
 }

 template class BlitSdma<uint32_t, false, 0>;
@@ -634,6 +634,31 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
  return stat;
 }

+hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
+                                   const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
+                                   const hsa_dim3_t* range, hsa_amd_copy_direction_t dir,
+                                   std::vector<core::Signal*>& dep_signals,
+                                   core::Signal& out_signal) {
+  if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT;
+
+  lazy_ptr<core::Blit>& blit =
+      (dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost];
+
+  if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+
+  if (profiling_enabled()) {
+    // Track the agent so we could translate the resulting timestamp to system
+    // domain correctly.
+    out_signal.async_copy_agent(core::Agent::Convert(this->public_handle()));
+  }
+
+  BlitSdmaBase* sdmaBlit = static_cast<BlitSdmaBase*>((*blit).get());
+  hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range,
+                                                      dep_signals, out_signal);
+
+  return stat;
+}
+
 hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) {
  return blits_[BlitDevToDev]->SubmitLinearFillCommand(ptr, value, count);
 }
@@ -381,6 +381,7 @@ void HsaApiTable::UpdateAmdExts() {
  amd_ext_api.hsa_amd_queue_intercept_create_fn = AMD::hsa_amd_queue_intercept_create;
  amd_ext_api.hsa_amd_queue_intercept_register_fn = AMD::hsa_amd_queue_intercept_register;
  amd_ext_api.hsa_amd_queue_set_priority_fn = AMD::hsa_amd_queue_set_priority;
+  amd_ext_api.hsa_amd_memory_async_copy_rect_fn = AMD::hsa_amd_memory_async_copy_rect;
 }

 class Init {
@@ -47,6 +47,7 @@
 #include <utility>
 #include <memory>
 #include <map>
+#include <vector>

 #include "core/inc/runtime.h"
 #include "core/inc/agent.h"
@@ -262,6 +263,52 @@ hsa_status_t hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent_handle,
  CATCH;
 }

+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal) {
+  TRY;
+  if (dst == nullptr || src == nullptr || dst_offset == nullptr || src_offset == nullptr ||
+      range == nullptr) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  if ((num_dep_signals == 0 && dep_signals != NULL) ||
+      (num_dep_signals > 0 && dep_signals == NULL)) {
+    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (dir == hsaHostToHost) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+
+  core::Agent* base_agent = core::Agent::Convert(copy_agent);
+  IS_VALID(base_agent);
+  if (base_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice)
+    return HSA_STATUS_ERROR_INVALID_AGENT;
+  amd::GpuAgent* agent = static_cast<amd::GpuAgent*>(base_agent);
+
+  std::vector<core::Signal*> dep_signal_list(num_dep_signals);
+  if (num_dep_signals > 0) {
+    for (size_t i = 0; i < num_dep_signals; ++i) {
+      core::Signal* dep_signal_obj = core::Signal::Convert(dep_signals[i]);
+      IS_VALID(dep_signal_obj);
+      dep_signal_list[i] = dep_signal_obj;
+    }
+  }
+
+  core::Signal* out_signal_obj = core::Signal::Convert(completion_signal);
+  IS_VALID(out_signal_obj);
+
+  if ((range->x != 0) && (range->y != 0) && (range->z != 0)) {
+    return agent->DmaCopyRect(dst, dst_offset, src, src_offset, range, dir, dep_signal_list,
+                              *out_signal_obj);
+  }
+
+  return HSA_STATUS_SUCCESS;
+  CATCH;
+}
+
+
 hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t* queue, int enable) {
  TRY;
  IS_OPEN();
@@ -176,6 +176,11 @@ static __forceinline T Min(const T& a, const T& b) {
  return (a > b) ? b : a;
 }

+template <class T, class... Arg>
+static __forceinline T Min(const T& a, const T& b, Arg... args) {
+  return Min(a, Min(b, args...));
+}
+
 /// @brief: Find out the max one of two inputs, input must support ">" operator.
 /// @param: a(Input), a reference to type T.
 /// @param: b(Input), a reference to type T.
@@ -185,6 +190,11 @@ static __forceinline T Max(const T& a, const T& b) {
  return (b > a) ? b : a;
 }

+template <class T, class... Arg>
+static __forceinline T Max(const T& a, const T& b, Arg... args) {
+  return Max(a, Max(b, args...));
+}
+
 /// @brief: Free the memory space which is newed previously.
 /// @param: ptr(Input), a pointer to memory space. Can't be NULL.
 /// @return: void.
@@ -216,6 +216,7 @@ global:
 	hsa_amd_ipc_signal_attach;
 	hsa_amd_register_system_event_handler;
 	hsa_amd_queue_set_priority;
+	hsa_amd_memory_async_copy_rect;

 local:
    *;
@@ -172,6 +172,7 @@ struct AmdExtTable {
  decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
  decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
  decltype(hsa_amd_queue_set_priority)* hsa_amd_queue_set_priority_fn;
+  decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
 };

 // Table to export HSA Core Runtime Apis
@@ -882,6 +882,43 @@ hsa_status_t HSA_API
                              const hsa_signal_t* dep_signals,
                              hsa_signal_t completion_signal);

+/*
+[Provisional API]
+Pitched memory descriptor.
+All elements must be 4 byte aligned.  Pitch and slice are in bytes.
+*/
+typedef struct hsa_pitched_ptr_s {
+  void* base;
+  size_t pitch;
+  size_t slice;
+} hsa_pitched_ptr_t;
+
+/*
+[Provisional API]
+Copy direction flag.
+*/
+typedef enum {
+  hsaHostToHost = 0,
+  hsaHostToDevice = 1,
+  hsaDeviceToHost = 2,
+  hsaDeviceToDevice = 3
+} hsa_amd_copy_direction_t;
+
+/*
+[Provisional API]
+SDMA 3D memory copy API.  The same requirements must be met by src and dst as in
+hsa_amd_memory_async_copy.
+Both src and dst must be directly accessible to the copy_agent during the copy, src and dst rects
+must not overlap.
+CPU agents are not supported.  API requires SDMA and will return an error if SDMA is not available.
+Offsets and range carry x in bytes, y and z in rows and layers.
+*/
+hsa_status_t HSA_API hsa_amd_memory_async_copy_rect(
+    const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
+    const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
+    hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
+    hsa_signal_t completion_signal);
+
 /**
 * @brief Type of accesses to a memory pool from a given agent.
 */