Implement optimized blit/fill kernels

Replace HSAIL kernels with SP3 shaders. Support all alignment variations efficiently. Change-Id: Icf7f5471f3ba68389f55484d82f2805dd9bc3827
2016-04-27 16:48:32 -05:00
@@ -43,6 +43,7 @@
 #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
 #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_

+#include <map>
 #include <stdint.h>

 #include "core/inc/blit.h"
@@ -66,8 +67,10 @@ class BlitKernel : public core::Blit {
  ///
  /// @note: The call will block until all AQL packets have been executed.
  ///
+  /// @param agent Agent passed to Initialize.
+  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy() override;
+  virtual hsa_status_t Destroy(const core::Agent& agent) override;

  /// @brief Submit an AQL packet to perform vector copy. The call is blocking
  /// until the command execution is finished.
@@ -104,17 +107,36 @@ class BlitKernel : public core::Blit {

 private:
  union KernelArgs {
-    struct __ALIGNED__(16) KernelCopyArgs {
-      const void* src;
-      void* dst;
-      uint64_t size;
-      uint32_t use_vector;
-    } copy;
+    struct __ALIGNED__(16) {
+      uint64_t phase1_src_start;
+      uint64_t phase1_dst_start;
+      uint64_t phase2_src_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase3_src_start;
+      uint64_t phase3_dst_start;
+      uint64_t phase4_src_start;
+      uint64_t phase4_dst_start;
+      uint64_t phase4_src_end;
+      uint64_t phase4_dst_end;
+      uint32_t num_workitems;
+    } copy_aligned;

-    struct __ALIGNED__(16) KernelFillArgs {
-      void* ptr;
-      uint64_t num;
-      uint32_t value;
+    struct __ALIGNED__(16) {
+      uint64_t phase1_src_start;
+      uint64_t phase1_dst_start;
+      uint64_t phase2_src_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase2_src_end;
+      uint64_t phase2_dst_end;
+      uint32_t num_workitems;
+    } copy_misaligned;
+
+    struct __ALIGNED__(16) {
+      uint64_t phase1_dst_start;
+      uint64_t phase2_dst_start;
+      uint64_t phase2_dst_end;
+      uint32_t fill_value;
+      uint32_t num_workitems;
    } fill;
  };

@@ -136,14 +158,19 @@ class BlitKernel : public core::Blit {

  KernelArgs* ObtainAsyncKernelCopyArg();

-  /// Handles to the vector copy kernel.
-  uint64_t copy_code_handle_;
+  /// AQL code object and size for each kernel.
+  enum class KernelType {
+    CopyAligned,
+    CopyMisaligned,
+    Fill,
+  };

-  /// Handles to the vector copy aligned kernel.
-  uint64_t copy_aligned_code_handle_;
+  struct KernelCode {
+    void* code_buf_;
+    size_t code_buf_size_;
+  };

-  /// Handles to the fill memory kernel.
-  uint64_t fill_code_handle_;
+  std::map<KernelType, KernelCode> kernels_;

  /// AQL queue for submitting the vector copy kernel.
  hsa_queue_t* queue_;
@@ -163,12 +190,8 @@ class BlitKernel : public core::Blit {
  /// Lock to synchronize access to kernarg_ and completion_signal_
  std::mutex lock_;

-  /// Pointer to memory containing the ISA and argument buffer.
-  void* code_arg_buffer_;
-
-  static const size_t kMaxCopyCount;
-  static const size_t kMaxFillCount;
-  static const uint32_t kGroupSize;
+  /// Number of CUs on the underlying agent.
+  int num_cus_;
 };
 }  // namespace amd

@@ -1,479 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-// 
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
-// Developed by:
-// 
-//                 AMD Research and AMD HSA Software Development
-// 
-//                 Advanced Micro Devices, Inc.
-// 
-//                 www.amd.com
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-// 
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_
-#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_KV_H_
-
-#include <stddef.h>
-
-#define HSA_VECTOR_COPY_KV_AKC_SIZE 368
-#define HSA_VECTOR_COPY_KV_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kVectorCopyRawKv.
-module &m:1:0:$full:$large:$default;
-
-prog kernel &__vector_copy_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size)
-{
-  @__vector_copy_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @BB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_u8	$s0, [$d0];
-  st_global_u8	$s0, [$d2];
-
-  @BB0_2:
-  // %return
-  ret;
-};
-*/
-
-static char kVectorCopyRawKv[] = {
-    127,  69,   76,   70,   2,   1,   1,   64,   0,    0,    0,   0,    0,
-    0,    0,    0,    1,    0,   -32, 0,   1,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   64,  0,    0,    0,    0,   0,    0,
-    0,    -104, 3,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    64,   0,    56,   0,    1,   0,   64,  0,    6,    0,    5,   0,    3,
-    0,    0,    96,   6,    0,   0,   0,   0,    1,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   112, 1,   0,    0,    0,    0,   0,    0,
-    112,  1,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    1,   0,   0,   0,    0,    0,    0,   0,    0,
-    1,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   65,  0,   -116, 0,    -112, 0,   0,    0,
-    11,   0,    10,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    24,   0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    11,   0,    5,   0,   5,   0,    0,    0,    9,   0,    0,
-    0,    0,    0,    0,    0,   3,   0,   0,    6,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   1,   5,   0,    -64,  127,  0,   -116, -65,
-    0,    -1,   -128, -109, 0,   0,   16,  0,    0,    8,    0,   -109, 0,
-    0,    0,    74,   4,    7,   64,  -64, -128, 2,    2,    126, 127,  0,
-    -116, -65,  0,    0,    -56, 125, 106, 36,   -128, -66,  15,  0,    -120,
-    -65,  0,    7,    -126, -64, 127, 0,   -116, -65,  4,    0,   2,    74,
-    5,    2,    4,    126,  2,   106, 80,  -46,  2,    1,    -87, 1,    0,
-    0,    32,   -36,  1,    0,   0,   1,   6,    0,    6,    74,  7,    2,
-    4,    126,  4,    106,  80,  -46, 2,   1,    -87,  1,    112, 0,    -116,
-    -65,  0,    0,    96,   -36, 3,   1,   0,    0,    0,    0,   -127, -65,
-    3,    0,    0,    0,    8,   0,   0,   0,    1,    0,    0,   0,    65,
-    77,   68,   0,    1,    0,   0,   0,   0,    0,    0,    0,   3,    0,
-    0,    0,    12,   0,    0,   0,   2,   0,    0,    0,    65,  77,   68,
-    0,    1,    0,    0,    0,   0,   0,   0,    0,    1,    1,   1,    0,
-    3,    0,    0,    0,    28,  0,   0,   0,    3,    0,    0,   0,    65,
-    77,   68,   0,    4,    0,   7,   0,   7,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   65,  77,   68,   0,    65,  77,   68,
-    71,   80,   85,   0,    0,   3,   0,   0,    0,    40,   0,   0,    0,
-    4,    0,    0,    0,    65,  77,  68,  0,    26,   0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   65,   77,   68,   32,  72,   83,
-    65,   32,   82,   117,  110, 116, 105, 109,  101,  32,   70,  105,  110,
-    97,   108,  105,  122,  101, 114, 0,   0,    0,    38,   95,  95,   118,
-    101,  99,   116,  111,  114, 95,  99,  111,  112,  121,  95,  107,  101,
-    114,  110,  101,  108,  0,   95,  95,  104,  115,  97,   95,  115,  101,
-    99,   116,  105,  111,  110, 46,  104, 115,  97,   116,  101, 120,  116,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    26,   0,   1,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    22,   0,   0,   0,   3,    0,    1,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    46,   104,  115, 97,  116, 101,  120,  116,  0,   46,   110,
-    111,  116,  101,  0,    46,  115, 116, 114,  116,  97,   98,  0,    46,
-    115,  121,  109,  116,  97,  98,  0,   46,   115,  104,  115, 116,  114,
-    116,  97,   98,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    1,    0,   0,    0,
-    1,    0,    0,    0,    7,   0,   -64, 0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    1,    0,    0,   0,    0,
-    0,    0,    112,  1,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   1,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    10,   0,    0,   0,    7,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   112, 2,    0,    0,    0,   0,    0,
-    0,    -104, 0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    4,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   16,   0,    0,    0,   3,    0,
-    0,    0,    32,   0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   8,   3,   0,    0,    0,    0,   0,    0,
-    44,   0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    1,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    0,   0,   24,  0,    0,    0,    2,   0,    0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,    56,  3,   0,   0,    0,    0,    0,   0,    48,
-    0,    0,    0,    0,    0,   0,   0,   3,    0,    0,    0,   0,    0,
-    0,    0,    8,    0,    0,   0,   0,   0,    0,    0,    24,  0,    0,
-    0,    0,    0,    0,    0,   32,  0,   0,    0,    3,    0,   0,    0,
-    32,   0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    104,  3,   0,   0,   0,    0,    0,    0,   42,   0,
-    0,    0,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    1,    0,    0,    0,   0,   0,   0,    0,    0,    0,   0,    0,
-    0,    0,    0,    0,
-};
-extern char* const kVectorCopyKvObject = &kVectorCopyRawKv[0];
-extern size_t const kVectorCopyKvObjectSize = sizeof(kVectorCopyRawKv);
-
-#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE 436
-#define HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawKv.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-
-prog kernel &__copy_buffer_aligned_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size,
-  kernarg_u32 %use_vector)
-{
-  @__copy_buffer_aligned_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_4;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%use_vector];
-  cmp_ne_b1_s32	$c0, $s0, 1;
-  cbr_b1	$c0, @LBB0_3;
-  // BB#2:                                // %if.then2
-  shl_u64	$d0, $d0, 4;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_v4_global_align(16)_const_u32	($s0, $s1, $s2, $s3), [$d0];
-  st_v4_global_align(16)_u32	($s0, $s1, $s2, $s3), [$d2];
-  br	@LBB0_4;
-
-  @LBB0_3:
-  // %if.else
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_align(4)_const_u32	$s0, [$d0];
-  st_global_align(4)_u32	$s0, [$d2];
-
-  @LBB0_4:
-  // %if.end6
-  ret;
-};
-*/
-
-static char kVectorCopyAlignedRawKv[] = {
-    127,  69,   76,   70,   2,   1,   1,    64,   0,    0,    0,    0,    0,
-    0,    0,    0,    1,    0,   -32, 0,    1,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   64,   0,    0,    0,    0,    0,    0,
-    0,    -8,   3,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    64,   0,    56,   0,    1,   0,   64,   0,    6,    0,    5,    0,    3,
-    0,    0,    96,   6,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   -76, 1,    0,    0,    0,    0,    0,    0,
-    -76,  1,    0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    1,    0,    0,    0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   65,  0,    -84,  0,    -112, 0,    0,    0,
-    11,   0,    10,   0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    32,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    11,   0,    7,   0,   7,    0,    0,    0,    9,    0,    0,
-    0,    0,    0,    0,    0,   4,   4,    4,    6,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   1,   5,    0,    -64,  127,  0,    -116, -65,
-    0,    -1,   -128, -109, 0,   0,   16,   0,    0,    8,    0,    -109, 0,
-    0,    0,    74,   4,    7,   64,  -64,  -128, 2,    2,    126,  127,  0,
-    -116, -65,  0,    0,    -56, 125, 106,  36,   -128, -66,  32,   0,    -120,
-    -65,  6,    7,    1,    -64, 0,   7,    -126, -64,  127,  0,    -116, -65,
-    2,    -127, 0,    -65,  14,  0,   -124, -65,  0,    0,    -62,  -46,  0,
-    9,    1,    0,    4,    0,   4,   74,   5,    2,    6,    126,  3,    3,
-    6,    80,   0,    0,    56,  -36, 2,    0,    0,    2,    6,    0,    0,
-    74,   7,    2,    12,   126, 6,   3,    2,    80,   112,  0,    -116, -65,
-    0,    0,    120,  -36,  0,   2,   0,    0,    13,   0,    -126, -65,  0,
-    0,    -62,  -46,  0,    5,   1,   0,    4,    0,    4,    74,   5,    2,
-    6,    126,  3,    3,    6,   80,  0,    0,    48,   -36,  2,    0,    0,
-    2,    6,    0,    0,    74,  7,   2,    6,    126,  3,    3,    2,    80,
-    112,  0,    -116, -65,  0,   0,   112,  -36,  0,    2,    0,    0,    0,
-    0,    -127, -65,  0,    0,   0,   0,    4,    0,    0,    0,    8,    0,
-    0,    0,    1,    0,    0,   0,   65,   77,   68,   0,    1,    0,    0,
-    0,    0,    0,    0,    0,   4,   0,    0,    0,    12,   0,    0,    0,
-    2,    0,    0,    0,    65,  77,  68,   0,    1,    0,    0,    0,    0,
-    0,    0,    0,    1,    1,   1,   0,    4,    0,    0,    0,    25,   0,
-    0,    0,    5,    0,    0,   0,   65,   77,   68,   0,    22,   0,    45,
-    104,  115,  97,   95,   99,  97,  108,  108,  95,   99,   111,  110,  118,
-    101,  110,  116,  105,  111, 110, 61,   0,    0,    0,    0,    0,    4,
-    0,    0,    0,    30,   0,   0,   0,    3,    0,    0,    0,    65,   77,
-    68,   0,    4,    0,    7,   0,   7,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   65,  77,   68,   0,    65,   77,   68,   71,
-    80,   85,   0,    0,    0,   0,   0,    0,    4,    0,    0,    0,    8,
-    0,    0,    0,    4,    0,   0,   0,    65,   77,   68,   0,    -32,  101,
-    -118, -12,  -1,   127,  0,   0,   38,   95,   95,   99,   111,  112,  121,
-    95,   98,   117,  102,  102, 101, 114,  95,   97,   108,  105,  103,  110,
-    101,  100,  95,   107,  101, 114, 110,  101,  108,  0,    95,   95,   104,
-    115,  97,   95,   115,  101, 99,  116,  105,  111,  110,  46,   104,  115,
-    97,   116,  101,  120,  116, 0,   0,    0,    0,    0,    0,    0,    0,
-    0,    26,   0,    1,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    -76,  1,    0,    0,    0,   0,   0,    0,    30,   0,    0,    0,    3,
-    0,    1,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    46,   104,  115,  97,   116,  101,
-    120,  116,  0,    46,   110, 111, 116,  101,  0,    46,   115,  116,  114,
-    116,  97,   98,   0,    46,  115, 121,  109,  116,  97,   98,   0,    46,
-    115,  104,  115,  116,  114, 116, 97,   98,   0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    1,    0,    0,    0,   1,   0,    0,    0,    7,    0,    -64,  0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,    0,    0,    0,    0,   0,   0,    -76,  1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    1,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    10,   0,    0,    0,    7,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    -72,  2,
-    0,    0,    0,    0,    0,   0,   -88,  0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    8,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    16,
-    0,    0,    0,    3,    0,   0,   0,    32,   0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    96,   3,    0,
-    0,    0,    0,    0,    0,   52,  0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    0,    0,    24,   0,
-    0,    0,    2,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,    -104, 3,    0,    0,
-    0,    0,    0,    0,    48,  0,   0,    0,    0,    0,    0,    0,    3,
-    0,    0,    0,    0,    0,   0,   0,    8,    0,    0,    0,    0,    0,
-    0,    0,    24,   0,    0,   0,   0,    0,    0,    0,    32,   0,    0,
-    0,    3,    0,    0,    0,   32,  0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    -56,  3,    0,    0,    0,
-    0,    0,    0,    42,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   1,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,   0,   0,    0,    0,
-};
-extern char* const kVectorCopyAlignedKvObject = &kVectorCopyAlignedRawKv[0];
-extern size_t const kVectorCopyAlignedKvObjectSize =
-    sizeof(kVectorCopyAlignedRawKv);
-
-#define HSA_FILL_MEMORY_KV_AKC_SIZE 352
-#define HSA_FILL_MEMORY_KV_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kFillMemoryRawKv.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-
-prog kernel &__fill_memory_kernel(
-kernarg_u64 %ptr,
-kernarg_u64 %num,
-kernarg_u32 %value)
-{
-@__fill_memory_kernel_entry:
-// BB#0:                                // %entry
-workitemabsid_u32	$s0, 0;
-cvt_u64_u32	$d0, $s0;
-ld_kernarg_align(8)_width(all)_u64	$d1, [%num];
-cmp_ge_b1_u64	$c0, $d0, $d1;
-cbr_b1	$c0, @LBB0_2;
-// BB#1:                                // %if.end
-ld_kernarg_align(8)_width(all)_u64	$d1, [%ptr];
-ld_kernarg_align(4)_width(all)_u32	$s0, [%value];
-shl_u64	$d0, $d0, 2;
-add_u64	$d0, $d1, $d0;
-st_global_align(4)_u32	$s0, [$d0];
-
-@LBB0_2:
-// %return
-ret;
-};
-*/
-
-static char kFillMemoryRawKv[] = {
-    127,  69,   76,  70,  2,    1,    1,   64,  0,    0,   0,   0,   0,    0,
-    0,    0,    1,   0,   -32,  0,    1,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   64,   0,    0,   0,   0,    0,   0,   0,   -104, 3,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   64,  0,   56,   0,
-    1,    0,    64,  0,   6,    0,    5,   0,   3,    0,   0,   96,  6,    0,
-    0,    0,    0,   1,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   96,   1,
-    0,    0,    0,   0,   0,    0,    96,  1,   0,    0,   0,   0,   0,    0,
-    0,    1,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   1,    0,    0,   0,   1,    0,   0,   0,   1,    0,
-    0,    0,    0,   0,   0,    0,    0,   1,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   64,  0,   -84,  0,
-    -112, 0,    0,   0,   11,   0,    10,  0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    32,  0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   11,   0,    3,   0,   3,    0,   0,   0,   9,    0,
-    0,    0,    0,   0,   0,    0,    4,   4,   4,    6,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   1,    5,   0,   -64, 127,  0,
-    -116, -65,  0,   -1,  -128, -109, 0,   0,   16,   0,   0,   8,   0,    -109,
-    0,    0,    0,   74,  2,    7,    64,  -64, -128, 2,   2,   126, 127,  0,
-    -116, -65,  0,   0,   -56,  125,  106, 36,  -128, -66, 11,  0,   -120, -65,
-    0,    7,    65,  -64, 4,    7,    2,   -64, 0,    0,   -62, -46, 0,    5,
-    1,    0,    127, 0,   -116, -65,  2,   0,   0,    74,  3,   2,   4,    126,
-    2,    3,    2,   80,  4,    2,    4,   126, 0,    0,   112, -36, 0,    2,
-    0,    0,    0,   0,   -127, -65,  4,   0,   0,    0,   8,   0,   0,    0,
-    1,    0,    0,   0,   65,   77,   68,  0,   1,    0,   0,   0,   0,    0,
-    0,    0,    4,   0,   0,    0,    12,  0,   0,    0,   2,   0,   0,    0,
-    65,   77,   68,  0,   1,    0,    0,   0,   0,    0,   0,   0,   1,    1,
-    1,    0,    4,   0,   0,    0,    25,  0,   0,    0,   5,   0,   0,    0,
-    65,   77,   68,  0,   22,   0,    45,  104, 115,  97,  95,  99,  97,   108,
-    108,  95,   99,  111, 110,  118,  101, 110, 116,  105, 111, 110, 61,   0,
-    0,    0,    0,   0,   4,    0,    0,   0,   30,   0,   0,   0,   3,    0,
-    0,    0,    65,  77,  68,   0,    4,   0,   7,    0,   7,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   65,   77,  68,  0,   65,   77,
-    68,   71,   80,  85,  0,    0,    0,   0,   0,    0,   4,   0,   0,    0,
-    8,    0,    0,   0,   4,    0,    0,   0,   65,   77,  68,  0,   48,   123,
-    44,   -103, -4,  127, 0,    0,    38,  95,  95,   102, 105, 108, 108,  95,
-    109,  101,  109, 111, 114,  121,  95,  107, 101,  114, 110, 101, 108,  0,
-    95,   95,   104, 115, 97,   95,   115, 101, 99,   116, 105, 111, 110,  46,
-    104,  115,  97,  116, 101,  120,  116, 0,   0,    0,   0,   0,   0,    0,
-    0,    0,    26,  0,   1,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    96,   1,    0,   0,   0,    0,    0,   0,   22,   0,   0,   0,   3,    0,
-    1,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    46,   104, 115, 97,   116, 101, 120, 116,  0,
-    46,   110,  111, 116, 101,  0,    46,  115, 116,  114, 116, 97,  98,   0,
-    46,   115,  121, 109, 116,  97,   98,  0,   46,   115, 104, 115, 116,  114,
-    116,  97,   98,  0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   1,    0,    0,   0,   1,    0,   0,   0,   7,    0,
-    -64,  0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    1,    0,   0,   0,    0,    0,   0,   96,   1,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   1,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   10,   0,
-    0,    0,    7,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   96,   2,   0,   0,   0,    0,
-    0,    0,    -88, 0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   8,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    16,  0,   0,    0,   3,   0,   0,    0,
-    32,   0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    8,   3,   0,    0,    0,   0,   0,    0,   44,  0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   1,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    24,   0,    0,   0,   2,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   56,  3,   0,    0,
-    0,    0,    0,   0,   48,   0,    0,   0,   0,    0,   0,   0,   3,    0,
-    0,    0,    0,   0,   0,    0,    8,   0,   0,    0,   0,   0,   0,    0,
-    24,   0,    0,   0,   0,    0,    0,   0,   32,   0,   0,   0,   3,    0,
-    0,    0,    32,  0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,    0,   0,   104,  3,    0,   0,   0,    0,   0,   0,   42,   0,
-    0,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    1,    0,    0,   0,   0,    0,    0,   0,   0,    0,   0,   0,   0,    0,
-    0,    0,
-};
-
-extern char* const kFillMemoryKvObject = &kFillMemoryRawKv[0];
-extern size_t const kFillMemoryKvObjectSize = sizeof(kFillMemoryRawKv);
-#endif  // header guard
@@ -1,490 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-// 
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-// 
-// Developed by:
-// 
-//                 AMD Research and AMD HSA Software Development
-// 
-//                 Advanced Micro Devices, Inc.
-// 
-//                 www.amd.com
-// 
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-// 
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_
-#define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_VI_H_
-
-#include <stddef.h>
-
-#define HSA_VECTOR_COPY_VI_AKC_SIZE 380
-#define HSA_VECTOR_COPY_VI_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kVectorCopyRawVi.
-module &m:1:0:$full:$large:$default;
-
-prog kernel &__vector_copy_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size)
-{
-  @__vector_copy_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @BB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_u8	$s0, [$d0];
-  st_global_u8	$s0, [$d2];
-
-  @BB0_2:
-  // %return
-  ret;
-};
-*/
-
-static char kVectorCopyRawVi[] = {
-    127, 69,  76,   70,   2,   1,   1,    64,   0,    0,    0,    0,    0,
-    0,   0,   0,    1,    0,   -32, 0,    1,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   64,   0,    0,    0,    0,    0,    0,
-    0,   -72, 3,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    64,  0,   56,   0,    1,   0,   64,   0,    6,    0,    5,    0,    3,
-    0,   0,   96,   6,    0,   0,   0,    0,    1,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   124, 1,    0,    0,    0,    0,    0,    0,
-    124, 1,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    1,   0,   0,    0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    1,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   -63, 2,    -84,  0,    -112, 0,    0,    0,
-    11,  0,   10,   0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    32,   0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   96,   0,    5,   0,   5,    0,    0,    0,    9,    0,    0,
-    0,   0,   0,    0,    0,   4,   4,    4,    6,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   2,   0,    2,    -64,  4,    0,    0,    0,
-    127, 0,   -116, -65,  0,   -1,  -128, -110, 0,    0,    16,   0,    0,
-    8,   0,   -110, 0,    0,   0,   50,   3,    0,    6,    -64,  16,   0,
-    0,   0,   -128, 2,    2,   126, 127,  0,    -116, -65,  0,    0,    -40,
-    125, 106, 32,   -128, -66, 16,  0,    -120, -65,  3,    1,    10,   -64,
-    0,   0,   0,    0,    127, 0,   -116, -65,  4,    0,    2,    50,   5,
-    2,   4,   126,  2,    106, 28,  -47,  2,    1,    -87,  1,    0,    0,
-    64,  -36, 1,    0,    0,   1,   6,    0,    6,    50,   7,    2,    4,
-    126, 4,   106,  28,   -47, 2,   1,    -87,  1,    112,  0,    -116, -65,
-    0,   0,   96,   -36,  3,   1,   0,    0,    0,    0,    -127, -65,  0,
-    0,   0,   0,    4,    0,   0,   0,    8,    0,    0,    0,    1,    0,
-    0,   0,   65,   77,   68,  0,   1,    0,    0,    0,    0,    0,    0,
-    0,   4,   0,    0,    0,   12,  0,    0,    0,    2,    0,    0,    0,
-    65,  77,  68,   0,    1,   0,   0,    0,    0,    0,    0,    0,    1,
-    1,   1,   0,    4,    0,   0,   0,    25,   0,    0,    0,    5,    0,
-    0,   0,   65,   77,   68,  0,   22,   0,    45,   104,  115,  97,   95,
-    99,  97,  108,  108,  95,  99,  111,  110,  118,  101,  110,  116,  105,
-    111, 110, 61,   0,    0,   0,   0,    0,    4,    0,    0,    0,    30,
-    0,   0,   0,    3,    0,   0,   0,    65,   77,   68,   0,    4,    0,
-    7,   0,   8,    0,    0,   0,   0,    0,    0,    0,    2,    0,    0,
-    0,   65,  77,   68,   0,   65,  77,   68,   71,   80,   85,   0,    0,
-    0,   0,   0,    0,    4,   0,   0,    0,    8,    0,    0,    0,    4,
-    0,   0,   0,    65,   77,  68,  0,    32,   103,  -72,  81,   -3,   127,
-    0,   0,   38,   95,   95,  118, 101,  99,   116,  111,  114,  95,   99,
-    111, 112, 121,  95,   107, 101, 114,  110,  101,  108,  0,    95,   95,
-    104, 115, 97,   95,   115, 101, 99,   116,  105,  111,  110,  46,   104,
-    115, 97,  116,  101,  120, 116, 0,    0,    0,    0,    0,    0,    0,
-    0,   0,   26,   0,    1,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   124, 1,    0,    0,   0,   0,    0,    0,    22,   0,    0,    0,
-    3,   0,   1,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    46,   104,  115,  97,   116,
-    101, 120, 116,  0,    46,  110, 111,  116,  101,  0,    46,   115,  116,
-    114, 116, 97,   98,   0,   46,  115,  121,  109,  116,  97,   98,   0,
-    46,  115, 104,  115,  116, 114, 116,  97,   98,   0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   1,    0,    0,   0,   1,    0,    0,    0,    7,    0,    -64,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   1,   0,    0,    0,   0,   0,    0,    124,  1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    1,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   10,  0,    0,    0,   7,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    -128,
-    2,   0,   0,    0,    0,   0,   0,    -88,  0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    8,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    16,  0,   0,    0,    3,   0,   0,    0,    32,   0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    40,   3,
-    0,   0,   0,    0,    0,   0,   44,   0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    1,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    0,    0,    24,
-    0,   0,   0,    2,    0,   0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,    88,   3,    0,
-    0,   0,   0,    0,    0,   48,  0,    0,    0,    0,    0,    0,    0,
-    3,   0,   0,    0,    0,   0,   0,    0,    8,    0,    0,    0,    0,
-    0,   0,   0,    24,   0,   0,   0,    0,    0,    0,    0,    32,   0,
-    0,   0,   3,    0,    0,   0,   32,   0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    -120, 3,    0,    0,
-    0,   0,   0,    0,    42,  0,   0,    0,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    1,    0,    0,    0,    0,    0,
-    0,   0,   0,    0,    0,   0,   0,    0,    0,    0,
-};
-extern char* const kVectorCopyViObject = &kVectorCopyRawVi[0];
-extern size_t const kVectorCopyViObjectSize = sizeof(kVectorCopyRawVi);
-
-#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE 452
-#define HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kVectorCopyAlignedRawVi.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-
-prog kernel &__copy_buffer_aligned_kernel(
-  kernarg_u64 %src,
-  kernarg_u64 %dst,
-  kernarg_u64 %size,
-  kernarg_u32 %use_vector)
-{
-  @__copy_buffer_aligned_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%size];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_4;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d2, [%dst];
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%src];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%use_vector];
-  cmp_ne_b1_s32	$c0, $s0, 1;
-  cbr_b1	$c0, @LBB0_3;
-  // BB#2:                                // %if.then2
-  shl_u64	$d0, $d0, 4;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_v4_global_align(16)_const_u32	($s0, $s1, $s2, $s3), [$d0];
-  st_v4_global_align(16)_u32	($s0, $s1, $s2, $s3), [$d2];
-  br	@LBB0_4;
-
-  @LBB0_3:
-  // %if.else
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d2, $d2, $d0;
-  add_u64	$d0, $d1, $d0;
-  ld_global_align(4)_const_u32	$s0, [$d0];
-  st_global_align(4)_u32	$s0, [$d2];
-
-  @LBB0_4:
-  // %if.end6
-  ret;
-};
-*/
-
-static char kVectorCopyAlignedRawVi[] = {
-    127,  69,   76,   70,   2,    1,    1,    64,   0,    0,    0,   0,   0,
-    0,    0,    0,    1,    0,    -32,  0,    1,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    64,   0,    0,    0,    0,   0,   0,
-    0,    8,    4,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    64,   0,    56,   0,    1,    0,    64,   0,    6,    0,    5,   0,   3,
-    0,    0,    96,   6,    0,    0,    0,    0,    1,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    -60,  1,    0,    0,    0,    0,   0,   0,
-    -60,  1,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    1,    0,    0,    0,    1,    0,    0,    0,    0,    0,    0,   0,   0,
-    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    65,   0,    -84,  0,    -112, 0,   0,   0,
-    11,   0,    74,   0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    32,   0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    16,   0,    8,    0,    8,    0,    0,    0,    12,  0,   0,
-    0,    0,    0,    0,    0,    4,    4,    4,    6,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    2,    0,    2,    -64,  4,    0,   0,   0,
-    127,  0,    -116, -65,  0,    -1,   -128, -110, 0,    0,    16,  0,   0,
-    8,    0,    -110, 0,    0,    0,    50,   3,    0,    6,    -64, 16,  0,
-    0,    0,    -128, 2,    2,    126,  127,  0,    -116, -65,  0,   0,   -40,
-    125,  106,  32,   -128, -66,  34,   0,    -120, -65,  -125, 0,   2,   -64,
-    24,   0,    0,    0,    3,    2,    10,   -64,  0,    0,    0,   0,   127,
-    0,    -116, -65,  2,    -127, 0,    -65,  14,   0,    -124, -65, 0,   0,
-    -113, -46,  -124, 0,    2,    0,    8,    0,    4,    50,   9,   2,   6,
-    126,  3,    3,    6,    56,   0,    0,    92,   -36,  2,    0,   0,   4,
-    10,   0,    0,    50,   11,   2,    4,    126,  2,    3,    2,   56,  112,
-    0,    -116, -65,  0,    0,    124,  -36,  0,    4,    0,    0,   13,  0,
-    -126, -65,  0,    0,    -113, -46,  -126, 0,    2,    0,    8,   0,   4,
-    50,   9,    2,    6,    126,  3,    3,    6,    56,   0,    0,   80,  -36,
-    2,    0,    0,    4,    10,   0,    0,    50,   11,   2,    4,   126, 2,
-    3,    2,    56,   112,  0,    -116, -65,  0,    0,    112,  -36, 0,   4,
-    0,    0,    0,    0,    -127, -65,  0,    0,    0,    0,    4,   0,   0,
-    0,    8,    0,    0,    0,    1,    0,    0,    0,    65,   77,  68,  0,
-    1,    0,    0,    0,    0,    0,    0,    0,    4,    0,    0,   0,   12,
-    0,    0,    0,    2,    0,    0,    0,    65,   77,   68,   0,   1,   0,
-    0,    0,    0,    0,    0,    0,    1,    1,    1,    0,    4,   0,   0,
-    0,    25,   0,    0,    0,    5,    0,    0,    0,    65,   77,  68,  0,
-    22,   0,    45,   104,  115,  97,   95,   99,   97,   108,  108, 95,  99,
-    111,  110,  118,  101,  110,  116,  105,  111,  110,  61,   0,   0,   0,
-    0,    0,    4,    0,    0,    0,    30,   0,    0,    0,    3,   0,   0,
-    0,    65,   77,   68,   0,    4,    0,    7,    0,    8,    0,   0,   0,
-    0,    0,    0,    0,    1,    0,    0,    0,    65,   77,   68,  0,   65,
-    77,   68,   71,   80,   85,   0,    0,    0,    0,    0,    0,   4,   0,
-    0,    0,    8,    0,    0,    0,    4,    0,    0,    0,    65,  77,  68,
-    0,    96,   62,   -27,  85,   -1,   127,  0,    0,    38,   95,  95,  99,
-    111,  112,  121,  95,   98,   117,  102,  102,  101,  114,  95,  97,  108,
-    105,  103,  110,  101,  100,  95,   107,  101,  114,  110,  101, 108, 0,
-    95,   95,   104,  115,  97,   95,   115,  101,  99,   116,  105, 111, 110,
-    46,   104,  115,  97,   116,  101,  120,  116,  0,    0,    0,   0,   0,
-    0,    0,    0,    0,    26,   0,    1,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    -60,  1,    0,    0,    0,    0,    0,    0,   30,  0,
-    0,    0,    3,    0,    1,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    46,  104, 115,
-    97,   116,  101,  120,  116,  0,    46,   110,  111,  116,  101, 0,   46,
-    115,  116,  114,  116,  97,   98,   0,    46,   115,  121,  109, 116, 97,
-    98,   0,    46,   115,  104,  115,  116,  114,  116,  97,   98,  0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    1,    0,    0,    0,    1,    0,    0,   0,   7,
-    0,    -64,  0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    1,    0,    0,    0,    0,    0,    0,    -60, 1,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    10,   0,    0,    0,    7,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    -56,  2,    0,    0,    0,    0,    0,    0,    -88,  0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   8,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    16,   0,    0,    0,    3,    0,    0,    0,    32,  0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    112,  3,    0,    0,    0,    0,    0,    0,    52,   0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   1,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,
-    0,    24,   0,    0,    0,    2,    0,    0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,   -88,
-    3,    0,    0,    0,    0,    0,    0,    48,   0,    0,    0,   0,   0,
-    0,    0,    3,    0,    0,    0,    0,    0,    0,    0,    8,   0,   0,
-    0,    0,    0,    0,    0,    24,   0,    0,    0,    0,    0,   0,   0,
-    32,   0,    0,    0,    3,    0,    0,    0,    32,   0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   -40, 3,
-    0,    0,    0,    0,    0,    0,    42,   0,    0,    0,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    0,   0,   0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0,
-};
-extern char* const kVectorCopyAlignedViObject = &kVectorCopyAlignedRawVi[0];
-extern size_t const kVectorCopyAlignedViObjectSize =
-    sizeof(kVectorCopyAlignedRawVi);
-
-#define HSA_FILL_MEMORY_VI_AKC_SIZE 368
-#define HSA_FILL_MEMORY_VI_AKC_OFFSET 256
-
-/*****HSAIL code of the ISA in ::kFillMemoryRawVi.
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-
-prog kernel &__fill_memory_kernel(
-  kernarg_u64 %ptr,
-  kernarg_u64 %num,
-  kernarg_u32 %value)
-{
-  @__fill_memory_kernel_entry:
-  // BB#0:                                // %entry
-  workitemabsid_u32	$s0, 0;
-  cvt_u64_u32	$d0, $s0;
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%num];
-  cmp_ge_b1_u64	$c0, $d0, $d1;
-  cbr_b1	$c0, @LBB0_2;
-  // BB#1:                                // %if.end
-  ld_kernarg_align(8)_width(all)_u64	$d1, [%ptr];
-  ld_kernarg_align(4)_width(all)_u32	$s0, [%value];
-  shl_u64	$d0, $d0, 2;
-  add_u64	$d0, $d1, $d0;
-  st_global_align(4)_u32	$s0, [$d0];
-
-  @LBB0_2:
-  // %return
-  ret;
-};
-*/
-
-static char kFillMemoryRawVi[] = {
-    127, 69,   76,   70,   2,   1,   1,    64,   0,    0,    0,   0,    0,
-    0,   0,    0,    1,    0,   -32, 0,    1,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   64,   0,    0,    0,    0,   0,    0,
-    0,   -88,  3,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    64,  0,    56,   0,    1,   0,   64,   0,    6,    0,    5,   0,    3,
-    0,   0,    96,   6,    0,   0,   0,    0,    1,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   112, 1,    0,    0,    0,    0,   0,    0,
-    112, 1,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    1,   0,    0,    0,    1,   0,   0,    0,    0,    0,    0,   0,    0,
-    1,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   64,  0,    -84,  0,    -112, 0,   0,    0,
-    11,  0,    74,   0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    32,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    13,   0,    3,   0,   3,    0,    0,    0,    9,   0,    0,
-    0,   0,    0,    0,    0,   4,   4,    4,    6,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   2,   0,    2,    -64,  4,    0,   0,    0,
-    127, 0,    -116, -65,  0,   -1,  -128, -110, 0,    0,    16,  0,    0,
-    8,   0,    -110, 0,    0,   0,   50,   3,    0,    6,    -64, 8,    0,
-    0,   0,    -128, 2,    2,   126, 127,  0,    -116, -65,  0,   0,    -40,
-    125, 106,  32,   -128, -66, 13,  0,    -120, -65,  -125, 0,   6,    -64,
-    0,   0,    0,    0,    3,   1,   2,    -64,  16,   0,    0,   0,    0,
-    0,   -113, -46,  -126, 0,   2,   0,    127,  0,    -116, -65, 2,    0,
-    0,   50,   3,    2,    4,   126, 2,    3,    2,    56,   4,   2,    4,
-    126, 0,    0,    112,  -36, 0,   2,    0,    0,    0,    0,   -127, -65,
-    4,   0,    0,    0,    8,   0,   0,    0,    1,    0,    0,   0,    65,
-    77,  68,   0,    1,    0,   0,   0,    0,    0,    0,    0,   4,    0,
-    0,   0,    12,   0,    0,   0,   2,    0,    0,    0,    65,  77,   68,
-    0,   1,    0,    0,    0,   0,   0,    0,    0,    1,    1,   1,    0,
-    4,   0,    0,    0,    25,  0,   0,    0,    5,    0,    0,   0,    65,
-    77,  68,   0,    22,   0,   45,  104,  115,  97,   95,   99,  97,   108,
-    108, 95,   99,   111,  110, 118, 101,  110,  116,  105,  111, 110,  61,
-    0,   0,    0,    0,    0,   4,   0,    0,    0,    30,   0,   0,    0,
-    3,   0,    0,    0,    65,  77,  68,   0,    4,    0,    7,   0,    8,
-    0,   0,    0,    0,    0,   0,   0,    1,    0,    0,    0,   65,   77,
-    68,  0,    65,   77,   68,  71,  80,   85,   0,    0,    0,   0,    0,
-    0,   4,    0,    0,    0,   8,   0,    0,    0,    4,    0,   0,    0,
-    65,  77,   68,   0,    16,  -20, 88,   97,   -4,   127,  0,   0,    38,
-    95,  95,   102,  105,  108, 108, 95,   109,  101,  109,  111, 114,  121,
-    95,  107,  101,  114,  110, 101, 108,  0,    95,   95,   104, 115,  97,
-    95,  115,  101,  99,   116, 105, 111,  110,  46,   104,  115, 97,   116,
-    101, 120,  116,  0,    0,   0,   0,    0,    0,    0,    0,   0,    26,
-    0,   1,    0,    0,    0,   0,   0,    0,    0,    0,    0,   112,  1,
-    0,   0,    0,    0,    0,   0,   22,   0,    0,    0,    3,   0,    1,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   46,  104,  115,  97,   116,  101, 120,  116,
-    0,   46,   110,  111,  116, 101, 0,    46,   115,  116,  114, 116,  97,
-    98,  0,    46,   115,  121, 109, 116,  97,   98,   0,    46,  115,  104,
-    115, 116,  114,  116,  97,  98,  0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    1,
-    0,   0,    0,    1,    0,   0,   0,    7,    0,    -64,  0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   1,    0,
-    0,   0,    0,    0,    0,   112, 1,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    1,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   10,   0,
-    0,   0,    7,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    112,  2,   0,    0,
-    0,   0,    0,    0,    -88, 0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    8,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    0,    16,  0,    0,
-    0,   3,    0,    0,    0,   32,  0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    24,   3,    0,   0,    0,
-    0,   0,    0,    44,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   1,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    0,    0,    24,   0,   0,    0,
-    2,   0,    0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,    72,   3,    0,    0,   0,    0,
-    0,   0,    48,   0,    0,   0,   0,    0,    0,    0,    3,   0,    0,
-    0,   0,    0,    0,    0,   8,   0,    0,    0,    0,    0,   0,    0,
-    24,  0,    0,    0,    0,   0,   0,    0,    32,   0,    0,   0,    3,
-    0,   0,    0,    32,   0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   120,  3,    0,    0,    0,   0,    0,
-    0,   42,   0,    0,    0,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    1,   0,   0,    0,    0,    0,    0,   0,    0,
-    0,   0,    0,    0,    0,   0,   0,
-};
-
-extern char* const kFillMemoryViObject = &kFillMemoryRawVi[0];
-extern size_t const kFillMemoryViObjectSize = sizeof(kFillMemoryRawVi);
-#endif  // header guard
@@ -73,8 +73,10 @@ class BlitSdma : public core::Blit {
  ///
  /// @note: The call will block until all packets have executed.
  ///
+  /// @param agent Agent passed to Initialize.
+  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy() override;
+  virtual hsa_status_t Destroy(const core::Agent& agent) override;

  /// @brief Submit a linear copy command to the queue buffer.
  ///
@@ -165,20 +165,24 @@ class GpuAgent : public GpuAgentInt {

  uint16_t GetMicrocodeVersion() const;

-  // @brief Assembles SP3 shader source into executable code.
+  // @brief Assembles SP3 shader source into ISA or AQL code object.
  //
  // @param [in] src_sp3 SP3 shader source text representation.
  // @param [in] func_name Name of the SP3 function to assemble.
-  // @param [out] code_buf Executable code buffer.
-  // @param [out] code_buf_size Size of executable code buffer in bytes.
-  void AssembleShader(const char* src_sp3, const char* func_name,
-                      void*& code_buf, size_t& code_buf_size);
+  // @param [in] assemble_target ISA or AQL assembly target.
+  // @param [out] code_buf Code object buffer.
+  // @param [out] code_buf_size Size of code object buffer in bytes.
+  enum class AssembleTarget { ISA, AQL };

-  // @brief Frees executable code created by AssembleShader.
+  void AssembleShader(const char* src_sp3, const char* func_name,
+                      AssembleTarget assemble_target, void*& code_buf,
+                      size_t& code_buf_size) const;
+
+  // @brief Frees code object created by AssembleShader.
  //
-  // @param [in] code_buf Executable code buffer.
-  // @param [in] code_buf_size Size of executable code buffer in bytes.
-  void ReleaseShader(void* code_buf, size_t code_buf_size);
+  // @param [in] code_buf Code object buffer.
+  // @param [in] code_buf_size Size of code object buffer in bytes.
+  void ReleaseShader(void* code_buf, size_t code_buf_size) const;

  // @brief Override from core::Agent.
  hsa_status_t VisitRegion(bool include_peer,
@@ -66,8 +66,10 @@ class Blit {
  ///
  /// @note: The call will block until all commands have executed.
  ///
+  /// @param agent Agent passed to Initialize.
+  ///
  /// @return hsa_status_t
-  virtual hsa_status_t Destroy() = 0;
+  virtual hsa_status_t Destroy(const core::Agent& agent) = 0;

  /// @brief Submit a linear copy command to the the underlying compute device's
  /// control block. The call is blocking until the command execution is
@@ -43,40 +43,486 @@
 #include "core/inc/amd_blit_kernel.h"

 #include <algorithm>
-#include <climits>
-#include <cmath>
-#include <cstring>
+#include <sstream>
+#include <string>

-#if defined(_WIN32) || defined(_WIN64)
-#define NOMINMAX
-#include <windows.h>
-#else
-#include <sys/mman.h>
-#endif
-
-#include "core/inc/amd_blit_kernel_kv.h"
-#include "core/inc/amd_blit_kernel_vi.h"
 #include "core/inc/amd_gpu_agent.h"
 #include "core/inc/hsa_internal.h"
 #include "core/util/utils.h"

 namespace amd {
-const uint32_t BlitKernel::kGroupSize = 256;
-const size_t BlitKernel::kMaxCopyCount = AlignDown(UINT32_MAX, kGroupSize);
-const size_t BlitKernel::kMaxFillCount = AlignDown(UINT32_MAX, kGroupSize);
-
 static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;

+static std::string kBlitKernelSource(R"(
+  // Compatibility function for GFXIP 7.
+
+  function s_load_dword_offset(byte_offset)
+    if kGFXIPVersion == 7
+      return byte_offset / 4
+    else
+      return byte_offset
+    end
+  end
+
+  // Memory copy for all cases except:
+  //  (src_addr & 0x3) != (dst_addr & 0x3)
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 src start address
+  //   [DW  2, 3]  Phase 1 dst start address
+  //   [DW  4, 5]  Phase 2 src start address
+  //   [DW  6, 7]  Phase 2 dst start address
+  //   [DW  8, 9]  Phase 3 src start address
+  //   [DW 10,11]  Phase 3 dst start address
+  //   [DW 12,13]  Phase 4 src start address
+  //   [DW 14,15]  Phase 4 dst start address
+  //   [DW 16,17]  Phase 4 src end address
+  //   [DW 18,19]  Phase 4 dst end address
+  //   [DW 20   ]  Total number of workitems
+
+  var kCopyAlignedVecWidth = 4
+  var kCopyAlignedUnroll = 1
+
+  shader CopyAligned
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(32)
+    vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
+
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+    s_load_dwordx4          s[16:19], s[0:1], s_load_dword_offset(0x30)
+    s_load_dwordx4          s[20:23], s[0:1], s_load_dword_offset(0x40)
+    s_load_dword            s24, s[0:1], s_load_dword_offset(0x50)
+    s_waitcnt               lgkmcnt(0)
+
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+
+    // =====================================================
+    // Phase 1: Byte copy up to 0x100 destination alignment.
+    // =====================================================
+
+    // Compute phase source address.
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v0, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s7
+    v_add_u32               v4, vcc, v0, s6
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+  L_COPY_ALIGNED_PHASE_1_LOOP:
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_1_DONE
+    s_and_b64               exec, exec, vcc
+
+    // Load from/advance the source address.
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+    v_add_u32               v2, vcc, v2, s24
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Write to/advance the destination address.
+    flat_store_byte         v[4:5], v1
+    v_add_u32               v4, vcc, v4, s24
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_1_LOOP
+
+  L_COPY_ALIGNED_PHASE_1_DONE:
+    // Restore EXEC mask for all lanes.
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+    // ========================================================
+    // Phase 2: Unrolled dword[x4] copy up to last whole block.
+    // ========================================================
+
+    // Compute unrolled dword[x4] stride across all threads.
+    if kCopyAlignedVecWidth == 4
+      s_lshl_b32            s25, s24, 0x4
+    else
+      s_lshl_b32            s25, s24, 0x2
+    end
+
+    // Compute phase source address.
+    if kCopyAlignedVecWidth == 4
+      v_lshlrev_b32         v1, 0x4, v0
+    else
+      v_lshlrev_b32         v1, 0x2, v0
+    end
+
+    v_mov_b32               v3, s9
+    v_add_u32               v2, vcc, v1, s8
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s11
+    v_add_u32               v4, vcc, v1, s10
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+  L_COPY_ALIGNED_PHASE_2_LOOP:
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_2_DONE
+
+    // Load from/advance the source address.
+    for var i = 0; i < kCopyAlignedUnroll; i ++
+      if kCopyAlignedVecWidth == 4
+        flat_load_dwordx4   v[8 + (i * 4)], v[2:3]
+      else
+        flat_load_dword     v[8 + i], v[2:3]
+      end
+
+      v_add_u32             v2, vcc, v2, s25
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+
+    // Write to/advance the destination address.
+    s_waitcnt               vmcnt(0)
+
+    for var i = 0; i < kCopyAlignedUnroll; i ++
+      if kCopyAlignedVecWidth == 4
+        flat_store_dwordx4  v[4:5], v[8 + (i * 4)]
+      else
+        flat_store_dword    v[4:5], v[8 + i]
+      end
+
+      v_add_u32             v4, vcc, v4, s25
+      v_addc_u32            v5, vcc, v5, 0x0, vcc
+    end
+
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_2_LOOP
+
+  L_COPY_ALIGNED_PHASE_2_DONE:
+
+    // ===========================================
+    // Phase 3: Dword copy up to last whole dword.
+    // ===========================================
+
+    // Compute dword stride across all threads.
+    s_lshl_b32              s25, s24, 0x2
+
+    // Compute phase source address.
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s13
+    v_add_u32               v2, vcc, v1, s12
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s15
+    v_add_u32               v4, vcc, v1, s14
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+  L_COPY_ALIGNED_PHASE_3_LOOP:
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[16:17]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_3_DONE
+    s_and_b64               exec, exec, vcc
+
+    // Load from/advance the source address.
+    flat_load_dword         v1, v[2:3]
+    v_add_u32               v2, vcc, v2, s25
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    s_waitcnt               vmcnt(0)
+
+    // Write to/advance the destination address.
+    flat_store_dword        v[4:5], v1
+    v_add_u32               v4, vcc, v4, s25
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+    // Repeat until branched out.
+    s_branch                L_COPY_ALIGNED_PHASE_3_LOOP
+
+  L_COPY_ALIGNED_PHASE_3_DONE:
+    // Restore EXEC mask for all lanes.
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+    // =============================
+    // Phase 4: Byte copy up to end.
+    // =============================
+
+    // Compute phase source address.
+    v_mov_b32               v3, s17
+    v_add_u32               v2, vcc, v0, s16
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s19
+    v_add_u32               v4, vcc, v0, s18
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[20:21]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_4_DONE
+    s_and_b64               exec, exec, vcc
+
+    // Load from the source address.
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+
+    // Write to the destination address.
+    flat_store_byte         v[4:5], v1
+
+  L_COPY_ALIGNED_PHASE_4_DONE:
+    s_endpgm
+  end
+
+  // Memory copy for this case:
+  //  (src_addr & 0x3) != (dst_addr & 0x3)
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 src start address
+  //   [DW  2, 3]  Phase 1 dst start address
+  //   [DW  4, 5]  Phase 2 src start address
+  //   [DW  6, 7]  Phase 2 dst start address
+  //   [DW  8, 9]  Phase 2 src end address
+  //   [DW 10,11]  Phase 2 dst end address
+  //   [DW 12   ]  Total number of workitems
+
+  var kCopyMisalignedUnroll = 4
+
+  shader CopyMisaligned
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(23)
+    vgpr_count(6 + kCopyMisalignedUnroll)
+
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+    s_load_dword            s16, s[0:1], s_load_dword_offset(0x30)
+    s_waitcnt               lgkmcnt(0)
+
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+
+    // ===================================================
+    // Phase 1: Unrolled byte copy up to last whole block.
+    // ===================================================
+
+    // Compute phase source address.
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v0, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s7
+    v_add_u32               v4, vcc, v0, s6
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+  L_COPY_MISALIGNED_PHASE_1_LOOP:
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_1_DONE
+
+    // Load from/advance the source address.
+    for var i = 0; i < kCopyMisalignedUnroll; i ++
+      flat_load_ubyte       v[6 + i], v[2:3]
+      v_add_u32             v2, vcc, v2, s16
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+
+    // Write to/advance the destination address.
+    s_waitcnt               vmcnt(0)
+
+    for var i = 0; i < kCopyMisalignedUnroll; i ++
+      flat_store_byte       v[4:5], v[6 + i]
+      v_add_u32             v4, vcc, v4, s16
+      v_addc_u32            v5, vcc, v5, 0x0, vcc
+    end
+
+    // Repeat until branched out.
+    s_branch                L_COPY_MISALIGNED_PHASE_1_LOOP
+
+  L_COPY_MISALIGNED_PHASE_1_DONE:
+
+    // =============================
+    // Phase 2: Byte copy up to end.
+    // =============================
+
+    // Compute phase source address.
+    v_mov_b32               v3, s9
+    v_add_u32               v2, vcc, v0, s8
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Compute phase destination address.
+    v_mov_b32               v5, s11
+    v_add_u32               v4, vcc, v0, s10
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+  L_COPY_MISALIGNED_PHASE_2_LOOP:
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+
+    // Load from/advance the source address.
+    flat_load_ubyte         v1, v[2:3]
+    v_add_u32               v2, vcc, v2, s16
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+    s_waitcnt               vmcnt(0)
+
+    // Write to/advance the destination address.
+    flat_store_byte         v[4:5], v1
+    v_add_u32               v4, vcc, v4, s16
+    v_addc_u32              v5, vcc, v5, 0x0, vcc
+
+    // Repeat until branched out.
+    s_branch                L_COPY_MISALIGNED_PHASE_2_LOOP
+
+  L_COPY_MISALIGNED_PHASE_2_DONE:
+    s_endpgm
+  end
+
+  // Memory fill for dword-aligned region.
+  //
+  // Kernel argument buffer:
+  //   [DW  0, 1]  Phase 1 dst start address
+  //   [DW  2, 3]  Phase 2 dst start address
+  //   [DW  4, 5]  Phase 2 dst end address
+  //   [DW  6   ]  Value to fill memory with
+  //   [DW  7   ]  Total number of workitems
+
+  var kFillVecWidth = 4
+  var kFillUnroll = 1
+
+  shader Fill
+    type(CS)
+    user_sgpr_count(2)
+    sgpr_count(19)
+    vgpr_count(8)
+
+    // Retrieve kernel arguments.
+    s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+    s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+    s_waitcnt               lgkmcnt(0)
+
+    // Compute workitem id.
+    s_lshl_b32              s2, s2, 0x6
+    v_add_u32               v0, vcc, s2, v0
+
+    // Copy fill pattern into VGPRs.
+    for var i = 0; i < kFillVecWidth; i ++
+      v_mov_b32           v[4 + i], s10
+    end
+
+    // ========================================================
+    // Phase 1: Unrolled dword[x4] fill up to last whole block.
+    // ========================================================
+
+    // Compute unrolled dword[x4] stride across all threads.
+    if kFillVecWidth == 4
+      s_lshl_b32            s12, s11, 0x4
+    else
+      s_lshl_b32            s12, s11, 0x2
+    end
+
+    // Compute phase destination address.
+    if kFillVecWidth == 4
+      v_lshlrev_b32         v1, 0x4, v0
+    else
+      v_lshlrev_b32         v1, 0x2, v0
+    end
+
+    v_mov_b32               v3, s5
+    v_add_u32               v2, vcc, v1, s4
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+  L_FILL_PHASE_1_LOOP:
+    // Branch out after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[6:7]
+    s_cbranch_vccz          L_FILL_PHASE_1_DONE
+
+    // Write to/advance the destination address.
+    for var i = 0; i < kFillUnroll; i ++
+      if kFillVecWidth == 4
+        flat_store_dwordx4  v[2:3], v[4:7]
+      else
+        flat_store_dword    v[2:3], v4
+      end
+
+      v_add_u32             v2, vcc, v2, s12
+      v_addc_u32            v3, vcc, v3, 0x0, vcc
+    end
+
+    // Repeat until branched out.
+    s_branch                L_FILL_PHASE_1_LOOP
+
+  L_FILL_PHASE_1_DONE:
+
+    // ==============================
+    // Phase 2: Dword fill up to end.
+    // ==============================
+
+    // Compute dword stride across all threads.
+    s_lshl_b32              s12, s11, 0x2
+
+    // Compute phase destination address.
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s7
+    v_add_u32               v2, vcc, v1, s6
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+  L_FILL_PHASE_2_LOOP:
+    // Mask off lanes (or branch out) after phase end.
+    v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+    s_cbranch_vccz          L_FILL_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+
+    // Write to/advance the destination address.
+    flat_store_dword        v[2:3], v4
+    v_add_u32               v2, vcc, v2, s12
+    v_addc_u32              v3, vcc, v3, 0x0, vcc
+
+    // Repeat until branched out.
+    s_branch                L_FILL_PHASE_2_LOOP
+
+  L_FILL_PHASE_2_DONE:
+    s_endpgm
+  end
+)");
+
+// Search kernel source for variable definition and return value.
+int GetKernelSourceParam(const char* paramName) {
+  std::stringstream paramDef;
+  paramDef << "var " << paramName << " = ";
+
+  std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str());
+  assert(paramDefLoc != std::string::npos);
+  std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size();
+  std::string::size_type paramEndLoc =
+      kBlitKernelSource.find('\n', paramDefLoc);
+  assert(paramDefLoc != std::string::npos);
+
+  std::string paramVal(&kBlitKernelSource[paramValLoc],
+                       &kBlitKernelSource[paramEndLoc]);
+  return std::stoi(paramVal);
+}
+
+static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth");
+static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll");
+static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll");
+static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth");
+static int kFillUnroll = GetKernelSourceParam("kFillUnroll");
+
 BlitKernel::BlitKernel()
    : core::Blit(),
-      copy_code_handle_(0),
-      fill_code_handle_(0),
      queue_(NULL),
      cached_index_(0),
      kernarg_async_(NULL),
      kernarg_async_mask_(0),
      kernarg_async_counter_(0),
-      code_arg_buffer_(NULL) {
+      num_cus_(0) {
  completion_signal_.handle = 0;
 }

@@ -96,26 +542,8 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
    return HSA_STATUS_ERROR;
  }

-  // Need queue buffer that can cover the max size of local memory.
-  const uint64_t kGpuVmVaSize = 1ULL << 40;
-  const uint32_t kRequiredQueueSize = NextPow2(static_cast<uint32_t>(
-      std::ceil(static_cast<double>(kGpuVmVaSize) / kMaxCopyCount)));
-
-  uint32_t max_queue_size = 0;
-  status = HSA::hsa_agent_get_info(agent_handle, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
-                                   &max_queue_size);
-
-  if (HSA_STATUS_SUCCESS != status) {
-    return status;
-  }
-
-  if (max_queue_size < kRequiredQueueSize) {
-    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-  }
-
-  status =
-      HSA::hsa_queue_create(agent_handle, kRequiredQueueSize,
-                            HSA_QUEUE_TYPE_MULTI, NULL, NULL, 0, 0, &queue_);
+  status = HSA::hsa_queue_create(agent_handle, 1024, HSA_QUEUE_TYPE_MULTI, NULL,
+                                 NULL, 0, 0, &queue_);

  if (HSA_STATUS_SUCCESS != status) {
    return status;
@@ -125,86 +553,6 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {

  cached_index_ = 0;

-  void* copy_raw_obj_mem = NULL;
-  size_t copy_akc_size = 0;
-  size_t copy_akc_offset = 0;
-
-  void* copy_aligned_raw_obj_mem = NULL;
-  size_t copy_aligned_akc_size = 0;
-  size_t copy_aligned_akc_offset = 0;
-
-  void* fill_raw_obj_mem = NULL;
-  size_t fill_akc_size = 0;
-  size_t fill_akc_offset = 0;
-
-  switch (agent.isa()->GetMajorVersion()) {
-    case 7:
-      copy_raw_obj_mem = kVectorCopyKvObject;
-      copy_akc_size = HSA_VECTOR_COPY_KV_AKC_SIZE;
-      copy_akc_offset = HSA_VECTOR_COPY_KV_AKC_OFFSET;
-
-      copy_aligned_raw_obj_mem = kVectorCopyAlignedKvObject;
-      copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_KV_AKC_SIZE;
-      copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_KV_AKC_OFFSET;
-
-      fill_raw_obj_mem = kFillMemoryKvObject;
-      fill_akc_size = HSA_FILL_MEMORY_KV_AKC_SIZE;
-      fill_akc_offset = HSA_FILL_MEMORY_KV_AKC_OFFSET;
-      break;
-    case 8:
-      copy_raw_obj_mem = kVectorCopyViObject;
-      copy_akc_size = HSA_VECTOR_COPY_VI_AKC_SIZE;
-      copy_akc_offset = HSA_VECTOR_COPY_VI_AKC_OFFSET;
-
-      copy_aligned_raw_obj_mem = kVectorCopyAlignedViObject;
-      copy_aligned_akc_size = HSA_VECTOR_COPY_ALIGNED_VI_AKC_SIZE;
-      copy_aligned_akc_offset = HSA_VECTOR_COPY_ALIGNED_VI_AKC_OFFSET;
-
-      fill_raw_obj_mem = kFillMemoryViObject;
-      fill_akc_size = HSA_FILL_MEMORY_VI_AKC_SIZE;
-      fill_akc_offset = HSA_FILL_MEMORY_VI_AKC_OFFSET;
-      break;
-    default:
-      assert(false && "Only gfx7 and gfx8 are supported");
-      break;
-  }
-
-  const size_t total_alloc_size = AlignUp(
-      AlignUp(copy_akc_size, 256) + AlignUp(copy_aligned_akc_size, 256) +
-          AlignUp(fill_akc_size, 256),
-      4096);
-
-  amd_kernel_code_t *code_ptr = nullptr;
-  code_arg_buffer_ = core::Runtime::runtime_singleton_->system_allocator()(
-      total_alloc_size, 4096);
-
-  char* akc_arg = reinterpret_cast<char*>(code_arg_buffer_);
-  memcpy(akc_arg,
-         reinterpret_cast<const char*>(copy_raw_obj_mem) + copy_akc_offset,
-         copy_akc_size);
-  copy_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(copy_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += copy_akc_size;
-
-  akc_arg = AlignUp(akc_arg, 256);
-  memcpy(akc_arg, reinterpret_cast<const char*>(copy_aligned_raw_obj_mem) +
-                      copy_aligned_akc_offset,
-         copy_aligned_akc_size);
-  copy_aligned_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(copy_aligned_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += copy_aligned_akc_size;
-
-  akc_arg = AlignUp(akc_arg, 256);
-  memcpy(akc_arg,
-         reinterpret_cast<const char*>(fill_raw_obj_mem) + fill_akc_offset,
-         fill_akc_size);
-  fill_code_handle_ = reinterpret_cast<uint64_t>(akc_arg);
-  code_ptr = (amd_kernel_code_t*)(fill_code_handle_);
-  code_ptr->runtime_loader_kernel_symbol = 0;
-  akc_arg += fill_akc_size;
-
  status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
  if (HSA_STATUS_SUCCESS != status) {
    return status;
@@ -212,34 +560,40 @@ hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {

  kernarg_async_ = reinterpret_cast<KernelArgs*>(
      core::Runtime::runtime_singleton_->system_allocator()(
-          kRequiredQueueSize * AlignUp(sizeof(KernelArgs), 16), 16));
+          queue_->size * AlignUp(sizeof(KernelArgs), 16), 16));

-  kernarg_async_mask_ = kRequiredQueueSize - 1;
+  kernarg_async_mask_ = queue_->size - 1;

-  // TODO: remove this code when execute permission level is not mandatory.
-  if (((amd::GpuAgent&)agent).profile() == HSA_PROFILE_FULL) {
-#if defined(_WIN32) || defined(_WIN64)
-#define NOMINMAX
-    DWORD old_protect = 0;
-    const DWORD new_protect = PAGE_EXECUTE_READWRITE;
-    if (!VirtualProtect(code_arg_buffer_, total_alloc_size, new_protect,
-                        &old_protect)) {
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
-#else
-    if (0 != mprotect(code_arg_buffer_, total_alloc_size,
-                      PROT_READ | PROT_WRITE | PROT_EXEC)) {
-      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
-    }
-#endif
+  // Obtain the number of compute units in the underlying agent.
+  const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
+  num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
+
+  // Assemble shaders to AQL code objects.
+  std::map<KernelType, const char*> kernel_names = {
+      {KernelType::CopyAligned, "CopyAligned"},
+      {KernelType::CopyMisaligned, "CopyMisaligned"},
+      {KernelType::Fill, "Fill"}};
+
+  for (auto kernel_name : kernel_names) {
+    KernelCode& kernel = kernels_[kernel_name.first];
+    gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second,
+                            GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
+                            kernel.code_buf_size_);
  }

  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t BlitKernel::Destroy(void) {
+hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
  std::lock_guard<std::mutex> guard(lock_);

+  const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
+
+  for (auto kernel_pair : kernels_) {
+    gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
+                           kernel_pair.second.code_buf_size_);
+  }
+
  if (queue_ != NULL) {
    HSA::hsa_queue_destroy(queue_);
  }
@@ -248,10 +602,6 @@ hsa_status_t BlitKernel::Destroy(void) {
    core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_);
  }

-  if (code_arg_buffer_ != NULL) {
-    core::Runtime::runtime_singleton_->system_deallocator()(code_arg_buffer_);
-  }
-
  if (completion_signal_.handle != 0) {
    HSA::hsa_signal_destroy(completion_signal_);
  }
@@ -259,11 +609,6 @@ hsa_status_t BlitKernel::Destroy(void) {
  return HSA_STATUS_SUCCESS;
 }

-static bool IsSystemMemory(void* address) {
-  static const uint64_t kLimitSystem = 1ULL << 48;
-  return (reinterpret_cast<uint64_t>(address) < kLimitSystem);
-}
-
 hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
                                                 size_t size) {
  // Protect completion_signal_.
@@ -294,35 +639,14 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
 hsa_status_t BlitKernel::SubmitLinearCopyCommand(
    void* dst, const void* src, size_t size,
    std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
-  assert(copy_code_handle_ != 0);
-
-  const size_t kAlignmentChar = 1;
-  const size_t kAlignmentUin32 = 4;
-  const size_t kAlignmentVec4 = 16;
-  const size_t copy_granule =
-      (IsMultipleOf(dst, kAlignmentVec4) && IsMultipleOf(src, kAlignmentVec4) &&
-       IsMultipleOf(size, kAlignmentVec4))
-          ? kAlignmentVec4
-          : (IsMultipleOf(dst, kAlignmentUin32) &&
-             IsMultipleOf(src, kAlignmentUin32) &&
-             IsMultipleOf(size, kAlignmentUin32))
-                ? kAlignmentUin32
-                : kAlignmentChar;
-
-  size = size / copy_granule;
-
-  const uint32_t num_copy_packet = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(size) / kMaxCopyCount));
-
-  const uint32_t num_barrier_packet =
-      static_cast<uint32_t>(std::ceil(dep_signals.size() / 5.0f));
-
-  // Reserve write index for copy + fence packet.
-  const uint32_t total_num_packet = num_barrier_packet + num_copy_packet;
+  // Reserve write index for barrier(s) + dispatch packet.
+  const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
+  const uint32_t total_num_packet = num_barrier_packet + 1;

  uint64_t write_index = AcquireWriteIndex(total_num_packet);
  uint64_t write_index_temp = write_index;

+  // Insert barrier packets to handle dependent signals.
  const uint16_t kBarrierPacketHeader =
      (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
      (1 << HSA_PACKET_HEADER_BARRIER) |
@@ -352,99 +676,116 @@ hsa_status_t BlitKernel::SubmitLinearCopyCommand(
    }
  }

-  const uint32_t last_copy_index = num_copy_packet - 1;
-  size_t total_copy_count = 0;
-  for (uint32_t i = 0; i < num_copy_packet; ++i) {
-    // Setup arguments.
-    const uint32_t copy_count = static_cast<uint32_t>(
-        std::min((size - total_copy_count), kMaxCopyCount));
+  // Insert dispatch packet for copy kernel.
+  KernelArgs* args = ObtainAsyncKernelCopyArg();
+  KernelCode* kernel_code = nullptr;
+  int num_workitems = 0;

-    void* cur_dst = static_cast<char*>(dst) + (total_copy_count * copy_granule);
-    const void* cur_src =
-        static_cast<const char*>(src) + (total_copy_count * copy_granule);
+  bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3));

-    KernelArgs* args = ObtainAsyncKernelCopyArg();
-    assert(args != NULL);
-    assert(IsMultipleOf(&args->copy, 16));
+  if (aligned) {
+    // Use dword-based aligned kernel.
+    kernel_code = &kernels_[KernelType::CopyAligned];

-    args->copy.src = cur_src;
-    args->copy.dst = cur_dst;
-    args->copy.size = copy_count;
-    args->copy.use_vector = (copy_granule == kAlignmentVec4) ? 1 : 0;
+    // Compute the size of each copy phase.
+    num_workitems = 64 * 4 * num_cus_;

-    const uint32_t grid_size_x =
-        AlignUp(static_cast<uint32_t>(copy_count), kGroupSize);
+    // Phase 1 (byte copy) ends when destination is 0x100-aligned.
+    uintptr_t src_start = uintptr_t(src);
+    uintptr_t dst_start = uintptr_t(dst);
+    uint64_t phase1_size =
+        std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF);

-    // This assert to make sure kMaxCopySize is not changed to a number that
-    // could cause overflow to packet.grid_size_x.
-    assert(grid_size_x >= copy_count);
+    // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits.
+    uint64_t phase2_block = num_workitems * sizeof(uint32_t) *
+                            kCopyAlignedUnroll * kCopyAlignedVecWidth;
+    uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block;

-    hsa_signal_t signal = {(i == last_copy_index)
-                               ? (core::Signal::Convert(&out_signal)).handle
-                               : 0};
-    PopulateQueue(write_index, ((copy_granule == kAlignmentChar)
-                                    ? copy_code_handle_
-                                    : copy_aligned_code_handle_),
-                  args, grid_size_x, signal);
+    // Phase 3 (dword copy) ends when last whole dword fits.
+    uint64_t phase3_size =
+        ((size - phase1_size - phase2_size) / sizeof(uint32_t)) *
+        sizeof(uint32_t);

-    ++write_index;
+    args->copy_aligned.phase1_src_start = src_start;
+    args->copy_aligned.phase1_dst_start = dst_start;
+    args->copy_aligned.phase2_src_start = src_start + phase1_size;
+    args->copy_aligned.phase2_dst_start = dst_start + phase1_size;
+    args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size;
+    args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size;
+    args->copy_aligned.phase4_src_start =
+        src_start + phase1_size + phase2_size + phase3_size;
+    args->copy_aligned.phase4_dst_start =
+        dst_start + phase1_size + phase2_size + phase3_size;
+    args->copy_aligned.phase4_src_end = src_start + size;
+    args->copy_aligned.phase4_dst_end = dst_start + size;
+    args->copy_aligned.num_workitems = num_workitems;
+  } else {
+    // Use byte-based misaligned kernel.
+    kernel_code = &kernels_[KernelType::CopyMisaligned];

-    total_copy_count += copy_count;
+    // Compute the size of each copy phase.
+    num_workitems = 64 * 4 * num_cus_;
+
+    // Phase 1 (unrolled byte copy) ends when last whole block fits.
+    uintptr_t src_start = uintptr_t(src);
+    uintptr_t dst_start = uintptr_t(dst);
+    uint64_t phase1_block =
+        num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll;
+    uint64_t phase1_size = (size / phase1_block) * phase1_block;
+
+    args->copy_misaligned.phase1_src_start = src_start;
+    args->copy_misaligned.phase1_dst_start = dst_start;
+    args->copy_misaligned.phase2_src_start = src_start + phase1_size;
+    args->copy_misaligned.phase2_dst_start = dst_start + phase1_size;
+    args->copy_misaligned.phase2_src_end = src_start + size;
+    args->copy_misaligned.phase2_dst_end = dst_start + size;
+    args->copy_misaligned.num_workitems = num_workitems;
  }

-  // Launch copy packet.
+  hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle};
+  PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args,
+                num_workitems, signal);
+
+  // Submit barrier(s) and dispatch packets.
  ReleaseWriteIndex(write_index_temp, total_num_packet);

  return HSA_STATUS_SUCCESS;
 }

 hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
-                                                 size_t num) {
-  assert(fill_code_handle_ != 0);
-
+                                                 size_t count) {
  std::lock_guard<std::mutex> guard(lock_);

-  HSA::hsa_signal_store_relaxed(completion_signal_, 1);
-
-  const uint32_t num_fill_packet = static_cast<uint32_t>(
-      std::ceil(static_cast<double>(num) / kMaxFillCount));
-
-  // Reserve write index for copy + fence packet.
-  uint64_t write_index = AcquireWriteIndex(num_fill_packet);
-
-  const uint32_t last_fill_index = num_fill_packet - 1;
-  size_t total_fill_count = 0;
-  for (uint32_t i = 0; i < num_fill_packet; ++i) {
-    // Setup arguments.
-    const uint32_t fill_count = static_cast<uint32_t>(
-        std::min((num - total_fill_count), kMaxFillCount));
-    void* cur_ptr = static_cast<char*>(ptr) + total_fill_count;
-
-    KernelArgs* args = ObtainAsyncKernelCopyArg();
-    assert(args != NULL);
-    assert(IsMultipleOf(&args->fill, 16));
-
-    args->fill.ptr = cur_ptr;
-    args->fill.num = fill_count;
-    args->fill.value = value;
-
-    const uint32_t grid_size_x =
-        AlignUp(static_cast<uint32_t>(fill_count), kGroupSize);
-
-    // This assert to make sure kMaxFillCount is not changed to a number that
-    // could cause overflow to packet.grid_size_x.
-    assert(grid_size_x >= fill_count);
-
-    hsa_signal_t signal = {(i == last_fill_index) ? completion_signal_.handle
-                                                  : 0};
-    PopulateQueue(write_index + i, fill_code_handle_, &args[i], grid_size_x,
-                  signal);
-
-    total_fill_count += fill_count;
+  // Reject misaligned base address.
+  if ((uintptr_t(ptr) & 0x3) != 0) {
+    return HSA_STATUS_ERROR;
  }

-  // Launch fill packet.
-  ReleaseWriteIndex(write_index, num_fill_packet);
+  // Compute the size of each fill phase.
+  int num_workitems = 64 * num_cus_;
+
+  // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits.
+  uintptr_t dst_start = uintptr_t(ptr);
+  uint64_t fill_size = count * sizeof(uint32_t);
+
+  uint64_t phase1_block =
+      num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth;
+  uint64_t phase1_size = (fill_size / phase1_block) * phase1_block;
+
+  KernelArgs* args = ObtainAsyncKernelCopyArg();
+  args->fill.phase1_dst_start = dst_start;
+  args->fill.phase2_dst_start = dst_start + phase1_size;
+  args->fill.phase2_dst_end = dst_start + fill_size;
+  args->fill.fill_value = value;
+  args->fill.num_workitems = num_workitems;
+
+  // Submit dispatch packet.
+  HSA::hsa_signal_store_relaxed(completion_signal_, 1);
+
+  uint64_t write_index = AcquireWriteIndex(1);
+  PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_),
+                args, num_workitems, completion_signal_);
+  ReleaseWriteIndex(write_index, 1);

  // Wait for the packet to finish.
  if (HSA::hsa_signal_wait_acquire(completion_signal_, HSA_SIGNAL_CONDITION_LT,
@@ -556,9 +897,9 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
  // Setup working size.
  const int kNumDimension = 1;
  packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), kGroupSize);
+  packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64);
  packet.grid_size_y = packet.grid_size_z = 1;
-  packet.workgroup_size_x = kGroupSize;
+  packet.workgroup_size_x = 64;
  packet.workgroup_size_y = packet.workgroup_size_z = 1;

  packet.completion_signal = completion_signal;
@@ -404,7 +404,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {

  if (err != HSAKMT_STATUS_SUCCESS) {
    assert(false && "AQL queue memory map failure.");
-    Destroy();
+    Destroy(agent);
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

@@ -419,7 +419,7 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
      hsaKmtCreateQueue(gpu_agent.node_id(), kQueueType_, 100,
                        HSA_QUEUE_PRIORITY_MAXIMUM, queue_start_addr_,
                        queue_size_, NULL, &queue_resource_)) {
-    Destroy();
+    Destroy(agent);
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

@@ -436,14 +436,14 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent) {
          fence_pool_size_ * sizeof(uint32_t), 256));

  if (fence_base_addr_ == NULL) {
-    Destroy();
+    Destroy(agent);
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

  return HSA_STATUS_SUCCESS;
 }

-hsa_status_t BlitSdma::Destroy(void) {
+hsa_status_t BlitSdma::Destroy(const core::Agent& agent) {
  // Release all allocated resources and reset them to zero.

  if (queue_resource_.QueueId != 0) {
@@ -46,6 +46,7 @@
 #include <atomic>
 #include <cstring>
 #include <climits>
+#include <string>
 #include <vector>

 #include "core/inc/amd_aql_queue.h"
@@ -126,7 +127,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props)

 GpuAgent::~GpuAgent() {
  if (blit_h2d_ != NULL) {
-    hsa_status_t status = blit_h2d_->Destroy();
+    hsa_status_t status = blit_h2d_->Destroy(*this);
    assert(status == HSA_STATUS_SUCCESS);

    delete blit_h2d_;
@@ -134,7 +135,7 @@ GpuAgent::~GpuAgent() {
  }

  if (blit_d2h_ != NULL) {
-    hsa_status_t status = blit_d2h_->Destroy();
+    hsa_status_t status = blit_d2h_->Destroy(*this);
    assert(status == HSA_STATUS_SUCCESS);

    delete blit_d2h_;
@@ -158,33 +159,51 @@ GpuAgent::~GpuAgent() {
 }

 void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
-                              void*& code_buf, size_t& code_buf_size) {
+                              AssembleTarget assemble_target, void*& code_buf,
+                              size_t& code_buf_size) const {
 #ifdef __linux__  // No VS builds of libsp3 available right now
+  std::string src_sp3_unified(src_sp3);
+
+  if (isa_->GetMajorVersion() == 7) {
+    // On Gfx7 replace v_add_u32 with legacy equivalent v_add_i32.
+    std::string add_inst_gfx8("v_add_u32"), add_inst_gfx7("v_add_i32");
+
+    for (size_t instIdx = 0; (instIdx = src_sp3_unified.find(
+                                  add_inst_gfx8, instIdx)) != std::string::npos;
+         instIdx += add_inst_gfx8.size()) {
+      src_sp3_unified.replace(instIdx, add_inst_gfx7.size(), add_inst_gfx7);
+    }
+  }
+
  // Assemble source string with libsp3.
  sp3_context* sp3 = sp3_new();

  switch (isa_->GetMajorVersion()) {
    case 7:
      sp3_setasic(sp3, "CI");
+      sp3_set_param_int(sp3, "kGFXIPVersion", 7);
      break;
    case 8:
      sp3_setasic(sp3, "VI");
+      sp3_set_param_int(sp3, "kGFXIPVersion", 8);
      break;
    default:
      assert(false && "SP3 assembly not supported on this agent");
  }

-  sp3_parse_string(sp3, src_sp3);
+  sp3_parse_string(sp3, src_sp3_unified.c_str());
  sp3_shader* code_sp3_meta = sp3_compile(sp3, func_name);

-  // Allocate a GPU-visible buffer for the trap shader.
+  // Allocate a GPU-visible buffer for the shader.
  HsaMemFlags code_buf_flags = {0};
  code_buf_flags.ui32.HostAccess = 1;
  code_buf_flags.ui32.ExecuteAccess = 1;
  code_buf_flags.ui32.NoSubstitute = 1;

  size_t code_size = code_sp3_meta->size * sizeof(uint32_t);
-  code_buf_size = AlignUp(code_size, 0x1000);
+  size_t header_size =
+      (assemble_target == AssembleTarget::AQL ? sizeof(amd_kernel_code_t) : 0);
+  code_buf_size = AlignUp(header_size + code_size, 0x1000);

  HSAKMT_STATUS err =
      hsaKmtAllocMemory(node_id(), code_buf_size, code_buf_flags, &code_buf);
@@ -193,9 +212,38 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
  err = hsaKmtMapMemoryToGPU(code_buf, code_buf_size, NULL);
  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtMapMemoryToGPU(Trap) failed");

-  // Copy trap handler code into the GPU-visible buffer.
  memset(code_buf, 0, code_buf_size);
-  memcpy(code_buf, code_sp3_meta->data, code_size);
+
+  // Populate optional code object header.
+  if (assemble_target == AssembleTarget::AQL) {
+    amd_kernel_code_t* header = reinterpret_cast<amd_kernel_code_t*>(code_buf);
+
+    int gran_sgprs = std::max(0, (int(code_sp3_meta->nsgprs) - 1) / 8);
+    int gran_vgprs = std::max(0, (int(code_sp3_meta->nvgprs) - 1) / 4);
+
+    header->kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
+    AMD_HSA_BITS_SET(header->kernel_code_properties,
+                     AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
+                     1);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT,
+                     gran_sgprs);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT,
+                     gran_vgprs);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 3);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc1,
+                     AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 1);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
+                     AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 2);
+    AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
+                     AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
+  }
+
+  // Copy trap handler code into the GPU-visible buffer.
+  memcpy((void*)(uintptr_t(code_buf) + header_size), code_sp3_meta->data,
+         code_size);

  // Release SP3 resources.
  sp3_free_shader(code_sp3_meta);
@@ -203,7 +251,7 @@ void GpuAgent::AssembleShader(const char* src_sp3, const char* func_name,
 #endif
 }

-void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) {
+void GpuAgent::ReleaseShader(void* code_buf, size_t code_buf_size) const {
  hsaKmtUnmapMemoryToGPU(code_buf);
  hsaKmtFreeMemory(code_buf, code_buf_size);
 }
@@ -377,7 +425,7 @@ core::Blit* GpuAgent::CreateBlitSdma() {
  BlitSdma* sdma = new BlitSdma();

  if (sdma->Initialize(*this) != HSA_STATUS_SUCCESS) {
-    sdma->Destroy();
+    sdma->Destroy(*this);
    delete sdma;
    sdma = NULL;
  }
@@ -389,7 +437,7 @@ core::Blit* GpuAgent::CreateBlitKernel() {
  BlitKernel* kernl = new BlitKernel();

  if (kernl->Initialize(*this) != HSA_STATUS_SUCCESS) {
-    kernl->Destroy();
+    kernl->Destroy(*this);
    delete kernl;
    kernl = NULL;
  }
@@ -904,7 +952,8 @@ void GpuAgent::BindTrapHandler() {
  }

  // Assemble the trap handler source code.
-  AssembleShader(src_sp3, "TrapHandler", trap_code_buf_, trap_code_buf_size_);
+  AssembleShader(src_sp3, "TrapHandler", AssembleTarget::ISA, trap_code_buf_,
+                 trap_code_buf_size_);

  // Bind the trap handler to this node.
  HSAKMT_STATUS err = hsaKmtSetTrapHandler(node_id(), trap_code_buf_,