Use LLVM compiler to build blit shaders

Generates shader bytecode stream in amd_blit_shaders_v2.h at build time Change-Id: I5228ec5442a78d074fd85ca9cd7f7a156dd84da3 [ROCm/ROCR-Runtime commit: 4e675ce730]
2023-08-22 16:44:07 -04:00
@@ -123,7 +123,8 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
  PRIVATE
  ${CMAKE_CURRENT_SOURCE_DIR}
  ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode
-  ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler)
+  ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler
+  ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/blit_shaders)


 ## ------------------------- Linux Compiler and Linker options -------------------------
@@ -202,6 +203,10 @@ target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${SRCS} )
 add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/trap_handler )
 add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 )

+## Depend on blit shader target.
+add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders )
+add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2)
+
 if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" )
  set ( IMAGE_SUPPORT ON )
 endif()
@@ -156,174 +156,6 @@ static const unsigned int kCodeFill8[] = {
    0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
 };

-static const unsigned int kCodeCopyAligned940[] = {
-    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
-    0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050,
-    0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900,
-    0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05,
-    0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000,
-    0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103,
-    0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105,
-    0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02,
-    0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501,
-    0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000,
-    0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
-    0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
-    0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901,
-    0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05,
-    0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000,
-    0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
-    0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
-    0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03,
-    0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105,
-    0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002,
-    0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000,
-};
-
-static const unsigned int kCodeCopyMisaligned940[] = {
-    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
-    0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205,
-    0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04,
-    0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032,
-    0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
-    0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
-    0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
-    0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
-    0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05,
-    0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05,
-    0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05,
-    0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05,
-    0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03,
-    0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105,
-    0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002,
-    0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000,
-    0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee,
-    0xbf810000, 0x00000000,
-};
-
-static const unsigned int kCodeFill940[] = {
-    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602,
-    0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b,
-    0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103,
-    0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02,
-    0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082,
-    0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a,
-    0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02,
-    0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000,
-};
-
-static const unsigned int kCodeCopyAligned10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
-    0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
-    0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
-    0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
-    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
-    0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
-    0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
-    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
-    0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
-    0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
-    0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
-    0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
-    0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
-    0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
-    0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
-    0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
-    0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
-    0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
-    0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
-    0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
-    0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyMisaligned10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
-    0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
-    0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
-    0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
-    0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
-    0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
-    0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
-    0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
-    0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
-    0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
-    0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
-    0xBF82FFEE, 0xBF810000,
-};
-
-static const unsigned int kCodeFill10[] = {
-    0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
-    0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
-    0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
-    0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
-    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
-    0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
-    0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
-    0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
-};
-
-static const unsigned int kCodeCopyAligned11[] = {
-    0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
-    0xF4080400, 0xF8000030, 0xF4080500, 0xF8000040, 0xF4000600, 0xF8000050,
-    0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002, 0x7E060205, 0xD7006A02,
-    0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207, 0xD7006A04, 0x00000D00,
-    0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102, 0xBFA3000F, 0x8BFE6A7E,
-    0xDC400000, 0x017C0002, 0xBF8903FF, 0xD7006A02, 0x00003102, 0xD5206A03,
-    0x01A90103, 0xDC600000, 0x007C0104, 0xD7006A04, 0x00003104, 0xD5206A05,
-    0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x84198418, 0x30020084, 0x7E060209,
-    0xD7006A02, 0x00001101, 0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04,
-    0x00001501, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000E,
-    0xDC5C0000, 0x087C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
-    0xBF8903FF, 0xDC740000, 0x007C0804, 0xD7006A04, 0x00003304, 0xD5206A05,
-    0x01A90105, 0xBFA0FFEF, 0x84198218, 0x30020082, 0x7E06020D, 0xD7006A02,
-    0x00001901, 0xD5206A03, 0x01A90103, 0x7E0A020F, 0xD7006A04, 0x00001D01,
-    0xD5206A05, 0x01A90105, 0xD459006A, 0x00002102, 0xBFA3000F, 0x8BFE6A7E,
-    0xDC500000, 0x017C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
-    0xBF8903FF, 0xDC680000, 0x007C0104, 0xD7006A04, 0x00003304, 0xD5206A05,
-    0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x7E060211, 0xD7006A02, 0x00002100,
-    0xD5206A03, 0x01A90103, 0x7E0A0213, 0xD7006A04, 0x00002500, 0xD5206A05,
-    0x01A90105, 0xD459006A, 0x00002902, 0xBFA30006, 0x8BFE6A7E, 0xDC400000,
-    0x017C0002, 0xBF8903FF, 0xDC600000, 0x007C0104, 0xBFB00000,
-};
-
-static const unsigned int kCodeCopyMisaligned11[] = {
-    0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
-    0xF4000400, 0xF8000030, 0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002,
-    0x7E060205, 0xD7006A02, 0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207,
-    0xD7006A04, 0x00000D00, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102,
-    0xBFA30032, 0xDC400000, 0x067C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
-    0x01A90103, 0xDC400000, 0x077C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
-    0x01A90103, 0xDC400000, 0x087C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
-    0x01A90103, 0xDC400000, 0x097C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
-    0x01A90103, 0xBF8903FF, 0xDC600000, 0x007C0604, 0xD7006A04, 0x00002104,
-    0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0704, 0xD7006A04, 0x00002104,
-    0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0804, 0xD7006A04, 0x00002104,
-    0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0904, 0xD7006A04, 0x00002104,
-    0xD5206A05, 0x01A90105, 0xBFA0FFCB, 0x7E060209, 0xD7006A02, 0x00001100,
-    0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04, 0x00001500, 0xD5206A05,
-    0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000F, 0x8BFE6A7E, 0xDC400000,
-    0x017C0002, 0xD7006A02, 0x00002102, 0xD5206A03, 0x01A90103, 0xBF8903FF,
-    0xDC600000, 0x007C0104, 0xD7006A04, 0x00002104, 0xD5206A05, 0x01A90105,
-    0xBFA0FFEE, 0xBFB00000,
-};
-
-static const unsigned int kCodeFill11[] = {
-    0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xBF89FC0F, 0x84028602,
-    0xD7006A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
-    0x840C840B, 0x30020084, 0x7E060205, 0xD7006A02, 0x00000901, 0xD5206A03,
-    0x01A90103, 0xD459006A, 0x00000D02, 0xBFA30007, 0xDC740000, 0x007C0402,
-    0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF6, 0x840C820B,
-    0x30020082, 0x7E060207, 0xD7006A02, 0x00000D01, 0xD5206A03, 0x01A90103,
-    0xD459006A, 0x00001102, 0xBFA30008, 0x8BFE6A7E, 0xDC680000, 0x007C0402,
-    0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF5, 0xBFB00000,
-};
-
 }  // namespace AMD
 }  // namespace rocr

@@ -70,6 +70,7 @@
 #include "core/inc/amd_blit_shaders.h"
 // Generated header
 #include "amd_trap_handler_v2.h"
+#include "amd_blit_shaders_v2.h"

 #if defined(__linux__)
 // libdrm headers
@@ -257,63 +258,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
  std::map<std::string, CompiledShader> compiled_shaders = {
      {"TrapHandler",
       {
-           {NULL, 0, 0, 0},                                                 // gfx7
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
-           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},            // gfx9
-           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},        // gfx90a
-           {NULL, 0, 0, 0},                                                 // gfx940
-           {NULL, 0, 0, 0},                                                 // gfx942
-           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},      // gfx1010
-           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},          // gfx10
-           {NULL, 0, 0, 0},                                                 // gfx11
+           {NULL, 0, 0, 0},                                             // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},        // gfx8
+           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},        // gfx9
+           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},    // gfx90a
+           {NULL, 0, 0, 0},                                             // gfx940
+           {NULL, 0, 0, 0},                                             // gfx942
+           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},  // gfx1010
+           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},      // gfx10
+           {NULL, 0, 0, 0},                                             // gfx11
       }},
      {"TrapHandlerKfdExceptions",
       {
-           {NULL, 0, 0, 0},                                                 // gfx7
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx9
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx90a
-           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx940
-           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx942
-           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010
-           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},    // gfx10
-           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},    // gfx11
+           {NULL, 0, 0, 0},                                                   // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},              // gfx8
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},        // gfx9
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},        // gfx90a
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},    // gfx940
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},    // gfx942
+           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},  // gfx1010
+           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},      // gfx10
+           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},      // gfx11
       }},
      {"CopyAligned",
       {
-           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},          // gfx7
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx8
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx9
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx90a
-           {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12},      // gfx940
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx942
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx1010
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx10
-           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},        // gfx11
+           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},      // gfx7
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},      // gfx8
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx9
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx90a
+           {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12},  // gfx940
+           {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12},      // gfx942
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},    // gfx1010
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},    // gfx10
+           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},    // gfx11
       }},
      {"CopyMisaligned",
       {
-           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},    // gfx7
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx8
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx9
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx90a
-           {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx942
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx1010
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx10
-           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},  // gfx11
+           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},      // gfx7
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},      // gfx8
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx9
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx90a
+           {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},  // gfx940
+           {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10},      // gfx942
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},    // gfx1010
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},    // gfx10
+           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},    // gfx11
       }},
      {"Fill",
       {
-           {kCodeFill7, sizeof(kCodeFill7), 19, 8},                         // gfx7
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx8
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx9
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx90a
-           {kCodeFill940, sizeof(kCodeFill940), 19, 8},                     // gfx940
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx942
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx1010
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx10
-           {kCodeFill11, sizeof(kCodeFill11), 19, 8},                       // gfx11
+           {kCodeFill7, sizeof(kCodeFill7), 19, 8},      // gfx7
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},      // gfx8
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx9
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx90a
+           {kCodeFill940, sizeof(kCodeFill940), 19, 8},  // gfx940
+           {kCodeFill9, sizeof(kCodeFill9), 19, 8},      // gfx942
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},    // gfx1010
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},    // gfx10
+           {kCodeFill11, sizeof(kCodeFill11), 19, 8},    // gfx11
       }}};

  auto compiled_shader_it = compiled_shaders.find(func_name);
@@ -0,0 +1,169 @@
+################################################################################
+##
+## The University of Illinois/NCSA
+## Open Source License (NCSA)
+##
+## Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved.
+##
+## Developed by:
+##
+##                 AMD Research and AMD HSA Software Development
+##
+##                 Advanced Micro Devices, Inc.
+##
+##                 www.amd.com
+##
+## Permission is hereby granted, free of charge, to any person obtaining a copy
+## of this software and associated documentation files (the "Software"), to
+## deal with the Software without restriction, including without limitation
+## the rights to use, copy, modify, merge, publish, distribute, sublicense,
+## and/or sell copies of the Software, and to permit persons to whom the
+## Software is furnished to do so, subject to the following conditions:
+##
+##  - Redistributions of source code must retain the above copyright notice,
+##    this list of conditions and the following disclaimers.
+##  - Redistributions in binary form must reproduce the above copyright
+##    notice, this list of conditions and the following disclaimers in
+##    the documentation and/or other materials provided with the distribution.
+##  - Neither the names of Advanced Micro Devices, Inc,
+##    nor the names of its contributors may be used to endorse or promote
+##    products derived from this Software without specific prior written
+##    permission.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+## DEALINGS WITH THE SOFTWARE.
+##
+##
+################################################################################
+
+# Minimum required version of CMake
+cmake_minimum_required ( VERSION 3.7 )
+
+# Find Clang package and LLVM package
+find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
+find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
+
+# Set the target devices
+set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100")
+# Set the postfix for each target device
+set (POSTFIX "9;940;1010;10;11")
+
+# If verbose output is enabled, print paths and target devices
+if(${CMAKE_VERBOSE_MAKEFILE})
+	get_property(clang_path TARGET clang PROPERTY LOCATION)
+	get_property(objcopy_path TARGET llvm-objcopy PROPERTY LOCATION)
+	message("Using clang from: ${clang_path}")
+	message("Using llvm-objcopy from: ${objcopy_path}")
+	message("Blit Shaders assembled for: ${TARGET_DEVS}")
+endif()
+
+# Function to generate kernel bitcode
+function(gen_kernel_bc TARGET_ID INPUT_FILE OUTPUT_FILE)
+	set(CODE_OBJECT "${OUTPUT_FILE}.hsaco")
+
+	# Separate clang arguments
+	separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-x assembler -target amdgcn-amd-amdhsa -mcode-object-version=5 -fPIC -mcpu=${TARGET_ID} -o ${CODE_OBJECT} ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE}")
+
+	# Add custom command to generate the kernel bitcode
+	add_custom_command(OUTPUT ${CODE_OBJECT} COMMAND clang ${CLANG_ARG_LIST}
+	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE} clang
+	COMMENT "BUILDING bitcode for ${OUTPUT_FILE}..."
+	VERBATIM)
+
+	separate_arguments(OBJCOPY_ARG_LIST UNIX_COMMAND "--dump-section=.text=${OUTPUT_FILE} ${CODE_OBJECT}")
+
+	# Add custom command to extract binary from the bitcode
+	add_custom_command(OUTPUT ${OUTPUT_FILE}
+	COMMAND llvm-objcopy ${OBJCOPY_ARG_LIST}
+	DEPENDS ${CODE_OBJECT} llvm-objcopy
+	COMMENT "Extracting binary for ${OUTPUT_FILE}..."
+	VERBATIM)
+
+	if(${CMAKE_VERBOSE_MAKEFILE})
+		message("     Blit Shader Source: " ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE})
+		message("     Blit Shader Binary: " ${OUTPUT_FILE})
+	endif()
+
+endfunction(gen_kernel_bc)
+
+# Function to build a kernel for each target device
+function(build_kernel BLIT_SHADER_NAME BLIT_FILE TARGET_ID POSTFIX)
+	set(CODE_OBJECT_FILE "${BLIT_SHADER_NAME}${POSTFIX}")
+	gen_kernel_bc(${TARGET_ID} ${BLIT_FILE} ${CODE_OBJECT_FILE})
+	list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
+	set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
+
+endfunction(build_kernel)
+
+# Function to build kernels for all devices and shaders
+function(build_kernels_for_devices SHADER_NAMES SHADER_FILES)
+	set(HSACO_TARG_LIST "")
+
+	list(LENGTH TARGET_DEVS num_target_devices)
+	math(EXPR num_target_devices "${num_target_devices} - 1")
+	list(LENGTH SHADER_NAMES num_shader_names)
+	math(EXPR num_shader_names "${num_shader_names} - 1")
+
+	foreach(shader_index RANGE ${num_shader_names})
+		list(GET SHADER_NAMES ${shader_index} shader_name)
+		list(GET SHADER_FILES ${shader_index} shader_file)
+		foreach(device_index RANGE ${num_target_devices})
+			# Get device from list of target devices
+			list(GET TARGET_DEVS ${device_index} target_device)
+			# Get postfix from list of postfixes
+			list(GET POSTFIX ${device_index} postfix)
+			if(${CMAKE_VERBOSE_MAKEFILE})
+				message("\n  Generating: ${target_device} for ${shader_name} ...")
+			endif()
+
+			# Define the name of the code object file
+			set(CODE_OBJECT_FILE "${shader_name}${postfix}")
+
+			# Generate the kernel bitcode for the current device and shader
+			gen_kernel_bc(${target_device} ${shader_file} ${CODE_OBJECT_FILE})
+			# Append the code object file to the list
+			list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
+		endforeach(device_index)
+	endforeach(shader_index)
+
+	# Make the list of code object files available in the parent scope
+	set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
+
+endfunction(build_kernels_for_devices)
+
+
+# Function to generate the bytecode stream and create the header file
+function(generate_bytecodeStrm HeaderFILE)
+	set(ARG_LIST "${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h")
+
+	# Copy the shell script to the build directory
+	configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create_blit_shader_header.sh
+		${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh
+		COPYONLY)
+
+	# Add a custom command to generate the header file
+	add_custom_command(OUTPUT ${HeaderFILE}.h
+		COMMAND ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh ${ARG_LIST} ${HSACO_TARG_LIST}
+		COMMENT "Collating blit shaders..."
+		DEPENDS ${HSACO_TARG_LIST} ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh)
+
+	# Add a custom target that depends on the header file
+	add_custom_target(${HeaderFILE} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h)
+
+endfunction(generate_bytecodeStrm)
+
+
+# Build kernels for deviceodeCopyAligned
+build_kernels_for_devices("kCodeCopyAligned;kCodeCopyMisaligned;kCodeFill" "blit_copyAligned.s;blit_copyMisaligned.s;blit_fill.s")
+
+# Generate bytecode stream
+generate_bytecodeStrm("amd_blit_shaders_v2")
+
+
+
+
@@ -0,0 +1,257 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////////////
+
+.text
+
+.macro V_ADD_CO_U32 vdst, src0, vsrc1
+  .if (.amdgcn.gfx_generation_number >= 10)
+		 v_add_co_u32        \vdst, vcc_lo, \src0, \vsrc1
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_add_co_u32        \vdst, vcc, \src0, \vsrc1
+	.else
+		v_add_u32           \vdst, vcc, \src0, \vsrc1
+	.endif
+.endm
+
+
+.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_add_co_ci_u32     \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_addc_co_u32       \vdst, vcc, \src0, \vsrc1, vcc
+	.else
+		v_addc_u32          \vdst, vcc, \src0, \vsrc1, vcc
+	.endif
+.endm
+
+.macro V_CMP_LT_U64 src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_cmp_lt_u64        vcc_lo, \src0, \vsrc1
+	.else
+		v_cmp_lt_u64        vcc, \src0, \vsrc1
+	.endif
+.endm
+
+
+.p2align 8
+
+CopyAligned:
+.set kCopyAlignedVecWidth, 4
+compute_pgm_rsrc2_user_sgpr = 2
+compute_pgm_rsrc2_tgid_x_en = 1
+enable_sgpr_kernarg_segment_ptr = 1
+
+.set kCopyAlignedUnroll, 1
+.set kCopyAlignedNumSGPRs, 32
+.set kCopyAlignedNumVGPRs, (8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
+.set CopyAlignedRsrc1SGPRs, (kCopyAlignedNumSGPRs - 1)/8
+.set CopyAlignedRsrc1VGPRs, (kCopyAlignedNumVGPRs - 1)/4
+
+compute_pgm_rsrc1_sgprs = CopyAlignedRsrc1SGPRs
+compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs
+
+
+  s_load_dwordx4  s[4:7], s[0:1], 0x0
+  s_load_dwordx4  s[8:11], s[0:1], 0x10
+  s_load_dwordx4  s[12:15], s[0:1], 0x20
+  s_load_dwordx4  s[16:19], s[0:1], 0x30
+  s_load_dwordx4  s[20:23], s[0:1], 0x40
+  s_load_dword    s24, s[0:1], 0x50
+  s_waitcnt                lgkmcnt(0)
+
+
+    s_lshl_b32              s2, s2, 0x6
+    V_ADD_CO_U32            v0, s2, v0
+
+    v_mov_b32               v3, s5
+    V_ADD_CO_U32            v2, v0, s4
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+
+    v_mov_b32               v5, s7
+    V_ADD_CO_U32            v4, v0, s6
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+  L_COPY_ALIGNED_PHASE_1_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[8:9]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_1_DONE
+    s_and_b64               exec, exec, vcc
+
+
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+    V_ADD_CO_U32            v2, v2, s24
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+
+    flat_store_byte         v[4:5], v1
+    V_ADD_CO_U32            v4, v4, s24
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+    s_branch                L_COPY_ALIGNED_PHASE_1_LOOP
+
+  L_COPY_ALIGNED_PHASE_1_DONE:
+
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+.if kCopyAlignedVecWidth == 4
+      s_lshl_b32            s25, s24, 0x4
+  .else
+      s_lshl_b32            s25, s24, 0x2
+  .endif
+
+  .if kCopyAlignedVecWidth == 4
+    v_lshlrev_b32          v1, 0x4, v0
+  .else
+    v_lshlrev_b32          v1, 0x2, v0
+  .endif
+
+
+    v_mov_b32               v3, s9
+    V_ADD_CO_U32            v2, v1, s8
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+    v_mov_b32               v5, s11
+    V_ADD_CO_U32            v4, v1, s10
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+  L_COPY_ALIGNED_PHASE_2_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_2_DONE
+
+.macro mCopyAlignedPhase2Load iter iter_end
+    .if kCopyAlignedVecWidth == 4
+      flat_load_dwordx4    v[8 + (\iter * 4):8 + (\iter * 4) + 3], v[2:3]
+    .else
+      flat_load_dword      v[8 + \iter], v[2:3]
+    .endif
+
+    V_ADD_CO_U32           v2, v2, s25
+    V_ADD_CO_CI_U32        v3, v3, 0x0
+
+    .if (\iter_end - \iter)
+      mCopyAlignedPhase2Load (\iter + 1), \iter_end
+    .endif
+.endm
+
+mCopyAlignedPhase2Load 0, (kCopyAlignedUnroll - 1)
+
+  s_waitcnt                vmcnt(0)
+
+.macro mCopyAlignedPhase2Store iter iter_end
+    .if kCopyAlignedVecWidth == 4
+      flat_store_dwordx4   v[4:5], v[8 + (\iter * 4):8 + (\iter * 4) + 3]
+    .else
+      flat_store_dword     v[4:5], v[8 + \iter]
+    .endif
+
+	V_ADD_CO_U32         v4, v4, s25
+	V_ADD_CO_CI_U32      v5, v5, 0x0
+
+
+    .if (\iter_end - \iter)
+      mCopyAlignedPhase2Store (\iter + 1), \iter_end
+    .endif
+.endm
+
+mCopyAlignedPhase2Store 0, (kCopyAlignedUnroll - 1)
+
+  s_branch                L_COPY_ALIGNED_PHASE_2_LOOP
+
+  L_COPY_ALIGNED_PHASE_2_DONE:
+
+    s_lshl_b32              s25, s24, 0x2
+
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s13
+    V_ADD_CO_U32            v2, v1, s12
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+    v_mov_b32               v5, s15
+    V_ADD_CO_U32            v4, v1, s14
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+  L_COPY_ALIGNED_PHASE_3_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[16:17]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_3_DONE
+    s_and_b64               exec, exec, vcc
+
+
+    flat_load_dword         v1, v[2:3]
+    V_ADD_CO_U32            v2, v2, s25
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+    s_waitcnt               vmcnt(0)
+
+
+    flat_store_dword        v[4:5], v1
+    V_ADD_CO_U32            v4, v4, s25
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+    s_branch                L_COPY_ALIGNED_PHASE_3_LOOP
+
+  L_COPY_ALIGNED_PHASE_3_DONE:
+
+    s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+    v_mov_b32               v3, s17
+    V_ADD_CO_U32            v2, v0, s16
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+    v_mov_b32               v5, s19
+    V_ADD_CO_U32            v4, v0, s18
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+    V_CMP_LT_U64            v[2:3], s[20:21]
+    s_cbranch_vccz          L_COPY_ALIGNED_PHASE_4_DONE
+    s_and_b64               exec, exec, vcc
+
+    flat_load_ubyte         v1, v[2:3]
+    s_waitcnt               vmcnt(0)
+
+    flat_store_byte         v[4:5], v1
+
+  L_COPY_ALIGNED_PHASE_4_DONE:
+    s_endpgm
+
@@ -0,0 +1,179 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//   	AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//     www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////////
+
+.text
+
+.macro V_ADD_CO_U32 vdst, src0, vsrc1
+  .if (.amdgcn.gfx_generation_number >= 10)
+		 v_add_co_u32        \vdst, vcc_lo, \src0, \vsrc1
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_add_co_u32        \vdst, vcc, \src0, \vsrc1
+	.else
+		v_add_u32           \vdst, vcc, \src0, \vsrc1
+	.endif
+.endm
+
+
+.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_add_co_ci_u32     \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_addc_co_u32       \vdst, vcc, \src0, \vsrc1, vcc
+	.else
+		v_addc_u32          \vdst, vcc, \src0, \vsrc1, vcc
+	.endif
+.endm
+
+.macro V_CMP_LT_U64 src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_cmp_lt_u64        vcc_lo, \src0, \vsrc1
+	.else
+		v_cmp_lt_u64        vcc, \src0, \vsrc1
+	.endif
+.endm
+
+.set kCopyMisalignedUnroll, 4
+.set kCopyMisalignedNumSGPRs, 17
+.set kCopyMisalignedNumVGPRs, 6 + kCopyMisalignedUnroll
+.set CopyMisalignedRsrc1SGPRs , (kCopyMisalignedNumSGPRs - 1) / 8
+
+.if CopyMisalignedRsrc1SGPRs  < 0
+    .set CopyMisalignedRsrc1SGPRs , 0
+.endif
+
+.set CopyMisalignedRsrc1VGPRs , (kCopyMisalignedNumVGPRs - 1) / 4
+.if CopyMisalignedRsrc1VGPRs  < 0
+    .set CopyMisalignedRsrc1VGPRs , 0
+.endif
+
+.p2align 8
+
+CopyMisaligned:
+  compute_pgm_rsrc1_sgprs = CopyMisalignedRsrc1SGPRs
+  compute_pgm_rsrc1_vgprs = CopyMisalignedRsrc1VGPRs
+  compute_pgm_rsrc2_user_sgpr = 2
+  compute_pgm_rsrc2_tgid_x_en = 1
+  enable_sgpr_kernarg_segment_ptr = 1
+
+  s_load_dwordx4  s[4:7], s[0:1], 0x0
+  s_load_dwordx4  s[8:11], s[0:1], 0x10
+  s_load_dwordx4  s[12:15], s[0:1], 0x20
+  s_load_dword    s16, s[0:1], 0x30
+  s_waitcnt             lgkmcnt(0)
+
+  s_lshl_b32            s2, s2, 0x6
+  V_ADD_CO_U32          v0, s2, v0
+
+  v_mov_b32             v3, s5
+  V_ADD_CO_U32          v2, v0, s4
+  V_ADD_CO_CI_U32       v3, v3, 0x0
+
+  v_mov_b32              v5, s7
+  V_ADD_CO_U32           v4, v0, s6
+  V_ADD_CO_CI_U32        v5, v5, 0x0
+
+  L_COPY_MISALIGNED_PHASE_1_LOOP:
+
+  V_CMP_LT_U64          v[2:3], s[8:9]
+  s_cbranch_vccz        L_COPY_MISALIGNED_PHASE_1_DONE
+
+
+  .macro mCopyMisalignedPhase1Load iter iter_end
+    flat_load_ubyte     v[6 + \iter], v[2:3]
+    V_ADD_CO_U32        v2, v2, s16
+    V_ADD_CO_CI_U32     v3, v3, 0x0
+
+    .if (\iter_end - \iter)
+      mCopyMisalignedPhase1Load (\iter + 1), \iter_end
+    .endif
+  .endm
+
+  mCopyMisalignedPhase1Load 0, (kCopyMisalignedUnroll - 1)
+
+  s_waitcnt                vmcnt(0)
+
+  .macro mCopyMisalignedPhase1Store iter iter_end
+    flat_store_byte        v[4:5], v[6 + \iter]
+    V_ADD_CO_U32           v4, v4, s16
+    V_ADD_CO_CI_U32        v5, v5, 0x0
+
+    .if (\iter_end - \iter)
+      mCopyMisalignedPhase1Store (\iter + 1), \iter_end
+    .endif
+  .endm
+
+    mCopyMisalignedPhase1Store 0, (kCopyMisalignedUnroll - 1)
+
+    s_branch                L_COPY_MISALIGNED_PHASE_1_LOOP
+
+  L_COPY_MISALIGNED_PHASE_1_DONE:
+
+    v_mov_b32               v3, s9
+    V_ADD_CO_U32            v2, v0, s8
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+    v_mov_b32               v5, s11
+    V_ADD_CO_U32            v4, v0, s10
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+  L_COPY_MISALIGNED_PHASE_2_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[12:13]
+    s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+
+
+    flat_load_ubyte         v1, v[2:3]
+    V_ADD_CO_U32            v2, v2, s16
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+    s_waitcnt               vmcnt(0)
+
+    flat_store_byte         v[4:5], v1
+    V_ADD_CO_U32            v4, v4, s16
+    V_ADD_CO_CI_U32         v5, v5, 0x0
+
+    s_branch                L_COPY_MISALIGNED_PHASE_2_LOOP
+
+  L_COPY_MISALIGNED_PHASE_2_DONE:
+    s_endpgm
+
+
@@ -0,0 +1,183 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+//                 AMD Research and AMD HSA Software Development
+//
+//                 Advanced Micro Devices, Inc.
+//
+//                 www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+//  - Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimers.
+//  - Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimers in
+//    the documentation and/or other materials provided with the distribution.
+//  - Neither the names of Advanced Micro Devices, Inc,
+//    nor the names of its contributors may be used to endorse or promote
+//    products derived from this Software without specific prior written
+//    permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////
+
+.text
+
+.macro V_ADD_CO_U32 vdst, src0, vsrc1
+  .if (.amdgcn.gfx_generation_number >= 10)
+		 v_add_co_u32        \vdst, vcc_lo, \src0, \vsrc1
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_add_co_u32        \vdst, vcc, \src0, \vsrc1
+	.else
+		v_add_u32           \vdst, vcc, \src0, \vsrc1
+	.endif
+.endm
+
+
+.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_add_co_ci_u32     \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
+	.elseif (.amdgcn.gfx_generation_number >= 9)
+		v_addc_co_u32       \vdst, vcc, \src0, \vsrc1, vcc
+	.else
+		v_addc_u32          \vdst, vcc, \src0, \vsrc1, vcc
+	.endif
+.endm
+
+.macro V_CMP_LT_U64 src0, vsrc1
+	.if (.amdgcn.gfx_generation_number >= 10)
+		v_cmp_lt_u64        vcc_lo, \src0, \vsrc1
+	.else
+		v_cmp_lt_u64        vcc, \src0, \vsrc1
+	.endif
+.endm
+
+.set kFillVecWidth, 4
+.set kFillUnroll, 1
+
+.set kFillNumSGPRs, 13
+.set kFillNumVGPRs, 4 + kFillUnroll
+
+.set FillRsrc1SGPRs , (kFillNumSGPRs - 1) / 8
+  .if FillRsrc1SGPRs  < 0
+    .set FillRsrc1SGPRs , 0
+  .endif
+
+.set FillRsrc1VGPRs , (kFillNumVGPRs - 1) / 4
+  .if FillRsrc1VGPRs  < 0
+    .set FillRsrc1VGPRs , 0
+  .endif
+
+.p2align 8
+
+Fill:
+
+    compute_pgm_rsrc1_sgprs = FillRsrc1SGPRs
+    compute_pgm_rsrc1_vgprs = FillRsrc1VGPRs
+    compute_pgm_rsrc2_user_sgpr = 2
+    compute_pgm_rsrc2_tgid_x_en = 1
+    enable_sgpr_kernarg_segment_ptr = 1
+
+    s_load_dwordx4  s[4:7], s[0:1], 0x0
+    s_load_dwordx4  s[8:11], s[0:1], 0x10
+    s_waitcnt       lgkmcnt(0)
+
+    s_lshl_b32       s2, s2, 0x6
+    V_ADD_CO_U32     v0, s2, v0
+
+.macro mFillPattern iter iter_end
+    v_mov_b32              v[4 + \iter], s10
+
+    .if (\iter_end - \iter)
+      mFillPattern (\iter + 1), \iter_end
+    .endif
+  .endm
+
+  mFillPattern 0, (kFillVecWidth - 1)
+
+  .if kFillVecWidth == 4
+      s_lshl_b32            s12, s11, 0x4
+  .else
+      s_lshl_b32            s12, s11, 0x2
+  .endif
+
+
+  .if kFillVecWidth == 4
+    v_lshlrev_b32          v1, 0x4, v0
+  .else
+    v_lshlrev_b32          v1, 0x2, v0
+  .endif
+
+   v_mov_b32               v3, s5
+   V_ADD_CO_U32            v2, v1, s4
+   V_ADD_CO_CI_U32         v3, v3, 0x0
+
+  L_FILL_PHASE_1_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[6:7]
+    s_cbranch_vccz          L_FILL_PHASE_1_DONE
+
+.macro mFillPhase1 iter iter_end
+    .if kFillVecWidth == 4
+      flat_store_dwordx4   v[2:3], v[4:7]
+    .else
+      flat_store_dword     v[2:3], v4
+    .endif
+
+     V_ADD_CO_U32          v2, v2, s12
+     V_ADD_CO_CI_U32       v3, v3, 0x0
+
+    .if \iter < \iter_end
+      mFillPhase1 (\iter + 1), \iter_end
+    .endif
+.endm
+
+mFillPhase1 0, kFillUnroll - 1
+
+  s_branch                L_FILL_PHASE_1_LOOP
+
+  L_FILL_PHASE_1_DONE:
+
+    s_lshl_b32              s12, s11, 0x2
+
+    v_lshlrev_b32           v1, 0x2, v0
+    v_mov_b32               v3, s7
+    V_ADD_CO_U32            v2, v1, s6
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+  L_FILL_PHASE_2_LOOP:
+
+    V_CMP_LT_U64            v[2:3], s[8:9]
+    s_cbranch_vccz          L_FILL_PHASE_2_DONE
+    s_and_b64               exec, exec, vcc
+
+
+    flat_store_dword        v[2:3], v4
+    V_ADD_CO_U32            v2, v2, s12
+    V_ADD_CO_CI_U32         v3, v3, 0x0
+
+    s_branch                L_FILL_PHASE_2_LOOP
+
+  L_FILL_PHASE_2_DONE:
+    s_endpgm
+
+
+
@@ -0,0 +1,78 @@
+#!/bin/bash -e
+################################################################################
+##
+## The University of Illinois/NCSA
+## Open Source License (NCSA)
+##
+## Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+##
+## Developed by:
+##
+##                 AMD Research and AMD HSA Software Development
+##
+##                 Advanced Micro Devices, Inc.
+##
+##                 www.amd.com
+##
+## Permission is hereby granted, free of charge, to any person obtaining a copy
+## of this software and associated documentation files (the "Software"), to
+## deal with the Software without restriction, including without limitation
+## the rights to use, copy, modify, merge, publish, distribute, sublicense,
+## and/or sell copies of the Software, and to permit persons to whom the
+## Software is furnished to do so, subject to the following conditions:
+##
+##  - Redistributions of source code must retain the above copyright notice,
+##    this list of conditions and the following disclaimers.
+##  - Redistributions in binary form must reproduce the above copyright
+##    notice, this list of conditions and the following disclaimers in
+##    the documentation and/or other materials provided with the distribution.
+##  - Neither the names of Advanced Micro Devices, Inc,
+##    nor the names of its contributors may be used to endorse or promote
+##    products derived from this Software without specific prior written
+##    permission.
+##
+## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+## DEALINGS WITH THE SOFTWARE.
+##
+################################################################################
+
+amd_gpu_shaders="$1"
+
+if ! command -v xxd >/dev/null
+then
+    echo "xxd not found!"
+    exit 1
+fi
+
+# Create the file in a temporary location and then move it in atomically
+{
+cat <<EOF
+//==============================================================================
+//  This file is automatically generated during build process, don't modify it
+//==============================================================================
+
+namespace rocr {
+namespace AMD {
+
+EOF
+
+shift
+for file in "$@"
+do
+xxd -i $file
+    echo -e '\n'
+done
+
+cat <<EOF
+} // namespace AMD
+} // namespace rocr
+
+EOF
+
+} > "$amd_gpu_shaders"
+