Use LLVM compiler to build blit shaders
Generates shader bytecode stream in amd_blit_shaders_v2.h at build time
Change-Id: I5228ec5442a78d074fd85ca9cd7f7a156dd84da3
[ROCm/ROCR-Runtime commit: 4e675ce730]
Tá an tiomantas seo le fáil i:
tiomanta ag
Shweta Khatri
tuismitheoir
590cac0321
tiomantas
e2c5ecb8dc
@@ -123,7 +123,8 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode
|
||||
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler)
|
||||
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler
|
||||
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/blit_shaders)
|
||||
|
||||
|
||||
## ------------------------- Linux Compiler and Linker options -------------------------
|
||||
@@ -202,6 +203,10 @@ target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${SRCS} )
|
||||
add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/trap_handler )
|
||||
add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 )
|
||||
|
||||
## Depend on blit shader target.
|
||||
add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders )
|
||||
add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2)
|
||||
|
||||
if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" )
|
||||
set ( IMAGE_SUPPORT ON )
|
||||
endif()
|
||||
|
||||
@@ -156,174 +156,6 @@ static const unsigned int kCodeFill8[] = {
|
||||
0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyAligned940[] = {
|
||||
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
|
||||
0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050,
|
||||
0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900,
|
||||
0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05,
|
||||
0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000,
|
||||
0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103,
|
||||
0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105,
|
||||
0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02,
|
||||
0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501,
|
||||
0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000,
|
||||
0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
|
||||
0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
|
||||
0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901,
|
||||
0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05,
|
||||
0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000,
|
||||
0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
|
||||
0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
|
||||
0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03,
|
||||
0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105,
|
||||
0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002,
|
||||
0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyMisaligned940[] = {
|
||||
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
|
||||
0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205,
|
||||
0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04,
|
||||
0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032,
|
||||
0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
|
||||
0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
|
||||
0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
|
||||
0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
|
||||
0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05,
|
||||
0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05,
|
||||
0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05,
|
||||
0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05,
|
||||
0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03,
|
||||
0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105,
|
||||
0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002,
|
||||
0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000,
|
||||
0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee,
|
||||
0xbf810000, 0x00000000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeFill940[] = {
|
||||
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602,
|
||||
0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b,
|
||||
0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103,
|
||||
0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02,
|
||||
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082,
|
||||
0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a,
|
||||
0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02,
|
||||
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyAligned10[] = {
|
||||
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
|
||||
0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
|
||||
0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
|
||||
0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
|
||||
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
|
||||
0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
|
||||
0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
|
||||
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
|
||||
0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
|
||||
0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
|
||||
0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
|
||||
0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
|
||||
0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
|
||||
0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
|
||||
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
|
||||
0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
|
||||
0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
|
||||
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
|
||||
0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
|
||||
0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
|
||||
0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyMisaligned10[] = {
|
||||
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
|
||||
0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
|
||||
0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
|
||||
0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
|
||||
0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
|
||||
0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
|
||||
0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
|
||||
0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
|
||||
0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
|
||||
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
|
||||
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
|
||||
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
|
||||
0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
|
||||
0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
|
||||
0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
|
||||
0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
|
||||
0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
|
||||
0xBF82FFEE, 0xBF810000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeFill10[] = {
|
||||
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
|
||||
0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
|
||||
0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
|
||||
0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
|
||||
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
|
||||
0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
|
||||
0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
|
||||
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyAligned11[] = {
|
||||
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
|
||||
0xF4080400, 0xF8000030, 0xF4080500, 0xF8000040, 0xF4000600, 0xF8000050,
|
||||
0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002, 0x7E060205, 0xD7006A02,
|
||||
0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207, 0xD7006A04, 0x00000D00,
|
||||
0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102, 0xBFA3000F, 0x8BFE6A7E,
|
||||
0xDC400000, 0x017C0002, 0xBF8903FF, 0xD7006A02, 0x00003102, 0xD5206A03,
|
||||
0x01A90103, 0xDC600000, 0x007C0104, 0xD7006A04, 0x00003104, 0xD5206A05,
|
||||
0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x84198418, 0x30020084, 0x7E060209,
|
||||
0xD7006A02, 0x00001101, 0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04,
|
||||
0x00001501, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000E,
|
||||
0xDC5C0000, 0x087C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
|
||||
0xBF8903FF, 0xDC740000, 0x007C0804, 0xD7006A04, 0x00003304, 0xD5206A05,
|
||||
0x01A90105, 0xBFA0FFEF, 0x84198218, 0x30020082, 0x7E06020D, 0xD7006A02,
|
||||
0x00001901, 0xD5206A03, 0x01A90103, 0x7E0A020F, 0xD7006A04, 0x00001D01,
|
||||
0xD5206A05, 0x01A90105, 0xD459006A, 0x00002102, 0xBFA3000F, 0x8BFE6A7E,
|
||||
0xDC500000, 0x017C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
|
||||
0xBF8903FF, 0xDC680000, 0x007C0104, 0xD7006A04, 0x00003304, 0xD5206A05,
|
||||
0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x7E060211, 0xD7006A02, 0x00002100,
|
||||
0xD5206A03, 0x01A90103, 0x7E0A0213, 0xD7006A04, 0x00002500, 0xD5206A05,
|
||||
0x01A90105, 0xD459006A, 0x00002902, 0xBFA30006, 0x8BFE6A7E, 0xDC400000,
|
||||
0x017C0002, 0xBF8903FF, 0xDC600000, 0x007C0104, 0xBFB00000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeCopyMisaligned11[] = {
|
||||
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
|
||||
0xF4000400, 0xF8000030, 0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002,
|
||||
0x7E060205, 0xD7006A02, 0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207,
|
||||
0xD7006A04, 0x00000D00, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102,
|
||||
0xBFA30032, 0xDC400000, 0x067C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
|
||||
0x01A90103, 0xDC400000, 0x077C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
|
||||
0x01A90103, 0xDC400000, 0x087C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
|
||||
0x01A90103, 0xDC400000, 0x097C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
|
||||
0x01A90103, 0xBF8903FF, 0xDC600000, 0x007C0604, 0xD7006A04, 0x00002104,
|
||||
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0704, 0xD7006A04, 0x00002104,
|
||||
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0804, 0xD7006A04, 0x00002104,
|
||||
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0904, 0xD7006A04, 0x00002104,
|
||||
0xD5206A05, 0x01A90105, 0xBFA0FFCB, 0x7E060209, 0xD7006A02, 0x00001100,
|
||||
0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04, 0x00001500, 0xD5206A05,
|
||||
0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000F, 0x8BFE6A7E, 0xDC400000,
|
||||
0x017C0002, 0xD7006A02, 0x00002102, 0xD5206A03, 0x01A90103, 0xBF8903FF,
|
||||
0xDC600000, 0x007C0104, 0xD7006A04, 0x00002104, 0xD5206A05, 0x01A90105,
|
||||
0xBFA0FFEE, 0xBFB00000,
|
||||
};
|
||||
|
||||
static const unsigned int kCodeFill11[] = {
|
||||
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xBF89FC0F, 0x84028602,
|
||||
0xD7006A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
|
||||
0x840C840B, 0x30020084, 0x7E060205, 0xD7006A02, 0x00000901, 0xD5206A03,
|
||||
0x01A90103, 0xD459006A, 0x00000D02, 0xBFA30007, 0xDC740000, 0x007C0402,
|
||||
0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF6, 0x840C820B,
|
||||
0x30020082, 0x7E060207, 0xD7006A02, 0x00000D01, 0xD5206A03, 0x01A90103,
|
||||
0xD459006A, 0x00001102, 0xBFA30008, 0x8BFE6A7E, 0xDC680000, 0x007C0402,
|
||||
0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF5, 0xBFB00000,
|
||||
};
|
||||
|
||||
} // namespace AMD
|
||||
} // namespace rocr
|
||||
|
||||
|
||||
+46
-45
@@ -70,6 +70,7 @@
|
||||
#include "core/inc/amd_blit_shaders.h"
|
||||
// Generated header
|
||||
#include "amd_trap_handler_v2.h"
|
||||
#include "amd_blit_shaders_v2.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
// libdrm headers
|
||||
@@ -257,63 +258,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
|
||||
std::map<std::string, CompiledShader> compiled_shaders = {
|
||||
{"TrapHandler",
|
||||
{
|
||||
{NULL, 0, 0, 0}, // gfx7
|
||||
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
|
||||
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9
|
||||
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a
|
||||
{NULL, 0, 0, 0}, // gfx940
|
||||
{NULL, 0, 0, 0}, // gfx942
|
||||
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010
|
||||
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10
|
||||
{NULL, 0, 0, 0}, // gfx11
|
||||
{NULL, 0, 0, 0}, // gfx7
|
||||
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
|
||||
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9
|
||||
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a
|
||||
{NULL, 0, 0, 0}, // gfx940
|
||||
{NULL, 0, 0, 0}, // gfx942
|
||||
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010
|
||||
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10
|
||||
{NULL, 0, 0, 0}, // gfx11
|
||||
}},
|
||||
{"TrapHandlerKfdExceptions",
|
||||
{
|
||||
{NULL, 0, 0, 0}, // gfx7
|
||||
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
|
||||
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9
|
||||
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a
|
||||
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940
|
||||
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942
|
||||
{kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010
|
||||
{kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10
|
||||
{kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11
|
||||
{NULL, 0, 0, 0}, // gfx7
|
||||
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
|
||||
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9
|
||||
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a
|
||||
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940
|
||||
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942
|
||||
{kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4}, // gfx1010
|
||||
{kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10
|
||||
{kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11
|
||||
}},
|
||||
{"CopyAligned",
|
||||
{
|
||||
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx9
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx90a
|
||||
{kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx942
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10
|
||||
{kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11
|
||||
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7
|
||||
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8
|
||||
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx9
|
||||
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx90a
|
||||
{kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940
|
||||
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx942
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010
|
||||
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10
|
||||
{kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11
|
||||
}},
|
||||
{"CopyMisaligned",
|
||||
{
|
||||
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx9
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx90a
|
||||
{kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx942
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10
|
||||
{kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11
|
||||
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7
|
||||
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8
|
||||
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx9
|
||||
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx90a
|
||||
{kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10}, // gfx940
|
||||
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx942
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010
|
||||
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10
|
||||
{kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11
|
||||
}},
|
||||
{"Fill",
|
||||
{
|
||||
{kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx9
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx90a
|
||||
{kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx942
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10
|
||||
{kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11
|
||||
{kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7
|
||||
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8
|
||||
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx9
|
||||
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx90a
|
||||
{kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940
|
||||
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx942
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010
|
||||
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10
|
||||
{kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11
|
||||
}}};
|
||||
|
||||
auto compiled_shader_it = compiled_shaders.find(func_name);
|
||||
|
||||
+169
@@ -0,0 +1,169 @@
|
||||
################################################################################
|
||||
##
|
||||
## The University of Illinois/NCSA
|
||||
## Open Source License (NCSA)
|
||||
##
|
||||
## Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
##
|
||||
## Developed by:
|
||||
##
|
||||
## AMD Research and AMD HSA Software Development
|
||||
##
|
||||
## Advanced Micro Devices, Inc.
|
||||
##
|
||||
## www.amd.com
|
||||
##
|
||||
## Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
## of this software and associated documentation files (the "Software"), to
|
||||
## deal with the Software without restriction, including without limitation
|
||||
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
## and/or sell copies of the Software, and to permit persons to whom the
|
||||
## Software is furnished to do so, subject to the following conditions:
|
||||
##
|
||||
## - Redistributions of source code must retain the above copyright notice,
|
||||
## this list of conditions and the following disclaimers.
|
||||
## - Redistributions in binary form must reproduce the above copyright
|
||||
## notice, this list of conditions and the following disclaimers in
|
||||
## the documentation and/or other materials provided with the distribution.
|
||||
## - Neither the names of Advanced Micro Devices, Inc,
|
||||
## nor the names of its contributors may be used to endorse or promote
|
||||
## products derived from this Software without specific prior written
|
||||
## permission.
|
||||
##
|
||||
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
## DEALINGS WITH THE SOFTWARE.
|
||||
##
|
||||
##
|
||||
################################################################################
|
||||
|
||||
# Minimum required version of CMake
|
||||
cmake_minimum_required ( VERSION 3.7 )
|
||||
|
||||
# Find Clang package and LLVM package
|
||||
find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
|
||||
find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
|
||||
|
||||
# Set the target devices
|
||||
set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100")
|
||||
# Set the postfix for each target device
|
||||
set (POSTFIX "9;940;1010;10;11")
|
||||
|
||||
# If verbose output is enabled, print paths and target devices
|
||||
if(${CMAKE_VERBOSE_MAKEFILE})
|
||||
get_property(clang_path TARGET clang PROPERTY LOCATION)
|
||||
get_property(objcopy_path TARGET llvm-objcopy PROPERTY LOCATION)
|
||||
message("Using clang from: ${clang_path}")
|
||||
message("Using llvm-objcopy from: ${objcopy_path}")
|
||||
message("Blit Shaders assembled for: ${TARGET_DEVS}")
|
||||
endif()
|
||||
|
||||
# Function to generate kernel bitcode
|
||||
function(gen_kernel_bc TARGET_ID INPUT_FILE OUTPUT_FILE)
|
||||
set(CODE_OBJECT "${OUTPUT_FILE}.hsaco")
|
||||
|
||||
# Separate clang arguments
|
||||
separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-x assembler -target amdgcn-amd-amdhsa -mcode-object-version=5 -fPIC -mcpu=${TARGET_ID} -o ${CODE_OBJECT} ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE}")
|
||||
|
||||
# Add custom command to generate the kernel bitcode
|
||||
add_custom_command(OUTPUT ${CODE_OBJECT} COMMAND clang ${CLANG_ARG_LIST}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE} clang
|
||||
COMMENT "BUILDING bitcode for ${OUTPUT_FILE}..."
|
||||
VERBATIM)
|
||||
|
||||
separate_arguments(OBJCOPY_ARG_LIST UNIX_COMMAND "--dump-section=.text=${OUTPUT_FILE} ${CODE_OBJECT}")
|
||||
|
||||
# Add custom command to extract binary from the bitcode
|
||||
add_custom_command(OUTPUT ${OUTPUT_FILE}
|
||||
COMMAND llvm-objcopy ${OBJCOPY_ARG_LIST}
|
||||
DEPENDS ${CODE_OBJECT} llvm-objcopy
|
||||
COMMENT "Extracting binary for ${OUTPUT_FILE}..."
|
||||
VERBATIM)
|
||||
|
||||
if(${CMAKE_VERBOSE_MAKEFILE})
|
||||
message(" Blit Shader Source: " ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE})
|
||||
message(" Blit Shader Binary: " ${OUTPUT_FILE})
|
||||
endif()
|
||||
|
||||
endfunction(gen_kernel_bc)
|
||||
|
||||
# Function to build a kernel for each target device
|
||||
function(build_kernel BLIT_SHADER_NAME BLIT_FILE TARGET_ID POSTFIX)
|
||||
set(CODE_OBJECT_FILE "${BLIT_SHADER_NAME}${POSTFIX}")
|
||||
gen_kernel_bc(${TARGET_ID} ${BLIT_FILE} ${CODE_OBJECT_FILE})
|
||||
list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
|
||||
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
|
||||
|
||||
endfunction(build_kernel)
|
||||
|
||||
# Function to build kernels for all devices and shaders
|
||||
function(build_kernels_for_devices SHADER_NAMES SHADER_FILES)
|
||||
set(HSACO_TARG_LIST "")
|
||||
|
||||
list(LENGTH TARGET_DEVS num_target_devices)
|
||||
math(EXPR num_target_devices "${num_target_devices} - 1")
|
||||
list(LENGTH SHADER_NAMES num_shader_names)
|
||||
math(EXPR num_shader_names "${num_shader_names} - 1")
|
||||
|
||||
foreach(shader_index RANGE ${num_shader_names})
|
||||
list(GET SHADER_NAMES ${shader_index} shader_name)
|
||||
list(GET SHADER_FILES ${shader_index} shader_file)
|
||||
foreach(device_index RANGE ${num_target_devices})
|
||||
# Get device from list of target devices
|
||||
list(GET TARGET_DEVS ${device_index} target_device)
|
||||
# Get postfix from list of postfixes
|
||||
list(GET POSTFIX ${device_index} postfix)
|
||||
if(${CMAKE_VERBOSE_MAKEFILE})
|
||||
message("\n Generating: ${target_device} for ${shader_name} ...")
|
||||
endif()
|
||||
|
||||
# Define the name of the code object file
|
||||
set(CODE_OBJECT_FILE "${shader_name}${postfix}")
|
||||
|
||||
# Generate the kernel bitcode for the current device and shader
|
||||
gen_kernel_bc(${target_device} ${shader_file} ${CODE_OBJECT_FILE})
|
||||
# Append the code object file to the list
|
||||
list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
|
||||
endforeach(device_index)
|
||||
endforeach(shader_index)
|
||||
|
||||
# Make the list of code object files available in the parent scope
|
||||
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
|
||||
|
||||
endfunction(build_kernels_for_devices)
|
||||
|
||||
|
||||
# Function to generate the bytecode stream and create the header file
|
||||
function(generate_bytecodeStrm HeaderFILE)
|
||||
set(ARG_LIST "${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h")
|
||||
|
||||
# Copy the shell script to the build directory
|
||||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create_blit_shader_header.sh
|
||||
${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh
|
||||
COPYONLY)
|
||||
|
||||
# Add a custom command to generate the header file
|
||||
add_custom_command(OUTPUT ${HeaderFILE}.h
|
||||
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh ${ARG_LIST} ${HSACO_TARG_LIST}
|
||||
COMMENT "Collating blit shaders..."
|
||||
DEPENDS ${HSACO_TARG_LIST} ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh)
|
||||
|
||||
# Add a custom target that depends on the header file
|
||||
add_custom_target(${HeaderFILE} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h)
|
||||
|
||||
endfunction(generate_bytecodeStrm)
|
||||
|
||||
|
||||
# Build kernels for deviceodeCopyAligned
|
||||
build_kernels_for_devices("kCodeCopyAligned;kCodeCopyMisaligned;kCodeFill" "blit_copyAligned.s;blit_copyMisaligned.s;blit_fill.s")
|
||||
|
||||
# Generate bytecode stream
|
||||
generate_bytecodeStrm("amd_blit_shaders_v2")
|
||||
|
||||
|
||||
|
||||
|
||||
+257
@@ -0,0 +1,257 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
.text
|
||||
|
||||
.macro V_ADD_CO_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.else
|
||||
v_add_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.else
|
||||
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro V_CMP_LT_U64 src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
|
||||
.else
|
||||
v_cmp_lt_u64 vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.p2align 8
|
||||
|
||||
CopyAligned:
|
||||
.set kCopyAlignedVecWidth, 4
|
||||
compute_pgm_rsrc2_user_sgpr = 2
|
||||
compute_pgm_rsrc2_tgid_x_en = 1
|
||||
enable_sgpr_kernarg_segment_ptr = 1
|
||||
|
||||
.set kCopyAlignedUnroll, 1
|
||||
.set kCopyAlignedNumSGPRs, 32
|
||||
.set kCopyAlignedNumVGPRs, (8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
|
||||
.set CopyAlignedRsrc1SGPRs, (kCopyAlignedNumSGPRs - 1)/8
|
||||
.set CopyAlignedRsrc1VGPRs, (kCopyAlignedNumVGPRs - 1)/4
|
||||
|
||||
compute_pgm_rsrc1_sgprs = CopyAlignedRsrc1SGPRs
|
||||
compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs
|
||||
|
||||
|
||||
s_load_dwordx4 s[4:7], s[0:1], 0x0
|
||||
s_load_dwordx4 s[8:11], s[0:1], 0x10
|
||||
s_load_dwordx4 s[12:15], s[0:1], 0x20
|
||||
s_load_dwordx4 s[16:19], s[0:1], 0x30
|
||||
s_load_dwordx4 s[20:23], s[0:1], 0x40
|
||||
s_load_dword s24, s[0:1], 0x50
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
V_ADD_CO_U32 v0, s2, v0
|
||||
|
||||
v_mov_b32 v3, s5
|
||||
V_ADD_CO_U32 v2, v0, s4
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
|
||||
v_mov_b32 v5, s7
|
||||
V_ADD_CO_U32 v4, v0, s6
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
L_COPY_ALIGNED_PHASE_1_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
s_waitcnt vmcnt(0)
|
||||
V_ADD_CO_U32 v2, v2, s24
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
|
||||
flat_store_byte v[4:5], v1
|
||||
V_ADD_CO_U32 v4, v4, s24
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
s_branch L_COPY_ALIGNED_PHASE_1_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_1_DONE:
|
||||
|
||||
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
.if kCopyAlignedVecWidth == 4
|
||||
s_lshl_b32 s25, s24, 0x4
|
||||
.else
|
||||
s_lshl_b32 s25, s24, 0x2
|
||||
.endif
|
||||
|
||||
.if kCopyAlignedVecWidth == 4
|
||||
v_lshlrev_b32 v1, 0x4, v0
|
||||
.else
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
.endif
|
||||
|
||||
|
||||
v_mov_b32 v3, s9
|
||||
V_ADD_CO_U32 v2, v1, s8
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
v_mov_b32 v5, s11
|
||||
V_ADD_CO_U32 v4, v1, s10
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
L_COPY_ALIGNED_PHASE_2_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[12:13]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE
|
||||
|
||||
.macro mCopyAlignedPhase2Load iter iter_end
|
||||
.if kCopyAlignedVecWidth == 4
|
||||
flat_load_dwordx4 v[8 + (\iter * 4):8 + (\iter * 4) + 3], v[2:3]
|
||||
.else
|
||||
flat_load_dword v[8 + \iter], v[2:3]
|
||||
.endif
|
||||
|
||||
V_ADD_CO_U32 v2, v2, s25
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
.if (\iter_end - \iter)
|
||||
mCopyAlignedPhase2Load (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mCopyAlignedPhase2Load 0, (kCopyAlignedUnroll - 1)
|
||||
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
.macro mCopyAlignedPhase2Store iter iter_end
|
||||
.if kCopyAlignedVecWidth == 4
|
||||
flat_store_dwordx4 v[4:5], v[8 + (\iter * 4):8 + (\iter * 4) + 3]
|
||||
.else
|
||||
flat_store_dword v[4:5], v[8 + \iter]
|
||||
.endif
|
||||
|
||||
V_ADD_CO_U32 v4, v4, s25
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
|
||||
.if (\iter_end - \iter)
|
||||
mCopyAlignedPhase2Store (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mCopyAlignedPhase2Store 0, (kCopyAlignedUnroll - 1)
|
||||
|
||||
s_branch L_COPY_ALIGNED_PHASE_2_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_2_DONE:
|
||||
|
||||
s_lshl_b32 s25, s24, 0x2
|
||||
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
v_mov_b32 v3, s13
|
||||
V_ADD_CO_U32 v2, v1, s12
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
v_mov_b32 v5, s15
|
||||
V_ADD_CO_U32 v4, v1, s14
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
L_COPY_ALIGNED_PHASE_3_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[16:17]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
|
||||
flat_load_dword v1, v[2:3]
|
||||
V_ADD_CO_U32 v2, v2, s25
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
|
||||
flat_store_dword v[4:5], v1
|
||||
V_ADD_CO_U32 v4, v4, s25
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
s_branch L_COPY_ALIGNED_PHASE_3_LOOP
|
||||
|
||||
L_COPY_ALIGNED_PHASE_3_DONE:
|
||||
|
||||
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
|
||||
|
||||
v_mov_b32 v3, s17
|
||||
V_ADD_CO_U32 v2, v0, s16
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
v_mov_b32 v5, s19
|
||||
V_ADD_CO_U32 v4, v0, s18
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[20:21]
|
||||
s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
flat_store_byte v[4:5], v1
|
||||
|
||||
L_COPY_ALIGNED_PHASE_4_DONE:
|
||||
s_endpgm
|
||||
|
||||
+179
@@ -0,0 +1,179 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
.text
|
||||
|
||||
.macro V_ADD_CO_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.else
|
||||
v_add_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.else
|
||||
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro V_CMP_LT_U64 src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
|
||||
.else
|
||||
v_cmp_lt_u64 vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.set kCopyMisalignedUnroll, 4
|
||||
.set kCopyMisalignedNumSGPRs, 17
|
||||
.set kCopyMisalignedNumVGPRs, 6 + kCopyMisalignedUnroll
|
||||
.set CopyMisalignedRsrc1SGPRs , (kCopyMisalignedNumSGPRs - 1) / 8
|
||||
|
||||
.if CopyMisalignedRsrc1SGPRs < 0
|
||||
.set CopyMisalignedRsrc1SGPRs , 0
|
||||
.endif
|
||||
|
||||
.set CopyMisalignedRsrc1VGPRs , (kCopyMisalignedNumVGPRs - 1) / 4
|
||||
.if CopyMisalignedRsrc1VGPRs < 0
|
||||
.set CopyMisalignedRsrc1VGPRs , 0
|
||||
.endif
|
||||
|
||||
.p2align 8
|
||||
|
||||
CopyMisaligned:
|
||||
compute_pgm_rsrc1_sgprs = CopyMisalignedRsrc1SGPRs
|
||||
compute_pgm_rsrc1_vgprs = CopyMisalignedRsrc1VGPRs
|
||||
compute_pgm_rsrc2_user_sgpr = 2
|
||||
compute_pgm_rsrc2_tgid_x_en = 1
|
||||
enable_sgpr_kernarg_segment_ptr = 1
|
||||
|
||||
s_load_dwordx4 s[4:7], s[0:1], 0x0
|
||||
s_load_dwordx4 s[8:11], s[0:1], 0x10
|
||||
s_load_dwordx4 s[12:15], s[0:1], 0x20
|
||||
s_load_dword s16, s[0:1], 0x30
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
V_ADD_CO_U32 v0, s2, v0
|
||||
|
||||
v_mov_b32 v3, s5
|
||||
V_ADD_CO_U32 v2, v0, s4
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
v_mov_b32 v5, s7
|
||||
V_ADD_CO_U32 v4, v0, s6
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_1_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE
|
||||
|
||||
|
||||
.macro mCopyMisalignedPhase1Load iter iter_end
|
||||
flat_load_ubyte v[6 + \iter], v[2:3]
|
||||
V_ADD_CO_U32 v2, v2, s16
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
.if (\iter_end - \iter)
|
||||
mCopyMisalignedPhase1Load (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mCopyMisalignedPhase1Load 0, (kCopyMisalignedUnroll - 1)
|
||||
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
.macro mCopyMisalignedPhase1Store iter iter_end
|
||||
flat_store_byte v[4:5], v[6 + \iter]
|
||||
V_ADD_CO_U32 v4, v4, s16
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
.if (\iter_end - \iter)
|
||||
mCopyMisalignedPhase1Store (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mCopyMisalignedPhase1Store 0, (kCopyMisalignedUnroll - 1)
|
||||
|
||||
s_branch L_COPY_MISALIGNED_PHASE_1_LOOP
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_1_DONE:
|
||||
|
||||
v_mov_b32 v3, s9
|
||||
V_ADD_CO_U32 v2, v0, s8
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
v_mov_b32 v5, s11
|
||||
V_ADD_CO_U32 v4, v0, s10
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_2_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[12:13]
|
||||
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
|
||||
flat_load_ubyte v1, v[2:3]
|
||||
V_ADD_CO_U32 v2, v2, s16
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
s_waitcnt vmcnt(0)
|
||||
|
||||
flat_store_byte v[4:5], v1
|
||||
V_ADD_CO_U32 v4, v4, s16
|
||||
V_ADD_CO_CI_U32 v5, v5, 0x0
|
||||
|
||||
s_branch L_COPY_MISALIGNED_PHASE_2_LOOP
|
||||
|
||||
L_COPY_MISALIGNED_PHASE_2_DONE:
|
||||
s_endpgm
|
||||
|
||||
|
||||
+183
@@ -0,0 +1,183 @@
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// The University of Illinois/NCSA
|
||||
// Open Source License (NCSA)
|
||||
//
|
||||
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Developed by:
|
||||
//
|
||||
// AMD Research and AMD HSA Software Development
|
||||
//
|
||||
// Advanced Micro Devices, Inc.
|
||||
//
|
||||
// www.amd.com
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to
|
||||
// deal with the Software without restriction, including without limitation
|
||||
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
// and/or sell copies of the Software, and to permit persons to whom the
|
||||
// Software is furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// - Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimers.
|
||||
// - Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimers in
|
||||
// the documentation and/or other materials provided with the distribution.
|
||||
// - Neither the names of Advanced Micro Devices, Inc,
|
||||
// nor the names of its contributors may be used to endorse or promote
|
||||
// products derived from this Software without specific prior written
|
||||
// permission.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
// DEALINGS WITH THE SOFTWARE.
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
.text
|
||||
|
||||
.macro V_ADD_CO_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.else
|
||||
v_add_u32 \vdst, vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
|
||||
.elseif (.amdgcn.gfx_generation_number >= 9)
|
||||
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.else
|
||||
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro V_CMP_LT_U64 src0, vsrc1
|
||||
.if (.amdgcn.gfx_generation_number >= 10)
|
||||
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
|
||||
.else
|
||||
v_cmp_lt_u64 vcc, \src0, \vsrc1
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.set kFillVecWidth, 4
|
||||
.set kFillUnroll, 1
|
||||
|
||||
.set kFillNumSGPRs, 13
|
||||
.set kFillNumVGPRs, 4 + kFillUnroll
|
||||
|
||||
.set FillRsrc1SGPRs , (kFillNumSGPRs - 1) / 8
|
||||
.if FillRsrc1SGPRs < 0
|
||||
.set FillRsrc1SGPRs , 0
|
||||
.endif
|
||||
|
||||
.set FillRsrc1VGPRs , (kFillNumVGPRs - 1) / 4
|
||||
.if FillRsrc1VGPRs < 0
|
||||
.set FillRsrc1VGPRs , 0
|
||||
.endif
|
||||
|
||||
.p2align 8
|
||||
|
||||
Fill:
|
||||
|
||||
compute_pgm_rsrc1_sgprs = FillRsrc1SGPRs
|
||||
compute_pgm_rsrc1_vgprs = FillRsrc1VGPRs
|
||||
compute_pgm_rsrc2_user_sgpr = 2
|
||||
compute_pgm_rsrc2_tgid_x_en = 1
|
||||
enable_sgpr_kernarg_segment_ptr = 1
|
||||
|
||||
s_load_dwordx4 s[4:7], s[0:1], 0x0
|
||||
s_load_dwordx4 s[8:11], s[0:1], 0x10
|
||||
s_waitcnt lgkmcnt(0)
|
||||
|
||||
s_lshl_b32 s2, s2, 0x6
|
||||
V_ADD_CO_U32 v0, s2, v0
|
||||
|
||||
.macro mFillPattern iter iter_end
|
||||
v_mov_b32 v[4 + \iter], s10
|
||||
|
||||
.if (\iter_end - \iter)
|
||||
mFillPattern (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mFillPattern 0, (kFillVecWidth - 1)
|
||||
|
||||
.if kFillVecWidth == 4
|
||||
s_lshl_b32 s12, s11, 0x4
|
||||
.else
|
||||
s_lshl_b32 s12, s11, 0x2
|
||||
.endif
|
||||
|
||||
|
||||
.if kFillVecWidth == 4
|
||||
v_lshlrev_b32 v1, 0x4, v0
|
||||
.else
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
.endif
|
||||
|
||||
v_mov_b32 v3, s5
|
||||
V_ADD_CO_U32 v2, v1, s4
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
L_FILL_PHASE_1_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[6:7]
|
||||
s_cbranch_vccz L_FILL_PHASE_1_DONE
|
||||
|
||||
.macro mFillPhase1 iter iter_end
|
||||
.if kFillVecWidth == 4
|
||||
flat_store_dwordx4 v[2:3], v[4:7]
|
||||
.else
|
||||
flat_store_dword v[2:3], v4
|
||||
.endif
|
||||
|
||||
V_ADD_CO_U32 v2, v2, s12
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
.if \iter < \iter_end
|
||||
mFillPhase1 (\iter + 1), \iter_end
|
||||
.endif
|
||||
.endm
|
||||
|
||||
mFillPhase1 0, kFillUnroll - 1
|
||||
|
||||
s_branch L_FILL_PHASE_1_LOOP
|
||||
|
||||
L_FILL_PHASE_1_DONE:
|
||||
|
||||
s_lshl_b32 s12, s11, 0x2
|
||||
|
||||
v_lshlrev_b32 v1, 0x2, v0
|
||||
v_mov_b32 v3, s7
|
||||
V_ADD_CO_U32 v2, v1, s6
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
L_FILL_PHASE_2_LOOP:
|
||||
|
||||
V_CMP_LT_U64 v[2:3], s[8:9]
|
||||
s_cbranch_vccz L_FILL_PHASE_2_DONE
|
||||
s_and_b64 exec, exec, vcc
|
||||
|
||||
|
||||
flat_store_dword v[2:3], v4
|
||||
V_ADD_CO_U32 v2, v2, s12
|
||||
V_ADD_CO_CI_U32 v3, v3, 0x0
|
||||
|
||||
s_branch L_FILL_PHASE_2_LOOP
|
||||
|
||||
L_FILL_PHASE_2_DONE:
|
||||
s_endpgm
|
||||
|
||||
|
||||
|
||||
Inrite
@@ -0,0 +1,78 @@
|
||||
#!/bin/bash -e
|
||||
################################################################################
|
||||
##
|
||||
## The University of Illinois/NCSA
|
||||
## Open Source License (NCSA)
|
||||
##
|
||||
## Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
##
|
||||
## Developed by:
|
||||
##
|
||||
## AMD Research and AMD HSA Software Development
|
||||
##
|
||||
## Advanced Micro Devices, Inc.
|
||||
##
|
||||
## www.amd.com
|
||||
##
|
||||
## Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
## of this software and associated documentation files (the "Software"), to
|
||||
## deal with the Software without restriction, including without limitation
|
||||
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
## and/or sell copies of the Software, and to permit persons to whom the
|
||||
## Software is furnished to do so, subject to the following conditions:
|
||||
##
|
||||
## - Redistributions of source code must retain the above copyright notice,
|
||||
## this list of conditions and the following disclaimers.
|
||||
## - Redistributions in binary form must reproduce the above copyright
|
||||
## notice, this list of conditions and the following disclaimers in
|
||||
## the documentation and/or other materials provided with the distribution.
|
||||
## - Neither the names of Advanced Micro Devices, Inc,
|
||||
## nor the names of its contributors may be used to endorse or promote
|
||||
## products derived from this Software without specific prior written
|
||||
## permission.
|
||||
##
|
||||
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
## DEALINGS WITH THE SOFTWARE.
|
||||
##
|
||||
################################################################################
|
||||
|
||||
amd_gpu_shaders="$1"
|
||||
|
||||
if ! command -v xxd >/dev/null
|
||||
then
|
||||
echo "xxd not found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create the file in a temporary location and then move it in atomically
|
||||
{
|
||||
cat <<EOF
|
||||
//==============================================================================
|
||||
// This file is automatically generated during build process, don't modify it
|
||||
//==============================================================================
|
||||
|
||||
namespace rocr {
|
||||
namespace AMD {
|
||||
|
||||
EOF
|
||||
|
||||
shift
|
||||
for file in "$@"
|
||||
do
|
||||
xxd -i $file
|
||||
echo -e '\n'
|
||||
done
|
||||
|
||||
cat <<EOF
|
||||
} // namespace AMD
|
||||
} // namespace rocr
|
||||
|
||||
EOF
|
||||
|
||||
} > "$amd_gpu_shaders"
|
||||
|
||||
Tagairt in Eagrán Nua
Cuir bac ar úsáideoir