Use LLVM compiler to build blit shaders

Generates shader bytecode stream in amd_blit_shaders_v2.h at build time

Change-Id: I5228ec5442a78d074fd85ca9cd7f7a156dd84da3


[ROCm/ROCR-Runtime commit: 4e675ce730]
Tá an tiomantas seo le fáil i:
Shweta Khatri
2023-08-22 16:44:07 -04:00
tiomanta ag Shweta Khatri
tuismitheoir 590cac0321
tiomantas e2c5ecb8dc
D'athraigh 8 comhad le 918 breiseanna agus 214 scriosta
@@ -123,7 +123,8 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler)
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/blit_shaders)
## ------------------------- Linux Compiler and Linker options -------------------------
@@ -202,6 +203,10 @@ target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${SRCS} )
add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/trap_handler )
add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 )
## Depend on blit shader target.
add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders )
add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2)
if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" )
set ( IMAGE_SUPPORT ON )
endif()
@@ -156,174 +156,6 @@ static const unsigned int kCodeFill8[] = {
0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
};
static const unsigned int kCodeCopyAligned940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050,
0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900,
0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05,
0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000,
0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103,
0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105,
0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02,
0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501,
0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000,
0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901,
0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05,
0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000,
0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03,
0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105,
0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002,
0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000,
};
static const unsigned int kCodeCopyMisaligned940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205,
0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04,
0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032,
0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03,
0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105,
0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002,
0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000,
0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee,
0xbf810000, 0x00000000,
};
static const unsigned int kCodeFill940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602,
0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b,
0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103,
0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02,
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082,
0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a,
0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02,
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000,
};
static const unsigned int kCodeCopyAligned10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02,
0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006,
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E,
0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03,
0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05,
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209,
0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04,
0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E,
0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05,
0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02,
0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E,
0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E,
0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103,
0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05,
0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010,
0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05,
0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000,
0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000,
};
static const unsigned int kCodeCopyMisaligned10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002,
0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207,
0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102,
0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03,
0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810,
0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008,
0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05,
0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000,
0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70,
0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105,
0xBF82FFEE, 0xBF810000,
};
static const unsigned int kCodeFill10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602,
0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03,
0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402,
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B,
0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103,
0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402,
0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
};
static const unsigned int kCodeCopyAligned11[] = {
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
0xF4080400, 0xF8000030, 0xF4080500, 0xF8000040, 0xF4000600, 0xF8000050,
0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002, 0x7E060205, 0xD7006A02,
0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207, 0xD7006A04, 0x00000D00,
0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102, 0xBFA3000F, 0x8BFE6A7E,
0xDC400000, 0x017C0002, 0xBF8903FF, 0xD7006A02, 0x00003102, 0xD5206A03,
0x01A90103, 0xDC600000, 0x007C0104, 0xD7006A04, 0x00003104, 0xD5206A05,
0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x84198418, 0x30020084, 0x7E060209,
0xD7006A02, 0x00001101, 0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04,
0x00001501, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000E,
0xDC5C0000, 0x087C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
0xBF8903FF, 0xDC740000, 0x007C0804, 0xD7006A04, 0x00003304, 0xD5206A05,
0x01A90105, 0xBFA0FFEF, 0x84198218, 0x30020082, 0x7E06020D, 0xD7006A02,
0x00001901, 0xD5206A03, 0x01A90103, 0x7E0A020F, 0xD7006A04, 0x00001D01,
0xD5206A05, 0x01A90105, 0xD459006A, 0x00002102, 0xBFA3000F, 0x8BFE6A7E,
0xDC500000, 0x017C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103,
0xBF8903FF, 0xDC680000, 0x007C0104, 0xD7006A04, 0x00003304, 0xD5206A05,
0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x7E060211, 0xD7006A02, 0x00002100,
0xD5206A03, 0x01A90103, 0x7E0A0213, 0xD7006A04, 0x00002500, 0xD5206A05,
0x01A90105, 0xD459006A, 0x00002902, 0xBFA30006, 0x8BFE6A7E, 0xDC400000,
0x017C0002, 0xBF8903FF, 0xDC600000, 0x007C0104, 0xBFB00000,
};
static const unsigned int kCodeCopyMisaligned11[] = {
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020,
0xF4000400, 0xF8000030, 0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002,
0x7E060205, 0xD7006A02, 0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207,
0xD7006A04, 0x00000D00, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102,
0xBFA30032, 0xDC400000, 0x067C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
0x01A90103, 0xDC400000, 0x077C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
0x01A90103, 0xDC400000, 0x087C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
0x01A90103, 0xDC400000, 0x097C0002, 0xD7006A02, 0x00002102, 0xD5206A03,
0x01A90103, 0xBF8903FF, 0xDC600000, 0x007C0604, 0xD7006A04, 0x00002104,
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0704, 0xD7006A04, 0x00002104,
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0804, 0xD7006A04, 0x00002104,
0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0904, 0xD7006A04, 0x00002104,
0xD5206A05, 0x01A90105, 0xBFA0FFCB, 0x7E060209, 0xD7006A02, 0x00001100,
0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04, 0x00001500, 0xD5206A05,
0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000F, 0x8BFE6A7E, 0xDC400000,
0x017C0002, 0xD7006A02, 0x00002102, 0xD5206A03, 0x01A90103, 0xBF8903FF,
0xDC600000, 0x007C0104, 0xD7006A04, 0x00002104, 0xD5206A05, 0x01A90105,
0xBFA0FFEE, 0xBFB00000,
};
static const unsigned int kCodeFill11[] = {
0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xBF89FC0F, 0x84028602,
0xD7006A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A,
0x840C840B, 0x30020084, 0x7E060205, 0xD7006A02, 0x00000901, 0xD5206A03,
0x01A90103, 0xD459006A, 0x00000D02, 0xBFA30007, 0xDC740000, 0x007C0402,
0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF6, 0x840C820B,
0x30020082, 0x7E060207, 0xD7006A02, 0x00000D01, 0xD5206A03, 0x01A90103,
0xD459006A, 0x00001102, 0xBFA30008, 0x8BFE6A7E, 0xDC680000, 0x007C0402,
0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF5, 0xBFB00000,
};
} // namespace AMD
} // namespace rocr
@@ -70,6 +70,7 @@
#include "core/inc/amd_blit_shaders.h"
// Generated header
#include "amd_trap_handler_v2.h"
#include "amd_blit_shaders_v2.h"
#if defined(__linux__)
// libdrm headers
@@ -257,63 +258,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
std::map<std::string, CompiledShader> compiled_shaders = {
{"TrapHandler",
{
{NULL, 0, 0, 0}, // gfx7
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a
{NULL, 0, 0, 0}, // gfx940
{NULL, 0, 0, 0}, // gfx942
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10
{NULL, 0, 0, 0}, // gfx11
{NULL, 0, 0, 0}, // gfx7
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
{kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9
{kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a
{NULL, 0, 0, 0}, // gfx940
{NULL, 0, 0, 0}, // gfx942
{kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010
{kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10
{NULL, 0, 0, 0}, // gfx11
}},
{"TrapHandlerKfdExceptions",
{
{NULL, 0, 0, 0}, // gfx7
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942
{kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010
{kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10
{kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11
{NULL, 0, 0, 0}, // gfx7
{kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9
{kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940
{kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942
{kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4}, // gfx1010
{kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10
{kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11
}},
{"CopyAligned",
{
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx9
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx90a
{kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx942
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10
{kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11
{kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7
{kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx9
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx90a
{kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940
{kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx942
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010
{kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10
{kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11
}},
{"CopyMisaligned",
{
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx9
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx90a
{kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx942
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10
{kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11
{kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7
{kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx9
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx90a
{kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10}, // gfx940
{kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx942
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010
{kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10
{kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11
}},
{"Fill",
{
{kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx9
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx90a
{kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx942
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10
{kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11
{kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7
{kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx9
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx90a
{kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940
{kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx942
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010
{kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10
{kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11
}}};
auto compiled_shader_it = compiled_shaders.find(func_name);
@@ -0,0 +1,169 @@
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
##
################################################################################
# Minimum required version of CMake
cmake_minimum_required ( VERSION 3.7 )
# Find Clang package and LLVM package
find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
# Set the target devices
set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100")
# Set the postfix for each target device
set (POSTFIX "9;940;1010;10;11")
# If verbose output is enabled, print paths and target devices
if(${CMAKE_VERBOSE_MAKEFILE})
get_property(clang_path TARGET clang PROPERTY LOCATION)
get_property(objcopy_path TARGET llvm-objcopy PROPERTY LOCATION)
message("Using clang from: ${clang_path}")
message("Using llvm-objcopy from: ${objcopy_path}")
message("Blit Shaders assembled for: ${TARGET_DEVS}")
endif()
# Function to generate kernel bitcode
function(gen_kernel_bc TARGET_ID INPUT_FILE OUTPUT_FILE)
set(CODE_OBJECT "${OUTPUT_FILE}.hsaco")
# Separate clang arguments
separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-x assembler -target amdgcn-amd-amdhsa -mcode-object-version=5 -fPIC -mcpu=${TARGET_ID} -o ${CODE_OBJECT} ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE}")
# Add custom command to generate the kernel bitcode
add_custom_command(OUTPUT ${CODE_OBJECT} COMMAND clang ${CLANG_ARG_LIST}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE} clang
COMMENT "BUILDING bitcode for ${OUTPUT_FILE}..."
VERBATIM)
separate_arguments(OBJCOPY_ARG_LIST UNIX_COMMAND "--dump-section=.text=${OUTPUT_FILE} ${CODE_OBJECT}")
# Add custom command to extract binary from the bitcode
add_custom_command(OUTPUT ${OUTPUT_FILE}
COMMAND llvm-objcopy ${OBJCOPY_ARG_LIST}
DEPENDS ${CODE_OBJECT} llvm-objcopy
COMMENT "Extracting binary for ${OUTPUT_FILE}..."
VERBATIM)
if(${CMAKE_VERBOSE_MAKEFILE})
message(" Blit Shader Source: " ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE})
message(" Blit Shader Binary: " ${OUTPUT_FILE})
endif()
endfunction(gen_kernel_bc)
# Function to build a kernel for each target device
function(build_kernel BLIT_SHADER_NAME BLIT_FILE TARGET_ID POSTFIX)
set(CODE_OBJECT_FILE "${BLIT_SHADER_NAME}${POSTFIX}")
gen_kernel_bc(${TARGET_ID} ${BLIT_FILE} ${CODE_OBJECT_FILE})
list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
endfunction(build_kernel)
# Function to build kernels for all devices and shaders
function(build_kernels_for_devices SHADER_NAMES SHADER_FILES)
set(HSACO_TARG_LIST "")
list(LENGTH TARGET_DEVS num_target_devices)
math(EXPR num_target_devices "${num_target_devices} - 1")
list(LENGTH SHADER_NAMES num_shader_names)
math(EXPR num_shader_names "${num_shader_names} - 1")
foreach(shader_index RANGE ${num_shader_names})
list(GET SHADER_NAMES ${shader_index} shader_name)
list(GET SHADER_FILES ${shader_index} shader_file)
foreach(device_index RANGE ${num_target_devices})
# Get device from list of target devices
list(GET TARGET_DEVS ${device_index} target_device)
# Get postfix from list of postfixes
list(GET POSTFIX ${device_index} postfix)
if(${CMAKE_VERBOSE_MAKEFILE})
message("\n Generating: ${target_device} for ${shader_name} ...")
endif()
# Define the name of the code object file
set(CODE_OBJECT_FILE "${shader_name}${postfix}")
# Generate the kernel bitcode for the current device and shader
gen_kernel_bc(${target_device} ${shader_file} ${CODE_OBJECT_FILE})
# Append the code object file to the list
list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}")
endforeach(device_index)
endforeach(shader_index)
# Make the list of code object files available in the parent scope
set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE)
endfunction(build_kernels_for_devices)
# Function to generate the bytecode stream and create the header file
function(generate_bytecodeStrm HeaderFILE)
set(ARG_LIST "${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h")
# Copy the shell script to the build directory
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create_blit_shader_header.sh
${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh
COPYONLY)
# Add a custom command to generate the header file
add_custom_command(OUTPUT ${HeaderFILE}.h
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh ${ARG_LIST} ${HSACO_TARG_LIST}
COMMENT "Collating blit shaders..."
DEPENDS ${HSACO_TARG_LIST} ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh)
# Add a custom target that depends on the header file
add_custom_target(${HeaderFILE} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h)
endfunction(generate_bytecodeStrm)
# Build kernels for deviceodeCopyAligned
build_kernels_for_devices("kCodeCopyAligned;kCodeCopyMisaligned;kCodeFill" "blit_copyAligned.s;blit_copyMisaligned.s;blit_fill.s")
# Generate bytecode stream
generate_bytecodeStrm("amd_blit_shaders_v2")
@@ -0,0 +1,257 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
///////////////////////////////////////////////////////////////////////////////////////
.text
.macro V_ADD_CO_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
.elseif (.amdgcn.gfx_generation_number >= 9)
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
.else
v_add_u32 \vdst, vcc, \src0, \vsrc1
.endif
.endm
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
.elseif (.amdgcn.gfx_generation_number >= 9)
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
.else
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
.endif
.endm
.macro V_CMP_LT_U64 src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
.else
v_cmp_lt_u64 vcc, \src0, \vsrc1
.endif
.endm
.p2align 8
CopyAligned:
.set kCopyAlignedVecWidth, 4
compute_pgm_rsrc2_user_sgpr = 2
compute_pgm_rsrc2_tgid_x_en = 1
enable_sgpr_kernarg_segment_ptr = 1
.set kCopyAlignedUnroll, 1
.set kCopyAlignedNumSGPRs, 32
.set kCopyAlignedNumVGPRs, (8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
.set CopyAlignedRsrc1SGPRs, (kCopyAlignedNumSGPRs - 1)/8
.set CopyAlignedRsrc1VGPRs, (kCopyAlignedNumVGPRs - 1)/4
compute_pgm_rsrc1_sgprs = CopyAlignedRsrc1SGPRs
compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs
s_load_dwordx4 s[4:7], s[0:1], 0x0
s_load_dwordx4 s[8:11], s[0:1], 0x10
s_load_dwordx4 s[12:15], s[0:1], 0x20
s_load_dwordx4 s[16:19], s[0:1], 0x30
s_load_dwordx4 s[20:23], s[0:1], 0x40
s_load_dword s24, s[0:1], 0x50
s_waitcnt lgkmcnt(0)
s_lshl_b32 s2, s2, 0x6
V_ADD_CO_U32 v0, s2, v0
v_mov_b32 v3, s5
V_ADD_CO_U32 v2, v0, s4
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s7
V_ADD_CO_U32 v4, v0, s6
V_ADD_CO_CI_U32 v5, v5, 0x0
L_COPY_ALIGNED_PHASE_1_LOOP:
V_CMP_LT_U64 v[2:3], s[8:9]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE
s_and_b64 exec, exec, vcc
flat_load_ubyte v1, v[2:3]
s_waitcnt vmcnt(0)
V_ADD_CO_U32 v2, v2, s24
V_ADD_CO_CI_U32 v3, v3, 0x0
flat_store_byte v[4:5], v1
V_ADD_CO_U32 v4, v4, s24
V_ADD_CO_CI_U32 v5, v5, 0x0
s_branch L_COPY_ALIGNED_PHASE_1_LOOP
L_COPY_ALIGNED_PHASE_1_DONE:
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
.if kCopyAlignedVecWidth == 4
s_lshl_b32 s25, s24, 0x4
.else
s_lshl_b32 s25, s24, 0x2
.endif
.if kCopyAlignedVecWidth == 4
v_lshlrev_b32 v1, 0x4, v0
.else
v_lshlrev_b32 v1, 0x2, v0
.endif
v_mov_b32 v3, s9
V_ADD_CO_U32 v2, v1, s8
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s11
V_ADD_CO_U32 v4, v1, s10
V_ADD_CO_CI_U32 v5, v5, 0x0
L_COPY_ALIGNED_PHASE_2_LOOP:
V_CMP_LT_U64 v[2:3], s[12:13]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE
.macro mCopyAlignedPhase2Load iter iter_end
.if kCopyAlignedVecWidth == 4
flat_load_dwordx4 v[8 + (\iter * 4):8 + (\iter * 4) + 3], v[2:3]
.else
flat_load_dword v[8 + \iter], v[2:3]
.endif
V_ADD_CO_U32 v2, v2, s25
V_ADD_CO_CI_U32 v3, v3, 0x0
.if (\iter_end - \iter)
mCopyAlignedPhase2Load (\iter + 1), \iter_end
.endif
.endm
mCopyAlignedPhase2Load 0, (kCopyAlignedUnroll - 1)
s_waitcnt vmcnt(0)
.macro mCopyAlignedPhase2Store iter iter_end
.if kCopyAlignedVecWidth == 4
flat_store_dwordx4 v[4:5], v[8 + (\iter * 4):8 + (\iter * 4) + 3]
.else
flat_store_dword v[4:5], v[8 + \iter]
.endif
V_ADD_CO_U32 v4, v4, s25
V_ADD_CO_CI_U32 v5, v5, 0x0
.if (\iter_end - \iter)
mCopyAlignedPhase2Store (\iter + 1), \iter_end
.endif
.endm
mCopyAlignedPhase2Store 0, (kCopyAlignedUnroll - 1)
s_branch L_COPY_ALIGNED_PHASE_2_LOOP
L_COPY_ALIGNED_PHASE_2_DONE:
s_lshl_b32 s25, s24, 0x2
v_lshlrev_b32 v1, 0x2, v0
v_mov_b32 v3, s13
V_ADD_CO_U32 v2, v1, s12
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s15
V_ADD_CO_U32 v4, v1, s14
V_ADD_CO_CI_U32 v5, v5, 0x0
L_COPY_ALIGNED_PHASE_3_LOOP:
V_CMP_LT_U64 v[2:3], s[16:17]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE
s_and_b64 exec, exec, vcc
flat_load_dword v1, v[2:3]
V_ADD_CO_U32 v2, v2, s25
V_ADD_CO_CI_U32 v3, v3, 0x0
s_waitcnt vmcnt(0)
flat_store_dword v[4:5], v1
V_ADD_CO_U32 v4, v4, s25
V_ADD_CO_CI_U32 v5, v5, 0x0
s_branch L_COPY_ALIGNED_PHASE_3_LOOP
L_COPY_ALIGNED_PHASE_3_DONE:
s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
v_mov_b32 v3, s17
V_ADD_CO_U32 v2, v0, s16
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s19
V_ADD_CO_U32 v4, v0, s18
V_ADD_CO_CI_U32 v5, v5, 0x0
V_CMP_LT_U64 v[2:3], s[20:21]
s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE
s_and_b64 exec, exec, vcc
flat_load_ubyte v1, v[2:3]
s_waitcnt vmcnt(0)
flat_store_byte v[4:5], v1
L_COPY_ALIGNED_PHASE_4_DONE:
s_endpgm
@@ -0,0 +1,179 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////////
.text
.macro V_ADD_CO_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
.elseif (.amdgcn.gfx_generation_number >= 9)
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
.else
v_add_u32 \vdst, vcc, \src0, \vsrc1
.endif
.endm
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
.elseif (.amdgcn.gfx_generation_number >= 9)
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
.else
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
.endif
.endm
.macro V_CMP_LT_U64 src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
.else
v_cmp_lt_u64 vcc, \src0, \vsrc1
.endif
.endm
.set kCopyMisalignedUnroll, 4
.set kCopyMisalignedNumSGPRs, 17
.set kCopyMisalignedNumVGPRs, 6 + kCopyMisalignedUnroll
.set CopyMisalignedRsrc1SGPRs , (kCopyMisalignedNumSGPRs - 1) / 8
.if CopyMisalignedRsrc1SGPRs < 0
.set CopyMisalignedRsrc1SGPRs , 0
.endif
.set CopyMisalignedRsrc1VGPRs , (kCopyMisalignedNumVGPRs - 1) / 4
.if CopyMisalignedRsrc1VGPRs < 0
.set CopyMisalignedRsrc1VGPRs , 0
.endif
.p2align 8
CopyMisaligned:
compute_pgm_rsrc1_sgprs = CopyMisalignedRsrc1SGPRs
compute_pgm_rsrc1_vgprs = CopyMisalignedRsrc1VGPRs
compute_pgm_rsrc2_user_sgpr = 2
compute_pgm_rsrc2_tgid_x_en = 1
enable_sgpr_kernarg_segment_ptr = 1
s_load_dwordx4 s[4:7], s[0:1], 0x0
s_load_dwordx4 s[8:11], s[0:1], 0x10
s_load_dwordx4 s[12:15], s[0:1], 0x20
s_load_dword s16, s[0:1], 0x30
s_waitcnt lgkmcnt(0)
s_lshl_b32 s2, s2, 0x6
V_ADD_CO_U32 v0, s2, v0
v_mov_b32 v3, s5
V_ADD_CO_U32 v2, v0, s4
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s7
V_ADD_CO_U32 v4, v0, s6
V_ADD_CO_CI_U32 v5, v5, 0x0
L_COPY_MISALIGNED_PHASE_1_LOOP:
V_CMP_LT_U64 v[2:3], s[8:9]
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE
.macro mCopyMisalignedPhase1Load iter iter_end
flat_load_ubyte v[6 + \iter], v[2:3]
V_ADD_CO_U32 v2, v2, s16
V_ADD_CO_CI_U32 v3, v3, 0x0
.if (\iter_end - \iter)
mCopyMisalignedPhase1Load (\iter + 1), \iter_end
.endif
.endm
mCopyMisalignedPhase1Load 0, (kCopyMisalignedUnroll - 1)
s_waitcnt vmcnt(0)
.macro mCopyMisalignedPhase1Store iter iter_end
flat_store_byte v[4:5], v[6 + \iter]
V_ADD_CO_U32 v4, v4, s16
V_ADD_CO_CI_U32 v5, v5, 0x0
.if (\iter_end - \iter)
mCopyMisalignedPhase1Store (\iter + 1), \iter_end
.endif
.endm
mCopyMisalignedPhase1Store 0, (kCopyMisalignedUnroll - 1)
s_branch L_COPY_MISALIGNED_PHASE_1_LOOP
L_COPY_MISALIGNED_PHASE_1_DONE:
v_mov_b32 v3, s9
V_ADD_CO_U32 v2, v0, s8
V_ADD_CO_CI_U32 v3, v3, 0x0
v_mov_b32 v5, s11
V_ADD_CO_U32 v4, v0, s10
V_ADD_CO_CI_U32 v5, v5, 0x0
L_COPY_MISALIGNED_PHASE_2_LOOP:
V_CMP_LT_U64 v[2:3], s[12:13]
s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE
s_and_b64 exec, exec, vcc
flat_load_ubyte v1, v[2:3]
V_ADD_CO_U32 v2, v2, s16
V_ADD_CO_CI_U32 v3, v3, 0x0
s_waitcnt vmcnt(0)
flat_store_byte v[4:5], v1
V_ADD_CO_U32 v4, v4, s16
V_ADD_CO_CI_U32 v5, v5, 0x0
s_branch L_COPY_MISALIGNED_PHASE_2_LOOP
L_COPY_MISALIGNED_PHASE_2_DONE:
s_endpgm
@@ -0,0 +1,183 @@
////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
// AMD Research and AMD HSA Software Development
//
// Advanced Micro Devices, Inc.
//
// www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// - Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimers.
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimers in
// the documentation and/or other materials provided with the distribution.
// - Neither the names of Advanced Micro Devices, Inc,
// nor the names of its contributors may be used to endorse or promote
// products derived from this Software without specific prior written
// permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
////////////////////////////////////////////////////////////////////////////////////
.text
.macro V_ADD_CO_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1
.elseif (.amdgcn.gfx_generation_number >= 9)
v_add_co_u32 \vdst, vcc, \src0, \vsrc1
.else
v_add_u32 \vdst, vcc, \src0, \vsrc1
.endif
.endm
.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo
.elseif (.amdgcn.gfx_generation_number >= 9)
v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc
.else
v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc
.endif
.endm
.macro V_CMP_LT_U64 src0, vsrc1
.if (.amdgcn.gfx_generation_number >= 10)
v_cmp_lt_u64 vcc_lo, \src0, \vsrc1
.else
v_cmp_lt_u64 vcc, \src0, \vsrc1
.endif
.endm
.set kFillVecWidth, 4
.set kFillUnroll, 1
.set kFillNumSGPRs, 13
.set kFillNumVGPRs, 4 + kFillUnroll
.set FillRsrc1SGPRs , (kFillNumSGPRs - 1) / 8
.if FillRsrc1SGPRs < 0
.set FillRsrc1SGPRs , 0
.endif
.set FillRsrc1VGPRs , (kFillNumVGPRs - 1) / 4
.if FillRsrc1VGPRs < 0
.set FillRsrc1VGPRs , 0
.endif
.p2align 8
Fill:
compute_pgm_rsrc1_sgprs = FillRsrc1SGPRs
compute_pgm_rsrc1_vgprs = FillRsrc1VGPRs
compute_pgm_rsrc2_user_sgpr = 2
compute_pgm_rsrc2_tgid_x_en = 1
enable_sgpr_kernarg_segment_ptr = 1
s_load_dwordx4 s[4:7], s[0:1], 0x0
s_load_dwordx4 s[8:11], s[0:1], 0x10
s_waitcnt lgkmcnt(0)
s_lshl_b32 s2, s2, 0x6
V_ADD_CO_U32 v0, s2, v0
.macro mFillPattern iter iter_end
v_mov_b32 v[4 + \iter], s10
.if (\iter_end - \iter)
mFillPattern (\iter + 1), \iter_end
.endif
.endm
mFillPattern 0, (kFillVecWidth - 1)
.if kFillVecWidth == 4
s_lshl_b32 s12, s11, 0x4
.else
s_lshl_b32 s12, s11, 0x2
.endif
.if kFillVecWidth == 4
v_lshlrev_b32 v1, 0x4, v0
.else
v_lshlrev_b32 v1, 0x2, v0
.endif
v_mov_b32 v3, s5
V_ADD_CO_U32 v2, v1, s4
V_ADD_CO_CI_U32 v3, v3, 0x0
L_FILL_PHASE_1_LOOP:
V_CMP_LT_U64 v[2:3], s[6:7]
s_cbranch_vccz L_FILL_PHASE_1_DONE
.macro mFillPhase1 iter iter_end
.if kFillVecWidth == 4
flat_store_dwordx4 v[2:3], v[4:7]
.else
flat_store_dword v[2:3], v4
.endif
V_ADD_CO_U32 v2, v2, s12
V_ADD_CO_CI_U32 v3, v3, 0x0
.if \iter < \iter_end
mFillPhase1 (\iter + 1), \iter_end
.endif
.endm
mFillPhase1 0, kFillUnroll - 1
s_branch L_FILL_PHASE_1_LOOP
L_FILL_PHASE_1_DONE:
s_lshl_b32 s12, s11, 0x2
v_lshlrev_b32 v1, 0x2, v0
v_mov_b32 v3, s7
V_ADD_CO_U32 v2, v1, s6
V_ADD_CO_CI_U32 v3, v3, 0x0
L_FILL_PHASE_2_LOOP:
V_CMP_LT_U64 v[2:3], s[8:9]
s_cbranch_vccz L_FILL_PHASE_2_DONE
s_and_b64 exec, exec, vcc
flat_store_dword v[2:3], v4
V_ADD_CO_U32 v2, v2, s12
V_ADD_CO_CI_U32 v3, v3, 0x0
s_branch L_FILL_PHASE_2_LOOP
L_FILL_PHASE_2_DONE:
s_endpgm
@@ -0,0 +1,78 @@
#!/bin/bash -e
################################################################################
##
## The University of Illinois/NCSA
## Open Source License (NCSA)
##
## Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
##
## Developed by:
##
## AMD Research and AMD HSA Software Development
##
## Advanced Micro Devices, Inc.
##
## www.amd.com
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to
## deal with the Software without restriction, including without limitation
## the rights to use, copy, modify, merge, publish, distribute, sublicense,
## and/or sell copies of the Software, and to permit persons to whom the
## Software is furnished to do so, subject to the following conditions:
##
## - Redistributions of source code must retain the above copyright notice,
## this list of conditions and the following disclaimers.
## - Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimers in
## the documentation and/or other materials provided with the distribution.
## - Neither the names of Advanced Micro Devices, Inc,
## nor the names of its contributors may be used to endorse or promote
## products derived from this Software without specific prior written
## permission.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
## DEALINGS WITH THE SOFTWARE.
##
################################################################################
amd_gpu_shaders="$1"
if ! command -v xxd >/dev/null
then
echo "xxd not found!"
exit 1
fi
# Create the file in a temporary location and then move it in atomically
{
cat <<EOF
//==============================================================================
// This file is automatically generated during build process, don't modify it
//==============================================================================
namespace rocr {
namespace AMD {
EOF
shift
for file in "$@"
do
xxd -i $file
echo -e '\n'
done
cat <<EOF
} // namespace AMD
} // namespace rocr
EOF
} > "$amd_gpu_shaders"