From e2c5ecb8dc44cc6dbb0f06b52b083a27c7db9260 Mon Sep 17 00:00:00 2001 From: Shweta Khatri Date: Tue, 22 Aug 2023 16:44:07 -0400 Subject: [PATCH] Use LLVM compiler to build blit shaders Generates shader bytecode stream in amd_blit_shaders_v2.h at build time Change-Id: I5228ec5442a78d074fd85ca9cd7f7a156dd84da3 [ROCm/ROCR-Runtime commit: 4e675ce7305d4999e274d03d72441ecc4f0ab916] --- .../runtime/hsa-runtime/CMakeLists.txt | 7 +- .../hsa-runtime/core/inc/amd_blit_shaders.h | 168 ------------ .../core/runtime/amd_gpu_agent.cpp | 91 ++++--- .../core/runtime/blit_shaders/CMakeLists.txt | 169 ++++++++++++ .../runtime/blit_shaders/blit_copyAligned.s | 257 ++++++++++++++++++ .../blit_shaders/blit_copyMisaligned.s | 179 ++++++++++++ .../core/runtime/blit_shaders/blit_fill.s | 183 +++++++++++++ .../blit_shaders/create_blit_shader_header.sh | 78 ++++++ 8 files changed, 918 insertions(+), 214 deletions(-) create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyAligned.s create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyMisaligned.s create mode 100644 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_fill.s create mode 100755 projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/create_blit_shader_header.sh diff --git a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt index 9307d3aeb2..130815c51e 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt +++ b/projects/rocr-runtime/runtime/hsa-runtime/CMakeLists.txt @@ -123,7 +123,8 @@ target_include_directories( ${CORE_RUNTIME_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode - ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler) + ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler + ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/blit_shaders) ## ------------------------- Linux Compiler and Linker options ------------------------- @@ -202,6 +203,10 @@ target_sources( ${CORE_RUNTIME_TARGET} PRIVATE ${SRCS} ) add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/trap_handler ) add_dependencies( ${CORE_RUNTIME_TARGET} amd_trap_handler_v2 ) +## Depend on blit shader target. +add_subdirectory( ${CMAKE_CURRENT_SOURCE_DIR}/core/runtime/blit_shaders ) +add_dependencies( ${CORE_RUNTIME_TARGET} amd_blit_shaders_v2) + if ( NOT DEFINED IMAGE_SUPPORT AND CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" ) set ( IMAGE_SUPPORT ON ) endif() diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_shaders.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_shaders.h index 12c52cc4b3..4bc1daa6ef 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_shaders.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_shaders.h @@ -156,174 +156,6 @@ static const unsigned int kCodeFill8[] = { 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, }; -static const unsigned int kCodeCopyAligned940[] = { - 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020, - 0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050, - 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900, - 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05, - 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000, - 0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103, - 0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105, - 0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02, - 0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501, - 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000, - 0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, - 0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105, - 0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901, - 0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05, - 0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000, - 0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, - 0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105, - 0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03, - 0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105, - 0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002, - 0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000, -}; - -static const unsigned int kCodeCopyMisaligned940[] = { - 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020, - 0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, - 0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, - 0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032, - 0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, - 0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, - 0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, - 0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, - 0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05, - 0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05, - 0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05, - 0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05, - 0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03, - 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105, - 0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002, - 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000, - 0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee, - 0xbf810000, 0x00000000, -}; - -static const unsigned int kCodeFill940[] = { - 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602, - 0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b, - 0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103, - 0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02, - 0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082, - 0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a, - 0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02, - 0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000, -}; - -static const unsigned int kCodeCopyAligned10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, - 0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050, - 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, 0x7E060205, 0xD70F6A02, - 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, 0xD70F6A04, 0x00020006, - 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, 0xBF86000F, 0x87FE6A7E, - 0xDC200000, 0x017D0002, 0xBF8C3F70, 0xD70F6A02, 0x00020418, 0xD5286A03, - 0x01A90103, 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020818, 0xD5286A05, - 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x8F198418, 0x34020084, 0x7E060209, - 0xD70F6A02, 0x00020208, 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, - 0x0002020A, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000E, - 0xDC380000, 0x087D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, - 0xBF8C3F70, 0xDC780000, 0x007D0804, 0xD70F6A04, 0x00020819, 0xD5286A05, - 0x01A90105, 0xBF82FFEF, 0x8F198218, 0x34020082, 0x7E06020D, 0xD70F6A02, - 0x0002020C, 0xD5286A03, 0x01A90103, 0x7E0A020F, 0xD70F6A04, 0x0002020E, - 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00002102, 0xBF86000F, 0x87FE6A7E, - 0xDC300000, 0x017D0002, 0xD70F6A02, 0x00020419, 0xD5286A03, 0x01A90103, - 0xBF8C3F70, 0xDC700000, 0x007D0104, 0xD70F6A04, 0x00020819, 0xD5286A05, - 0x01A90105, 0xBF82FFEE, 0xBEFE04C1, 0x7E060211, 0xD70F6A02, 0x00020010, - 0xD5286A03, 0x01A90103, 0x7E0A0213, 0xD70F6A04, 0x00020012, 0xD5286A05, - 0x01A90105, 0xD4E1006A, 0x00002902, 0xBF860006, 0x87FE6A7E, 0xDC200000, - 0x017D0002, 0xBF8C3F70, 0xDC600000, 0x007D0104, 0xBF810000, -}; - -static const unsigned int kCodeCopyMisaligned10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, - 0xF4000400, 0xFA000030, 0xBF8CC07F, 0x8F028602, 0xD70F6A00, 0x00020002, - 0x7E060205, 0xD70F6A02, 0x00020004, 0xD5286A03, 0x01A90103, 0x7E0A0207, - 0xD70F6A04, 0x00020006, 0xD5286A05, 0x01A90105, 0xD4E1006A, 0x00001102, - 0xBF860032, 0xDC200000, 0x067D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x077D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x087D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xDC200000, 0x097D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, - 0x01A90103, 0xBF8C3F70, 0xDC600000, 0x007D0604, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0704, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0804, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xDC600000, 0x007D0904, 0xD70F6A04, 0x00020810, - 0xD5286A05, 0x01A90105, 0xBF82FFCB, 0x7E060209, 0xD70F6A02, 0x00020008, - 0xD5286A03, 0x01A90103, 0x7E0A020B, 0xD70F6A04, 0x0002000A, 0xD5286A05, - 0x01A90105, 0xD4E1006A, 0x00001902, 0xBF86000F, 0x87FE6A7E, 0xDC200000, - 0x017D0002, 0xD70F6A02, 0x00020410, 0xD5286A03, 0x01A90103, 0xBF8C3F70, - 0xDC600000, 0x007D0104, 0xD70F6A04, 0x00020810, 0xD5286A05, 0x01A90105, - 0xBF82FFEE, 0xBF810000, -}; - -static const unsigned int kCodeFill10[] = { - 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xBF8CC07F, 0x8F028602, - 0xD70F6A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, - 0x8F0C840B, 0x34020084, 0x7E060205, 0xD70F6A02, 0x00020204, 0xD5286A03, - 0x01A90103, 0xD4E1006A, 0x00000D02, 0xBF860007, 0xDC780000, 0x007D0402, - 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF6, 0x8F0C820B, - 0x34020082, 0x7E060207, 0xD70F6A02, 0x00020206, 0xD5286A03, 0x01A90103, - 0xD4E1006A, 0x00001102, 0xBF860008, 0x87FE6A7E, 0xDC700000, 0x007D0402, - 0xD70F6A02, 0x0002040C, 0xD5286A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, -}; - -static const unsigned int kCodeCopyAligned11[] = { - 0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020, - 0xF4080400, 0xF8000030, 0xF4080500, 0xF8000040, 0xF4000600, 0xF8000050, - 0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002, 0x7E060205, 0xD7006A02, - 0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207, 0xD7006A04, 0x00000D00, - 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102, 0xBFA3000F, 0x8BFE6A7E, - 0xDC400000, 0x017C0002, 0xBF8903FF, 0xD7006A02, 0x00003102, 0xD5206A03, - 0x01A90103, 0xDC600000, 0x007C0104, 0xD7006A04, 0x00003104, 0xD5206A05, - 0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x84198418, 0x30020084, 0x7E060209, - 0xD7006A02, 0x00001101, 0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04, - 0x00001501, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000E, - 0xDC5C0000, 0x087C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103, - 0xBF8903FF, 0xDC740000, 0x007C0804, 0xD7006A04, 0x00003304, 0xD5206A05, - 0x01A90105, 0xBFA0FFEF, 0x84198218, 0x30020082, 0x7E06020D, 0xD7006A02, - 0x00001901, 0xD5206A03, 0x01A90103, 0x7E0A020F, 0xD7006A04, 0x00001D01, - 0xD5206A05, 0x01A90105, 0xD459006A, 0x00002102, 0xBFA3000F, 0x8BFE6A7E, - 0xDC500000, 0x017C0002, 0xD7006A02, 0x00003302, 0xD5206A03, 0x01A90103, - 0xBF8903FF, 0xDC680000, 0x007C0104, 0xD7006A04, 0x00003304, 0xD5206A05, - 0x01A90105, 0xBFA0FFEE, 0xBEFE01C1, 0x7E060211, 0xD7006A02, 0x00002100, - 0xD5206A03, 0x01A90103, 0x7E0A0213, 0xD7006A04, 0x00002500, 0xD5206A05, - 0x01A90105, 0xD459006A, 0x00002902, 0xBFA30006, 0x8BFE6A7E, 0xDC400000, - 0x017C0002, 0xBF8903FF, 0xDC600000, 0x007C0104, 0xBFB00000, -}; - -static const unsigned int kCodeCopyMisaligned11[] = { - 0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xF4080300, 0xF8000020, - 0xF4000400, 0xF8000030, 0xBF89FC0F, 0x84028602, 0xD7006A00, 0x00020002, - 0x7E060205, 0xD7006A02, 0x00000900, 0xD5206A03, 0x01A90103, 0x7E0A0207, - 0xD7006A04, 0x00000D00, 0xD5206A05, 0x01A90105, 0xD459006A, 0x00001102, - 0xBFA30032, 0xDC400000, 0x067C0002, 0xD7006A02, 0x00002102, 0xD5206A03, - 0x01A90103, 0xDC400000, 0x077C0002, 0xD7006A02, 0x00002102, 0xD5206A03, - 0x01A90103, 0xDC400000, 0x087C0002, 0xD7006A02, 0x00002102, 0xD5206A03, - 0x01A90103, 0xDC400000, 0x097C0002, 0xD7006A02, 0x00002102, 0xD5206A03, - 0x01A90103, 0xBF8903FF, 0xDC600000, 0x007C0604, 0xD7006A04, 0x00002104, - 0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0704, 0xD7006A04, 0x00002104, - 0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0804, 0xD7006A04, 0x00002104, - 0xD5206A05, 0x01A90105, 0xDC600000, 0x007C0904, 0xD7006A04, 0x00002104, - 0xD5206A05, 0x01A90105, 0xBFA0FFCB, 0x7E060209, 0xD7006A02, 0x00001100, - 0xD5206A03, 0x01A90103, 0x7E0A020B, 0xD7006A04, 0x00001500, 0xD5206A05, - 0x01A90105, 0xD459006A, 0x00001902, 0xBFA3000F, 0x8BFE6A7E, 0xDC400000, - 0x017C0002, 0xD7006A02, 0x00002102, 0xD5206A03, 0x01A90103, 0xBF8903FF, - 0xDC600000, 0x007C0104, 0xD7006A04, 0x00002104, 0xD5206A05, 0x01A90105, - 0xBFA0FFEE, 0xBFB00000, -}; - -static const unsigned int kCodeFill11[] = { - 0xF4080100, 0xF8000000, 0xF4080200, 0xF8000010, 0xBF89FC0F, 0x84028602, - 0xD7006A00, 0x00020002, 0x7E08020A, 0x7E0A020A, 0x7E0C020A, 0x7E0E020A, - 0x840C840B, 0x30020084, 0x7E060205, 0xD7006A02, 0x00000901, 0xD5206A03, - 0x01A90103, 0xD459006A, 0x00000D02, 0xBFA30007, 0xDC740000, 0x007C0402, - 0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF6, 0x840C820B, - 0x30020082, 0x7E060207, 0xD7006A02, 0x00000D01, 0xD5206A03, 0x01A90103, - 0xD459006A, 0x00001102, 0xBFA30008, 0x8BFE6A7E, 0xDC680000, 0x007C0402, - 0xD7006A02, 0x00001902, 0xD5206A03, 0x01A90103, 0xBFA0FFF5, 0xBFB00000, -}; - } // namespace AMD } // namespace rocr diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 0893246671..f0797cd24d 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -70,6 +70,7 @@ #include "core/inc/amd_blit_shaders.h" // Generated header #include "amd_trap_handler_v2.h" +#include "amd_blit_shaders_v2.h" #if defined(__linux__) // libdrm headers @@ -257,63 +258,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar std::map compiled_shaders = { {"TrapHandler", { - {NULL, 0, 0, 0}, // gfx7 - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 - {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9 - {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a - {NULL, 0, 0, 0}, // gfx940 - {NULL, 0, 0, 0}, // gfx942 - {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010 - {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10 - {NULL, 0, 0, 0}, // gfx11 + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9 + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a + {NULL, 0, 0, 0}, // gfx940 + {NULL, 0, 0, 0}, // gfx942 + {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010 + {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10 + {NULL, 0, 0, 0}, // gfx11 }}, {"TrapHandlerKfdExceptions", { - {NULL, 0, 0, 0}, // gfx7 - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9 - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a - {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940 - {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942 - {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010 - {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10 - {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11 + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940 + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942 + {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4}, // gfx1010 + {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10 + {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11 }}, {"CopyAligned", { - {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7 - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8 - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx9 - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx90a - {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940 - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx942 - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010 - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10 - {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11 + {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx9 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx90a + {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940 + {kCodeCopyAligned9, sizeof(kCodeCopyAligned9), 32, 12}, // gfx942 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10 + {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11 }}, {"CopyMisaligned", { - {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7 - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8 - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx9 - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx90a - {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940 - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx942 - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010 - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10 - {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11 + {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx9 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx90a + {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10}, // gfx940 + {kCodeCopyMisaligned9, sizeof(kCodeCopyMisaligned9), 23, 10}, // gfx942 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10 + {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11 }}, {"Fill", { - {kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7 - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8 - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx9 - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx90a - {kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940 - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx942 - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010 - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10 - {kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11 + {kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx9 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx90a + {kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940 + {kCodeFill9, sizeof(kCodeFill9), 19, 8}, // gfx942 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10 + {kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11 }}}; auto compiled_shader_it = compiled_shaders.find(func_name); diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt new file mode 100644 index 0000000000..dc32b2f2bc --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt @@ -0,0 +1,169 @@ +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and/or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and/or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +## +################################################################################ + +# Minimum required version of CMake +cmake_minimum_required ( VERSION 3.7 ) + +# Find Clang package and LLVM package +find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) +find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) + +# Set the target devices +set (TARGET_DEVS "gfx900;gfx940;gfx1010;gfx1030;gfx1100") +# Set the postfix for each target device +set (POSTFIX "9;940;1010;10;11") + +# If verbose output is enabled, print paths and target devices +if(${CMAKE_VERBOSE_MAKEFILE}) + get_property(clang_path TARGET clang PROPERTY LOCATION) + get_property(objcopy_path TARGET llvm-objcopy PROPERTY LOCATION) + message("Using clang from: ${clang_path}") + message("Using llvm-objcopy from: ${objcopy_path}") + message("Blit Shaders assembled for: ${TARGET_DEVS}") +endif() + +# Function to generate kernel bitcode +function(gen_kernel_bc TARGET_ID INPUT_FILE OUTPUT_FILE) + set(CODE_OBJECT "${OUTPUT_FILE}.hsaco") + + # Separate clang arguments + separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-x assembler -target amdgcn-amd-amdhsa -mcode-object-version=5 -fPIC -mcpu=${TARGET_ID} -o ${CODE_OBJECT} ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE}") + + # Add custom command to generate the kernel bitcode + add_custom_command(OUTPUT ${CODE_OBJECT} COMMAND clang ${CLANG_ARG_LIST} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE} clang + COMMENT "BUILDING bitcode for ${OUTPUT_FILE}..." + VERBATIM) + + separate_arguments(OBJCOPY_ARG_LIST UNIX_COMMAND "--dump-section=.text=${OUTPUT_FILE} ${CODE_OBJECT}") + + # Add custom command to extract binary from the bitcode + add_custom_command(OUTPUT ${OUTPUT_FILE} + COMMAND llvm-objcopy ${OBJCOPY_ARG_LIST} + DEPENDS ${CODE_OBJECT} llvm-objcopy + COMMENT "Extracting binary for ${OUTPUT_FILE}..." + VERBATIM) + + if(${CMAKE_VERBOSE_MAKEFILE}) + message(" Blit Shader Source: " ${CMAKE_CURRENT_SOURCE_DIR}/${INPUT_FILE}) + message(" Blit Shader Binary: " ${OUTPUT_FILE}) + endif() + +endfunction(gen_kernel_bc) + +# Function to build a kernel for each target device +function(build_kernel BLIT_SHADER_NAME BLIT_FILE TARGET_ID POSTFIX) + set(CODE_OBJECT_FILE "${BLIT_SHADER_NAME}${POSTFIX}") + gen_kernel_bc(${TARGET_ID} ${BLIT_FILE} ${CODE_OBJECT_FILE}) + list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}") + set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE) + +endfunction(build_kernel) + +# Function to build kernels for all devices and shaders +function(build_kernels_for_devices SHADER_NAMES SHADER_FILES) + set(HSACO_TARG_LIST "") + + list(LENGTH TARGET_DEVS num_target_devices) + math(EXPR num_target_devices "${num_target_devices} - 1") + list(LENGTH SHADER_NAMES num_shader_names) + math(EXPR num_shader_names "${num_shader_names} - 1") + + foreach(shader_index RANGE ${num_shader_names}) + list(GET SHADER_NAMES ${shader_index} shader_name) + list(GET SHADER_FILES ${shader_index} shader_file) + foreach(device_index RANGE ${num_target_devices}) + # Get device from list of target devices + list(GET TARGET_DEVS ${device_index} target_device) + # Get postfix from list of postfixes + list(GET POSTFIX ${device_index} postfix) + if(${CMAKE_VERBOSE_MAKEFILE}) + message("\n Generating: ${target_device} for ${shader_name} ...") + endif() + + # Define the name of the code object file + set(CODE_OBJECT_FILE "${shader_name}${postfix}") + + # Generate the kernel bitcode for the current device and shader + gen_kernel_bc(${target_device} ${shader_file} ${CODE_OBJECT_FILE}) + # Append the code object file to the list + list(APPEND HSACO_TARG_LIST "${CODE_OBJECT_FILE}") + endforeach(device_index) + endforeach(shader_index) + + # Make the list of code object files available in the parent scope + set(HSACO_TARG_LIST ${HSACO_TARG_LIST} PARENT_SCOPE) + +endfunction(build_kernels_for_devices) + + +# Function to generate the bytecode stream and create the header file +function(generate_bytecodeStrm HeaderFILE) + set(ARG_LIST "${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h") + + # Copy the shell script to the build directory + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/create_blit_shader_header.sh + ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh + COPYONLY) + + # Add a custom command to generate the header file + add_custom_command(OUTPUT ${HeaderFILE}.h + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh ${ARG_LIST} ${HSACO_TARG_LIST} + COMMENT "Collating blit shaders..." + DEPENDS ${HSACO_TARG_LIST} ${CMAKE_CURRENT_BINARY_DIR}/create_blit_shader_header.sh) + + # Add a custom target that depends on the header file + add_custom_target(${HeaderFILE} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${HeaderFILE}.h) + +endfunction(generate_bytecodeStrm) + + +# Build kernels for deviceodeCopyAligned +build_kernels_for_devices("kCodeCopyAligned;kCodeCopyMisaligned;kCodeFill" "blit_copyAligned.s;blit_copyMisaligned.s;blit_fill.s") + +# Generate bytecode stream +generate_bytecodeStrm("amd_blit_shaders_v2") + + + + diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyAligned.s b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyAligned.s new file mode 100644 index 0000000000..20dcd87d53 --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyAligned.s @@ -0,0 +1,257 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +/////////////////////////////////////////////////////////////////////////////////////// + +.text + +.macro V_ADD_CO_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1 + .elseif (.amdgcn.gfx_generation_number >= 9) + v_add_co_u32 \vdst, vcc, \src0, \vsrc1 + .else + v_add_u32 \vdst, vcc, \src0, \vsrc1 + .endif +.endm + + +.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo + .elseif (.amdgcn.gfx_generation_number >= 9) + v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc + .else + v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc + .endif +.endm + +.macro V_CMP_LT_U64 src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_cmp_lt_u64 vcc_lo, \src0, \vsrc1 + .else + v_cmp_lt_u64 vcc, \src0, \vsrc1 + .endif +.endm + + +.p2align 8 + +CopyAligned: +.set kCopyAlignedVecWidth, 4 +compute_pgm_rsrc2_user_sgpr = 2 +compute_pgm_rsrc2_tgid_x_en = 1 +enable_sgpr_kernarg_segment_ptr = 1 + +.set kCopyAlignedUnroll, 1 +.set kCopyAlignedNumSGPRs, 32 +.set kCopyAlignedNumVGPRs, (8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth)) +.set CopyAlignedRsrc1SGPRs, (kCopyAlignedNumSGPRs - 1)/8 +.set CopyAlignedRsrc1VGPRs, (kCopyAlignedNumVGPRs - 1)/4 + +compute_pgm_rsrc1_sgprs = CopyAlignedRsrc1SGPRs +compute_pgm_rsrc1_vgprs = CopyAlignedRsrc1VGPRs + + + s_load_dwordx4 s[4:7], s[0:1], 0x0 + s_load_dwordx4 s[8:11], s[0:1], 0x10 + s_load_dwordx4 s[12:15], s[0:1], 0x20 + s_load_dwordx4 s[16:19], s[0:1], 0x30 + s_load_dwordx4 s[20:23], s[0:1], 0x40 + s_load_dword s24, s[0:1], 0x50 + s_waitcnt lgkmcnt(0) + + + s_lshl_b32 s2, s2, 0x6 + V_ADD_CO_U32 v0, s2, v0 + + v_mov_b32 v3, s5 + V_ADD_CO_U32 v2, v0, s4 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + + v_mov_b32 v5, s7 + V_ADD_CO_U32 v4, v0, s6 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + L_COPY_ALIGNED_PHASE_1_LOOP: + + V_CMP_LT_U64 v[2:3], s[8:9] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE + s_and_b64 exec, exec, vcc + + + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + V_ADD_CO_U32 v2, v2, s24 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + + flat_store_byte v[4:5], v1 + V_ADD_CO_U32 v4, v4, s24 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + s_branch L_COPY_ALIGNED_PHASE_1_LOOP + + L_COPY_ALIGNED_PHASE_1_DONE: + + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + +.if kCopyAlignedVecWidth == 4 + s_lshl_b32 s25, s24, 0x4 + .else + s_lshl_b32 s25, s24, 0x2 + .endif + + .if kCopyAlignedVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + .else + v_lshlrev_b32 v1, 0x2, v0 + .endif + + + v_mov_b32 v3, s9 + V_ADD_CO_U32 v2, v1, s8 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + v_mov_b32 v5, s11 + V_ADD_CO_U32 v4, v1, s10 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + L_COPY_ALIGNED_PHASE_2_LOOP: + + V_CMP_LT_U64 v[2:3], s[12:13] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE + +.macro mCopyAlignedPhase2Load iter iter_end + .if kCopyAlignedVecWidth == 4 + flat_load_dwordx4 v[8 + (\iter * 4):8 + (\iter * 4) + 3], v[2:3] + .else + flat_load_dword v[8 + \iter], v[2:3] + .endif + + V_ADD_CO_U32 v2, v2, s25 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + .if (\iter_end - \iter) + mCopyAlignedPhase2Load (\iter + 1), \iter_end + .endif +.endm + +mCopyAlignedPhase2Load 0, (kCopyAlignedUnroll - 1) + + s_waitcnt vmcnt(0) + +.macro mCopyAlignedPhase2Store iter iter_end + .if kCopyAlignedVecWidth == 4 + flat_store_dwordx4 v[4:5], v[8 + (\iter * 4):8 + (\iter * 4) + 3] + .else + flat_store_dword v[4:5], v[8 + \iter] + .endif + + V_ADD_CO_U32 v4, v4, s25 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + + .if (\iter_end - \iter) + mCopyAlignedPhase2Store (\iter + 1), \iter_end + .endif +.endm + +mCopyAlignedPhase2Store 0, (kCopyAlignedUnroll - 1) + + s_branch L_COPY_ALIGNED_PHASE_2_LOOP + + L_COPY_ALIGNED_PHASE_2_DONE: + + s_lshl_b32 s25, s24, 0x2 + + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s13 + V_ADD_CO_U32 v2, v1, s12 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + v_mov_b32 v5, s15 + V_ADD_CO_U32 v4, v1, s14 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + L_COPY_ALIGNED_PHASE_3_LOOP: + + V_CMP_LT_U64 v[2:3], s[16:17] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE + s_and_b64 exec, exec, vcc + + + flat_load_dword v1, v[2:3] + V_ADD_CO_U32 v2, v2, s25 + V_ADD_CO_CI_U32 v3, v3, 0x0 + s_waitcnt vmcnt(0) + + + flat_store_dword v[4:5], v1 + V_ADD_CO_U32 v4, v4, s25 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + s_branch L_COPY_ALIGNED_PHASE_3_LOOP + + L_COPY_ALIGNED_PHASE_3_DONE: + + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + v_mov_b32 v3, s17 + V_ADD_CO_U32 v2, v0, s16 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + v_mov_b32 v5, s19 + V_ADD_CO_U32 v4, v0, s18 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + V_CMP_LT_U64 v[2:3], s[20:21] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE + s_and_b64 exec, exec, vcc + + flat_load_ubyte v1, v[2:3] + s_waitcnt vmcnt(0) + + flat_store_byte v[4:5], v1 + + L_COPY_ALIGNED_PHASE_4_DONE: + s_endpgm + diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyMisaligned.s b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyMisaligned.s new file mode 100644 index 0000000000..dd0b15d10b --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_copyMisaligned.s @@ -0,0 +1,179 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////////// + +.text + +.macro V_ADD_CO_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1 + .elseif (.amdgcn.gfx_generation_number >= 9) + v_add_co_u32 \vdst, vcc, \src0, \vsrc1 + .else + v_add_u32 \vdst, vcc, \src0, \vsrc1 + .endif +.endm + + +.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo + .elseif (.amdgcn.gfx_generation_number >= 9) + v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc + .else + v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc + .endif +.endm + +.macro V_CMP_LT_U64 src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_cmp_lt_u64 vcc_lo, \src0, \vsrc1 + .else + v_cmp_lt_u64 vcc, \src0, \vsrc1 + .endif +.endm + +.set kCopyMisalignedUnroll, 4 +.set kCopyMisalignedNumSGPRs, 17 +.set kCopyMisalignedNumVGPRs, 6 + kCopyMisalignedUnroll +.set CopyMisalignedRsrc1SGPRs , (kCopyMisalignedNumSGPRs - 1) / 8 + +.if CopyMisalignedRsrc1SGPRs < 0 + .set CopyMisalignedRsrc1SGPRs , 0 +.endif + +.set CopyMisalignedRsrc1VGPRs , (kCopyMisalignedNumVGPRs - 1) / 4 +.if CopyMisalignedRsrc1VGPRs < 0 + .set CopyMisalignedRsrc1VGPRs , 0 +.endif + +.p2align 8 + +CopyMisaligned: + compute_pgm_rsrc1_sgprs = CopyMisalignedRsrc1SGPRs + compute_pgm_rsrc1_vgprs = CopyMisalignedRsrc1VGPRs + compute_pgm_rsrc2_user_sgpr = 2 + compute_pgm_rsrc2_tgid_x_en = 1 + enable_sgpr_kernarg_segment_ptr = 1 + + s_load_dwordx4 s[4:7], s[0:1], 0x0 + s_load_dwordx4 s[8:11], s[0:1], 0x10 + s_load_dwordx4 s[12:15], s[0:1], 0x20 + s_load_dword s16, s[0:1], 0x30 + s_waitcnt lgkmcnt(0) + + s_lshl_b32 s2, s2, 0x6 + V_ADD_CO_U32 v0, s2, v0 + + v_mov_b32 v3, s5 + V_ADD_CO_U32 v2, v0, s4 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + v_mov_b32 v5, s7 + V_ADD_CO_U32 v4, v0, s6 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + L_COPY_MISALIGNED_PHASE_1_LOOP: + + V_CMP_LT_U64 v[2:3], s[8:9] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE + + + .macro mCopyMisalignedPhase1Load iter iter_end + flat_load_ubyte v[6 + \iter], v[2:3] + V_ADD_CO_U32 v2, v2, s16 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + .if (\iter_end - \iter) + mCopyMisalignedPhase1Load (\iter + 1), \iter_end + .endif + .endm + + mCopyMisalignedPhase1Load 0, (kCopyMisalignedUnroll - 1) + + s_waitcnt vmcnt(0) + + .macro mCopyMisalignedPhase1Store iter iter_end + flat_store_byte v[4:5], v[6 + \iter] + V_ADD_CO_U32 v4, v4, s16 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + .if (\iter_end - \iter) + mCopyMisalignedPhase1Store (\iter + 1), \iter_end + .endif + .endm + + mCopyMisalignedPhase1Store 0, (kCopyMisalignedUnroll - 1) + + s_branch L_COPY_MISALIGNED_PHASE_1_LOOP + + L_COPY_MISALIGNED_PHASE_1_DONE: + + v_mov_b32 v3, s9 + V_ADD_CO_U32 v2, v0, s8 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + v_mov_b32 v5, s11 + V_ADD_CO_U32 v4, v0, s10 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + L_COPY_MISALIGNED_PHASE_2_LOOP: + + V_CMP_LT_U64 v[2:3], s[12:13] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + + flat_load_ubyte v1, v[2:3] + V_ADD_CO_U32 v2, v2, s16 + V_ADD_CO_CI_U32 v3, v3, 0x0 + s_waitcnt vmcnt(0) + + flat_store_byte v[4:5], v1 + V_ADD_CO_U32 v4, v4, s16 + V_ADD_CO_CI_U32 v5, v5, 0x0 + + s_branch L_COPY_MISALIGNED_PHASE_2_LOOP + + L_COPY_MISALIGNED_PHASE_2_DONE: + s_endpgm + + diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_fill.s b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_fill.s new file mode 100644 index 0000000000..859de116bc --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/blit_fill.s @@ -0,0 +1,183 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +//////////////////////////////////////////////////////////////////////////////////// + +.text + +.macro V_ADD_CO_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_u32 \vdst, vcc_lo, \src0, \vsrc1 + .elseif (.amdgcn.gfx_generation_number >= 9) + v_add_co_u32 \vdst, vcc, \src0, \vsrc1 + .else + v_add_u32 \vdst, vcc, \src0, \vsrc1 + .endif +.endm + + +.macro V_ADD_CO_CI_U32 vdst, src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_add_co_ci_u32 \vdst, vcc_lo, \src0, \vsrc1, vcc_lo + .elseif (.amdgcn.gfx_generation_number >= 9) + v_addc_co_u32 \vdst, vcc, \src0, \vsrc1, vcc + .else + v_addc_u32 \vdst, vcc, \src0, \vsrc1, vcc + .endif +.endm + +.macro V_CMP_LT_U64 src0, vsrc1 + .if (.amdgcn.gfx_generation_number >= 10) + v_cmp_lt_u64 vcc_lo, \src0, \vsrc1 + .else + v_cmp_lt_u64 vcc, \src0, \vsrc1 + .endif +.endm + +.set kFillVecWidth, 4 +.set kFillUnroll, 1 + +.set kFillNumSGPRs, 13 +.set kFillNumVGPRs, 4 + kFillUnroll + +.set FillRsrc1SGPRs , (kFillNumSGPRs - 1) / 8 + .if FillRsrc1SGPRs < 0 + .set FillRsrc1SGPRs , 0 + .endif + +.set FillRsrc1VGPRs , (kFillNumVGPRs - 1) / 4 + .if FillRsrc1VGPRs < 0 + .set FillRsrc1VGPRs , 0 + .endif + +.p2align 8 + +Fill: + + compute_pgm_rsrc1_sgprs = FillRsrc1SGPRs + compute_pgm_rsrc1_vgprs = FillRsrc1VGPRs + compute_pgm_rsrc2_user_sgpr = 2 + compute_pgm_rsrc2_tgid_x_en = 1 + enable_sgpr_kernarg_segment_ptr = 1 + + s_load_dwordx4 s[4:7], s[0:1], 0x0 + s_load_dwordx4 s[8:11], s[0:1], 0x10 + s_waitcnt lgkmcnt(0) + + s_lshl_b32 s2, s2, 0x6 + V_ADD_CO_U32 v0, s2, v0 + +.macro mFillPattern iter iter_end + v_mov_b32 v[4 + \iter], s10 + + .if (\iter_end - \iter) + mFillPattern (\iter + 1), \iter_end + .endif + .endm + + mFillPattern 0, (kFillVecWidth - 1) + + .if kFillVecWidth == 4 + s_lshl_b32 s12, s11, 0x4 + .else + s_lshl_b32 s12, s11, 0x2 + .endif + + + .if kFillVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + .else + v_lshlrev_b32 v1, 0x2, v0 + .endif + + v_mov_b32 v3, s5 + V_ADD_CO_U32 v2, v1, s4 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + L_FILL_PHASE_1_LOOP: + + V_CMP_LT_U64 v[2:3], s[6:7] + s_cbranch_vccz L_FILL_PHASE_1_DONE + +.macro mFillPhase1 iter iter_end + .if kFillVecWidth == 4 + flat_store_dwordx4 v[2:3], v[4:7] + .else + flat_store_dword v[2:3], v4 + .endif + + V_ADD_CO_U32 v2, v2, s12 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + .if \iter < \iter_end + mFillPhase1 (\iter + 1), \iter_end + .endif +.endm + +mFillPhase1 0, kFillUnroll - 1 + + s_branch L_FILL_PHASE_1_LOOP + + L_FILL_PHASE_1_DONE: + + s_lshl_b32 s12, s11, 0x2 + + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s7 + V_ADD_CO_U32 v2, v1, s6 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + L_FILL_PHASE_2_LOOP: + + V_CMP_LT_U64 v[2:3], s[8:9] + s_cbranch_vccz L_FILL_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + + flat_store_dword v[2:3], v4 + V_ADD_CO_U32 v2, v2, s12 + V_ADD_CO_CI_U32 v3, v3, 0x0 + + s_branch L_FILL_PHASE_2_LOOP + + L_FILL_PHASE_2_DONE: + s_endpgm + + + diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/create_blit_shader_header.sh b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/create_blit_shader_header.sh new file mode 100755 index 0000000000..09e654fdfa --- /dev/null +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/blit_shaders/create_blit_shader_header.sh @@ -0,0 +1,78 @@ +#!/bin/bash -e +################################################################################ +## +## The University of Illinois/NCSA +## Open Source License (NCSA) +## +## Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +## +## Developed by: +## +## AMD Research and AMD HSA Software Development +## +## Advanced Micro Devices, Inc. +## +## www.amd.com +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal with the Software without restriction, including without limitation +## the rights to use, copy, modify, merge, publish, distribute, sublicense, +## and/or sell copies of the Software, and to permit persons to whom the +## Software is furnished to do so, subject to the following conditions: +## +## - Redistributions of source code must retain the above copyright notice, +## this list of conditions and the following disclaimers. +## - Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimers in +## the documentation and/or other materials provided with the distribution. +## - Neither the names of Advanced Micro Devices, Inc, +## nor the names of its contributors may be used to endorse or promote +## products derived from this Software without specific prior written +## permission. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +## THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +## OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +## ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +## DEALINGS WITH THE SOFTWARE. +## +################################################################################ + +amd_gpu_shaders="$1" + +if ! command -v xxd >/dev/null +then + echo "xxd not found!" + exit 1 +fi + +# Create the file in a temporary location and then move it in atomically +{ +cat < "$amd_gpu_shaders" +