From 5ca7af2d3011ae5885dfa85a84a55ab3e8f9b1ec Mon Sep 17 00:00:00 2001 From: Joseph Macaranas <145489236+jayhawk-commits@users.noreply.github.com> Date: Fri, 5 Sep 2025 10:32:44 -0400 Subject: [PATCH] Migrate amdgpu-windows-interop to rocm-systems (#808) --- .dvc/.gitignore | 3 + .dvc/config | 6 + .dvcignore | 3 + .github/labeler.yml | 4 + CONTRIBUTING.md | 48 +- shared/amdgpu-windows-interop/LICENSE | 198 + .../lib/loaders/elf/CMakeLists.txt | 26 + .../hsail-compiler/lib/loaders/elf/elf.cpp | 1557 +++++ .../hsail-compiler/lib/loaders/elf/elf.hpp | 416 ++ .../lib/loaders/elf/elf_utils.cpp | 285 + .../lib/loaders/elf/elf_utils.hpp | 150 + .../lib/loaders/elf/utils/CMakeLists.txt | 12 + .../lib/loaders/elf/utils/common/_elftc.h | 539 ++ .../loaders/elf/utils/common/elfdefinitions.h | 2618 ++++++++ .../elf/utils/common/native-elf-format | 47 + .../elf/utils/common/native-elf-format.h | 14 + .../lib/loaders/elf/utils/common/uthash.h | 906 +++ .../lib/loaders/elf/utils/common/win32/ar.h | 24 + .../loaders/elf/utils/common/win32/compat.h | 57 + .../elf/utils/common/win32/sys/cdefs.h | 19 + .../elf/utils/common/win32/sys/elf32.h | 163 + .../elf/utils/common/win32/sys/elf64.h | 176 + .../elf/utils/common/win32/sys/elf_common.h | 355 + .../loaders/elf/utils/common/win32/sys/mman.h | 15 + .../elf/utils/common/win32/sys/param.h | 13 + .../elf/utils/common/win32/sys/queue.h | 671 ++ .../loaders/elf/utils/libelf/CMakeLists.txt | 29 + .../lib/loaders/elf/utils/libelf/README | 12 + .../lib/loaders/elf/utils/libelf/README.build | 64 + .../lib/loaders/elf/utils/libelf/Version.map | 97 + .../lib/loaders/elf/utils/libelf/_libelf.h | 216 + .../lib/loaders/elf/utils/libelf/_libelf_ar.h | 55 + .../loaders/elf/utils/libelf/_libelf_config.h | 288 + .../lib/loaders/elf/utils/libelf/elf.c | 41 + .../lib/loaders/elf/utils/libelf/elf_begin.c | 334 + .../lib/loaders/elf/utils/libelf/elf_cntl.c | 58 + .../lib/loaders/elf/utils/libelf/elf_data.c | 254 + .../lib/loaders/elf/utils/libelf/elf_end.c | 99 + .../lib/loaders/elf/utils/libelf/elf_errmsg.c | 88 + .../lib/loaders/elf/utils/libelf/elf_errno.c | 43 + .../lib/loaders/elf/utils/libelf/elf_fill.c | 39 + .../lib/loaders/elf/utils/libelf/elf_flag.c | 195 + .../loaders/elf/utils/libelf/elf_getarhdr.c | 47 + .../loaders/elf/utils/libelf/elf_getarsym.c | 58 + .../loaders/elf/utils/libelf/elf_getbase.c | 48 + .../loaders/elf/utils/libelf/elf_getident.c | 68 + .../lib/loaders/elf/utils/libelf/elf_hash.c | 56 + .../lib/loaders/elf/utils/libelf/elf_kind.c | 44 + .../lib/loaders/elf/utils/libelf/elf_memory.c | 92 + .../lib/loaders/elf/utils/libelf/elf_next.c | 62 + .../lib/loaders/elf/utils/libelf/elf_phnum.c | 67 + .../lib/loaders/elf/utils/libelf/elf_rand.c | 59 + .../loaders/elf/utils/libelf/elf_rawfile.c | 53 + .../lib/loaders/elf/utils/libelf/elf_scn.c | 245 + .../lib/loaders/elf/utils/libelf/elf_shnum.c | 67 + .../loaders/elf/utils/libelf/elf_shstrndx.c | 82 + .../lib/loaders/elf/utils/libelf/elf_strptr.c | 136 + .../lib/loaders/elf/utils/libelf/elf_update.c | 1264 ++++ .../loaders/elf/utils/libelf/elf_version.c | 52 + .../lib/loaders/elf/utils/libelf/gelf.h | 114 + .../lib/loaders/elf/utils/libelf/gelf_cap.c | 148 + .../loaders/elf/utils/libelf/gelf_checksum.c | 58 + .../lib/loaders/elf/utils/libelf/gelf_dyn.c | 143 + .../lib/loaders/elf/utils/libelf/gelf_ehdr.c | 167 + .../lib/loaders/elf/utils/libelf/gelf_fsize.c | 62 + .../loaders/elf/utils/libelf/gelf_getclass.c | 39 + .../lib/loaders/elf/utils/libelf/gelf_move.c | 154 + .../lib/loaders/elf/utils/libelf/gelf_phdr.c | 177 + .../lib/loaders/elf/utils/libelf/gelf_rel.c | 152 + .../lib/loaders/elf/utils/libelf/gelf_rela.c | 155 + .../lib/loaders/elf/utils/libelf/gelf_shdr.c | 129 + .../lib/loaders/elf/utils/libelf/gelf_sym.c | 153 + .../loaders/elf/utils/libelf/gelf_syminfo.c | 147 + .../loaders/elf/utils/libelf/gelf_symshndx.c | 128 + .../lib/loaders/elf/utils/libelf/gelf_xlate.c | 81 + .../lib/loaders/elf/utils/libelf/libelf.h | 262 + .../loaders/elf/utils/libelf/libelf_align.c | 185 + .../elf/utils/libelf/libelf_allocate.c | 228 + .../lib/loaders/elf/utils/libelf/libelf_ar.c | 458 ++ .../loaders/elf/utils/libelf/libelf_ar_util.c | 354 + .../elf/utils/libelf/libelf_checksum.c | 100 + .../loaders/elf/utils/libelf/libelf_convert.c | 3100 +++++++++ .../loaders/elf/utils/libelf/libelf_data.c | 114 + .../loaders/elf/utils/libelf/libelf_ehdr.c | 211 + .../elf/utils/libelf/libelf_extended.c | 135 + .../loaders/elf/utils/libelf/libelf_fsize.c | 147 + .../loaders/elf/utils/libelf/libelf_msize.c | 145 + .../loaders/elf/utils/libelf/libelf_phdr.c | 160 + .../loaders/elf/utils/libelf/libelf_shdr.c | 56 + .../loaders/elf/utils/libelf/libelf_xlate.c | 150 + .../lib/loaders/elf/utils/libelf/memfile.cpp | 403 ++ .../lib/loaders/elf/utils/libelf/memfile.h | 40 + .../lib/loaders/elf/utils/libelf/roundup.h | 15 + .../amdgpu-windows-interop/pal/CMakeLists.txt | 92 + shared/amdgpu-windows-interop/pal/LICENSE | 21 + .../amdgpu-windows-interop/pal/inc/core/pal.h | 833 +++ .../pal/inc/core/palCmdAllocator.h | 204 + .../pal/inc/core/palCmdBuffer.h | 5118 ++++++++++++++ .../pal/inc/core/palCmdTracking.h | 370 ++ .../pal/inc/core/palDestroyable.h | 70 + .../pal/inc/core/palDeveloperHooks.h | 626 ++ .../pal/inc/core/palDevice.h | 5896 +++++++++++++++++ .../pal/inc/core/palFence.h | 171 + .../pal/inc/core/palFormat.h | 506 ++ .../pal/inc/core/palFormatInfo.h | 881 +++ .../pal/inc/core/palGpuMemory.h | 711 ++ .../pal/inc/core/palGpuMemoryBindable.h | 139 + .../pal/inc/core/palImage.h | 983 +++ .../pal/inc/core/palLib.h | 508 ++ .../pal/inc/core/palMsaaState.h | 187 + .../pal/inc/core/palPerfExperiment.h | 619 ++ .../pal/inc/core/palPipeline.h | 896 +++ .../pal/inc/core/palPlatform.h | 645 ++ .../pal/inc/core/palQueryPool.h | 234 + .../pal/inc/core/palQueue.h | 765 +++ .../pal/inc/core/palQueueSemaphore.h | 275 + .../pal/inc/core/palShaderLibrary.h | 251 + .../inc/gpuUtil/palCodeObjectTraceSource.h | 212 + .../pal/inc/gpuUtil/palGpaSession.h | 1201 ++++ .../pal/inc/gpuUtil/palGpuUtil.h | 141 + .../inc/gpuUtil/palQueueTimingsTraceSource.h | 236 + .../inc/gpuUtil/palRenderOpTraceController.h | 150 + .../pal/inc/gpuUtil/palTraceSession.h | 737 +++ .../pal/inc/util/palAssert.h | 368 + .../pal/inc/util/palAutoBuffer.h | 212 + .../pal/inc/util/palBuddyAllocator.h | 202 + .../pal/inc/util/palBuddyAllocatorImpl.h | 607 ++ .../pal/inc/util/palDbgPrint.h | 364 + .../pal/inc/util/palDeque.h | 319 + .../pal/inc/util/palEvent.h | 141 + .../pal/inc/util/palFile.h | 300 + .../pal/inc/util/palHashBase.h | 529 ++ .../pal/inc/util/palHashBaseImpl.h | 650 ++ .../pal/inc/util/palHashMap.h | 143 + .../pal/inc/util/palHashMapImpl.h | 250 + .../pal/inc/util/palHashSet.h | 144 + .../pal/inc/util/palHashSetImpl.h | 231 + .../pal/inc/util/palInlineFuncs.h | 1565 +++++ .../pal/inc/util/palIntrusiveList.h | 293 + .../pal/inc/util/palLinearAllocator.h | 346 + .../pal/inc/util/palMath.h | 226 + .../pal/inc/util/palMemTracker.h | 158 + .../pal/inc/util/palMemTrackerImpl.h | 331 + .../pal/inc/util/palMutex.h | 398 ++ .../pal/inc/util/palSpan.h | 447 ++ .../pal/inc/util/palStringUtil.h | 188 + .../pal/inc/util/palStringView.h | 228 + .../pal/inc/util/palSysMemory.h | 820 +++ .../pal/inc/util/palSysUtil.h | 816 +++ .../pal/inc/util/palTime.h | 87 + .../pal/inc/util/palUtil.h | 714 ++ .../pal/inc/util/palVector.h | 455 ++ .../pal/lib/Release/x64/.gitignore | 32 + .../Release/x64/DriverUtilsService.lib.dvc | 5 + .../Release/x64/SettingsRpcService2.lib.dvc | 5 + .../lib/Release/x64/UberTraceService.lib.dvc | 5 + .../pal/lib/Release/x64/addrlib.lib.dvc | 5 + .../pal/lib/Release/x64/amdrdf.lib.dvc | 5 + .../pal/lib/Release/x64/cwpack.lib.dvc | 5 + .../pal/lib/Release/x64/ddCommon.lib.dvc | 5 + .../pal/lib/Release/x64/ddCore.lib.dvc | 5 + .../pal/lib/Release/x64/ddEventClient.lib.dvc | 5 + .../pal/lib/Release/x64/ddEventParser.lib.dvc | 5 + .../pal/lib/Release/x64/ddEventServer.lib.dvc | 5 + .../lib/Release/x64/ddEventStreamer.lib.dvc | 5 + .../pal/lib/Release/x64/ddNet.lib.dvc | 5 + .../pal/lib/Release/x64/ddRpcClient.lib.dvc | 5 + .../pal/lib/Release/x64/ddRpcServer.lib.dvc | 5 + .../pal/lib/Release/x64/ddRpcShared.lib.dvc | 5 + .../pal/lib/Release/x64/ddSocket.lib.dvc | 5 + .../pal/lib/Release/x64/ddYaml.lib.dvc | 5 + .../pal/lib/Release/x64/dd_common.lib.dvc | 5 + .../pal/lib/Release/x64/dd_libyaml.lib.dvc | 5 + .../pal/lib/Release/x64/dd_settings.lib.dvc | 5 + .../pal/lib/Release/x64/devdriver.lib.dvc | 5 + .../pal/lib/Release/x64/metrohash.lib.dvc | 5 + .../pal/lib/Release/x64/mpack.lib.dvc | 5 + .../pal/lib/Release/x64/pal.lib.dvc | 5 + .../lib/Release/x64/palCompilerDeps.lib.dvc | 5 + .../pal/lib/Release/x64/palUtil.lib.dvc | 5 + .../pal/lib/Release/x64/pal_lz4.lib.dvc | 5 + .../pal/lib/Release/x64/pal_uuid.lib.dvc | 5 + .../pal/lib/Release/x64/stb_sprintf.lib.dvc | 5 + .../pal/lib/Release/x64/vam.lib.dvc | 5 + .../pal/lib/Release/x64/zstd.lib.dvc | 5 + .../shared/legacy/core/inc/ddcDefs.h | 269 + .../shared/legacy/core/inc/ddcPlatform.h | 824 +++ .../shared/legacy/core/inc/ddcTemplate.h | 378 ++ .../core/inc/platforms/ddcWinPlatform.h | 115 + .../shared/legacy/inc/baseProtocolServer.h | 62 + .../devdriver/shared/legacy/inc/ddDefs.h | 28 + .../shared/legacy/inc/ddLegacyDefs.h | 48 + .../devdriver/shared/legacy/inc/ddPlatform.h | 28 + .../devdriver/shared/legacy/inc/ddTemplate.h | 28 + .../shared/legacy/inc/ddUriInterface.h | 291 + .../shared/legacy/inc/devDriverServer.h | 103 + .../devdriver/shared/legacy/inc/gpuopen.h | 590 ++ .../devdriver/shared/legacy/inc/msgChannel.h | 271 + .../shared/legacy/inc/msgTransport.h | 70 + .../legacy/inc/platforms/ddWinPlatform.h | 28 + .../shared/legacy/inc/protocolClient.h | 54 + .../shared/legacy/inc/protocolServer.h | 53 + .../shared/legacy/inc/protocolSession.h | 175 + .../inc/protocols/driverControlProtocol.h | 531 ++ .../inc/protocols/driverControlServer.h | 157 + .../shared/legacy/inc/protocols/rgpProtocol.h | 537 ++ .../shared/legacy/inc/protocols/rgpServer.h | 174 + .../legacy/inc/protocols/systemProtocols.h | 217 + .../shared/legacy/inc/protocols/typemap.h | 168 + .../shared/legacy/inc/util/ddMetroHash.h | 86 + .../legacy/inc/util/ddStructuredReader.h | 335 + .../shared/legacy/inc/util/hashFunc.h | 102 + .../shared/legacy/inc/util/sharedptr.h | 291 + .../devdriver/shared/legacy/inc/util/string.h | 292 + .../devdriver/shared/legacy/inc/util/vector.h | 605 ++ .../third_party/dd_crc32/inc/dd_crc32.h | 77 + .../pal/shared/inc/trackedCmdLocation.h | 187 + .../pal/shared/metrohash/CMakeLists.txt | 70 + .../pal/shared/metrohash/LICENSE | 201 + .../pal/shared/metrohash/README.md | 56 + .../shared/metrohash/make/Makefile.metrohash | 20 + .../pal/shared/metrohash/src/metrohash.h | 24 + .../pal/shared/metrohash/src/metrohash128.cpp | 419 ++ .../pal/shared/metrohash/src/metrohash128.h | 85 + .../shared/metrohash/src/metrohash128crc.cpp | 178 + .../shared/metrohash/src/metrohash128crc.h | 33 + .../pal/shared/metrohash/src/metrohash64.cpp | 415 ++ .../pal/shared/metrohash/src/metrohash64.h | 86 + .../pal/shared/metrohash/src/platform.h | 62 + .../pal/shared/metrohash/src/testvector.h | 63 + .../sc/HSAIL/ext/libamdhsacode/CMakeLists.txt | 82 + .../HSAIL/ext/libamdhsacode/amd_elf_image.cpp | 1717 +++++ .../HSAIL/ext/libamdhsacode/amd_hsa_code.cpp | 1910 ++++++ .../ext/libamdhsacode/amd_hsa_code_util.cpp | 1048 +++ .../ext/libamdhsacode/amd_hsa_code_util.hpp | 195 + .../HSAIL/ext/libamdhsacode/amd_hsa_locks.cpp | 94 + .../HSAIL/ext/libamdhsacode/amd_hsa_locks.hpp | 127 + .../HSAIL/ext/libamdhsacode/amd_options.cpp | 381 ++ .../HSAIL/ext/libamdhsacode/amd_options.hpp | 476 ++ .../HSAIL/ext/loader/AMDHSAKernelDescriptor.h | 263 + .../sc/HSAIL/ext/loader/CMakeLists.txt | 31 + .../sc/HSAIL/ext/loader/executable.cpp | 1983 ++++++ .../sc/HSAIL/ext/loader/executable.hpp | 686 ++ .../sc/HSAIL/ext/loader/loaders.cpp | 307 + .../sc/HSAIL/ext/loader/loaders.hpp | 114 + .../sc/HSAIL/hsail-tools/libHSAIL/Brig.h | 1131 ++++ .../sc/HSAIL/include/amd_elf_image.hpp | 266 + .../sc/HSAIL/include/amd_hsa_code.hpp | 432 ++ .../sc/HSAIL/include/amd_hsa_common.h | 91 + .../sc/HSAIL/include/amd_hsa_elf.h | 476 ++ .../sc/HSAIL/include/amd_hsa_kernel_code.h | 274 + .../sc/HSAIL/include/amd_hsa_loader.hpp | 520 ++ .../sc/HSAIL/include/amd_hsa_program.hpp | 347 + .../sc/HSAIL/include/amd_hsa_queue.h | 87 + .../sc/HSAIL/include/amd_hsa_signal.h | 80 + .../sc/HSAIL/include/hsa.h | 5692 ++++++++++++++++ .../sc/HSAIL/include/hsa_ext_finalize.h | 531 ++ .../sc/HSAIL/include/hsa_ext_image.h | 1454 ++++ .../sc/HSAIL/include/hsa_ven_amd_loader.h | 667 ++ .../sc/HSAIL/include/public/amdid.h | 30 + .../sc/HSAIL/include/public/cz_id.h | 46 + 261 files changed, 86831 insertions(+), 2 deletions(-) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 shared/amdgpu-windows-interop/LICENSE create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.cpp create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.hpp create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.cpp create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.hpp create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/_elftc.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/elfdefinitions.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/uthash.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/ar.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/compat.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/cdefs.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf32.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf64.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf_common.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/mman.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/param.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/queue.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README.build create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/Version.map create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_ar.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_config.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_begin.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_cntl.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_data.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_end.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errmsg.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errno.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_fill.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_flag.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarhdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarsym.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getbase.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getident.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_hash.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_kind.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_memory.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_next.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_phnum.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rand.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rawfile.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_scn.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shnum.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shstrndx.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_strptr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_update.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_version.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_cap.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_checksum.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_dyn.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_ehdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_fsize.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_getclass.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_move.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_phdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rel.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rela.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_shdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_sym.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_syminfo.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_symshndx.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_xlate.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_align.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_allocate.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar_util.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_checksum.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_convert.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_data.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ehdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_extended.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_fsize.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_msize.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_phdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_shdr.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_xlate.c create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.cpp create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.h create mode 100644 shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/roundup.h create mode 100644 shared/amdgpu-windows-interop/pal/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/pal/LICENSE create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/pal.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palDevice.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palFence.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palFormat.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palImage.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palLib.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palQueue.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palAssert.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palDeque.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palEvent.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palFile.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palMath.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palMutex.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palSpan.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palStringView.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palTime.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palUtil.h create mode 100644 shared/amdgpu-windows-interop/pal/inc/util/palVector.h create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/.gitignore create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/README.md create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h create mode 100644 shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_elf_image.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/AMDHSAKernelDescriptor.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/CMakeLists.txt create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.cpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/hsail-tools/libHSAIL/Brig.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_elf_image.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_code.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_common.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_elf.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_kernel_code.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_loader.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_program.hpp create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_queue.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_signal.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/hsa.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_finalize.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_image.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ven_amd_loader.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/public/amdid.h create mode 100644 shared/amdgpu-windows-interop/sc/HSAIL/include/public/cz_id.h diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000000..528f30c71c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000000..375b342d55 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,6 @@ +[core] + autostage = true + remote = storage +['remote "storage"'] + url = s3://therock-dvc/rocm-systems + allow_anonymous_login = true diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000000..5197305523 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/labeler.yml b/.github/labeler.yml index e4b807c14d..a05305133e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -62,6 +62,10 @@ - changed-files: - any-glob-to-any-file: 'projects/roctracer/**/*' +"shared: amdgpu-windows-interop": +- changed-files: + - any-glob-to-any-file: 'shared/amdgpu-windows-interop/**/*' + documentation: - changed-files: - any-glob-to-any-file: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 696685e5c1..513d1083dd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ To limit your local checkout to only the project(s) you work on and improve perf git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems.git cd rocm-systems git sparse-checkout init --cone -git sparse-checkout set projects/rocblas shared/tensile +git sparse-checkout set projects/rocprofiler-sdk shared/rocprofiler-compute git checkout develop # or the branch you are starting from ``` @@ -33,7 +33,7 @@ The checkout command of the two projects lasted less than 90 seconds. If your work involves changing projects or introducing new projects, you can update your sparse-checkout environment: ```bash -git sparse-checkout set projects/hipsparse projects/rocsparse +git sparse-checkout set projects/hip projects/clr projects/hip-tests ``` This keeps your working directory clean and fast, as you won't need to clone the entire super-repo. @@ -130,6 +130,50 @@ Please refer to [this documentation](/docs/continuous-integration.md) for furthe --- +## Large File Storage + +[Data Version Control](https://dvc.org) is the system for large file storage in this super-repo. It provides staging capabilities on top of what Git LFS typically provides that ROCm CI/CD workflows can make use of. Files are stored in an AWS S3 bucket that has public-read access. + +Currently, `dvc` utilization is limited to the `pal` libraries in the `shared/amdgpu-windows-interop` directory. +If your development does not involve these files, you do not need to install `dvc`. + +### Installing DVC + +`dvc` can be installed as a python module via pip and is cross-platform. Visit the [dvc installation page](https://dvc.org/doc/install) if you want to use another method of installation. Due to our use of an AWS S3 bucket with `dvc`, the `dvc[s3]` module should be installed. The configuration to download the large files from the AWS S3 bucket is already set in this repository. + +```bash +pip install dvc[s3] +``` + +### Retrieving large files: + +```bash +git pull +dvc pull +``` + +### Switching to versions in other branches or commits: + +```bash +git checkout feature-branch +dvc checkout +``` + +### Update large files + +Write-access requires authentication. Please reach out to a project lead for credentials. To make updates to files maintained by `DVC`: + +```bash +dvc add [path-to-large-file-modified] +dvc push +git status +git add [dvc-files-mentioned-from-status-output] +git commit -m "commit message" +git push +``` + +--- + ## Gardener Rotation In order to achieve the goal of keeping the `develop` branch healthy, a team of ROCm engineers will be dedicated towards monitoring and triaging issues that arise. diff --git a/shared/amdgpu-windows-interop/LICENSE b/shared/amdgpu-windows-interop/LICENSE new file mode 100644 index 0000000000..2d16201200 --- /dev/null +++ b/shared/amdgpu-windows-interop/LICENSE @@ -0,0 +1,198 @@ +AMD Software End User License Agreement + +IMPORTANT-READ CAREFULLY: DO NOT INSTALL, COPY OR USE THE ENCLOSED SOFTWARE, +DOCUMENTATION (AS DEFINED BELOW), OR ANY PORTION THEREOF, UNTIL YOU HAVE +CAREFULLY READ AND AGREED TO THE FOLLOWING TERMS AND CONDITIONS. THIS IS A LEGAL +AGREEMENT ("AGREEMENT") BETWEEN YOU (EITHER AN INDIVIDUAL OR AN ENTITY) ("YOU") +AND ADVANCED MICRO DEVICES, INC. ("AMD"). +IF YOU DO NOT AGREE TO THE TERMS OF THIS AGREEMENT, DO NOT INSTALL, COPY OR USE +THIS SOFTWARE. BY INSTALLING, COPYING OR USING THE SOFTWARE YOU AGREE TO ALL THE +TERMS AND CONDITIONS OF THIS AGREEMENT. + +1. DEFINITIONS + 1. “Derivative Works” means any work, revision, modification or adaptation made to or +derived from the Software, or any work that incorporates the Software, in whole or in +part. + 2. “Documentation” means install scripts and online or electronic documentation +associated, included, or provided in connection with the Software, or any portion +thereof. + 3. “Free Software License” means an open source or other license that requires, as a +condition of use, modification or distribution, that any resulting software must be (a) +disclosed or distributed in source code form; (b) licensed for the purpose of making +derivative works; or (c) redistributable at no charge. + 4. “Intellectual Property Rights” means all copyrights, trademarks, trade secrets, patents, +mask works, and all related, similar, or other intellectual property rights recognized in +any jurisdiction worldwide, including all applications and registrations with respect +thereto. + 5. “Object Code” means machine readable computer programming code files, which is not +in a human readable form. + 6. “Software” means the enclosed AMD software program or any portion thereof that is +provided to You. + 7. “Source Code” means computer programming code in human readable form and +related system level documentation, including all comments, symbols and any +procedural code such as job control language. + +2. LICENSE +Subject to the terms and conditions of this Agreement, AMD hereby grants You a non-exclusive, +royalty-free, revocable, non-transferable, limited, copyright license to + 1. install and use the Software solely in Object Code form in conjunction with systems or +components that include or incorporate AMD products, as applicable; + 2. create Derivative Works solely in Object Code form of the Software for use with systems +or components that include or incorporate AMD products, as applicable; + 3. unless otherwise prohibited by a confidentiality agreement, make and distribute copies +of the Derivative Works to Your partners and customers for use in conjunction with +systems or components that include or incorporate AMD products, provided that such +distribution shall be under a license agreement with terms and conditions at least as +restrictive as those set forth in the Agreement; and + 4. use and reference the Documentation, if any, solely in connection with the Software and +Derivative Works. + +3. RESTRICTIONS +Except for the limited license expressly granted in Section 2 herein, You have no other rights in +the Software, whether express, implied, arising by estoppel or otherwise. Further restrictions +regarding Your use of the Software are set forth below. Except for the limited license expressly +granted in Section 2, You may not: + 1. modify or create derivative works of the Software or Documentation; + 2. distribute, publish, display, sublicense, assign or otherwise transfer the Software or +Documentation; + 3. decompile, reverse engineer, disassemble or otherwise reduce the Software to Source +Code form (except as allowed by applicable law); + 4. alter or remove any copyright, trademark or patent notice(s) in the Software or +Documentation; or + 5. use the Software and Documentation to: (i) develop inventions directly derived from +Confidential Information to seek patent protection; (ii) assist in the analysis of Your +patents and patent applications; or (iii) modify existing patents; or + 6. use, modify and/or distribute any of the Software or Documentation so that any part +becomes subject to a Free Software License. + +4. THIRD-PARTY COMPONENTS + The Software or Documentation may come bundled with third party technologies for which You +must obtain licenses from parties other than AMD (“Third Party Components”). By accessing +and using the Software or Documentation, You are agreeing to fully comply with the terms of +the applicable Third Party Component license. To the extent that a Third Party Component +license conflicts with the terms and conditions of this Agreement, then the Third Party +Component license shall control solely with respect to the applicable Third Party Component. +To the extent that any Third Party Components in the Software or Documentation requires an +offer for corresponding source code, AMD hereby makes such an offer for corresponding +source code form. + +5. PRE-PRODUCTION SOFTWARE + The Software may be a pre-production version, intended to provide advance access to features +that may or may not eventually be included into production version of the Software. +Accordingly, pre-production Software may not be fully functional relative to production +versions of the Software. Use of pre-production Software may result in unexpected results, loss +of data, project delays or other unpredictable damage or loss. Pre-production Software is not +intended for use in production, and Your use of pre-production Software is at Your own risk. + +6. FEEDBACK + You have no obligation to give AMD any suggestions, comments or other feedback +(“Feedback”) relating to the Software or Documentation. However, AMD may use and include +any Feedback that it receives from You to improve the Software, Documentation, or other AMD +products, software, and technologies. Accordingly, for any Feedback You provide to AMD, You +grant AMD and its affiliates and subsidiaries a worldwide, non-exclusive, irrevocable,royaltyfree, +perpetual license to, directly or indirectly, use, reproduce, license, sublicense, distribute, +make, have made, sell and otherwise commercialize the Feedback in the Software, +Documentation, or other AMD products, software and technologies. You further agree not to +provide any Feedback that (a) You know is subject to any Intellectual Property Rights of any +third party or (b) is subject to license terms which seek to require any products incorporating or +derived from such Feedback, or other AMD intellectual property, to be licensed to or otherwise +shared with any third party. + +7. OWNERSHIP AND COPYRIGHT OF SOFTWARE + The Software, including all Intellectual Property Rights therein, and the Documentation are and +remain the sole and exclusive property of AMD or its licensors, and You shall have no right, title +or interest therein except as expressly set forth in this Agreement. + +8. WARRANTY DISCLAIMER + THE SOFTWARE AND DOCUMENTATION ARE PROVIDED "AS IS" WITHOUT WARRANTY OF ANY +KIND. AMD DISCLAIMS ALL WARRANTIES, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING BUT +NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE, TITLE, NON-INFRINGEMENT, THAT THE SOFTWARE OR DOCUMENTATION WILL RUN +UNINTERRUPTED OR ERROR-FREE OR WARRANTIES ARISING FROM CUSTOM OF TRADE OR +COURSE OF USAGE. THE ENTIRE RISK ASSOCIATED WITH THE USE OF THE SOFTWARE AND +DOCUMENTATION IS ASSUMED BY YOU. Some jurisdictions do not allow the exclusion of +implied warranties, so the above exclusion may not apply to You. + +9. LIMITATION OF LIABILITY AND INDEMNIFICATION + AMD AND ITS LICENSORS WILL NOT, UNDER ANY CIRCUMSTANCES BE LIABLE TO YOU FOR ANY +PUNITIVE, DIRECT, INCIDENTAL, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING +FROM USE OF THE SOFTWARE, DOCUMENTATION, OR THIS AGREEMENT EVEN IF AMD AND ITS +LICENSORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. In no event shall +AMD's total liability to You for all damages, losses, and causes of action (whether in contract, +tort (including negligence) or otherwise) exceed the amount of $100 USD. You agree to defend, +indemnify and hold harmless AMD and its licensors, and any of their directors, officers, +employees, affiliates or agents from and against any and all loss, damage, liability and other +expenses (including reasonable attorneys' fees), resulting from Your use of the Software, +Documentation, or violation of the terms and conditions of this Agreement. + +10. EXPORT RESTRICTIONS + You shall adhere to all applicable U.S. import/export laws and regulations, as well as the +import/export control laws and regulations of other countries as applicable. You further agree +to not export, re-export, or transfer, directly or indirectly, any product, technical data, software +or source code received from AMD under this license, or the direct product of such technical +data or software to any country for which the United States or any other applicable +government requires an export license or other governmental approval without first obtaining +such licenses or approvals; or in violation of any applicable laws or regulations of the United +States or the country where the technical data or software was obtained. You acknowledge the +technical data and software received will not, in the absence of authorization from U.S. or local +law and regulations as applicable, be used by or exported, re-exported or transferred to: (i) any +sanctioned or embargoed country, or to nationals or residents of such countries; (ii) any +restricted end-user as identified on any applicable government end-user list; or (iii) any party +where the end-use involves nuclear, chemical/biological weapons, rocket systems, or +unmanned air vehicles. For the most current Country Group listings, or for additional +information about the EAR or Your obligations under those regulations, please refer to the U.S. +Bureau of Industry and Security’s website at http://www.bis.doc.gov/. + +11. NOTICE TO U.S. GOVERNMENT END USERS + The Software and Documentation are "commercial items", as that term is defined at 48 C.F.R. +§2.101, consisting of "commercial computer software" and "commercial computer software +documentation", as such terms are used in 48 C.F.R. §12.212 and 48 C.F.R. §227.7202, +respectively. Consistent with 48 C.F.R. §12.212 or 48 C.F.R. §227.7202-1 through 227.7202-4, as +applicable, the commercial computer software and commercial computer software +documentation are being licensed to U.S. Government end users (a) only as commercial items +and (b) with only those rights as are granted to all other end users pursuant to the terms and +conditions set forth in this Agreement. Unpublished rights are reserved under the copyright +laws of the United States. + +12. TERMINATION OF LICENSE + This Agreement will terminate immediately without notice from AMD or judicial resolution if (1) +You fail to comply with any provisions of this Agreement, or (2) You provide AMD with notice +that You would like to terminate this Agreement. Upon termination of this Agreement, You +must delete or destroy all copies of the Software. Upon termination or expiration of this +Agreement, all provisions survive except for Section 2. + +13. SUPPORT AND UPDATES + AMD is under no obligation to provide any kind of support under this Agreement. AMD may, in +its sole discretion, provide You with updates to the Software and Documentation, and such +updates will be covered under this Agreement. + +14. GOVERNING LAW + This Agreement is made under and shall be construed according to the laws of the State of +California, excluding conflicts of law rules. Each party submits to the jurisdiction of the state +and federal courts of Santa Clara County and the Northern District of California for the purposes +of this Agreement. You acknowledge that Your breach of this Agreement may cause irreparable +damage and agree that AMD shall be entitled to seek injunctive relief under this Agreement, as +well as such further relief as may be granted by a court of competent jurisdiction. + +15. PRIVACY + We may be required under applicable data protection law to provide you with certain +information about who we are, how we process your personal data and for what purposes and +your rights in relation to your personal information and how to exercise them. This information +is provided in www.amd.com/en/corporate/privacy. It is important that you read that +information. AMD’s Cookie Policy, sets out information about the cookies AMD uses. + +16. GENERAL PROVISIONS + You may not assign this Agreement without the prior written consent of AMD and any +assignment without such consent will be null and void. The parties do not intend that any +agency or partnership relationship be created between them by this Agreement. Each +provision of this Agreement shall be interpreted in such a manner as to be effective and valid +under applicable law. However, in the event that any provision of this Agreement becomes or +is declared unenforceable by any court of competent jurisdiction, such provision shall be +deemed deleted and the remainder of this Agreement shall remain in full force and effect. + +17. ENTIRE AGREEMENT + This Agreement sets forth the entire agreement and understanding between the parties with +respect to the Software and supersedes and merges all prior oral and written agreements, +discussions and understandings between them regarding the subject matter of this +Agreement. No waiver or modification of any provision of this Agreement shall be binding +unless made in writing and signed by an authorized representative of each party. diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/CMakeLists.txt b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/CMakeLists.txt new file mode 100644 index 0000000000..6affc1d788 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/CMakeLists.txt @@ -0,0 +1,26 @@ +## +######################################################################################################################## +# +# Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +# +######################################################################################################################## + +cmake_minimum_required(VERSION 3.10) + +file(GLOB sources + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp +) + +add_library(clCompilerElfLoader ${sources}) + +add_subdirectory(utils) + +include_directories(${lib}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${elf-toolchain}/common) +if(WIN32) +include_directories(${elf-toolchain}/common/win32) +endif() +include_directories(${elf-toolchain}/libelf) + +add_definitions(-DBSD_LIBELF) diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.cpp b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.cpp new file mode 100644 index 0000000000..5a631d52d4 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.cpp @@ -0,0 +1,1557 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. +// +#include "elf.hpp" + +#include +#include +#include + +#if defined(__linux__) +#include +#endif + +#include "os/os.hpp" +#include "_libelf.h" +namespace amd { + +using namespace oclelfutils; + +#if !defined(ELFMAG) +#define ELFMAG "\177ELF" +#define SELFMAG 4 +#endif + +/* + Opague data type definition. +*/ +struct symbol_handle { + union { + Elf64_Sym sym64; + Elf32_Sym sym32; + } u; +}; + +typedef struct { + OclElf::oclElfSections id; + const char *name; + Elf_Type d_type; + uint64_t d_align; // section alignment in bytes + Elf32_Word sh_type; // section type + Elf32_Word sh_flags; // section flags + const char *desc; +} OclElfSectionsDesc; + +namespace { + // Objects that are visible only within this module + + const OclElfSectionsDesc oclElfSecDesc[] = + { + { OclElf::LLVMIR, ".llvmir", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "ASIC-independent LLVM IR" }, + { OclElf::SOURCE, ".source", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "OpenCL source" }, + { OclElf::ILTEXT, ".amdil", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "AMD IL text" }, + { OclElf::ASTEXT, ".astext", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "X86 assembly text" }, + { OclElf::CAL, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, + "AMD CalImage" }, + { OclElf::DLL, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, + "x86 dll" }, + { OclElf::STRTAB, ".strtab", ELF_T_BYTE, 1, SHT_STRTAB, SHF_STRINGS, + "String table" }, + { OclElf::SYMTAB, ".symtab", ELF_T_SYM, sizeof(Elf64_Xword), SHT_SYMTAB, 0, + "Symbol table" }, + { OclElf::RODATA, ".rodata", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC, + "Read-only data" }, + { OclElf::SHSTRTAB, ".shstrtab", ELF_T_BYTE, 1, SHT_STRTAB, SHF_STRINGS, + "Section names" }, + { OclElf::NOTES, ".note", ELF_T_NOTE, 1, SHT_NOTE, 0, + "used by loader for notes" }, + { OclElf::COMMENT, ".comment", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Version string" }, + { OclElf::ILDEBUG, ".debugil", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "AMD Debug IL" }, + { OclElf::DEBUG_INFO, ".debug_info", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug info" }, + { OclElf::DEBUG_ABBREV, ".debug_abbrev", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug abbrev" }, + { OclElf::DEBUG_LINE, ".debug_line", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug line" }, + { OclElf::DEBUG_PUBNAMES, ".debug_pubnames", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug pubnames" }, + { OclElf::DEBUG_PUBTYPES, ".debug_pubtypes", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug pubtypes" }, + { OclElf::DEBUG_LOC, ".debug_loc", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug loc" }, + { OclElf::DEBUG_ARANGES, ".debug_aranges", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug aranges" }, + { OclElf::DEBUG_RANGES, ".debug_ranges", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug ranges" }, + { OclElf::DEBUG_MACINFO, ".debug_macinfo", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug macinfo" }, + { OclElf::DEBUG_STR, ".debug_str", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug str" }, + { OclElf::DEBUG_FRAME, ".debug_frame", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Dwarf debug frame" }, + { OclElf::JITBINARY, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, + "x86 JIT Binary" }, + { OclElf::CODEGEN, ".cg", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Target dependent IL" }, + { OclElf::TEXT, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR, + "Device specific ISA" }, + { OclElf::INTERNAL, ".internal", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Internal usage" }, + { OclElf::SPIR, ".spir", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "Vendor/Device-independent LLVM IR" }, + { OclElf::SPIRV, ".spirv", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "SPIR-V Binary" }, + { OclElf::RUNTIME_METADATA,".AMDGPU.runtime_metadata", ELF_T_BYTE, 1, SHT_PROGBITS, 0, + "AMDGPU runtime metadata" }, + }; + + // index 0 is reserved and must be there (NULL section) + const char shstrtab[] = { + /* index 0 */ '\0', + /* index 1 */ '.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', '\0', + /* index 11 */ '.', 's', 't', 'r', 't', 'a', 'b', '\0' + }; + +#define SHSTRTAB_NAME_NDX 1 +#define STRTAB_NAME_NDX 11 + + // index 0 is reserved and must be there (NULL name) + const char strtab[] = { + /* index 0 */ '\0' + }; + +} + + bool +isElfMagic(const char* p) +{ + if (p==NULL || strncmp(p, ELFMAG, SELFMAG) != 0) { + return false; + } + return true; +} + +// + bool +isElfHeader(const char* p, signed char ec) +{ + if (!isElfMagic(p)) { + return false; + } + signed char libVersion = elf_version(EV_CURRENT); + signed char fileVersion = p[EI_VERSION]; + signed char elfClass = p[EI_CLASS]; + if( fileVersion > libVersion) { + return false; + } + + // class check: + if ( elfClass != ec) { + return false; + } + + return true; +} + + bool +isCALTarget(const char* p, signed char ec) +{ + if (!isElfMagic(p)) { + return false; + } + + Elf64_Half machine; + if (ec == ELFCLASS32) { + machine = ((Elf32_Ehdr*)p)->e_machine; + + } + else { + machine = ((Elf64_Ehdr*)p)->e_machine; + } + +#if !defined(WITH_LIGHTNING_COMPILER) + if ( (machine >= OclElf::CAL_FIRST) && (machine <= OclElf::CAL_LAST) ) { + return true; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + + return false; +} + +/////////////////////////////////////////////////////////////// +////////////////////// elf initializers /////////////////////// +/////////////////////////////////////////////////////////////// + +OclElf::OclElf ( + unsigned char eclass, + const char* rawElfBytes, + uint64_t rawElfSize, + const char* elfFileName, + Elf_Cmd elfcmd + ) +: _fd (-1), + _fname (elfFileName), + _e (0), + _err (), + _eclass (eclass), + _rawElfBytes (rawElfBytes), + _rawElfSize (rawElfSize), + _elfCmd (elfcmd), + _elfMemory(), + _shstrtab_ndx (0), + _strtab_ndx (0) +{ + if (rawElfBytes != NULL) { + /* + In general, 'eclass' should be the same as rawElfBytes's. 'eclass' is what the runtime + will use for generating an ELF, and therefore it expects the input ELF to have this 'eclass'. + However, GPU needs to accept both 32-bit and 64-bit ELF for compatibility (we used to + generate 64-bit ELF, which is the bad design in the first place). Here we just uses eclass + from rawElfBytes, and overrides the input 'eclass'. + */ + _eclass = (unsigned char)rawElfBytes[EI_CLASS]; + } + (void)Init(); +} + +OclElf::~OclElf() +{ +#if 0 + Elf_Cmd c = (_errCmd == ELF_C_READ) ? ELF_C_NULL : _errCmd; + if (elf_update(_e, c < 0) { + _err.xfail("OclElf::Fini() : elf_update() failed: %s", elf_errmsg(-1); + return; + } +#endif + _err.Fini(); + + elf_end(_e); + _e = 0; + + if (_fd != -1) { + xclose(_err, _fname, _fd); + char* tname= const_cast(_fname); + if (tname) { + unlink(tname); + free(tname); + } + _fd = -1; + _fname = NULL; + + } + + elfMemoryRelease(); +} + + bool +OclElf::Clear() +{ + if (_e) { + elf_end(_e); + _e = NULL; + } + + if (_fd != -1) { + if (xclose(_err, _fname, _fd) < 0) { + return false; + } + _fd = -1; + } + + elfMemoryRelease(); + + _err.Fini(); + + // Re-initialize the object + Init(); + + return !hasError(); +} + +/* + Initialize OclElf object + */ + bool +OclElf::Init() +{ + _err.Init(); + + // Create a temporary file if it is needed + if (_elfCmd != ELF_C_READ) { + if (_fname != NULL) { + size_t sz = strlen(_fname) + 1; + + char* tname = (char*)xmalloc(_err, sz); + if (tname == 0) { + _err.xfail("OclElf::Init() failed to malloc()"); + return false; + } + strcpy(tname, _fname); + _fname = static_cast(tname); + } + } + + if (elf_version(EV_CURRENT) == EV_NONE) { + _err.xfail("OclElf::Init(): Application expects CURRENT elf version"); + return false; + } + int oflag, pmode; +#if defined(_MSC_VER) + if (_elfCmd == ELF_C_READ) { + oflag = _O_RDONLY | _O_BINARY; + } + else { + oflag = _O_CREAT | _O_RDWR | _O_TRUNC | _O_BINARY; + } + pmode = _S_IREAD | _S_IWRITE; +#else + if (_elfCmd == ELF_C_READ) { + oflag = O_RDONLY; + } + else { + oflag = O_CREAT | O_RDWR | O_TRUNC; + } + pmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; // 0644 +#endif + if ((_fd == -1) && (_rawElfBytes == NULL)) { + // case 1: elf object is in file '_fname' + + _fd = xopen(_err, _fname, oflag, pmode); + if (_fd == -1) { + _err.xfail("OclElf::Init(): Cannot Open File %s!", _fname); + return false; + } + + _e = elf_begin(_fd, _elfCmd, NULL, NULL); + if (_e == NULL) { + _err.xfail ("OclElf::Init(): elf_begin failed"); + return false; + } + } + else if (_fd == -1) { + // case 2: elf object is in memory + if (_elfCmd == ELF_C_READ) { + assert ((_fname == NULL) && "ELF file name should not be provided for a read only elf."); + } else { + _fd = xopen(_err, _fname, oflag, pmode); + if (_fd == -1) { + _err.xfail("OclElf::Init(): Cannot Open File %s!", _fname); + return false; + } + } + + // const_cast is safe + _e = elf_memory(const_cast(_rawElfBytes), _rawElfSize, NULL); + if ( _e == NULL) { + _err.xfail("OclElf::Init(): elf_memory failed: %s", + elf_errmsg(-1)); + return false; + } + // If _fd != -1, then we are a read/write and not just a read, so change accordingly. + if (_fd != -1) { + _e->e_fd = _fd; + _e->e_cmd = _elfCmd; + } + } + else { // _fd != -1 + // case 3: elf object is in a file with file descriptor '_fd' + + _e = elf_begin(_fd, _elfCmd, NULL, NULL); + if (_e == NULL) { + _err.xfail ("OclElf::Init(): elf_begin failed: %s", + elf_errmsg(-1)); + return false; + } + } + + if (!InitElf()) { + return false; + } + + // Success + return true; +} + +/* + Return true: if InitElf() is successful + Return false: if InitElf() failed. + */ + bool +OclElf::InitElf () +{ + assert (_e && "libelf object should have been created already"); + + if (_elfCmd != ELF_C_WRITE) { + // Set up _shstrtab_ndx and _strtab_ndx + GElf_Ehdr gehdr; + if (gelf_getehdr(_e, &gehdr) == NULL) { + _err.xfail("OclElf::InitElf() failed in gelf_getehdr()- %s", + elf_errmsg(-1)); + return false; + } + + _shstrtab_ndx = gehdr.e_shstrndx; + + Elf_Scn* scn; + if (!getSectionDesc(scn, STRTAB)) { + _err.xfail("OclElf::InitElf() failed in getSectionDesc(STRTAB)"); + return false; + } + + // Sanity check. Each ELF binary should have STRTAB ! + if (scn != NULL) { + _strtab_ndx = elf_ndxscn(scn); + } + + return true; + } + + /*********************************/ + /******** ELF_C_WRITE ************/ + /*********************************/ + + // + // 1. Create ELF header + // + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_newehdr(_e); + if (ehdr32 == NULL) { + _err.xfail("OclElf::InitElf() failed in elf32_newehdr: %s.", + elf_errmsg(-1)); + return false; + } + } + else { + Elf64_Ehdr* ehdr64 = elf64_newehdr(_e); + if (ehdr64 == NULL) { + _err.xfail("OclElf::InitElf() failed in elf32_newehdr : %s.", + elf_errmsg(-1)); + return false; + } + } + +#if 0 + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("elf_update() failed"); + return -1; + } +#endif + + // + // 2. Create ELF shstrtab + // + Elf_Scn* scn_shstrtab = elf_newscn(_e); + if (scn_shstrtab == NULL) { + _err.xfail("Elf::InitElf() failed in elf_newscn : %s", elf_errmsg(-1)); + return false; + } + + /* addng ELF_Data descriptor associated with section scn */ + Elf_Data* data_shstrtab = createElfData(scn_shstrtab, SHSTRTAB, + const_cast(shstrtab), (uint64_t)sizeof(shstrtab), false); + if (data_shstrtab == NULL) { + return false; + } + + if (!createShdr(SHSTRTAB, scn_shstrtab, SHSTRTAB_NAME_NDX)) { + return false; + } + + // Save shstrtab section index + _shstrtab_ndx = elf_ndxscn(scn_shstrtab); +#if defined(BSD_LIBELF) + elf_setshstrndx(_e, _shstrtab_ndx); +#else + elfx_update_shstrndx(_e, _shstrtab_ndx); +#endif + +#if 0 + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("elf_update() failed"); + return -1; + } +#endif + + // + // 3. Create .strtab section + // + Elf_Scn* scn_strtab = elf_newscn(_e); + if (scn_strtab == NULL) { + _err.xfail("Elf::InitElf() failed in elf_newscn : %s", elf_errmsg(-1)); + return false; + } + + /* addng ELF_Data descriptor associated with section scn */ + Elf_Data* data_strtab = createElfData(scn_strtab, STRTAB, + const_cast(strtab), (uint64_t)sizeof(strtab), false); + if (data_strtab == NULL) { + return false; + } + + if (!createShdr(STRTAB, scn_strtab, STRTAB_NAME_NDX)) { + return false; + } + + // Save strtab section index + _strtab_ndx = elf_ndxscn(scn_strtab); + + // Need to update section header + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("elf_update() failed"); + return false; + } + + // + // 4. Create the symbol table + // + + // Create the first reserved symbol (undefined symbol) + size_t sym_sz = (_eclass == ELFCLASS32) ? sizeof(Elf32_Sym) : sizeof(Elf64_Sym); + void* sym = oclelf_calloc(sym_sz); + if (sym == NULL) { + _err.xfail("OclElf::InitElf() failed to alloc memory"); + return false; + } + + Elf_Scn* scn_symtab = newSection(SYMTAB, sym, sym_sz, false); + if (scn_symtab == NULL) { + // Use newSection()'s error message. + return false; + } + + return true; +} + +Elf_Data* +OclElf::createElfData( + Elf_Scn*& scn, + oclElfSections id, + void* d_buf, + uint64_t d_size, + bool do_copy + ) +{ + /* addng Elf_Data descriptor associated with section scn */ + Elf_Data* data = elf_newdata(scn); + if (data == NULL) { + _err.xfail("OclElf::createElfData() failed in elf_newdata() - %s", + elf_errmsg(-1)); + return NULL; + } + + void* newbuf; + if (do_copy) { + newbuf = oclelf_allocAndCopy((void*)d_buf, d_size); + } + else { + newbuf = d_buf; + } + + data->d_align = oclElfSecDesc[id].d_align; + data->d_off = 0LL; + data->d_buf = newbuf; + data->d_type = oclElfSecDesc[id].d_type; + data->d_size = d_size; + data->d_version = EV_CURRENT ; + + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("elf_update() failed"); + return NULL; + } + return data; +} + +bool +OclElf::createShdr ( + oclElfSections id, + Elf_Scn*& scn, + Elf64_Word shname, + Elf64_Word shlink + ) +{ + if (_eclass == ELFCLASS32) { + Elf32_Shdr* shdr32 = elf32_getshdr(scn); + if (shdr32 == NULL) { + _err.xfail("Elf::createShdr() failed in elf32_getshdr(): %s.", elf_errmsg(-1)); + return false; + } + + shdr32->sh_name = (Elf32_Word)shname; + shdr32->sh_type = (Elf32_Word)oclElfSecDesc[id].sh_type; + shdr32->sh_flags = (Elf32_Word)oclElfSecDesc[id].sh_flags; + + shdr32->sh_link = (Elf32_Word)shlink; + } + else { + Elf64_Shdr* shdr64 = elf64_getshdr(scn); + if (shdr64 == NULL) { + _err.xfail("Elf::InitElf() failed in elf64_getshdr(): %s.", elf_errmsg(-1)); + return false; + } + + shdr64->sh_name = (Elf64_Word)shname; + shdr64->sh_type = (Elf64_Word)oclElfSecDesc[id].sh_type; + shdr64->sh_flags = (Elf64_Xword)oclElfSecDesc[id].sh_flags; + + shdr64->sh_link = (Elf64_Word)shlink; + } + return true; +} + + bool +OclElf::getTarget(uint16_t& machine, oclElfPlatform& platform) +{ + assert(_e != 0); + + GElf_Ehdr ehdrO; + GElf_Ehdr *ehdr = gelf_getehdr(_e, &ehdrO); + if (ehdr == NULL) { + return false; + } + + Elf64_Half mach = ehdr->e_machine; + if ((mach >= CPU_FIRST) && (mach <= CPU_LAST)) { + platform = CPU_PLATFORM; + machine = mach - CPU_BASE; + } +#if !defined(WITH_LIGHTNING_COMPILER) + else if ( (mach >= CAL_FIRST) && (mach <= CAL_LAST)) { + platform = CAL_PLATFORM; + machine = mach - CAL_BASE; + } +#endif // !defined(WITH_LIGHTNING_COMPILER) + else if (mach == EM_386 + || mach == EM_HSAIL + || mach == EM_HSAIL_64 + || mach == EM_AMDIL + || mach == EM_AMDIL_64 + || mach == EM_X86_64) { + platform = COMPLIB_PLATFORM; + machine = mach; + } else { + // Invalid machine + return false; + } + + return true; +} + + bool +OclElf::setTarget(uint16_t machine, oclElfPlatform platform) +{ + assert(_e != 0); + + Elf64_Half mach; + if (platform == CPU_PLATFORM) + mach = machine + CPU_BASE; + else if (platform == CAL_PLATFORM) + mach = machine + CAL_BASE; + else + mach = machine; + + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_getehdr(_e); + + if (ehdr32 == NULL) { + _err.xfail("setTarget() : failed in elf32_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr32->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr32->e_type = ET_NONE; + ehdr32->e_machine = (Elf32_Half)mach; + } + else { + Elf64_Ehdr* ehdr64 = elf64_getehdr(_e); + + if (ehdr64 == NULL) { + _err.xfail("setTarget() : failed in elf64_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr64->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr64->e_type = ET_NONE; + ehdr64->e_machine = mach; + } + + return true; +} + +bool +OclElf::getType(uint16_t &type) { + assert(_e != 0); + + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_getehdr(_e); + + if (ehdr32 == NULL) { + _err.xfail("setTarget() : failed in elf32_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + type = ehdr32->e_type; + } + else { + Elf64_Ehdr* ehdr64 = elf64_getehdr(_e); + + if (ehdr64 == NULL) { + _err.xfail("setTarget() : failed in elf64_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + type = ehdr64->e_type; + } + + return true; +} + +bool +OclElf::setType(uint16_t type) { + assert(_e != 0); + + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_getehdr(_e); + + if (ehdr32 == NULL) { + _err.xfail("setTarget() : failed in elf32_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr32->e_type = type; + } + else { + Elf64_Ehdr* ehdr64 = elf64_getehdr(_e); + + if (ehdr64 == NULL) { + _err.xfail("setTarget() : failed in elf64_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr64->e_type = type; + } + + return true; +} + +bool +OclElf::getFlags(uint32_t &flag) { + assert(_e != 0); + + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_getehdr(_e); + + if (ehdr32 == NULL) { + _err.xfail("setTarget() : failed in elf32_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + flag = ehdr32->e_flags; + } + else { + Elf64_Ehdr* ehdr64 = elf64_getehdr(_e); + + if (ehdr64 == NULL) { + _err.xfail("setTarget() : failed in elf64_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + flag = ehdr64->e_flags; + } + + return true; +} + +bool +OclElf::setFlags(uint32_t flag) { + assert(_e != 0); + + if (_eclass == ELFCLASS32) { + Elf32_Ehdr* ehdr32 = elf32_getehdr(_e); + + if (ehdr32 == NULL) { + _err.xfail("setTarget() : failed in elf32_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr32->e_flags = flag; + } + else { + Elf64_Ehdr* ehdr64 = elf64_getehdr(_e); + + if (ehdr64 == NULL) { + _err.xfail("setTarget() : failed in elf64_getehdr()- %s.", elf_errmsg(-1)); + return false; + } + + ehdr64->e_flags = flag; + } + + return true; +} + +/* + returns true if success; return false if fail. + scn will return scn for 'id'. + */ + bool +OclElf::getSectionDesc(Elf_Scn*& scn, OclElf::oclElfSections id) const +{ + if ( ((id == SHSTRTAB) && (_shstrtab_ndx != 0)) || + ((id == STRTAB) && (_strtab_ndx != 0)) ) { + // Special (efficient) processing of SHSTRTAB/STRTAB + size_t idx = (id == SHSTRTAB) ? _shstrtab_ndx : _strtab_ndx; + if ((scn = elf_getscn(_e, idx)) == NULL) { + _err.xfail("OclElf::addSectionDesc(): elf_getscn() failed - %s", + elf_errmsg(-1)); + return false; + } + } + else { + /* Search sections */ + const char* sname = oclElfSecDesc[id].name; + for (scn = elf_nextscn(_e, 0); + scn != NULL; + scn = elf_nextscn(_e, scn)) + { + size_t idx = elf_ndxscn(scn); + if ( ((idx == _shstrtab_ndx) && (_shstrtab_ndx != 0)) || + ((idx == _strtab_ndx) && (_strtab_ndx != 0)) ) { + continue; + } + + GElf_Shdr shdr; + if (gelf_getshdr(scn, &shdr) != &shdr) { + _err.xfail("OclElf::getSectionDesc() : failed in gelf_getshdr()- %s.", + elf_errmsg(-1)); + return false; + } + + /* Convert an index (to the shdr string table) to a char pointer */ + char *nm = elf_strptr(_e, _shstrtab_ndx, shdr.sh_name); + if (strcmp(sname, nm ? nm : "") == 0) { + // Found ! + break; + } + } + } + return true; +} + +/* + Return true if success; return false if fail. + data will return Elf_Data. + */ + bool +OclElf::getSectionData(Elf_Data*& data, OclElf::oclElfSections id) const +{ + assert(_e != 0); + + data = NULL; + Elf_Scn* scn; + if (!getSectionDesc(scn, id)) { + return false; + } + if (scn != NULL) { + // There is only one data descriptor (we are reading!) + data = elf_getdata(scn, 0); + } + return true; +} + +/* + Get the whole section, assuming that there is only one data descriptor + */ + bool +OclElf::getSection(OclElf::oclElfSections id, char** dst, size_t* sz) const +{ + assert((oclElfSecDesc[id].id == id) && + "oclElfSecDesc[] should be in the same order as enum oclElfSections"); + + Elf_Data* data; + if (!getSectionData(data, id)) { + _err.xfail("OclElf::getSection() failed in getSectionData()"); + return false; + } + if (data == NULL) { + *dst = NULL; + *sz = 0; + } + else { + *sz = (size_t)data->d_size; + *dst = (char*)data->d_buf; + } + + return true; +} + +/* + API routines for manipulating symbols + */ + Sym_Handle +OclElf::nextSymbol(Sym_Handle symHandle) const +{ + size_t sz; + char* beg, *end; + + if (!getSection(SYMTAB, &beg, &sz)) { + _err.xfail("OclElf::nextSymbol() failed in getSection()"); + return NULL; + } + + if ( (beg == 0) || (sz == 0) ) { + return NULL; + } + + end = beg + sz; + if (_eclass == ELFCLASS64) { + // Skip the first dummy symbol (STT_NOTYPE) + beg += sizeof(Elf64_Sym); + + if (beg == end) { // No valid symbols in the table + return NULL; + } + + if (symHandle == NULL) { + // Return the first symbol + return reinterpret_cast(beg); + } + + // Return the next symbol + Elf64_Sym* sym64 = reinterpret_cast(symHandle); + sym64++; + if (reinterpret_cast(sym64) == end) { + return NULL; + } + return reinterpret_cast(sym64); + } + else { + // Skip the first dummy symbol (STT_NOTYPE) + beg += sizeof(Elf32_Sym); + + if (beg == end) { // No valid symbols in the table + return NULL; + } + + if (symHandle == NULL) { + // Return the first symbol + return reinterpret_cast(beg); + } + + Elf32_Sym* sym32 = reinterpret_cast(symHandle); + sym32++; + if (reinterpret_cast(sym32) == end) { + return NULL; + } + return reinterpret_cast(sym32); + } + + // UNREACHABLE + return NULL; +} + +/* + Given a symbol handle, return info for this symbol + + Fails with symbols which have special section indexes (like absolute symbols). + It is impossible to return valid SymbolInfo for such symbols because + correct section names are unknown (unspecified in ELF). + */ + bool +OclElf::getSymbolInfo(Sym_Handle symHandle, SymbolInfo* symInfo) const +{ + assert(_e != 0); + + Elf_Scn *scn; + char* sym_name; + Elf64_Addr st_value; /* visibility */ + Elf64_Xword st_size; /* index of related section */ + + if (_eclass == ELFCLASS64) { + Elf64_Sym* sym64 = reinterpret_cast(symHandle); + if (sym64->st_shndx >= SHN_LORESERVE && sym64->st_shndx <= SHN_HIRESERVE) { + return false; + } + + sym_name = elf_strptr(_e, _strtab_ndx, sym64->st_name); + st_value = (Elf64_Addr)(sym64->st_value); + st_size = (Elf64_Xword)(sym64->st_size); + + // get section + scn = elf_getscn(_e, sym64->st_shndx); + } + else { + Elf32_Sym* sym32 = reinterpret_cast(symHandle); + if (sym32->st_shndx >= SHN_LORESERVE && sym32->st_shndx <= SHN_HIRESERVE) { + return false; + } + + sym_name = elf_strptr(_e, _strtab_ndx, sym32->st_name); + st_value = (Elf64_Addr)(sym32->st_value); + st_size = (Elf64_Xword)(sym32->st_size); + + // get section + scn = elf_getscn(_e, sym32->st_shndx); + } + + GElf_Shdr gshdr; + if (gelf_getshdr(scn, &gshdr) == NULL) { + _err.xfail("OclElf::getSymbolInfo() failed in gelf_getshdr() - %s.", + elf_errmsg(-1)); + return false; + } + char* sec_name = elf_strptr(_e, _shstrtab_ndx, gshdr.sh_name); + + // Assume there is only one Elf_Data. For reading, it's always true + Elf_Data* data = elf_getdata(scn, 0); + if (data == NULL) { + symInfo->sec_addr = (char*)NULL; + symInfo->sec_size = 0; + symInfo->address = (char*)NULL; + symInfo->size = (uint64_t)0; + } + else { + symInfo->sec_addr = (char*)data->d_buf; + symInfo->sec_size = data->d_size; + symInfo->address = symInfo->sec_addr + (size_t)st_value; + symInfo->size = (uint64_t)st_size; + } + symInfo->sec_name = sec_name; + symInfo->sym_name = sym_name; + + return true; +} + +/* + AddSectionData() will add data into a section. Return the offset + of the data in this section if success; return -1 if fail. + */ +bool +OclElf::addSectionData ( + Elf64_Xword& outOffset, + oclElfSections id, + const void* buffer, + size_t size, + bool do_copy // true if buffer needs to be copied + ) +{ + outOffset = 0; + const char* secName = oclElfSecDesc[id].name; + GElf_Shdr shdr; + Elf_Scn* scn; + if (!getSectionDesc(scn, id)) { + return false; + } + assert (scn && "Elf_Scn should have been created already"); + + if (gelf_getshdr(scn, &shdr) != &shdr) { + _err.xfail("OclElf::addSectionData(): gelf_getshdr() failed - %s", + elf_errmsg(-1)); + return false; + } + outOffset = (Elf64_Xword)shdr.sh_size; + + /* addng Elf_Data descriptor associated with section scn */ + Elf_Data* data = createElfData(scn, id, const_cast(buffer), + (uint64_t)size, do_copy); + if (data == NULL) { + return false; + } + + return true; +} + +/* + getShdrNdx() returns an index to the .shstrtab in 'outNdx' for "name" if it + is in .shstrtab (outNdx == 0 means it is not in .shstrtab). It return true if + it is successful; return false if en error occured. + */ + bool +OclElf::getShstrtabNdx(Elf64_Word& outNdx, const char* name) +{ + outNdx = 0; + + // .shstrtab must be created already + Elf_Scn* scn = elf_getscn(_e, _shstrtab_ndx); + if (scn == NULL) { + _err.xfail("OclElf::getShdrNdx() failed in elf_getscn for section .shstrtab - %s", + elf_errmsg(-1)); + return false; + } + + Elf_Data* data = elf_getdata(scn, NULL); + if (data == NULL) { + _err.xfail("Elf::getShdrNdx() failed in elf_getdata for section .shstrtab - %s", + elf_errmsg(-1)); + return false; + } + + size_t name_sz = strlen(name); + uint64_t data_offset = 0; + do { + if (data->d_size > name_sz) { + char* base = (char*)data->d_buf; + char* end = base + (size_t)data->d_size; + char* b = base; + char* e; + + while ( b != end) { + e = b; + + // find the next 0 char + while ( (e != end) && (*e != 0) ) { + e++; + } + + if ((e != end) && ((size_t)(e - b) == name_sz) && + (strcmp(b, name) == 0)) { + outNdx = (Elf64_Word)((b - base) + data_offset); + return true; + } + b = e+1; + } + } + data_offset += data->d_size; + } while ((data = elf_getdata(scn, data)) != NULL); + + return true; +} + +/* + newSection() assumes that .shstrtab and .strtab have been created already. + Return the pointer to the new section if success; return 0 if fail. + */ +Elf_Scn* +OclElf::newSection ( + OclElf::oclElfSections id, + const void* d_buf, + size_t d_size, + bool do_copy + ) +{ + Elf64_Word sh_name; + if (!getShstrtabNdx(sh_name, oclElfSecDesc[id].name)) { + _err.xfail("OclElf::newSection() failed in getShstrtabNdx() for section %s", + oclElfSecDesc[id].name); + return NULL; + } + + if (sh_name == 0) { // Need to create a new entry for this section name + Elf64_Xword offset; + if (!addSectionData(offset, SHSTRTAB, oclElfSecDesc[id].name, + strlen(oclElfSecDesc[id].name) + 1, false)) { + _err.xfail("OclElf::newSection() failed in getSectionData() for section %s", + oclElfSecDesc[id].name); + return NULL; + } + sh_name = (Elf64_Word)offset; + } + + // Create a new section + Elf_Scn* scn = elf_newscn(_e); + if (scn == NULL) { + _err.xfail("OclElf::newSection() failed in elf_newscn() - %s.", + elf_errmsg(-1)); + return NULL; + } + + // If there is no data, skip creating Elf_Data + if ((d_buf != NULL) && (d_size != 0)) { + Elf_Data* data = createElfData(scn, id, + const_cast(d_buf), (uint64_t)d_size, do_copy); + if (data == NULL) { + return NULL; + } + } + + if (!createShdr(id, scn, sh_name, (id == SYMTAB) ? _strtab_ndx : 0)) { + return NULL; + } + + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("OclElf::newSection(): elf_update() failed"); + return NULL; + } + + return scn; +} + +/* + Return true: success +false: fail +*/ +bool +OclElf::addSection ( + oclElfSections id, + const void* d_buf, + size_t d_size, + bool do_copy + ) +{ + assert(oclElfSecDesc[id].id == id && + "struct oclElfSecDesc should be ordered by id same as enum Elf::oclElfSections"); + + /* If section is already in elf object, simply return its address */ + Elf_Scn* scn; + if (!getSectionDesc(scn, id)) { + // Failed + return false; + } + + if (scn != NULL) { + Elf64_Xword sec_offset; + if (!addSectionData(sec_offset, id, d_buf, d_size, do_copy)) { + _err.xfail("OclElf::addSection() failed in addSectionData() for section name %s.", + oclElfSecDesc[id].name); + return false; + } + } + else { + scn = newSection(id, d_buf, d_size, do_copy); + if (scn == NULL) { + _err.xfail("OclElf::addSection() failed in newSection() for section name %s.", + oclElfSecDesc[id].name); + return false; + } + } + return true; +} + +bool +OclElf::addSymbol( + oclElfSections id, + const char* symbolName, + const void* buffer, + size_t size, + bool do_copy + ) +{ + assert(oclElfSecDesc[id].id == id && + "The order of oclElfSecDesc[] and Elf::oclElfSections mismatches."); + + const char* sectionName = oclElfSecDesc[id].name; + + bool isFunction = ((id == OclElf::CAL) || (id == OclElf::DLL) || (id == OclElf::JITBINARY)) ? true : false; + + // Get section index + Elf_Scn* scn; + if (!getSectionDesc(scn, id)) { + _err.xfail("OclElf::addSymbol() failed in getSectionDesc"); + return false; + } + if (scn == NULL) { + // Create a new section. + if ((scn = newSection(id, NULL, 0, false)) == NULL) { + _err.xfail("OclElf::addSymbol() failed in newSection"); + return false; + } + } + size_t sec_ndx = elf_ndxscn(scn); + if (sec_ndx == SHN_UNDEF) { + _err.xfail("OclElf::addSymbol() failed in elf_ndxscn() - %s.", + elf_errmsg(-1)); + return false; + } + + // Put symbolName into .strtab section + Elf64_Xword strtab_offset; + if (!addSectionData(strtab_offset, STRTAB, (void*)symbolName, + strlen(symbolName)+1, true)) { + _err.xfail("OclElf::addSymbol() failed in addSectionData(.strtab)"); + return false; + } + + // Put buffer into section + Elf64_Xword sec_offset = 0; + if ( (buffer != NULL) && (size != 0) ) { + if (!addSectionData(sec_offset, id, buffer, size, do_copy)) { + _err.xfail("OclElf::addSymbol() failed in addSectionData(%s)", sectionName); + return false; + } + } + + bool retvalue; + Elf64_Xword symtab_offset; + if (_eclass == ELFCLASS64) { + Elf64_Sym* sym64 = (Elf64_Sym*)oclelf_calloc(sizeof(Elf64_Sym)); + + sym64->st_name = (Elf64_Word)strtab_offset; + sym64->st_value = (Elf64_Addr)sec_offset; + sym64->st_size = (Elf64_Xword)size; + sym64->st_info = (isFunction)? STT_FUNC : STT_OBJECT; + sym64->st_shndx = (Elf64_Section)sec_ndx; + + retvalue = addSectionData(symtab_offset, SYMTAB, sym64, sizeof(Elf64_Sym), false); + } + else { // _eclass == ELFCLASS32 + Elf32_Sym* sym32 = (Elf32_Sym*)oclelf_calloc(sizeof(Elf32_Sym)); + + sym32->st_name = (Elf32_Word)strtab_offset; + sym32->st_value = (Elf32_Addr)sec_offset; + sym32->st_size = (Elf32_Word)size; + sym32->st_info = (isFunction)? STT_FUNC : STT_OBJECT; + sym32->st_shndx = (Elf32_Section)sec_ndx; + + retvalue = addSectionData(symtab_offset, SYMTAB, sym32, sizeof(Elf32_Sym), false); + } + + if (!retvalue) { + _err.xfail("OclElf::addSymbol() failed in addSectionData(.symtab)"); + return false; + } + + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("OclElf::addSymbol() : elf_update() failed"); + return false; + } + + return true; +} + +bool +OclElf::getSymbol( + oclElfSections id, + const char* symbolName, + char** buffer, + size_t* size + ) const +{ + assert(oclElfSecDesc[id].id == id && + "The order of oclElfSecDesc[] and Elf::oclElfSections mismatches."); + if (!size || !buffer || !symbolName) { + return false; + } + // Initialize the size and buffer to invalid data points. + (*size) = 0; + (*buffer) = NULL; + for (amd::Sym_Handle s = nextSymbol(NULL); s; s = nextSymbol(s)) { + amd::OclElf::SymbolInfo si; + // Continue if symbol information is not retrieved. + if (!getSymbolInfo(s, &si)) { + continue; + } + // Continue if the symbol is in the wrong section. + if (strcmp(oclElfSecDesc[id].name, si.sec_name)) { + continue; + } + // Continue if the symbol name doesn't match. + if (strcmp(symbolName, si.sym_name)) { + continue; + } + // Set the size and the address and return true. + (*size) = si.size; + (*buffer) = si.address; + return true; + } + return false; +} + +bool +OclElf::addNote( + const char* noteName, + const char* noteDesc, + size_t nameSize, + size_t descSize + ) +{ + if ((nameSize == 0 && descSize == 0) + || (nameSize != 0 && noteName == NULL) + || (descSize != 0 && noteDesc == NULL)) { + _err.xfail("OclElf::addNote() empty note"); + return false; + } + + const oclElfSections sid = NOTES; + assert(oclElfSecDesc[sid].id == sid && + "The order of oclElfSecDesc[] and Elf::oclElfSections mismatches."); + + // Get section + Elf_Scn* scn; + if (!getSectionDesc(scn, sid)) { + _err.xfail("OclElf::addNote() failed in getSectionDesc"); + return false; + } + if (scn == NULL) { + // Create a new section. + if ((scn = newSection(sid, NULL, 0, false)) == NULL) { + _err.xfail("OclElf::addNote() failed in newSection"); + return false; + } + } + + // Put note into section + Elf64_Xword sec_offset = 0; + size_t bufsize = sizeof(Elf_Note) + nameSize + descSize; + char* buffer = (char*)oclelf_calloc(bufsize); + if (buffer == NULL) return false; + + Elf_Note* en = reinterpret_cast(buffer); + en->n_namesz = nameSize; + en->n_descsz = descSize; + en->n_type = 0; + memcpy(buffer+sizeof(Elf_Note), noteName, nameSize); + memcpy(buffer+sizeof(Elf_Note)+nameSize, noteDesc, descSize); + if (!addSectionData(sec_offset, sid, buffer, bufsize, false/*not copy*/)) { + const char* sectionName = oclElfSecDesc[sid].name; + _err.xfail("OclElf::addNote() failed in addSectionData(%s)", sectionName); + return false; + } + + if (elf_update(_e, ELF_C_NULL) < 0) { + _err.xfail("OclElf::addNote() : elf_update() failed"); + return false; + } + + return true; +} + +bool +OclElf::getNote( + const char* noteName, + char** noteDesc, + size_t *descSize + ) +{ + if (!descSize || !noteDesc || !noteName) { + return false; + } + + const oclElfSections sid = NOTES; + assert(oclElfSecDesc[sid].id == sid && + "The order of oclElfSecDesc[] and Elf::oclElfSections mismatches."); + + // Get section + Elf_Scn* scn; + if (!getSectionDesc(scn, sid)) { + _err.xfail("OclElf::getNote() failed in getSectionDesc"); + return false; + } + if (scn == NULL) { + _err.xfail("OclElf::getNote() failed: .note section not found"); + return false; + } + + // read the whole .note section + Elf_Data* data = elf_getdata(scn, 0); + + // Initialize the size and buffer to invalid data points. + *descSize = 0; + *noteDesc = NULL; + + // look for the specified note + char* ptr = (char*)data->d_buf; + while (ptr < (char*)data->d_buf + data->d_size) { + Elf_Note* note = reinterpret_cast(ptr); + + // Continue if the note name doesn't match. + if (strlen(noteName) != note->n_namesz + || strncmp(noteName, ptr+sizeof(Elf_Note), note->n_namesz) != 0) { + ptr += sizeof(Elf_Note) + note->n_namesz + note->n_descsz; + continue; + } + // Set the size and the address and return true. + *descSize = note->n_descsz; + *noteDesc = ptr + sizeof(Elf_Note) + note->n_namesz; + return true; + } + return false; +} + + bool +OclElf::dumpImage(char** buff, size_t* len) +{ + if (buff == NULL || len == NULL ) { + return false; + } + + assert ((_fd != -1) && "_fd in Elf::dumpImage should be defined"); + + // Now, write the ELF into the file + if (elf_update(_e, ELF_C_WRITE) < 0) { + _err.xfail("OclElf::dumpImage() : elf_update() failed - %s", + elf_errmsg(-1)); + return false; + } + + int buff_sz = xlseek(_err, _fname, _fd, 0, SEEK_END); + if (buff_sz == -1) { + return false; + } + + /* + The memory is owned by caller, and caller assumes that the memory is new'ed. + So, use new instead of malloc + */ + *buff = new char[buff_sz]; + if (*buff == NULL) { + _err.xfail("OclElf::dumpImage() : new char[sz] failed"); + return false; + } + + if (xlseek(_err, _fname, _fd, 0, SEEK_SET) == -1) { + _err.xfail("OclElf::dumpImage() failed in xlseek()"); + delete [] *buff; + return false; + } + + if (xread(_err, _fname, _fd, *buff, buff_sz) != buff_sz) { + _err.xfail("OclElf::dumpImage() failed in xread()"); + delete [] *buff; + *buff = 0; + return false; + } + + *len = buff_sz; + return true; +} + + void* +OclElf::oclelf_allocAndCopy(void* p, size_t sz) +{ + if (p == 0 || sz == 0) return p; + + void* buf = xmalloc(_err, sz); + if (buf == 0) { + _err.xfail("OclElf::oclelf_allocAndCopy() failed"); + return 0; + } + + memcpy(buf, p, sz); + _elfMemory.insert( std::make_pair(buf, sz)); + return buf; +} + + void* +OclElf::oclelf_calloc(size_t sz) +{ + void* buf = xmalloc(_err, sz); + if (buf == 0) { + _err.xfail("OclElf::oclelf_calloc() failed"); + return 0; + } + _elfMemory.insert( std::make_pair(buf, sz)); + return buf; +} + + void +OclElf::elfMemoryRelease() +{ + for(EMemory::iterator it = _elfMemory.begin(); it != _elfMemory.end(); it++) { + free(it->first); + } + _elfMemory.clear(); +} + +} // namespace amd diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.hpp b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.hpp new file mode 100644 index 0000000000..7ba8a0879f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf.hpp @@ -0,0 +1,416 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef ELF_HPP_ +#define ELF_HPP_ + +#include + +#include "top.hpp" +#include "elf_utils.hpp" +#if !defined(WITH_LIGHTNING_COMPILER) +#include "caltarget.h" // using CALtargetEnum +#endif // !defined(WITH_LIGHTNING_COMPILER) + +#include "libelf.h" +#include "gelf.h" + +// Not sure where to put these in the libelf +#define AMD_BIF2 2 // AMD BIF Version 2.0 +#define AMD_BIF3 3 // AMD BIF Version 3.0 + +// These two definitions need to stay in sync with +// the definitions elfdefinitions.h until they get +// properly upstreamed to gcc/libelf. +#ifndef EM_HSAIL +#define EM_HSAIL 0xAF5A +#endif +#ifndef EM_HSAIL_64 +#define EM_HSAIL_64 0xAF5B +#endif +#ifndef EM_AMDIL +#define EM_AMDIL 0x4154 +#endif +#ifndef EM_ATI_CALIMAGE_BINARY +#define EM_ATI_CALIMAGE_BINARY 125 +#endif +#ifndef EM_AMDGPU +#define EM_AMDGPU 224 +#endif +#ifndef ELFOSABI_AMD_OPENCL +#define ELFOSABI_AMD_OPENCL 201 +#endif +#ifndef ELFOSABI_HSAIL +#define ELFOSABI_HSAIL 202 +#endif +#ifndef ELFOSABI_AMDIL +#define ELFOSABI_AMDIL 203 +#endif +#ifndef ELFOSABI_CALIMAGE +#define ELFOSABI_CALIMAGE 100 +#endif +namespace amd { + +// Test: is it ELF file (with a given bitness) ? +bool isElfHeader(const char* p, signed char ec); +bool isElfMagic(const char* p); + +// Test: is it ELF for CAL ? +bool isCALTarget(const char* p, signed char ec); + +// Symbol handle +typedef struct symbol_handle *Sym_Handle; + +class OclElf +{ +public: + enum { + CAL_BASE = 1001, // A number that is not dependent on libelf.h + CPU_BASE = 2001, + CPU_FEATURES_FIRST = 0, // Never generated, but keep it for simplicity. + CPU_FEATURES_LAST = 0xF // This should be consistent with cpudevice.hpp + } oclElfBase; + + typedef enum { + // NOTE!!! Never remove an entry or change the order. + +#if !defined(WITH_LIGHTNING_COMPILER) + // All CAL targets are within [CAL_FIRST, CAL_LAST]. + CAL_FIRST = CAL_TARGET_600 + CAL_BASE, + CAL_LAST = CAL_TARGET_LAST + CAL_BASE, +#endif // !defined(WITH_LIGHTNING_COMPILER) + + // All CPU targets are within [CPU_FIRST, CPU_LAST] + CPU_FIRST = CPU_FEATURES_FIRST + CPU_BASE, + CPU_LAST = CPU_FEATURES_LAST + CPU_BASE, + + OCL_TARGETS_LAST, + } oclElfTargets; + + typedef enum { + CAL_PLATFORM = 0, + CPU_PLATFORM = 1, + COMPLIB_PLATFORM = 2, + LC_PLATFORM = 3, + LAST_PLATFORM = 4 + } oclElfPlatform; + + typedef enum { + LLVMIR = 0, + SOURCE, + ILTEXT, + ASTEXT, + CAL, + DLL, + STRTAB, + SYMTAB, + RODATA, + SHSTRTAB, + NOTES, + COMMENT, + ILDEBUG, + DEBUG_INFO, + DEBUG_ABBREV, + DEBUG_LINE, + DEBUG_PUBNAMES, + DEBUG_PUBTYPES, + DEBUG_LOC, + DEBUG_ARANGES, + DEBUG_RANGES, + DEBUG_MACINFO, + DEBUG_STR, + DEBUG_FRAME, + JITBINARY, + CODEGEN, + TEXT, + INTERNAL, + SPIR, + SPIRV, + RUNTIME_METADATA, + OCL_ELF_SECTIONS_LAST + } oclElfSections; + + typedef struct { + char* sec_name; //! section name + char* sec_addr; //! section address + uint64_t sec_size; //! section size + char* sym_name; //! symbol name + char* address; //! address of corresponding to symbol data + uint64_t size; //! size of data corresponding to symbol + } SymbolInfo; + +private: + + // file descriptor + int _fd; + + // file name + const char* _fname; + + // pointer to libelf structure + ::Elf* _e; + + // Error Object + mutable OclElfErr _err; + + // Bitness of the Elf object. + unsigned char _eclass; + + // Raw ELF bytes in memory from which Elf object is initialized + // The memory is owned by the client, not this OclElf object ! + const char* _rawElfBytes; + uint64_t _rawElfSize; + + // Read, write, or read and write for this Elf object + const Elf_Cmd _elfCmd; + + // Memory management + typedef std::map EMemory; + EMemory _elfMemory; + + // Indexes of .shstrtab and .strtab (for convenience) + Elf64_Word _shstrtab_ndx; + Elf64_Word _strtab_ndx; + +public: + + /* + OclElf object can be created for reading or writing (it could be created for + both reading and writing, which is not supported yet at this time). Currently, + it has two forms: + + 1) OclElf(eclass, rawElfBytes, rawElfSize, 0, ELF_C_READ) + + To load ELF from raw bytes in memory and generate OclElf object. And this + object is for reading only. + + 2) OclElf(eclass, NULL, 0, elfFileName|NULL, ELF_C_WRITE) + + To create an ELF for writing and save it into a file 'elfFileName' (if it + is NULL, the OclElf will create a temporary file and set it to 'elfFileName'. + + Since we need to read the ELF into memory, this file 'elfFileName' is created + with both read and write, so that the runtime can use dumpImage() to get ELF + raw bytes by reading this file. + + 'eclass' is ELF's bitness and it must be the same as the eclass of ELF to + be loaded (for example, rawElfBytes). + + Return values of all public APIs with bool return type + true : on success; + false : on error. + */ + OclElf ( + unsigned char eclass, // eclass for this ELF + const char* rawElfBytes, // raw ELF bytes to be loaded + uint64_t rawElfSize, // size of the ELF raw bytes + const char* elfFileName, // File to save this ELF. + Elf_Cmd elfcmd // ELF_C_READ/ELF_C_WRITE + ); + + ~OclElf (); + + /* + dumpImage() will finalize the ELF and write it into the file. It then reads + it into the memory; and returns it via . + + The memory pointed by buff is owned by OclElf object. + */ + bool dumpImage(char** buff, size_t* len); + + /* + addSection() is used to create a single ELF section with data . If + do_copy is true, the OclElf object will make a copy of d_buf and uses that copy to + create an ELF section. + + When setting do_copy = false, the caller should make sure that will + be unchanged and available during the lifetime of this OclElf object; ie before + calling dumpImage(). + */ + bool addSection ( + oclElfSections id, + const void* d_buf, + size_t d_size, + bool do_copy = true + ); + + /* + getSection() will return the whole section in . + + The memory pointed by is owned by the OclElf object. + */ + bool getSection(oclElfSections id, char** dst, size_t* sz) const; + + /* + addSymbol() adds a symbol with name 'symbolName' and data + into the ELF. 'id' indicates which section will go + into. The meaning of 'do_copy' is the same as addSection(). + */ + bool addSymbol( + oclElfSections id, // Section in which symbol is added + const char* symbolName, // Name of symbol + const void* buffer, // Symbol's data + size_t size, // Symbol's size + bool do_copy = true // If true, add a copy of buffer into the section + ); + + /* + * getSymbol() will return the data associated with + * the symbol from the Elf. + * + * The memory pointed by is owned by the OclElf object + */ + bool getSymbol( + oclElfSections id, // Section in which symbol is in + const char* symbolName, // Name of the symbol to retrieve + char** buffer, // Symbol's data + size_t* size // Symbol's size + ) const; + + /* + nextSymbol() and getSymbolInfo() use the symbol handle to access symbols + + For example: + for( Sym_Handle s = nextSymbol(NULL); s ; s = nextSymbol(s)) { + SymbolInfo si; + if (!getSymbolInfo(s, &si)) { + Error; + } + use si + } + + where nextSymbol(NULL) will return the first symbol. + + Note that memory space pointed to by si is owned by OclElf. + */ + bool getSymbolInfo(Sym_Handle sym, SymbolInfo* symInfo) const; + Sym_Handle nextSymbol(Sym_Handle symhandle) const; + + /* + Adds a note with name 'noteName' and description "noteDesc" + into the .note section of ELF. Length of note name is 'nameSize'. + Length of note description is "descSize'. + */ + bool addNote(const char* noteName, const char* noteDesc, + size_t nameSize, size_t descSize); + + /* + Returns the description of a note whose name is 'noteName' + in 'noteDesc'. + Returns the length of the description in 'descSize'. + */ + bool getNote(const char* noteName, char** noteDesc, size_t *descSize); + + /* + Get/set machine and platform (target) for which elf is built. + */ + bool getTarget(uint16_t& machine, oclElfPlatform& platform); + bool setTarget(uint16_t machine, oclElfPlatform platform); + + /* + Get/set elf type field from header + */ + bool getType(uint16_t &type); + bool setType(uint16_t type); + + /* + Get/set elf flag field from header. + */ + bool getFlags(uint32_t &flag); + bool setFlags(uint32_t flag); + + /* + Clear() will return the status of OclElf to just after ctor() is invoked. + However, it will not regenerate a temporary file name like ctor() does. + + It is useful when the ELF content needs to be discarded for some reason. + */ + bool Clear(); + + bool hasError() { return (_err.getOclElfError())[0] != 0; } + const char* getErrMsg() { return _err.getOclElfError(); } + unsigned char getELFClass() { return _eclass; } + + bool isHsaCo() const { return (_eclass == ELFCLASS32) ? + (elf32_getehdr(_e)->e_machine == EM_AMDGPU) : (elf64_getehdr(_e)->e_machine == EM_AMDGPU); } + +private: + + /* Initialization */ + bool Init(); + + /* + Initialize ELF object by creating ELF header and key sections such as + .shstrtab, .strtab, and .symtab. + */ + bool InitElf (); + + // Wraper for creating a section header and Elf_Data + bool createShdr ( + oclElfSections id, + Elf_Scn*& scn, + Elf64_Word shname, + Elf64_Word shlink = 0 + ); + + Elf_Data* createElfData( + Elf_Scn*& scn, + oclElfSections id, + void* d_buf, + uint64_t d_size, + bool do_copy + ); + + /* + Create a new section (id) with data . If do_copy is true, + make a copy of d_buf and create a new section with that copy. + + Return the valid Elf_Scn on success; return NULL on error. + + Note that newSection() uses Section Header's size, so make sure elf_update() + is invoked properly before invoking newSection(). + */ + Elf_Scn* newSection ( + oclElfSections id, + const void* d_buf, + size_t d_size, + bool do_copy = true // if true, add a copy of d_buf + ); + + /* + Add a new data into a section by creating a new data descriptor. + And the new data's offset is returned in 'outOffset'. + */ + bool addSectionData( + Elf64_Xword& outOffset, + oclElfSections id, + const void* buffer, + size_t size, + bool do_copy=true // if true, add a copy of buffer + ); + + // Return Elf_Data for this section 'id' + bool getSectionData(Elf_Data*& data, oclElfSections id) const; + + // Return Elf_Scn for this section 'id' + bool getSectionDesc(Elf_Scn*& scn, oclElfSections id) const; + + // + bool getShstrtabNdx(Elf64_Word& outNdx, const char*); + + void* oclelf_allocAndCopy(void* p, size_t sz); + void* oclelf_calloc(size_t sz); + + void elfMemoryRelease(); +}; + +} // namespace amd + +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.cpp b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.cpp new file mode 100644 index 0000000000..a693159e38 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.cpp @@ -0,0 +1,285 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "elf_utils.hpp" +#include "memfile.h" + +#include +#include +#include +#include + +/* + See elf_utils.hpp for descriptions about each functions + */ + +namespace amd { + +#define ELF_OPEN mem_open +#define ELF_READ(f, b, l) mem_read((f), (b), (unsigned int)(l)) +#define ELF_WRITE mem_write +#define ELF_CLOSE mem_close +#define ELF_LSEEK mem_lseek + +/* + Save the error string in _lastErrMsg. If it is built without NDEBUG, the program + will terminate immediately with exit(1). + */ +void OclElfErr::xfail(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(&_lastErrMsg[0], (size_t)MAX_ERROR_MESSAGE_LENGTH, fmt, ap); + va_end(ap); + +#ifndef NDEBUG + printf("%s\n", _lastErrMsg); + exit(1); +#endif +} + +namespace oclelfutils { + +/* + Wrap malloc() with xfail(), so this returns newly-allocated memory or 0. + The memory is guaranteed to be initialized to zero. + */ +void* xmalloc(OclElfErr& err, const size_t len) +{ + void *retval = calloc(1, len); + if (retval == NULL) { + err.xfail("xmalloc failed: out of memory"); + return NULL; + } + return retval; +} + +/* + Return file descriptor on success; return -1 on error and invoke xfail() + to record the error. + */ +int xopen(OclElfErr& err, const char *fname, const int in_flags, const int perms) +{ + const int retval = ELF_OPEN(fname, in_flags, perms); + if (retval == -1) { + err.xfail("Failed to open '%s': %s", fname, strerror(errno)); + return -1; + } + return retval; +} + +/* + Return 0 on success; return -1 on error. + */ +int xclose(OclElfErr& err, const char *fname, const int fd) +{ + int rc; + while ( ((rc = :: ELF_CLOSE(fd)) == -1) && (errno == EINTR) ) { ;/* spin. */ } + if (rc == -1) { + err.xfail("Failed to close '%s': %s", fname, strerror(errno)); + return -1; + } + return rc; +} + +/* + Return the file offset location on success; return -1 on error. + */ +off_t xlseek( + OclElfErr& err, + const char* fname, + const int fd, + const off_t offset, + const int whence) +{ + // For really big file _lseeki64/lseek64 are needed. For now, + // lseek/_lseek is enough. + off_t res = ELF_LSEEK(fd, offset, whence); + if (res == -1) { + err.xfail("Failed to seek in '%s': %s", fname, strerror(errno)); + return -1; + } + return res; +} + +/* + Return the number of bytes that are read on success; return -1 on error. + */ +ssize_t xread( + OclElfErr& err, + const char* fname, + const int fd, + void* buf, + const size_t buf_len + ) +{ + ssize_t rc; + while (((rc = ELF_READ(fd, buf, buf_len)) == -1) && (errno == EINTR)) { ;/* spin */ } + if (rc < 0) { + err.xfail("Failed to read '%s': %s", fname, strerror(errno)); + return -1; + } + return rc; +} + +#if 0 + +/* + Return the number of bytes that have been written on success; return -1 on error. + */ +ssize_t xwrite(OclElfErr& err, + const char* fname, + const int fd, + const void* buf, + const size_t len) +{ + ssize_t rc; + while (((rc = ELF_WRITE(fd, buf, len)) == -1) && (errno == EINTR)) { ;/* spin */ } + if ( (rc == -1) || (rc != (ssize_t)len) ) { + err.xfail("Failed to write '%s': %s", fname, strerror(errno)); + return -1; + } + return rc; +} + +/* + Allocate a copy of (str), invoke xfail() on failure. + Returns NULL on error, or address of the allocated copy + */ +char* xstrdup(OclElfErr& err, const char *str) +{ + char* retval = (char*)xmalloc(err, strlen(str) + 1); + if (retval == NULL) { + err.xfail("xstrdup failed: cannot allocate new char string"); + return NULL; + } + strcpy(retval, str); + return retval; +} + +/* + get the length of an open file in bytes. return -1 on error. + */ +uint64_t xget_file_size(OclElfErr& err, const char *fname, const int fd) +{ + struct stat statbuf; + if (fstat(fd, &statbuf) == -1) { + err.xfail("Failed to fstat '%s': %s", fname, strerror(errno)); + return -1; + } + return (uint64_t) statbuf.st_size; +} + +/* + Copy file 'infd' to file 'outfd'. + Return the total number of bytes copied on success; return -1 on error. + */ +int64_t xcopyfile( + OclElfErr& err, + const char* in, + const int infd, + const char* out, + const int outfd + ) +{ + uint64_t retval = 0; + ssize_t rc = 0; + off_t res = xlseek(err, in, infd, 0, SEEK_SET); + if (res == -1) { + err.xfail("xcopyfile failed in xlseek : in %s, out %s", in, out); + return -1; + } + + uint8_t* copybuf = err._copyBuffer; + if (copybuf == NULL) { + copybuf = (uint8_t*)xmalloc(err, IO_BUF_SIZE); + err._copyBuffer = copybuf; + } + + while ( (rc = xread(err, in, infd, copybuf, IO_BUF_SIZE)) > 0 ) { + retval += (uint64_t) rc; + int ret = xwrite(err, out, outfd, copybuf, rc); + if (ret == -1) { + err.xfail("xcopyfile failed in xwrite: in %s, out %s", in, out); + return -1; + } + } + if (rc == -1) { + err.xfail("xcopyfile failed in xread: in %s, out %s", in, out); + return -1; + } + return retval; +} + +/* + Copy file from 'infd' to current offset in 'outfd', for 'size' bytes. + Return 'size' on success; return -1 on error. + */ +int64_t +xcopyfile_range( + OclElfErr& err, const char *in, const int infd, + const char *out, const int outfd, + const uint64_t offset, const uint64_t size + ) +{ + uint8_t* copybuf = err._copyBuffer; + if (copybuf == NULL) { + copybuf = (uint8_t*)xmalloc(err, IO_BUF_SIZE); + err._copyBuffer = copybuf; + } + + ssize_t rc = xlseek(err, in, infd, (off_t) offset, SEEK_SET); + if (rc == -1) { + err.xfail("xcopyfile_range: xlseek() failed: %s", in); + return -1; + } + + uint64_t remaining = size; + while (remaining >= IO_BUF_SIZE) { + rc = xread(err, in, infd, copybuf, IO_BUF_SIZE); + if ((rc == -1) || (rc != IO_BUF_SIZE)) { + err.xfail("xcopyfile_range: xread() failed %s", in); + return -1; + } + rc = xwrite(err, out, outfd, copybuf, IO_BUF_SIZE); + if (rc == -1) { + err.xfail("xcopyfile_range: xwrite() failed: %s", out); + } + + remaining -= (uint64_t) IO_BUF_SIZE; + } + + if (remaining > 0) { + rc = xread(err, in, infd, copybuf, IO_BUF_SIZE); + if ((rc == -1) || (rc != (ssize_t)remaining)) { + err.xfail("xcopyfile_range: xread() failed %s", in); + return -1; + } + rc = xwrite(err, out, outfd, copybuf, rc); + if (rc == -1) { + err.xfail("xcopyfile_range: xwrite() failed: %s", out); + } + } + return size; +} + +uint64_t +align_to_page(const uint64_t offset) +{ + // TODO_jugu don't use hardcoded pagesize. + return (offset + ((1LL << 12) -1)) & ((uint64_t)(-(1LL << 12))); +} + +#endif + +} // namespace elfutils + +} // namespace amd diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.hpp b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.hpp new file mode 100644 index 0000000000..2d3eeb09c8 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/elf_utils.hpp @@ -0,0 +1,150 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef _ELF_UTILS_HPP +#define _ELF_UTILS_HPP + +#include +#include +#include +#include + +#include "top.hpp" + +namespace amd { + +#define MAX_ERROR_MESSAGE_LENGTH 1024 +#define IO_BUF_SIZE 16 * 1024 + +class OclElfErr +{ +public: + // Temperary buffer for copying from file to file + uint8_t* _copyBuffer; // Initialized first time it is used + +private: + // Keep the last error message. + char _lastErrMsg[MAX_ERROR_MESSAGE_LENGTH]; + +public: + + OclElfErr() : _copyBuffer(NULL) { _lastErrMsg[0] = 0; } + ~OclElfErr() { + if (_copyBuffer) { + free(_copyBuffer); + } + } + + void Init() { _lastErrMsg[0] = 0; } + + void Fini() { + _lastErrMsg[0] = 0; + if (_copyBuffer) { + free(_copyBuffer); + } + _copyBuffer = NULL; + } + + // Return the last error message. + const char* getOclElfError() const { return _lastErrMsg; } + + // + // Save the error string in ErrorMessage. If it is built without NDEBUG, the program + // will terminate immediately with exit(1). + // + void xfail(const char *fmt, ...); + +}; + +namespace oclelfutils { + +/* + Wrap malloc() with xfail(), so this returns newly-allocated memory or 0. + The memory is guaranteed to be initialized to zero. + */ +void* xmalloc(OclElfErr& err, const size_t len); + +/* + Return file descriptor on success; return -1 on error and invoke xfail() + to record the error. + */ +int xopen(OclElfErr& err, const char *fname, const int flags, const int perms); + +/* + Return 0 on success; return -1 on error. + */ +int xclose(OclElfErr& err, const char *fname, const int fd); + +/* + Return the file offset location on success; return -1 on error. + */ +off_t xlseek(OclElfErr& err, const char *fname, const int fd, + const off_t o, const int whence); + +/* + Return the number of bytes that are read on success; return -1 on error. + */ +ssize_t xread( + OclElfErr& err, + const char* fname, // File name for file descriptor 'fd' + const int fd, // File descriptor + void* buf, // buffer for reading + const size_t buf_len // capacity of buffer in bytes + ); + +#if 0 + +/* + Return the number of bytes that have been written on success; return -1 on error. + */ +ssize_t xwrite( + OclElfErr& err, + const char* fname, // File name for file descriptor 'fd' + const int fd, // File descriptor + const void* buf, // data buffer to be written out + const size_t buf_len // the size of data in bytes + ); + +/* + Allocate a copy of (str), invoke xfail() on failure. + Returns 0 on error, or address of the allocated copy + */ +char* xstrdup(OclElfErr& err, const char *str); + +/* + get the length of an open file in bytes. return -1 if error. + */ +uint64_t xget_file_size(OclElfErr& err, const char *fname, const int fd); + +/* + Copy file 'infd' to file 'outfd'. + Return the total number of bytes copied on success; return -1 on error. + */ +int64_t xcopyfile(OclElfErr& err, const char *in, const int infd, + const char *out, const int outfd); + +/* + Copy file from 'infd' to current offset in 'outfd', for 'size' bytes. + Return 'size' on success; return -1 on error. + */ +int64_t xcopyfile_range(OclElfErr& err, const char *in, const int infd, + const char *out, const int outfd, + const uint64_t offset, const uint64_t size); + +// Align a value to the page size. +uint64_t align_to_page(const uint64_t offset); + +#endif + +} // namespace elfutils + +} // namespace amd + +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/CMakeLists.txt b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/CMakeLists.txt new file mode 100644 index 0000000000..c6a567f49c --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/CMakeLists.txt @@ -0,0 +1,12 @@ +## +######################################################################################################################## +# +# Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +# +######################################################################################################################## + +cmake_minimum_required(VERSION 3.10) + +if(NOT TARGET oclelf) + add_subdirectory(libelf) +endif() diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/_elftc.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/_elftc.h new file mode 100644 index 0000000000..b121eaff44 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/_elftc.h @@ -0,0 +1,539 @@ +/*- + * Copyright (c) 2009 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: _elftc.h 2064 2011-10-26 15:12:32Z jkoshy $ + */ + +/** + ** Miscellanous definitions needed by multiple components. + **/ + +#ifndef _ELFTC_H +#define _ELFTC_H + +#ifndef NULL +#define NULL ((void *) 0) +#endif + +#ifndef offsetof +// change from (int) to (char *) for x64 gcc +#define offsetof(T, M) ((char *) &((T*) 0) -> M) +#endif + +/* + * Supply macros missing from + */ + +#ifndef STAILQ_FOREACH_SAFE +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +#ifndef STAILQ_LAST +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? \ + NULL : \ + ((struct type *)(void *) \ + ((char *)((head)->stqh_last) - offsetof(struct type, field)))) +#endif + +#ifndef TAILQ_FOREACH_SAFE +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) +#endif + +/* + * Symbols that are sometimes missing in system headers. + */ + +#ifndef DT_DEPRECATED_SPARC_REGISTER +#define DT_DEPRECATED_SPARC_REGISTER 0x70000001 +#endif + +#ifndef DT_GNU_PRELINKED +#define DT_GNU_PRELINKED 0x6FFFFDF5U +#endif + +#ifndef DT_GNU_CONFLICTSZ +#define DT_GNU_CONFLICTSZ 0x6FFFFDF6U +#endif + +#ifndef DT_GNU_LIBLISTSZ +#define DT_GNU_LIBLISTSZ 0x6FFFFDF7U +#endif + +#ifndef DT_GNU_HASH +#define DT_GNU_HASH 0x6FFFFEF5U +#endif + +#ifndef DT_GNU_CONFLICT +#define DT_GNU_CONFLICT 0x6FFFFEF8U +#endif + +#ifndef DT_GNU_LIBLIST +#define DT_GNU_LIBLIST 0x6FFFFEF9U +#endif + +#ifndef DT_MAXPOSTAGS +#define DT_MAXPOSTAGS 34 +#endif + +#ifndef DT_SUNW_AUXILIARY +#define DT_SUNW_AUXILIARY 0x6000000D +#endif + +#ifndef DT_SUNW_CAP +#define DT_SUNW_CAP 0x60000010 +#endif + +#ifndef DT_SUNW_FILTER +#define DT_SUNW_FILTER 0x6000000F +#endif + +#ifndef DT_SUNW_RTLDINF +#define DT_SUNW_RTLDINF 0x6000000E +#endif + +#ifndef DT_USED +#define DT_USED 0x7FFFFFFE +#endif + +#ifndef ELFOSABI_86OPEN +#define ELFOSABI_86OPEN 5 +#endif + +#ifndef ELFOSABI_AIX +#define ELFOSABI_AIX 7 +#endif + +#ifndef ELFOSABI_HURD +#define ELFOSABI_HURD 4 +#endif + +#ifndef ELFOSABI_NONE +#define ELFOSABI_NONE 0 +#endif + +#ifndef ELFOSABI_NSK +#define ELFOSABI_NSK 14 +#endif + +#ifndef ELFOSABI_OPENVMS +#define ELFOSABI_OPENVMS 13 +#endif + +/* + * Supply missing EM_XXX definitions. + */ +#ifndef EM_68HC05 +#define EM_68HC05 72 +#endif + +#ifndef EM_68HC08 +#define EM_68HC08 71 +#endif + +#ifndef EM_68HC11 +#define EM_68HC11 70 +#endif + +#ifndef EM_68HC16 +#define EM_68HC16 69 +#endif + +#ifndef EM_ARCA +#define EM_ARCA 109 +#endif + +#ifndef EM_ARC_A5 +#define EM_ARC_A5 93 +#endif + +#ifndef EM_AVR +#define EM_AVR 83 +#endif + +#ifndef EM_BLACKFIN +#define EM_BLACKFIN 106 +#endif + +#ifndef EM_CR +#define EM_CR 103 +#endif + +#ifndef EM_CRIS +#define EM_CRIS 76 +#endif + +#ifndef EM_D10V +#define EM_D10V 85 +#endif + +#ifndef EM_D30V +#define EM_D30V 86 +#endif + +#ifndef EM_F2MC16 +#define EM_F2MC16 104 +#endif + +#ifndef EM_FIREPATH +#define EM_FIREPATH 78 +#endif + +#ifndef EM_FR30 +#define EM_FR30 84 +#endif + +#ifndef EM_FX66 +#define EM_FX66 66 +#endif + +#ifndef EM_HUANY +#define EM_HUANY 81 +#endif + +#ifndef EM_IP2K +#define EM_IP2K 101 +#endif + +#ifndef EM_JAVELIN +#define EM_JAVELIN 77 +#endif + +#ifndef EM_M32R +#define EM_M32R 88 +#endif + +#ifndef EM_MAX +#define EM_MAX 102 +#endif + +#ifndef EM_MMIX +#define EM_MMIX 80 +#endif + +#ifndef EM_MN10200 +#define EM_MN10200 90 +#endif + +#ifndef EM_MN10300 +#define EM_MN10300 89 +#endif + +#ifndef EM_MSP430 +#define EM_MSP430 105 +#endif + +#ifndef EM_NS32K +#define EM_NS32K 97 +#endif + +#ifndef EM_OPENRISC +#define EM_OPENRISC 92 +#endif + +#ifndef EM_PDSP +#define EM_PDSP 63 +#endif + +#ifndef EM_PJ +#define EM_PJ 91 +#endif + +#ifndef EM_PRISM +#define EM_PRISM 82 +#endif + +#ifndef EM_SEP +#define EM_SEP 108 +#endif + +#ifndef EM_SE_C33 +#define EM_SE_C33 107 +#endif + +#ifndef EM_SNP1K +#define EM_SNP1K 99 +#endif + +#ifndef EM_ST19 +#define EM_ST19 74 +#endif + +#ifndef EM_ST200 +#define EM_ST200 100 +#endif + +#ifndef EM_ST7 +#define EM_ST7 68 +#endif + +#ifndef EM_ST9PLUS +#define EM_ST9PLUS 67 +#endif + +#ifndef EM_SVX +#define EM_SVX 73 +#endif + +#ifndef EM_TMM_GPP +#define EM_TMM_GPP 96 +#endif + +#ifndef EM_TPC +#define EM_TPC 98 +#endif + +#ifndef EM_UNICORE +#define EM_UNICORE 110 +#endif + +#ifndef EM_V850 +#define EM_V850 87 +#endif + +#ifndef EM_VAX +#define EM_VAX 75 +#endif + +#ifndef EM_VIDEOCORE +#define EM_VIDEOCORE 95 +#endif + +#ifndef EM_XTENSA +#define EM_XTENSA 94 +#endif + +#ifndef EM_ZSP +#define EM_ZSP 79 +#endif + +#ifndef PN_XNUM +#define PN_XNUM 0xFFFFU +#endif + +#ifndef R_IA_64_DIR32LSB +#define R_IA_64_DIR32LSB 0x25 +#endif + +#ifndef R_IA_64_DIR64LSB +#define R_IA_64_DIR64LSB 0x27 +#endif + +#ifndef R_IA_64_SECREL32LSB +#define R_IA_64_SECREL32LSB 0x65 +#endif + +#ifndef R_MIPS_32 +#define R_MIPS_32 0x2 +#endif + +#ifndef R_PPC_ADDR32 +#define R_PPC_ADDR32 0x1 +#endif + +#ifndef R_SPARC_UA32 +#define R_SPARC_UA32 23 +#endif + +#ifndef R_SPARC_UA64 +#define R_SPARC_UA64 54 +#endif + +#ifndef R_X86_64_32 +#define R_X86_64_32 10 +#endif + +#ifndef R_X86_64_64 +#define R_X86_64_64 1 +#endif + +#ifndef SHT_AMD64_UNWIND +#define SHT_AMD64_UNWIND 0x70000001 +#endif + +#ifndef SHT_SUNW_ANNOTATE +#define SHT_SUNW_ANNOTATE 0X6FFFFFF7 +#endif + +#ifndef SHT_SUNW_DEBUGSTR +#define SHT_SUNW_DEBUGSTR 0X6FFFFFF8 +#endif + +#ifndef SHT_SUNW_DEBUG +#define SHT_SUNW_DEBUG 0X6FFFFFF9 +#endif + +#ifndef SHT_SUNW_cap +#define SHT_SUNW_cap 0x6FFFFFF5 +#endif + +#ifndef SHT_SUNW_dof +#define SHT_SUNW_dof 0x6FFFFFF4 +#endif + +#ifndef SHT_SUNW_verdef +#define SHT_SUNW_verdef 0x6FFFFFFD +#endif + +#ifndef SHT_SUNW_verneed +#define SHT_SUNW_verneed 0x6FFFFFFE +#endif + +#ifndef SHT_SUNW_versym +#define SHT_SUNW_versym 0x6FFFFFFF +#endif + +#ifndef SHN_XINDEX +#define SHN_XINDEX 0xFFFFU +#endif + +#ifndef SHT_GNU_ATTRIBUTES +#define SHT_GNU_ATTRIBUTES 0x6FFFFFF5U +#endif + +#ifndef SHT_GNU_HASH +#define SHT_GNU_HASH 0x6FFFFFF6U +#endif + +#ifndef SHT_GNU_LIBLIST +#define SHT_GNU_LIBLIST 0x6FFFFFF7U +#endif + +/* + * VCS Ids. + */ + +#ifndef ELFTC_VCSID + +#if defined(__FreeBSD__) +#define ELFTC_VCSID(ID) __FBSDID(ID) +#endif + +#if defined(__linux__) || defined(__GNU__) || defined(__GLIBC__) +#if defined(__GNUC__) +#define ELFTC_VCSID(ID) __asm__(".ident\t\"" ID "\"") +#else +#define ELFTC_VCSID(ID) /**/ +#endif +#endif + +#if defined(__NetBSD__) +#define ELFTC_VCSID(ID) __RCSID(ID) +#endif + +#endif /* ELFTC_VCSID */ + +/* + * Provide an equivalent for getprogname(3). + */ + +#ifndef ELFTC_GETPROGNAME + +#if defined(__FreeBSD__) || defined(__NetBSD__) + +#include + +#define ELFTC_GETPROGNAME() getprogname() + +#endif /* defined(__FreeBSD__) || defined(__NetBSD__) */ + +#if defined(__linux__) + +#ifdef __cplusplus +extern "C" { +#endif +/* + * GLIBC based systems have a global 'char *' pointer referencing + * the executable's name. + */ +extern /*const*/ char *program_invocation_short_name; +#ifdef __cplusplus +} +#endif + +#define ELFTC_GETPROGNAME() program_invocation_short_name + +#endif /* __linux__ */ + +#endif /* ELFTC_GETPROGNAME */ + +/** + ** Per-OS configuration. + **/ + +#if defined(__linux__) + +#include + +#define ELFTC_BYTE_ORDER __BYTE_ORDER +#define ELFTC_BYTE_ORDER_LITTLE_ENDIAN __LITTLE_ENDIAN +#define ELFTC_BYTE_ORDER_BIG_ENDIAN __BIG_ENDIAN + +/* + * Debian GNU/Linux is missing strmode(3). + */ +#define ELFTC_HAVE_STRMODE 0 + +/* Whether we need to supply {be,le}32dec. */ +#define ELFTC_NEED_BYTEORDER_EXTENSIONS 1 + +#define roundup2 roundup + +#endif /* __linux__ */ + +#if defined(__FreeBSD__) + +#include +#include + +#define ELFTC_BYTE_ORDER _BYTE_ORDER +#define ELFTC_BYTE_ORDER_LITTLE_ENDIAN _LITTLE_ENDIAN +#define ELFTC_BYTE_ORDER_BIG_ENDIAN _BIG_ENDIAN + +#define ELFTC_HAVE_STRMODE 1 +#if __FreeBSD_version <= 900000 +#define ELFTC_BROKEN_YY_NO_INPUT 1 +#endif +#endif /* __FreeBSD__ */ + +#if defined(__NetBSD__) + +#include + +#define ELFTC_BYTE_ORDER _BYTE_ORDER +#define ELFTC_BYTE_ORDER_LITTLE_ENDIAN _LITTLE_ENDIAN +#define ELFTC_BYTE_ORDER_BIG_ENDIAN _BIG_ENDIAN + +#define ELFTC_HAVE_STRMODE 1 +#define ELFTC_BROKEN_YY_NO_INPUT 1 +#endif /* __NetBSD __ */ + +#endif /* _ELFTC_H */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/elfdefinitions.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/elfdefinitions.h new file mode 100644 index 0000000000..10051dbcc3 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/elfdefinitions.h @@ -0,0 +1,2618 @@ +/*- + * Copyright (c) 2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: elfdefinitions.h 2064 2011-10-26 15:12:32Z jkoshy $ + */ + +/* + * These definitions are based on: + * - The public specification of the ELF format as defined in the + * October 2009 draft of System V ABI. + * See: http://www.sco.com/developers/gabi/latest/ch4.intro.html + * - The May 1998 (version 1.5) draft of "The ELF-64 object format". + * - Processor-specific ELF ABI definitions for sparc, i386, amd64, mips, + * ia64, and powerpc processors. + * - The "Linkers and Libraries Guide", from Sun Microsystems. + */ + +#ifndef _ELFDEFINITIONS_H_ +#define _ELFDEFINITIONS_H_ + +#ifdef WIN32 +#include "compat.h" +#else +#include +#endif + +/* + * Types of capabilities. + */ + +#define _ELF_DEFINE_CAPABILITIES() \ +_ELF_DEFINE_CA(CA_SUNW_NULL, 0, "ignored") \ +_ELF_DEFINE_CA(CA_SUNW_HW_1, 1, "hardware capability") \ +_ELF_DEFINE_CA(CA_SUNW_SW_1, 2, "software capability") + +#undef _ELF_DEFINE_CA +#define _ELF_DEFINE_CA(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_CAPABILITIES() + CA__LAST__ +}; + +/* + * Flags used with dynamic linking entries. + */ + +#define _ELF_DEFINE_DYN_FLAGS() \ +_ELF_DEFINE_DF(DF_ORIGIN, 0x1, \ + "object being loaded may refer to $ORIGIN") \ +_ELF_DEFINE_DF(DF_SYMBOLIC, 0x2, \ + "search library for references before executable") \ +_ELF_DEFINE_DF(DF_TEXTREL, 0x4, \ + "relocation entries may modify text segment") \ +_ELF_DEFINE_DF(DF_BIND_NOW, 0x8, \ + "process relocation entries at load time") \ +_ELF_DEFINE_DF(DF_STATIC_TLS, 0x10, \ + "uses static thread-local storage") +#undef _ELF_DEFINE_DF +#define _ELF_DEFINE_DF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_DYN_FLAGS() + DF__LAST__ +}; + + +/* + * Dynamic linking entry types. + */ + +#define _ELF_DEFINE_DYN_TYPES() \ +_ELF_DEFINE_DT(DT_NULL, 0, "end of array") \ +_ELF_DEFINE_DT(DT_NEEDED, 1, "names a needed library") \ +_ELF_DEFINE_DT(DT_PLTRELSZ, 2, \ + "size in bytes of associated relocation entries") \ +_ELF_DEFINE_DT(DT_PLTGOT, 3, \ + "address associated with the procedure linkage table") \ +_ELF_DEFINE_DT(DT_HASH, 4, \ + "address of the symbol hash table") \ +_ELF_DEFINE_DT(DT_STRTAB, 5, \ + "address of the string table") \ +_ELF_DEFINE_DT(DT_SYMTAB, 6, \ + "address of the symbol table") \ +_ELF_DEFINE_DT(DT_RELA, 7, \ + "address of the relocation table") \ +_ELF_DEFINE_DT(DT_RELASZ, 8, "size of the DT_RELA table") \ +_ELF_DEFINE_DT(DT_RELAENT, 9, "size of each DT_RELA entry") \ +_ELF_DEFINE_DT(DT_STRSZ, 10, "size of the string table") \ +_ELF_DEFINE_DT(DT_SYMENT, 11, \ + "size of a symbol table entry") \ +_ELF_DEFINE_DT(DT_INIT, 12, \ + "address of the initialization function") \ +_ELF_DEFINE_DT(DT_FINI, 13, \ + "address of the finalization function") \ +_ELF_DEFINE_DT(DT_SONAME, 14, "names the shared object") \ +_ELF_DEFINE_DT(DT_RPATH, 15, \ + "runtime library search path") \ +_ELF_DEFINE_DT(DT_SYMBOLIC, 16, \ + "alter symbol resolution algorithm") \ +_ELF_DEFINE_DT(DT_REL, 17, \ + "address of the DT_REL table") \ +_ELF_DEFINE_DT(DT_RELSZ, 18, "size of the DT_REL table") \ +_ELF_DEFINE_DT(DT_RELENT, 19, "size of each DT_REL entry") \ +_ELF_DEFINE_DT(DT_PLTREL, 20, \ + "type of relocation entry in the procedure linkage table") \ +_ELF_DEFINE_DT(DT_DEBUG, 21, "used for debugging") \ +_ELF_DEFINE_DT(DT_TEXTREL, 22, \ + "text segment may be written to during relocation") \ +_ELF_DEFINE_DT(DT_JMPREL, 23, \ + "address of relocation entries associated with the procedure linkage table") \ +_ELF_DEFINE_DT(DT_BIND_NOW, 24, \ + "bind symbols at loading time") \ +_ELF_DEFINE_DT(DT_INIT_ARRAY, 25, \ + "pointers to initialization functions") \ +_ELF_DEFINE_DT(DT_FINI_ARRAY, 26, \ + "pointers to termination functions") \ +_ELF_DEFINE_DT(DT_INIT_ARRAYSZ, 27, "size of the DT_INIT_ARRAY") \ +_ELF_DEFINE_DT(DT_FINI_ARRAYSZ, 28, "size of the DT_FINI_ARRAY") \ +_ELF_DEFINE_DT(DT_RUNPATH, 29, \ + "index of library search path string") \ +_ELF_DEFINE_DT(DT_FLAGS, 30, \ + "flags specific to the object being loaded") \ +_ELF_DEFINE_DT(DT_ENCODING, 32, "standard semantics") \ +_ELF_DEFINE_DT(DT_PREINIT_ARRAY, 32, \ + "pointers to pre-initialization functions") \ +_ELF_DEFINE_DT(DT_PREINIT_ARRAYSZ, 33, \ + "size of pre-initialization array") \ +_ELF_DEFINE_DT(DT_MAXPOSTAGS, 34, \ + "the number of positive tags") \ +_ELF_DEFINE_DT(DT_LOOS, 0x6000000DUL, \ + "start of OS-specific types") \ +_ELF_DEFINE_DT(DT_SUNW_AUXILIARY, 0x6000000DUL, \ + "offset of string naming auxiliary filtees") \ +_ELF_DEFINE_DT(DT_SUNW_RTLDINF, 0x6000000EUL, "rtld internal use") \ +_ELF_DEFINE_DT(DT_SUNW_FILTER, 0x6000000FUL, \ + "offset of string naming standard filtees") \ +_ELF_DEFINE_DT(DT_SUNW_CAP, 0x60000010UL, \ + "address of hardware capabilities section") \ +_ELF_DEFINE_DT(DT_HIOS, 0x6FFFF000UL, \ + "end of OS-specific types") \ +_ELF_DEFINE_DT(DT_VALRNGLO, 0x6FFFFD00UL, \ + "start of range using the d_val field") \ +_ELF_DEFINE_DT(DT_GNU_PRELINKED, 0x6FFFFDF5UL, \ + "prelinking timestamp") \ +_ELF_DEFINE_DT(DT_GNU_CONFLICTSZ, 0x6FFFFDF6UL, \ + "size of conflict section") \ +_ELF_DEFINE_DT(DT_GNU_LIBLISTSZ, 0x6FFFFDF7UL, \ + "size of library list") \ +_ELF_DEFINE_DT(DT_CHECKSUM, 0x6FFFFDF8UL, \ + "checksum for the object") \ +_ELF_DEFINE_DT(DT_PLTPADSZ, 0x6FFFFDF9UL, \ + "size of PLT padding") \ +_ELF_DEFINE_DT(DT_MOVEENT, 0x6FFFFDFAUL, \ + "size of DT_MOVETAB entries") \ +_ELF_DEFINE_DT(DT_MOVESZ, 0x6FFFFDFBUL, \ + "total size of the MOVETAB table") \ +_ELF_DEFINE_DT(DT_FEATURE_1, 0x6FFFFDFCUL, "feature values") \ +_ELF_DEFINE_DT(DT_POSFLAG_1, 0x6FFFFDFDUL, \ + "dynamic position flags") \ +_ELF_DEFINE_DT(DT_SYMINSZ, 0x6FFFFDFEUL, \ + "size of the DT_SYMINFO table") \ +_ELF_DEFINE_DT(DT_SYMINENT, 0x6FFFFDFFUL, \ + "size of a DT_SYMINFO entry") \ +_ELF_DEFINE_DT(DT_VALRNGHI, 0x6FFFFDFFUL, \ + "end of range using the d_val field") \ +_ELF_DEFINE_DT(DT_ADDRRNGLO, 0x6FFFFE00UL, \ + "start of range using the d_ptr field") \ +_ELF_DEFINE_DT(DT_GNU_HASH, 0x6FFFFEF5UL, \ + "GNU style hash tables") \ +_ELF_DEFINE_DT(DT_GNU_CONFLICT, 0x6FFFFEF8UL, \ + "address of conflict section") \ +_ELF_DEFINE_DT(DT_GNU_LIBLIST, 0x6FFFFEF9UL, \ + "address of conflict section") \ +_ELF_DEFINE_DT(DT_CONFIG, 0x6FFFFEFAUL, \ + "configuration file") \ +_ELF_DEFINE_DT(DT_DEPAUDIT, 0x6FFFFEFBUL, \ + "string defining audit libraries") \ +_ELF_DEFINE_DT(DT_AUDIT, 0x6FFFFEFCUL, \ + "string defining audit libraries") \ +_ELF_DEFINE_DT(DT_PLTPAD, 0x6FFFFEFDUL, "PLT padding") \ +_ELF_DEFINE_DT(DT_MOVETAB, 0x6FFFFEFEUL, \ + "address of a move table") \ +_ELF_DEFINE_DT(DT_SYMINFO, 0x6FFFFEFFUL, \ + "address of the symbol information table") \ +_ELF_DEFINE_DT(DT_ADDRRNGHI, 0x6FFFFEFFUL, \ + "end of range using the d_ptr field") \ +_ELF_DEFINE_DT(DT_VERSYM, 0x6FFFFFF0UL, \ + "address of the version section") \ +_ELF_DEFINE_DT(DT_RELACOUNT, 0x6FFFFFF9UL, \ + "count of RELA relocations") \ +_ELF_DEFINE_DT(DT_RELCOUNT, 0x6FFFFFFAUL, \ + "count of REL relocations") \ +_ELF_DEFINE_DT(DT_FLAGS_1, 0x6FFFFFFBUL, "flag values") \ +_ELF_DEFINE_DT(DT_VERDEF, 0x6FFFFFFCUL, \ + "address of the version definition segment") \ +_ELF_DEFINE_DT(DT_VERDEFNUM, 0x6FFFFFFDUL, \ + "the number of version definition entries") \ +_ELF_DEFINE_DT(DT_VERNEED, 0x6FFFFFFEUL, \ + "address of section with needed versions") \ +_ELF_DEFINE_DT(DT_VERNEEDNUM, 0x6FFFFFFFUL, \ + "the number of version needed entries") \ +_ELF_DEFINE_DT(DT_LOPROC, 0x70000000UL, \ + "start of processor-specific types") \ +_ELF_DEFINE_DT(DT_ARM_SYMTABSZ, 0x70000001UL, \ + "number of entries in the dynamic symbol table") \ +_ELF_DEFINE_DT(DT_SPARC_REGISTER, 0x70000001UL, \ + "index of an STT_SPARC_REGISTER symbol") \ +_ELF_DEFINE_DT(DT_ARM_PREEMPTMAP, 0x70000002UL, \ + "address of the preemption map") \ +_ELF_DEFINE_DT(DT_MIPS_RLD_VERSION, 0x70000001UL, \ + "version ID for runtime linker interface") \ +_ELF_DEFINE_DT(DT_MIPS_TIME_STAMP, 0x70000002UL, \ + "timestamp") \ +_ELF_DEFINE_DT(DT_MIPS_ICHECKSUM, 0x70000003UL, \ + "checksum of all external strings and common sizes") \ +_ELF_DEFINE_DT(DT_MIPS_IVERSION, 0x70000004UL, \ + "string table index of a version string") \ +_ELF_DEFINE_DT(DT_MIPS_FLAGS, 0x70000005UL, \ + "MIPS-specific flags") \ +_ELF_DEFINE_DT(DT_MIPS_BASE_ADDRESS, 0x70000006UL, \ + "base address for the executable/DSO") \ +_ELF_DEFINE_DT(DT_MIPS_CONFLICT, 0x70000008UL, \ + "address of .conflict section") \ +_ELF_DEFINE_DT(DT_MIPS_LIBLIST, 0x70000009UL, \ + "address of .liblist section") \ +_ELF_DEFINE_DT(DT_MIPS_LOCAL_GOTNO, 0x7000000AUL, \ + "number of local GOT entries") \ +_ELF_DEFINE_DT(DT_MIPS_CONFLICTNO, 0x7000000BUL, \ + "number of entries in the .conflict section") \ +_ELF_DEFINE_DT(DT_MIPS_LIBLISTNO, 0x70000010UL, \ + "number of entries in the .liblist section") \ +_ELF_DEFINE_DT(DT_MIPS_SYMTABNO, 0x70000011UL, \ + "number of entries in the .dynsym section") \ +_ELF_DEFINE_DT(DT_MIPS_UNREFEXTNO, 0x70000012UL, \ + "index of first external dynamic symbol not ref'ed locally") \ +_ELF_DEFINE_DT(DT_MIPS_GOTSYM, 0x70000013UL, \ + "index of first dynamic symbol corresponds to a GOT entry") \ +_ELF_DEFINE_DT(DT_MIPS_HIPAGENO, 0x70000014UL, \ + "number of page table entries in GOT") \ +_ELF_DEFINE_DT(DT_MIPS_RLD_MAP, 0x70000016UL, \ + "address of runtime linker map") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_CLASS, 0x70000017UL, \ + "Delta C++ class definition") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_CLASS_NO, 0x70000018UL, \ + "number of entries in DT_MIPS_DELTA_CLASS") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_INSTANCE, 0x70000019UL, \ + "Delta C++ class instances") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_INSTANCE_NO, 0x7000001AUL, \ + "number of entries in DT_MIPS_DELTA_INSTANCE") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_RELOC, 0x7000001BUL, \ + "Delta relocations") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_RELOC_NO, 0x7000001CUL, \ + "number of entries in DT_MIPS_DELTA_RELOC") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_SYM, 0x7000001DUL, \ + "Delta symbols refered by Delta relocations") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_SYM_NO, 0x7000001EUL, \ + "number of entries in DT_MIPS_DELTA_SYM") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_CLASSSYM, 0x70000020UL, \ + "Delta symbols for class declarations") \ +_ELF_DEFINE_DT(DT_MIPS_DELTA_CLASSSYM_NO, 0x70000021UL, \ + "number of entries in DT_MIPS_DELTA_CLASSSYM") \ +_ELF_DEFINE_DT(DT_MIPS_CXX_FLAGS, 0x70000022UL, \ + "C++ flavor flags") \ +_ELF_DEFINE_DT(DT_MIPS_PIXIE_INIT, 0x70000023UL, \ + "address of an initialization routine created by pixie") \ +_ELF_DEFINE_DT(DT_MIPS_SYMBOL_LIB, 0x70000024UL, \ + "address of .MIPS.symlib section") \ +_ELF_DEFINE_DT(DT_MIPS_LOCALPAGE_GOTIDX, 0x70000025UL, \ + "GOT index of first page table entry for a segment") \ +_ELF_DEFINE_DT(DT_MIPS_LOCAL_GOTIDX, 0x70000026UL, \ + "GOT index of first page table entry for a local symbol") \ +_ELF_DEFINE_DT(DT_MIPS_HIDDEN_GOTIDX, 0x70000027UL, \ + "GOT index of first page table entry for a hidden symbol") \ +_ELF_DEFINE_DT(DT_MIPS_PROTECTED_GOTIDX, 0x70000028UL, \ + "GOT index of first page table entry for a protected symbol") \ +_ELF_DEFINE_DT(DT_MIPS_OPTIONS, 0x70000029UL, \ + "address of .MIPS.options section") \ +_ELF_DEFINE_DT(DT_MIPS_INTERFACE, 0x7000002AUL, \ + "address of .MIPS.interface section") \ +_ELF_DEFINE_DT(DT_MIPS_DYNSTR_ALIGN, 0x7000002BUL, "???") \ +_ELF_DEFINE_DT(DT_MIPS_INTERFACE_SIZE, 0x7000002CUL, \ + "size of .MIPS.interface section") \ +_ELF_DEFINE_DT(DT_MIPS_RLD_TEXT_RESOLVE_ADDR, 0x7000002DUL, \ + "address of _rld_text_resolve in GOT") \ +_ELF_DEFINE_DT(DT_MIPS_PERF_SUFFIX, 0x7000002EUL, \ + "default suffix of DSO to be appended by dlopen") \ +_ELF_DEFINE_DT(DT_MIPS_COMPACT_SIZE, 0x7000002FUL, \ + "size of a ucode compact relocation record (o32)") \ +_ELF_DEFINE_DT(DT_MIPS_GP_VALUE, 0x70000030UL, \ + "GP value of a specified GP relative range") \ +_ELF_DEFINE_DT(DT_MIPS_AUX_DYNAMIC, 0x70000031UL, \ + "address of an auxiliary dynamic table") \ +_ELF_DEFINE_DT(DT_MIPS_PLTGOT, 0x70000032UL, \ + "address of the PLTGOT") \ +_ELF_DEFINE_DT(DT_MIPS_RLD_OBJ_UPDATE, 0x70000033UL, \ + "object list update callback") \ +_ELF_DEFINE_DT(DT_MIPS_RWPLT, 0x70000034UL, \ + "address of a writable PLT") \ +_ELF_DEFINE_DT(DT_PPC_GOT, 0x70000000UL, \ + "value of _GLOBAL_OFFSET_TABLE_") \ +_ELF_DEFINE_DT(DT_PPC_TLSOPT, 0x70000001UL, \ + "TLS descriptor should be optimized") \ +_ELF_DEFINE_DT(DT_PPC64_GLINK, 0x70000000UL, \ + "address of .glink section") \ +_ELF_DEFINE_DT(DT_PPC64_OPD, 0x70000001UL, \ + "address of .opd section") \ +_ELF_DEFINE_DT(DT_PPC64_OPDSZ, 0x70000002UL, \ + "size of .opd section") \ +_ELF_DEFINE_DT(DT_PPC64_TLSOPT, 0x70000003UL, \ + "TLS descriptor should be optimized") \ +_ELF_DEFINE_DT(DT_AUXILIARY, 0x7FFFFFFDUL, \ + "offset of string naming auxiliary filtees") \ +_ELF_DEFINE_DT(DT_USED, 0x7FFFFFFEUL, "ignored") \ +_ELF_DEFINE_DT(DT_FILTER, 0x7FFFFFFFUL, \ + "index of string naming filtees") \ +_ELF_DEFINE_DT(DT_HIPROC, 0x7FFFFFFFUL, \ + "end of processor-specific types") + +#undef _ELF_DEFINE_DT +#define _ELF_DEFINE_DT(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_DYN_TYPES() + DT__LAST__ = DT_HIPROC +}; +#ifndef DT_DEPRECATED_SPARC_REGISTER +#define DT_DEPRECATED_SPARC_REGISTER DT_SPARC_REGISTER +#endif + +/* + * Flags used in the executable header (field: e_flags). + */ +#define _ELF_DEFINE_EHDR_FLAGS() \ +_ELF_DEFINE_EF(EF_ARM_RELEXEC, 0x00000001UL, \ + "dynamic segment describes only how to relocate segments") \ +_ELF_DEFINE_EF(EF_ARM_HASENTRY, 0x00000002UL, \ + "e_entry contains a program entry point") \ +_ELF_DEFINE_EF(EF_ARM_SYMSARESORTED, 0x00000004UL, \ + "subsection of symbol table is sorted by symbol value") \ +_ELF_DEFINE_EF(EF_ARM_DYNSYMSUSESEGIDX, 0x00000008UL, \ + "dynamic symbol st_shndx = containing segment index + 1") \ +_ELF_DEFINE_EF(EF_ARM_MAPSYMSFIRST, 0x00000010UL, \ + "mapping symbols precede other local symbols in symtab") \ +_ELF_DEFINE_EF(EF_ARM_BE8, 0x00800000UL, \ + "file contains BE-8 code") \ +_ELF_DEFINE_EF(EF_ARM_LE8, 0x00400000UL, \ + "file contains LE-8 code") \ +_ELF_DEFINE_EF(EF_ARM_EABIMASK, 0xFF000000UL, \ + "mask for ARM EABI version number (0 denotes GNU or unknown)") \ +_ELF_DEFINE_EF(EF_ARM_INTERWORK, 0x00000004UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_APCS_26, 0x00000008UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_APCS_FLOAT, 0x00000010UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_PIC, 0x00000020UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_ALIGN8, 0x00000040UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_NEW_ABI, 0x00000080UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_OLD_ABI, 0x00000100UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_SOFT_FLOAT, 0x00000200UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_VFP_FLOAT, 0x00000400UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_ARM_MAVERICK_FLOAT, 0x00000800UL, \ + "GNU EABI extension") \ +_ELF_DEFINE_EF(EF_MIPS_NOREORDER, 0x00000001UL, \ + "at least one .noreorder directive appeared in the source") \ +_ELF_DEFINE_EF(EF_MIPS_PIC, 0x00000002UL, \ + "file contains position independent code") \ +_ELF_DEFINE_EF(EF_MIPS_CPIC, 0x00000004UL, \ + "file's code uses standard conventions for calling PIC") \ +_ELF_DEFINE_EF(EF_MIPS_UCODE, 0x00000010UL, \ + "file contains UCODE (obsolete)") \ +_ELF_DEFINE_EF(EF_MIPS_ABI2, 0x00000020UL, \ + "file follows MIPS III 32-bit ABI") \ +_ELF_DEFINE_EF(EF_MIPS_OPTIONS_FIRST, 0x00000080UL, \ + "ld(1) should process .MIPS.options section first") \ +_ELF_DEFINE_EF(EF_MIPS_ARCH_ASE, 0x0F000000UL, \ + "file uses application-specific architectural extensions") \ +_ELF_DEFINE_EF(EF_MIPS_ARCH_ASE_MDMX, 0x08000000UL, \ + "file uses MDMX multimedia extensions") \ +_ELF_DEFINE_EF(EF_MIPS_ARCH_ASE_M16, 0x04000000UL, \ + "file uses MIPS-16 ISA extensions") \ +_ELF_DEFINE_EF(EF_MIPS_ARCH, 0xF0000000UL, \ + "4-bit MIPS architecture field") \ +_ELF_DEFINE_EF(EF_SPARC_EXT_MASK, 0x00ffff00UL, \ + "Vendor Extension mask") \ +_ELF_DEFINE_EF(EF_SPARC_32PLUS, 0x00000100UL, \ + "Generic V8+ features") \ +_ELF_DEFINE_EF(EF_SPARC_SUN_US1, 0x00000200UL, \ + "Sun UltraSPARCTM 1 Extensions") \ +_ELF_DEFINE_EF(EF_SPARC_HAL_R1, 0x00000400UL, "HAL R1 Extensions") \ +_ELF_DEFINE_EF(EF_SPARC_SUN_US3, 0x00000800UL, \ + "Sun UltraSPARC 3 Extensions") \ +_ELF_DEFINE_EF(EF_SPARCV9_MM, 0x00000003UL, \ + "Mask for Memory Model") \ +_ELF_DEFINE_EF(EF_SPARCV9_TSO, 0x00000000UL, \ + "Total Store Ordering") \ +_ELF_DEFINE_EF(EF_SPARCV9_PSO, 0x00000001UL, \ + "Partial Store Ordering") \ +_ELF_DEFINE_EF(EF_SPARCV9_RMO, 0x00000002UL, \ + "Relaxed Memory Ordering") + +#undef _ELF_DEFINE_EF +#define _ELF_DEFINE_EF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_EHDR_FLAGS() + EF__LAST__ +}; + +/* + * Offsets in the `ei_ident[]` field of an ELF executable header. + */ +#define _ELF_DEFINE_EI_OFFSETS() \ +_ELF_DEFINE_EI(EI_MAG0, 0, "magic number") \ +_ELF_DEFINE_EI(EI_MAG1, 1, "magic number") \ +_ELF_DEFINE_EI(EI_MAG2, 2, "magic number") \ +_ELF_DEFINE_EI(EI_MAG3, 3, "magic number") \ +_ELF_DEFINE_EI(EI_CLASS, 4, "file class") \ +_ELF_DEFINE_EI(EI_DATA, 5, "data encoding") \ +_ELF_DEFINE_EI(EI_VERSION, 6, "file version") \ +_ELF_DEFINE_EI(EI_OSABI, 7, "OS ABI kind") \ +_ELF_DEFINE_EI(EI_ABIVERSION, 8, "OS ABI version") \ +_ELF_DEFINE_EI(EI_PAD, 9, "padding start") \ +_ELF_DEFINE_EI(EI_NIDENT, 16, "total size") + +#undef _ELF_DEFINE_EI +#define _ELF_DEFINE_EI(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_EI_OFFSETS() + EI__LAST__ +}; + +/* + * The ELF class of an object. + */ +#define _ELF_DEFINE_ELFCLASS() \ +_ELF_DEFINE_EC(ELFCLASSNONE, 0, "Unknown ELF class") \ +_ELF_DEFINE_EC(ELFCLASS32, 1, "32 bit objects") \ +_ELF_DEFINE_EC(ELFCLASS64, 2, "64 bit objects") + +#undef _ELF_DEFINE_EC +#define _ELF_DEFINE_EC(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ELFCLASS() + EC__LAST__ +}; + +/* + * Endianness of data in an ELF object. + */ + +#define _ELF_DEFINE_ELF_DATA_ENDIANNESS() \ +_ELF_DEFINE_ED(ELFDATANONE, 0, "Unknown data endianness") \ +_ELF_DEFINE_ED(ELFDATA2LSB, 1, "little endian") \ +_ELF_DEFINE_ED(ELFDATA2MSB, 2, "big endian") + +#undef _ELF_DEFINE_ED +#define _ELF_DEFINE_ED(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ELF_DATA_ENDIANNESS() + ED__LAST__ +}; + +/* + * Values of the magic numbers used in identification array. + */ +#define _ELF_DEFINE_ELF_MAGIC() \ +_ELF_DEFINE_EMAG(ELFMAG0, 0x7FU) \ +_ELF_DEFINE_EMAG(ELFMAG1, 'E') \ +_ELF_DEFINE_EMAG(ELFMAG2, 'L') \ +_ELF_DEFINE_EMAG(ELFMAG3, 'F') + +#undef _ELF_DEFINE_EMAG +#define _ELF_DEFINE_EMAG(N, V) N = V , +enum { + _ELF_DEFINE_ELF_MAGIC() + ELFMAG__LAST__ +}; + +/* + * ELF OS ABI field. + */ +#define _ELF_DEFINE_ELF_OSABI() \ +_ELF_DEFINE_EABI(ELFOSABI_NONE, 0, \ + "No extensions or unspecified") \ +_ELF_DEFINE_EABI(ELFOSABI_SYSV, 0, "SYSV") \ +_ELF_DEFINE_EABI(ELFOSABI_HPUX, 1, "Hewlett-Packard HP-UX") \ +_ELF_DEFINE_EABI(ELFOSABI_NETBSD, 2, "NetBSD") \ +_ELF_DEFINE_EABI(ELFOSABI_GNU, 3, "GNU") \ +_ELF_DEFINE_EABI(ELFOSABI_HURD, 4, "GNU/HURD") \ +_ELF_DEFINE_EABI(ELFOSABI_86OPEN, 5, "86Open Common ABI") \ +_ELF_DEFINE_EABI(ELFOSABI_SOLARIS, 6, "Sun Solaris") \ +_ELF_DEFINE_EABI(ELFOSABI_AIX, 7, "AIX") \ +_ELF_DEFINE_EABI(ELFOSABI_IRIX, 8, "IRIX") \ +_ELF_DEFINE_EABI(ELFOSABI_FREEBSD, 9, "FreeBSD") \ +_ELF_DEFINE_EABI(ELFOSABI_TRU64, 10, "Compaq TRU64 UNIX") \ +_ELF_DEFINE_EABI(ELFOSABI_MODESTO, 11, "Novell Modesto") \ +_ELF_DEFINE_EABI(ELFOSABI_OPENBSD, 12, "Open BSD") \ +_ELF_DEFINE_EABI(ELFOSABI_OPENVMS, 13, "Open VMS") \ +_ELF_DEFINE_EABI(ELFOSABI_NSK, 14, \ + "Hewlett-Packard Non-Stop Kernel") \ +_ELF_DEFINE_EABI(ELFOSABI_AROS, 15, "Amiga Research OS") \ +_ELF_DEFINE_EABI(ELFOSABI_FENIXOS, 16, \ + "The FenixOS highly scalable multi-core OS") \ +_ELF_DEFINE_EABI(ELFOSABI_ARM_AEABI, 64, \ + "ARM specific symbol versioning extensions") \ +_ELF_DEFINE_EABI(ELFOSABI_ARM, 97, "ARM ABI") \ +_ELF_DEFINE_EABI(ELFOSABI_AMD_OPENCL, 201, "AMD OpenCL Runtime ABI") \ +_ELF_DEFINE_EABI(ELFOSABI_HSAIL, 202, "AMD HSAIL Runtime ABI") \ +_ELF_DEFINE_EABI(ELFOSABI_AMDIL, 203, "AMD CAL Runtime ABI") \ +_ELF_DEFINE_EABI(ELFOSABI_STANDALONE, 255, \ + "Standalone (embedded) application") + +#undef _ELF_DEFINE_EABI +#define _ELF_DEFINE_EABI(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ELF_OSABI() + ELFOSABI__LAST__ +}; +#ifndef ELFOSABI_LINUX +#define ELFOSABI_LINUX ELFOSABI_GNU +#endif + +/* + * ELF Machine types: (EM_*). + */ +#define _ELF_DEFINE_ELF_MACHINES() \ +_ELF_DEFINE_EM(EM_NONE, 0, "No machine") \ +_ELF_DEFINE_EM(EM_M32, 1, "AT&T WE 32100") \ +_ELF_DEFINE_EM(EM_SPARC, 2, "SPARC") \ +_ELF_DEFINE_EM(EM_386, 3, "Intel 80386") \ +_ELF_DEFINE_EM(EM_68K, 4, "Motorola 68000") \ +_ELF_DEFINE_EM(EM_88K, 5, "Motorola 88000") \ +_ELF_DEFINE_EM(EM_860, 7, "Intel 80860") \ +_ELF_DEFINE_EM(EM_MIPS, 8, "MIPS I Architecture") \ +_ELF_DEFINE_EM(EM_S370, 9, "IBM System/370 Processor") \ +_ELF_DEFINE_EM(EM_MIPS_RS3_LE, 10, "MIPS RS3000 Little-endian") \ +_ELF_DEFINE_EM(EM_PARISC, 15, "Hewlett-Packard PA-RISC") \ +_ELF_DEFINE_EM(EM_VPP500, 17, "Fujitsu VPP500") \ +_ELF_DEFINE_EM(EM_SPARC32PLUS, 18, \ + "Enhanced instruction set SPARC") \ +_ELF_DEFINE_EM(EM_960, 19, "Intel 80960") \ +_ELF_DEFINE_EM(EM_PPC, 20, "PowerPC") \ +_ELF_DEFINE_EM(EM_PPC64, 21, "64-bit PowerPC") \ +_ELF_DEFINE_EM(EM_S390, 22, "IBM System/390 Processor") \ +_ELF_DEFINE_EM(EM_SPU, 23, "IBM SPU/SPC") \ +_ELF_DEFINE_EM(EM_V800, 36, "NEC V800") \ +_ELF_DEFINE_EM(EM_FR20, 37, "Fujitsu FR20") \ +_ELF_DEFINE_EM(EM_RH32, 38, "TRW RH-32") \ +_ELF_DEFINE_EM(EM_RCE, 39, "Motorola RCE") \ +_ELF_DEFINE_EM(EM_ARM, 40, "Advanced RISC Machines ARM") \ +_ELF_DEFINE_EM(EM_ALPHA, 41, "Digital Alpha") \ +_ELF_DEFINE_EM(EM_SH, 42, "Hitachi SH") \ +_ELF_DEFINE_EM(EM_SPARCV9, 43, "SPARC Version 9") \ +_ELF_DEFINE_EM(EM_TRICORE, 44, \ + "Siemens TriCore embedded processor") \ +_ELF_DEFINE_EM(EM_ARC, 45, \ + "Argonaut RISC Core, Argonaut Technologies Inc.") \ +_ELF_DEFINE_EM(EM_H8_300, 46, "Hitachi H8/300") \ +_ELF_DEFINE_EM(EM_H8_300H, 47, "Hitachi H8/300H") \ +_ELF_DEFINE_EM(EM_H8S, 48, "Hitachi H8S") \ +_ELF_DEFINE_EM(EM_H8_500, 49, "Hitachi H8/500") \ +_ELF_DEFINE_EM(EM_IA_64, 50, \ + "Intel IA-64 processor architecture") \ +_ELF_DEFINE_EM(EM_MIPS_X, 51, "Stanford MIPS-X") \ +_ELF_DEFINE_EM(EM_COLDFIRE, 52, "Motorola ColdFire") \ +_ELF_DEFINE_EM(EM_68HC12, 53, "Motorola M68HC12") \ +_ELF_DEFINE_EM(EM_MMA, 54, \ + "Fujitsu MMA Multimedia Accelerator") \ +_ELF_DEFINE_EM(EM_PCP, 55, "Siemens PCP") \ +_ELF_DEFINE_EM(EM_NCPU, 56, \ + "Sony nCPU embedded RISC processor") \ +_ELF_DEFINE_EM(EM_NDR1, 57, "Denso NDR1 microprocessor") \ +_ELF_DEFINE_EM(EM_STARCORE, 58, "Motorola Star*Core processor") \ +_ELF_DEFINE_EM(EM_ME16, 59, "Toyota ME16 processor") \ +_ELF_DEFINE_EM(EM_ST100, 60, \ + "STMicroelectronics ST100 processor") \ +_ELF_DEFINE_EM(EM_TINYJ, 61, \ + "Advanced Logic Corp. TinyJ embedded processor family") \ +_ELF_DEFINE_EM(EM_X86_64, 62, "AMD x86-64 architecture") \ +_ELF_DEFINE_EM(EM_PDSP, 63, "Sony DSP Processor") \ +_ELF_DEFINE_EM(EM_PDP10, 64, \ + "Digital Equipment Corp. PDP-10") \ +_ELF_DEFINE_EM(EM_PDP11, 65, \ + "Digital Equipment Corp. PDP-11") \ +_ELF_DEFINE_EM(EM_FX66, 66, "Siemens FX66 microcontroller") \ +_ELF_DEFINE_EM(EM_ST9PLUS, 67, \ + "STMicroelectronics ST9+ 8/16 bit microcontroller") \ +_ELF_DEFINE_EM(EM_ST7, 68, \ + "STMicroelectronics ST7 8-bit microcontroller") \ +_ELF_DEFINE_EM(EM_68HC16, 69, \ + "Motorola MC68HC16 Microcontroller") \ +_ELF_DEFINE_EM(EM_68HC11, 70, \ + "Motorola MC68HC11 Microcontroller") \ +_ELF_DEFINE_EM(EM_68HC08, 71, \ + "Motorola MC68HC08 Microcontroller") \ +_ELF_DEFINE_EM(EM_68HC05, 72, \ + "Motorola MC68HC05 Microcontroller") \ +_ELF_DEFINE_EM(EM_SVX, 73, "Silicon Graphics SVx") \ +_ELF_DEFINE_EM(EM_ST19, 74, \ + "STMicroelectronics ST19 8-bit microcontroller") \ +_ELF_DEFINE_EM(EM_VAX, 75, "Digital VAX") \ +_ELF_DEFINE_EM(EM_CRIS, 76, \ + "Axis Communications 32-bit embedded processor") \ +_ELF_DEFINE_EM(EM_JAVELIN, 77, \ + "Infineon Technologies 32-bit embedded processor") \ +_ELF_DEFINE_EM(EM_FIREPATH, 78, \ + "Element 14 64-bit DSP Processor") \ +_ELF_DEFINE_EM(EM_ZSP, 79, \ + "LSI Logic 16-bit DSP Processor") \ +_ELF_DEFINE_EM(EM_MMIX, 80, \ + "Donald Knuth's educational 64-bit processor") \ +_ELF_DEFINE_EM(EM_HUANY, 81, \ + "Harvard University machine-independent object files") \ +_ELF_DEFINE_EM(EM_PRISM, 82, "SiTera Prism") \ +_ELF_DEFINE_EM(EM_AVR, 83, \ + "Atmel AVR 8-bit microcontroller") \ +_ELF_DEFINE_EM(EM_FR30, 84, "Fujitsu FR30") \ +_ELF_DEFINE_EM(EM_D10V, 85, "Mitsubishi D10V") \ +_ELF_DEFINE_EM(EM_D30V, 86, "Mitsubishi D30V") \ +_ELF_DEFINE_EM(EM_V850, 87, "NEC v850") \ +_ELF_DEFINE_EM(EM_M32R, 88, "Mitsubishi M32R") \ +_ELF_DEFINE_EM(EM_MN10300, 89, "Matsushita MN10300") \ +_ELF_DEFINE_EM(EM_MN10200, 90, "Matsushita MN10200") \ +_ELF_DEFINE_EM(EM_PJ, 91, "picoJava") \ +_ELF_DEFINE_EM(EM_OPENRISC, 92, \ + "OpenRISC 32-bit embedded processor") \ +_ELF_DEFINE_EM(EM_ARC_COMPACT, 93, \ + "ARC International ARCompact processor") \ +_ELF_DEFINE_EM(EM_XTENSA, 94, \ + "Tensilica Xtensa Architecture") \ +_ELF_DEFINE_EM(EM_VIDEOCORE, 95, \ + "Alphamosaic VideoCore processor") \ +_ELF_DEFINE_EM(EM_TMM_GPP, 96, \ + "Thompson Multimedia General Purpose Processor") \ +_ELF_DEFINE_EM(EM_NS32K, 97, \ + "National Semiconductor 32000 series") \ +_ELF_DEFINE_EM(EM_TPC, 98, "Tenor Network TPC processor") \ +_ELF_DEFINE_EM(EM_SNP1K, 99, "Trebia SNP 1000 processor") \ +_ELF_DEFINE_EM(EM_ST200, 100, \ + "STMicroelectronics (www.st.com) ST200 microcontroller") \ +_ELF_DEFINE_EM(EM_IP2K, 101, \ + "Ubicom IP2xxx microcontroller family") \ +_ELF_DEFINE_EM(EM_MAX, 102, "MAX Processor") \ +_ELF_DEFINE_EM(EM_CR, 103, \ + "National Semiconductor CompactRISC microprocessor") \ +_ELF_DEFINE_EM(EM_F2MC16, 104, "Fujitsu F2MC16") \ +_ELF_DEFINE_EM(EM_MSP430, 105, \ + "Texas Instruments embedded microcontroller msp430") \ +_ELF_DEFINE_EM(EM_BLACKFIN, 106, \ + "Analog Devices Blackfin (DSP) processor") \ +_ELF_DEFINE_EM(EM_SE_C33, 107, \ + "S1C33 Family of Seiko Epson processors") \ +_ELF_DEFINE_EM(EM_SEP, 108, \ + "Sharp embedded microprocessor") \ +_ELF_DEFINE_EM(EM_ARCA, 109, "Arca RISC Microprocessor") \ +_ELF_DEFINE_EM(EM_UNICORE, 110, \ + "Microprocessor series from PKU-Unity Ltd. and MPRC of Peking University") \ +_ELF_DEFINE_EM(EM_EXCESS, 111, \ + "eXcess: 16/32/64-bit configurable embedded CPU") \ +_ELF_DEFINE_EM(EM_DXP, 112, \ + "Icera Semiconductor Inc. Deep Execution Processor") \ +_ELF_DEFINE_EM(EM_ALTERA_NIOS2, 113, \ + "Altera Nios II soft-core processor") \ +_ELF_DEFINE_EM(EM_CRX, 114, \ + "National Semiconductor CompactRISC CRX microprocessor") \ +_ELF_DEFINE_EM(EM_XGATE, 115, \ + "Motorola XGATE embedded processor") \ +_ELF_DEFINE_EM(EM_C166, 116, \ + "Infineon C16x/XC16x processor") \ +_ELF_DEFINE_EM(EM_M16C, 117, \ + "Renesas M16C series microprocessors") \ +_ELF_DEFINE_EM(EM_DSPIC30F, 118, \ + "Microchip Technology dsPIC30F Digital Signal Controller") \ +_ELF_DEFINE_EM(EM_CE, 119, \ + "Freescale Communication Engine RISC core") \ +_ELF_DEFINE_EM(EM_M32C, 120, \ + "Renesas M32C series microprocessors") \ +_ELF_DEFINE_EM(EM_TSK3000, 131, "Altium TSK3000 core") \ +_ELF_DEFINE_EM(EM_RS08, 132, \ + "Freescale RS08 embedded processor") \ +_ELF_DEFINE_EM(EM_SHARC, 133, \ + "Analog Devices SHARC family of 32-bit DSP processors") \ +_ELF_DEFINE_EM(EM_ECOG2, 134, \ + "Cyan Technology eCOG2 microprocessor") \ +_ELF_DEFINE_EM(EM_SCORE7, 135, \ + "Sunplus S+core7 RISC processor") \ +_ELF_DEFINE_EM(EM_DSP24, 136, \ + "New Japan Radio (NJR) 24-bit DSP Processor") \ +_ELF_DEFINE_EM(EM_VIDEOCORE3, 137, \ + "Broadcom VideoCore III processor") \ +_ELF_DEFINE_EM(EM_LATTICEMICO32, 138, \ + "RISC processor for Lattice FPGA architecture") \ +_ELF_DEFINE_EM(EM_SE_C17, 139, "Seiko Epson C17 family") \ +_ELF_DEFINE_EM(EM_TI_C6000, 140, \ + "The Texas Instruments TMS320C6000 DSP family") \ +_ELF_DEFINE_EM(EM_TI_C2000, 141, \ + "The Texas Instruments TMS320C2000 DSP family") \ +_ELF_DEFINE_EM(EM_TI_C5500, 142, \ + "The Texas Instruments TMS320C55x DSP family") \ +_ELF_DEFINE_EM(EM_MMDSP_PLUS, 160, \ + "STMicroelectronics 64bit VLIW Data Signal Processor") \ +_ELF_DEFINE_EM(EM_CYPRESS_M8C, 161, "Cypress M8C microprocessor") \ +_ELF_DEFINE_EM(EM_R32C, 162, \ + "Renesas R32C series microprocessors") \ +_ELF_DEFINE_EM(EM_TRIMEDIA, 163, \ + "NXP Semiconductors TriMedia architecture family") \ +_ELF_DEFINE_EM(EM_QDSP6, 164, "QUALCOMM DSP6 Processor") \ +_ELF_DEFINE_EM(EM_8051, 165, "Intel 8051 and variants") \ +_ELF_DEFINE_EM(EM_STXP7X, 166, \ + "STMicroelectronics STxP7x family of configurable and extensible RISC processors") \ +_ELF_DEFINE_EM(EM_NDS32, 167, \ + "Andes Technology compact code size embedded RISC processor family") \ +_ELF_DEFINE_EM(EM_ECOG1, 168, \ + "Cyan Technology eCOG1X family") \ +_ELF_DEFINE_EM(EM_ECOG1X, 168, \ + "Cyan Technology eCOG1X family") \ +_ELF_DEFINE_EM(EM_MAXQ30, 169, \ + "Dallas Semiconductor MAXQ30 Core Micro-controllers") \ +_ELF_DEFINE_EM(EM_XIMO16, 170, \ + "New Japan Radio (NJR) 16-bit DSP Processor") \ +_ELF_DEFINE_EM(EM_MANIK, 171, \ + "M2000 Reconfigurable RISC Microprocessor") \ +_ELF_DEFINE_EM(EM_CRAYNV2, 172, \ + "Cray Inc. NV2 vector architecture") \ +_ELF_DEFINE_EM(EM_RX, 173, "Renesas RX family") \ +_ELF_DEFINE_EM(EM_METAG, 174, \ + "Imagination Technologies META processor architecture") \ +_ELF_DEFINE_EM(EM_MCST_ELBRUS, 175, \ + "MCST Elbrus general purpose hardware architecture") \ +_ELF_DEFINE_EM(EM_ECOG16, 176, \ + "Cyan Technology eCOG16 family") \ +_ELF_DEFINE_EM(EM_CR16, 177, \ + "National Semiconductor CompactRISC CR16 16-bit microprocessor") \ +_ELF_DEFINE_EM(EM_ETPU, 178, \ + "Freescale Extended Time Processing Unit") \ +_ELF_DEFINE_EM(EM_SLE9X, 179, \ + "Infineon Technologies SLE9X core") \ +_ELF_DEFINE_EM(EM_AVR32, 185, \ + "Atmel Corporation 32-bit microprocessor family") \ +_ELF_DEFINE_EM(EM_STM8, 186, \ + "STMicroeletronics STM8 8-bit microcontroller") \ +_ELF_DEFINE_EM(EM_TILE64, 187, \ + "Tilera TILE64 multicore architecture family") \ +_ELF_DEFINE_EM(EM_TILEPRO, 188, \ + "Tilera TILEPro multicore architecture family") \ +_ELF_DEFINE_EM(EM_MICROBLAZE, 189, \ + "Xilinx MicroBlaze 32-bit RISC soft processor core") \ +_ELF_DEFINE_EM(EM_CUDA, 190, "NVIDIA CUDA architecture") \ +_ELF_DEFINE_EM(EM_TILEGX, 191, \ + "Tilera TILE-Gx multicore architecture family") \ +_ELF_DEFINE_EM(EM_CLOUDSHIELD, 192, \ + "CloudShield architecture family") \ +_ELF_DEFINE_EM(EM_COREA_1ST, 193, \ + "KIPO-KAIST Core-A 1st generation processor family") \ +_ELF_DEFINE_EM(EM_COREA_2ND, 194, \ + "KIPO-KAIST Core-A 2nd generation processor family") \ +_ELF_DEFINE_EM(EM_ARC_COMPACT2, 195, "Synopsys ARCompact V2") \ +_ELF_DEFINE_EM(EM_OPEN8, 196, \ + "Open8 8-bit RISC soft processor core") \ +_ELF_DEFINE_EM(EM_RL78, 197, "Renesas RL78 family") \ +_ELF_DEFINE_EM(EM_VIDEOCORE5, 198, "Broadcom VideoCore V processor") \ +_ELF_DEFINE_EM(EM_78KOR, 199, "Renesas 78KOR family") \ +_ELF_DEFINE_EM(EM_AMDIL, 0x4154, "AMD Intermediate Language Target") \ +_ELF_DEFINE_EM(EM_HSAIL, 0xAF5A, "HSA Intermediate Language Target") \ +_ELF_DEFINE_EM(EM_AMDIL_64, 0x4155, "AMD64 Intermediate Language Target") \ +_ELF_DEFINE_EM(EM_HSAIL_64, 0xAF5B, "HSA64 Intermediate Language Target") \ +_ELF_DEFINE_EM(EM_ARM_64, 0xAA64, "Advanced RISC Machines ARM 64") + + +#undef _ELF_DEFINE_EM +#define _ELF_DEFINE_EM(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ELF_MACHINES() + EM__LAST__ +}; + +/* Older synonyms. */ +#ifndef EM_ARC_A5 +#define EM_ARC_A5 EM_ARC_COMPACT +#endif + +/* + * ELF file types: (ET_*). + */ +#define _ELF_DEFINE_ELF_TYPES() \ +_ELF_DEFINE_ET(ET_NONE, 0, "No file type") \ +_ELF_DEFINE_ET(ET_REL, 1, "Relocatable object") \ +_ELF_DEFINE_ET(ET_EXEC, 2, "Executable") \ +_ELF_DEFINE_ET(ET_DYN, 3, "Shared object") \ +_ELF_DEFINE_ET(ET_CORE, 4, "Core file") \ +_ELF_DEFINE_ET(ET_LOOS, 0xFE00U, "Begin OS-specific range") \ +_ELF_DEFINE_ET(ET_HIOS, 0xFEFFU, "End OS-specific range") \ +_ELF_DEFINE_ET(ET_LOPROC, 0xFF00U, "Begin processor-specific range") \ +_ELF_DEFINE_ET(ET_HIPROC, 0xFFFFU, "End processor-specific range") + +#undef _ELF_DEFINE_ET +#define _ELF_DEFINE_ET(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ELF_TYPES() + ET__LAST__ +}; + +/* ELF file format version numbers. */ +#define EV_NONE 0 +#define EV_CURRENT 1 + +/* + * Flags for section groups. + */ +#define GRP_COMDAT 0x1 /* COMDAT semantics */ +#define GRP_MASKOS 0x0ff00000 /* OS-specific flags */ +#define GRP_MASKPROC 0xf0000000 /* processor-specific flags */ + +/* + * Flags used by program header table entries. + */ + +#define _ELF_DEFINE_PHDR_FLAGS() \ +_ELF_DEFINE_PF(PF_X, 0x1, "Execute") \ +_ELF_DEFINE_PF(PF_W, 0x2, "Write") \ +_ELF_DEFINE_PF(PF_R, 0x4, "Read") \ +_ELF_DEFINE_PF(PF_MASKOS, 0x0ff00000, "OS-specific flags") \ +_ELF_DEFINE_PF(PF_MASKPROC, 0xf0000000, "Processor-specific flags") \ +_ELF_DEFINE_PF(PF_ARM_SB, 0x10000000, \ + "segment contains the location addressed by the static base") \ +_ELF_DEFINE_PF(PF_ARM_PI, 0x20000000, \ + "segment is position-independent") \ +_ELF_DEFINE_PF(PF_ARM_ABS, 0x40000000, \ + "segment must be loaded at its base address") + +#undef _ELF_DEFINE_PF +#define _ELF_DEFINE_PF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_PHDR_FLAGS() + PF__LAST__ +}; + +/* + * Types of program header table entries. + */ + +#define _ELF_DEFINE_PHDR_TYPES() \ +_ELF_DEFINE_PT(PT_NULL, 0, "ignored entry") \ +_ELF_DEFINE_PT(PT_LOAD, 1, "loadable segment") \ +_ELF_DEFINE_PT(PT_DYNAMIC, 2, \ + "contains dynamic linking information") \ +_ELF_DEFINE_PT(PT_INTERP, 3, "names an interpreter") \ +_ELF_DEFINE_PT(PT_NOTE, 4, "auxiliary information") \ +_ELF_DEFINE_PT(PT_SHLIB, 5, "reserved") \ +_ELF_DEFINE_PT(PT_PHDR, 6, \ + "describes the program header itself") \ +_ELF_DEFINE_PT(PT_TLS, 7, "thread local storage") \ +_ELF_DEFINE_PT(PT_LOOS, 0x60000000UL, \ + "start of OS-specific range") \ +_ELF_DEFINE_PT(PT_GNU_EH_FRAME, 0x6474E550UL, \ + "GCC generated .eh_frame_hdr segment ") \ +_ELF_DEFINE_PT(PT_GNU_STACK, 0x6474E551UL, \ + "Stack flags") \ +_ELF_DEFINE_PT(PT_GNU_RELRO, 0x6474E552UL, \ + "Segment becomes read-only after relocation") \ +_ELF_DEFINE_PT(PT_HIOS, 0x6FFFFFFFUL, \ + "end of OS-specific range") \ +_ELF_DEFINE_PT(PT_LOPROC, 0x70000000UL, \ + "start of processor-specific range") \ +_ELF_DEFINE_PT(PT_ARM_ARCHEXT, 0x70000000UL, \ + "platform architecture compatibility information") \ +_ELF_DEFINE_PT(PT_ARM_EXIDX, 0x70000001UL, \ + "exception unwind tables") \ +_ELF_DEFINE_PT(PT_MIPS_REGINFO, 0x70000000UL, \ + "register usage information") \ +_ELF_DEFINE_PT(PT_MIPS_RTPROC, 0x70000001UL, \ + "runtime procedure table") \ +_ELF_DEFINE_PT(PT_MIPS_OPTIONS, 0x70000002UL, \ + "options segment") \ +_ELF_DEFINE_PT(PT_HIPROC, 0x7FFFFFFFUL, \ + "end of processor-specific range") + +#undef _ELF_DEFINE_PT +#define _ELF_DEFINE_PT(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_PHDR_TYPES() + PT__LAST__ = PT_HIPROC +}; + +/* synonyms. */ +#define PT_ARM_UNWIND PT_ARM_EXIDX + +/* + * Section flags. + */ + +#define _ELF_DEFINE_SECTION_FLAGS() \ +_ELF_DEFINE_SHF(SHF_WRITE, 0x1, \ + "writable during program execution") \ +_ELF_DEFINE_SHF(SHF_ALLOC, 0x2, \ + "occupies memory during program execution") \ +_ELF_DEFINE_SHF(SHF_EXECINSTR, 0x4, "executable instructions") \ +_ELF_DEFINE_SHF(SHF_MERGE, 0x10, \ + "may be merged to prevent duplication") \ +_ELF_DEFINE_SHF(SHF_STRINGS, 0x20, \ + "NUL-terminated character strings") \ +_ELF_DEFINE_SHF(SHF_INFO_LINK, 0x40, \ + "the sh_info field holds a link") \ +_ELF_DEFINE_SHF(SHF_LINK_ORDER, 0x80, \ + "special ordering requirements during linking") \ +_ELF_DEFINE_SHF(SHF_OS_NONCONFORMING, 0x100, \ + "requires OS-specific processing during linking") \ +_ELF_DEFINE_SHF(SHF_GROUP, 0x200, \ + "member of a section group") \ +_ELF_DEFINE_SHF(SHF_TLS, 0x400, \ + "holds thread-local storage") \ +_ELF_DEFINE_SHF(SHF_MASKOS, 0x0FF00000UL, \ + "bits reserved for OS-specific semantics") \ +_ELF_DEFINE_SHF(SHF_AMD64_LARGE, 0x10000000UL, \ + "section uses large code model") \ +_ELF_DEFINE_SHF(SHF_ENTRYSECT, 0x10000000UL, \ + "section contains an entry point (ARM)") \ +_ELF_DEFINE_SHF(SHF_COMDEF, 0x80000000UL, \ + "section may be multiply defined in input to link step (ARM)") \ +_ELF_DEFINE_SHF(SHF_MIPS_GPREL, 0x10000000UL, \ + "section must be part of global data area") \ +_ELF_DEFINE_SHF(SHF_MIPS_MERGE, 0x20000000UL, \ + "section data should be merged to eliminate duplication") \ +_ELF_DEFINE_SHF(SHF_MIPS_ADDR, 0x40000000UL, \ + "section data is addressed by default") \ +_ELF_DEFINE_SHF(SHF_MIPS_STRING, 0x80000000UL, \ + "section data is string data by default") \ +_ELF_DEFINE_SHF(SHF_MIPS_NOSTRIP, 0x08000000UL, \ + "section data may not be stripped") \ +_ELF_DEFINE_SHF(SHF_MIPS_LOCAL, 0x04000000UL, \ + "section data local to process") \ +_ELF_DEFINE_SHF(SHF_MIPS_NAMES, 0x02000000UL, \ + "linker must generate implicit hidden weak names") \ +_ELF_DEFINE_SHF(SHF_MIPS_NODUPE, 0x01000000UL, \ + "linker must retain only one copy") \ +_ELF_DEFINE_SHF(SHF_ORDERED, 0x40000000UL, \ + "section is ordered with respect to other sections") \ +_ELF_DEFINE_SHF(SHF_EXCLUDE, 0x80000000UL, \ + "section is excluded from executables and shared objects") \ +_ELF_DEFINE_SHF(SHF_MASKPROC, 0xF0000000UL, \ + "bits reserved for processor-specific semantics") + +#undef _ELF_DEFINE_SHF +#define _ELF_DEFINE_SHF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SECTION_FLAGS() + SHF__LAST__ +}; + +/* + * Special section indices. + */ +#define _ELF_DEFINE_SECTION_INDICES() \ +_ELF_DEFINE_SHN(SHN_UNDEF, 0, "undefined section") \ +_ELF_DEFINE_SHN(SHN_LORESERVE, 0xFF00U, "start of reserved area") \ +_ELF_DEFINE_SHN(SHN_LOPROC, 0xFF00U, \ + "start of processor-specific range") \ +_ELF_DEFINE_SHN(SHN_BEFORE, 0xFF00U, "used for section ordering") \ +_ELF_DEFINE_SHN(SHN_AFTER, 0xFF01U, "used for section ordering") \ +_ELF_DEFINE_SHN(SHN_AMD64_LCOMMON, 0xFF02U, "large common block label") \ +_ELF_DEFINE_SHN(SHN_MIPS_ACOMMON, 0xFF00U, \ + "allocated common symbols in a DSO") \ +_ELF_DEFINE_SHN(SHN_MIPS_TEXT, 0xFF01U, "Reserved (obsolete)") \ +_ELF_DEFINE_SHN(SHN_MIPS_DATA, 0xFF02U, "Reserved (obsolete)") \ +_ELF_DEFINE_SHN(SHN_MIPS_SCOMMON, 0xFF03U, \ + "gp-addressable common symbols") \ +_ELF_DEFINE_SHN(SHN_MIPS_SUNDEFINED, 0xFF04U, \ + "gp-addressable undefined symbols") \ +_ELF_DEFINE_SHN(SHN_MIPS_LCOMMON, 0xFF05U, "local common symbols") \ +_ELF_DEFINE_SHN(SHN_MIPS_LUNDEFINED, 0xFF06U, \ + "local undefined symbols") \ +_ELF_DEFINE_SHN(SHN_HIPROC, 0xFF1FU, \ + "end of processor-specific range") \ +_ELF_DEFINE_SHN(SHN_LOOS, 0xFF20U, \ + "start of OS-specific range") \ +_ELF_DEFINE_SHN(SHN_SUNW_IGNORE, 0xFF3FU, "used by dtrace") \ +_ELF_DEFINE_SHN(SHN_HIOS, 0xFF3FU, \ + "end of OS-specific range") \ +_ELF_DEFINE_SHN(SHN_ABS, 0xFFF1U, "absolute references") \ +_ELF_DEFINE_SHN(SHN_COMMON, 0xFFF2U, "references to COMMON areas") \ +_ELF_DEFINE_SHN(SHN_XINDEX, 0xFFFFU, "extended index") \ +_ELF_DEFINE_SHN(SHN_HIRESERVE, 0xFFFFU, "end of reserved area") + +#undef _ELF_DEFINE_SHN +#define _ELF_DEFINE_SHN(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SECTION_INDICES() + SHN__LAST__ +}; + +/* + * Section types. + */ + +#define _ELF_DEFINE_SECTION_TYPES() \ +_ELF_DEFINE_SHT(SHT_NULL, 0, "inactive header") \ +_ELF_DEFINE_SHT(SHT_PROGBITS, 1, "program defined information") \ +_ELF_DEFINE_SHT(SHT_SYMTAB, 2, "symbol table") \ +_ELF_DEFINE_SHT(SHT_STRTAB, 3, "string table") \ +_ELF_DEFINE_SHT(SHT_RELA, 4, \ + "relocation entries with addends") \ +_ELF_DEFINE_SHT(SHT_HASH, 5, "symbol hash table") \ +_ELF_DEFINE_SHT(SHT_DYNAMIC, 6, \ + "information for dynamic linking") \ +_ELF_DEFINE_SHT(SHT_NOTE, 7, "additional notes") \ +_ELF_DEFINE_SHT(SHT_NOBITS, 8, "section occupying no space") \ +_ELF_DEFINE_SHT(SHT_REL, 9, \ + "relocation entries without addends") \ +_ELF_DEFINE_SHT(SHT_SHLIB, 10, "reserved") \ +_ELF_DEFINE_SHT(SHT_DYNSYM, 11, "symbol table") \ +_ELF_DEFINE_SHT(SHT_INIT_ARRAY, 14, \ + "pointers to initialization functions") \ +_ELF_DEFINE_SHT(SHT_FINI_ARRAY, 15, \ + "pointers to termination functions") \ +_ELF_DEFINE_SHT(SHT_PREINIT_ARRAY, 16, \ + "pointers to functions called before initialization") \ +_ELF_DEFINE_SHT(SHT_GROUP, 17, "defines a section group") \ +_ELF_DEFINE_SHT(SHT_SYMTAB_SHNDX, 18, \ + "used for extended section numbering") \ +_ELF_DEFINE_SHT(SHT_LOOS, 0x60000000UL, \ + "start of OS-specific range") \ +_ELF_DEFINE_SHT(SHT_SUNW_dof, 0x6FFFFFF4UL, \ + "used by dtrace") \ +_ELF_DEFINE_SHT(SHT_SUNW_cap, 0x6FFFFFF5UL, \ + "capability requirements") \ +_ELF_DEFINE_SHT(SHT_GNU_ATTRIBUTES, 0x6FFFFFF5UL, \ + "object attributes") \ +_ELF_DEFINE_SHT(SHT_SUNW_SIGNATURE, 0x6FFFFFF6UL, \ + "module verification signature") \ +_ELF_DEFINE_SHT(SHT_GNU_HASH, 0x6FFFFFF6UL, \ + "GNU Hash sections") \ +_ELF_DEFINE_SHT(SHT_GNU_LIBLIST, 0x6FFFFFF7UL, \ + "List of libraries to be prelinked") \ +_ELF_DEFINE_SHT(SHT_SUNW_ANNOTATE, 0x6FFFFFF7UL, \ + "special section where unresolved references are allowed") \ +_ELF_DEFINE_SHT(SHT_SUNW_DEBUGSTR, 0x6FFFFFF8UL, \ + "debugging information") \ +_ELF_DEFINE_SHT(SHT_CHECKSUM, 0x6FFFFFF8UL, \ + "checksum for dynamic shared objects") \ +_ELF_DEFINE_SHT(SHT_SUNW_DEBUG, 0x6FFFFFF9UL, \ + "debugging information") \ +_ELF_DEFINE_SHT(SHT_SUNW_move, 0x6FFFFFFAUL, \ + "information to handle partially initialized symbols") \ +_ELF_DEFINE_SHT(SHT_SUNW_COMDAT, 0x6FFFFFFBUL, \ + "section supporting merging of multiple copies of data") \ +_ELF_DEFINE_SHT(SHT_SUNW_syminfo, 0x6FFFFFFCUL, \ + "additional symbol information") \ +_ELF_DEFINE_SHT(SHT_SUNW_verdef, 0x6FFFFFFDUL, \ + "symbol versioning information") \ +_ELF_DEFINE_SHT(SHT_SUNW_verneed, 0x6FFFFFFEUL, \ + "symbol versioning requirements") \ +_ELF_DEFINE_SHT(SHT_SUNW_versym, 0x6FFFFFFFUL, \ + "symbol versioning table") \ +_ELF_DEFINE_SHT(SHT_HIOS, 0x6FFFFFFFUL, \ + "end of OS-specific range") \ +_ELF_DEFINE_SHT(SHT_LOPROC, 0x70000000UL, \ + "start of processor-specific range") \ +_ELF_DEFINE_SHT(SHT_ARM_EXIDX, 0x70000001UL, \ + "exception index table") \ +_ELF_DEFINE_SHT(SHT_ARM_PREEMPTMAP, 0x70000002UL, \ + "BPABI DLL dynamic linking preemption map") \ +_ELF_DEFINE_SHT(SHT_ARM_ATTRIBUTES, 0x70000003UL, \ + "object file compatibility attributes") \ +_ELF_DEFINE_SHT(SHT_ARM_DEBUGOVERLAY, 0x70000004UL, \ + "overlay debug information") \ +_ELF_DEFINE_SHT(SHT_ARM_OVERLAYSECTION, 0x70000005UL, \ + "overlay debug information") \ +_ELF_DEFINE_SHT(SHT_MIPS_LIBLIST, 0x70000000UL, \ + "DSO library information used in link") \ +_ELF_DEFINE_SHT(SHT_MIPS_MSYM, 0x70000001UL, \ + "MIPS symbol table extension") \ +_ELF_DEFINE_SHT(SHT_MIPS_CONFLICT, 0x70000002UL, \ + "symbol conflicting with DSO-defined symbols ") \ +_ELF_DEFINE_SHT(SHT_MIPS_GPTAB, 0x70000003UL, \ + "global pointer table") \ +_ELF_DEFINE_SHT(SHT_MIPS_UCODE, 0x70000004UL, \ + "reserved") \ +_ELF_DEFINE_SHT(SHT_MIPS_DEBUG, 0x70000005UL, \ + "reserved (obsolete debug information)") \ +_ELF_DEFINE_SHT(SHT_MIPS_REGINFO, 0x70000006UL, \ + "register usage information") \ +_ELF_DEFINE_SHT(SHT_MIPS_PACKAGE, 0x70000007UL, \ + "OSF reserved") \ +_ELF_DEFINE_SHT(SHT_MIPS_PACKSYM, 0x70000008UL, \ + "OSF reserved") \ +_ELF_DEFINE_SHT(SHT_MIPS_RELD, 0x70000009UL, \ + "dynamic relocation") \ +_ELF_DEFINE_SHT(SHT_MIPS_IFACE, 0x7000000BUL, \ + "subprogram interface information") \ +_ELF_DEFINE_SHT(SHT_MIPS_CONTENT, 0x7000000CUL, \ + "section content classification") \ +_ELF_DEFINE_SHT(SHT_MIPS_OPTIONS, 0x7000000DUL, \ + "general options") \ +_ELF_DEFINE_SHT(SHT_MIPS_DELTASYM, 0x7000001BUL, \ + "Delta C++: symbol table") \ +_ELF_DEFINE_SHT(SHT_MIPS_DELTAINST, 0x7000001CUL, \ + "Delta C++: instance table") \ +_ELF_DEFINE_SHT(SHT_MIPS_DELTACLASS, 0x7000001DUL, \ + "Delta C++: class table") \ +_ELF_DEFINE_SHT(SHT_MIPS_DWARF, 0x7000001EUL, \ + "DWARF debug information") \ +_ELF_DEFINE_SHT(SHT_MIPS_DELTADECL, 0x7000001FUL, \ + "Delta C++: declarations") \ +_ELF_DEFINE_SHT(SHT_MIPS_SYMBOL_LIB, 0x70000020UL, \ + "symbol-to-library mapping") \ +_ELF_DEFINE_SHT(SHT_MIPS_EVENTS, 0x70000021UL, \ + "event locations") \ +_ELF_DEFINE_SHT(SHT_MIPS_TRANSLATE, 0x70000022UL, \ + "???") \ +_ELF_DEFINE_SHT(SHT_MIPS_PIXIE, 0x70000023UL, \ + "special pixie sections") \ +_ELF_DEFINE_SHT(SHT_MIPS_XLATE, 0x70000024UL, \ + "address translation table") \ +_ELF_DEFINE_SHT(SHT_MIPS_XLATE_DEBUG, 0x70000025UL, \ + "SGI internal address translation table") \ +_ELF_DEFINE_SHT(SHT_MIPS_WHIRL, 0x70000026UL, \ + "intermediate code") \ +_ELF_DEFINE_SHT(SHT_MIPS_EH_REGION, 0x70000027UL, \ + "C++ exception handling region info") \ +_ELF_DEFINE_SHT(SHT_MIPS_XLATE_OLD, 0x70000028UL, \ + "obsolete") \ +_ELF_DEFINE_SHT(SHT_MIPS_PDR_EXCEPTION, 0x70000029UL, \ + "runtime procedure descriptor table exception information") \ +_ELF_DEFINE_SHT(SHT_SPARC_GOTDATA, 0x70000000UL, \ + "SPARC-specific data") \ +_ELF_DEFINE_SHT(SHT_AMD64_UNWIND, 0x70000001UL, \ + "unwind tables for the AMD64") \ +_ELF_DEFINE_SHT(SHT_ORDERED, 0x7FFFFFFFUL, \ + "sort entries in the section") \ +_ELF_DEFINE_SHT(SHT_HIPROC, 0x7FFFFFFFUL, \ + "end of processor-specific range") \ +_ELF_DEFINE_SHT(SHT_LOUSER, 0x80000000UL, \ + "start of application-specific range") \ +_ELF_DEFINE_SHT(SHT_HIUSER, 0xFFFFFFFFUL, \ + "end of application-specific range") + +#undef _ELF_DEFINE_SHT +#define _ELF_DEFINE_SHT(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SECTION_TYPES() + SHT__LAST__ = SHT_HIUSER +}; + +/* Aliases for section types. */ +#ifndef SHT_GNU_verdef +#define SHT_GNU_verdef SHT_SUNW_verdef +#endif +#ifndef SHT_GNU_verneed +#define SHT_GNU_verneed SHT_SUNW_verneed +#endif +#ifndef SHT_GNU_versym +#define SHT_GNU_versym SHT_SUNW_versym +#endif + +/* + * Symbol binding information. + */ + +#define _ELF_DEFINE_SYMBOL_BINDING() \ +_ELF_DEFINE_STB(STB_LOCAL, 0, \ + "not visible outside defining object file") \ +_ELF_DEFINE_STB(STB_GLOBAL, 1, \ + "visible across all object files being combined") \ +_ELF_DEFINE_STB(STB_WEAK, 2, \ + "visible across all object files but with low precedence") \ +_ELF_DEFINE_STB(STB_LOOS, 10, "start of OS-specific range") \ +_ELF_DEFINE_STB(STB_HIOS, 12, "end of OS-specific range") \ +_ELF_DEFINE_STB(STB_LOPROC, 13, \ + "start of processor-specific range") \ +_ELF_DEFINE_STB(STB_HIPROC, 15, \ + "end of processor-specific range") + +#undef _ELF_DEFINE_STB +#define _ELF_DEFINE_STB(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SYMBOL_BINDING() + STB__LAST__ +}; + +/* + * Symbol types + */ + +#define _ELF_DEFINE_SYMBOL_TYPES() \ +_ELF_DEFINE_STT(STT_NOTYPE, 0, "unspecified type") \ +_ELF_DEFINE_STT(STT_OBJECT, 1, "data object") \ +_ELF_DEFINE_STT(STT_FUNC, 2, "executable code") \ +_ELF_DEFINE_STT(STT_SECTION, 3, "section") \ +_ELF_DEFINE_STT(STT_FILE, 4, "source file") \ +_ELF_DEFINE_STT(STT_COMMON, 5, "uninitialized common block") \ +_ELF_DEFINE_STT(STT_TLS, 6, "thread local storage") \ +_ELF_DEFINE_STT(STT_LOOS, 10, "start of OS-specific types") \ +_ELF_DEFINE_STT(STT_HIOS, 12, "end of OS-specific types") \ +_ELF_DEFINE_STT(STT_LOPROC, 13, \ + "start of processor-specific types") \ +_ELF_DEFINE_STT(STT_ARM_TFUNC, 13, "Thumb function (GNU)") \ +_ELF_DEFINE_STT(STT_ARM_16BIT, 15, "Thumb label (GNU)") \ +_ELF_DEFINE_STT(STT_HIPROC, 15, \ + "end of processor-specific types") + +#undef _ELF_DEFINE_STT +#define _ELF_DEFINE_STT(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SYMBOL_TYPES() + STT__LAST__ +}; + +/* + * Symbol binding. + */ + +#define _ELF_DEFINE_SYMBOL_BINDING_KINDS() \ +_ELF_DEFINE_SYB(SYMINFO_BT_SELF, 0xFFFFU, \ + "bound to self") \ +_ELF_DEFINE_SYB(SYMINFO_BT_PARENT, 0xFFFEU, \ + "bound to parent") \ +_ELF_DEFINE_SYB(SYMINFO_BT_NONE, 0xFFFDU, \ + "no special binding") + +#undef _ELF_DEFINE_SYB +#define _ELF_DEFINE_SYB(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SYMBOL_BINDING_KINDS() + SYMINFO__LAST__ +}; + +/* + * Symbol visibility. + */ + +#define _ELF_DEFINE_SYMBOL_VISIBILITY() \ +_ELF_DEFINE_STV(STV_DEFAULT, 0, \ + "as specified by symbol type") \ +_ELF_DEFINE_STV(STV_INTERNAL, 1, \ + "as defined by processor semantics") \ +_ELF_DEFINE_STV(STV_HIDDEN, 2, \ + "hidden from other components") \ +_ELF_DEFINE_STV(STV_PROTECTED, 3, \ + "local references are not preemptable") + +#undef _ELF_DEFINE_STV +#define _ELF_DEFINE_STV(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SYMBOL_VISIBILITY() + STV__LAST__ +}; + +/* + * Symbol flags. + */ +#define _ELF_DEFINE_SYMBOL_FLAGS() \ +_ELF_DEFINE_SYF(SYMINFO_FLG_DIRECT, 0x01, \ + "directly assocated reference") \ +_ELF_DEFINE_SYF(SYMINFO_FLG_COPY, 0x04, \ + "definition by copy-relocation") \ +_ELF_DEFINE_SYF(SYMINFO_FLG_LAZYLOAD, 0x08, \ + "object should be lazily loaded") \ +_ELF_DEFINE_SYF(SYMINFO_FLG_DIRECTBIND, 0x10, \ + "reference should be directly bound") \ +_ELF_DEFINE_SYF(SYMINFO_FLG_NOEXTDIRECT, 0x20, \ + "external references not allowed to bind to definition") + +#undef _ELF_DEFINE_SYF +#define _ELF_DEFINE_SYF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_SYMBOL_FLAGS() + SYMINFO_FLG__LAST__ +}; + +/* + * Version dependencies. + */ +#define _ELF_DEFINE_VERSIONING_DEPENDENCIES() \ +_ELF_DEFINE_VERD(VER_NDX_LOCAL, 0, "local scope") \ +_ELF_DEFINE_VERD(VER_NDX_GLOBAL, 1, "global scope") +#undef _ELF_DEFINE_VERD +#define _ELF_DEFINE_VERD(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_VERSIONING_DEPENDENCIES() + VER_NDX__LAST__ +}; + +/* + * Version flags. + */ +#define _ELF_DEFINE_VERSIONING_FLAGS() \ +_ELF_DEFINE_VERF(VER_FLG_BASE, 0x1, "file version") \ +_ELF_DEFINE_VERF(VER_FLG_WEAK, 0x2, "weak version") +#undef _ELF_DEFINE_VERF +#define _ELF_DEFINE_VERF(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_VERSIONING_FLAGS() + VER_FLG__LAST__ +}; + +/* + * Version needs + */ +#define _ELF_DEFINE_VERSIONING_NEEDS() \ +_ELF_DEFINE_VRN(VER_NEED_NONE, 0, "invalid version") \ +_ELF_DEFINE_VRN(VER_NEED_CURRENT, 1, "current version") +#undef _ELF_DEFINE_VRN +#define _ELF_DEFINE_VRN(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_VERSIONING_NEEDS() + VER_NEED__LAST__ +}; + +/* + * Version numbers. + */ +#define _ELF_DEFINE_VERSIONING_NUMBERS() \ +_ELF_DEFINE_VRNU(VER_DEF_NONE, 0, "invalid version") \ +_ELF_DEFINE_VRNU(VER_DEF_CURRENT, 1, "current version") +#undef _ELF_DEFINE_VRNU +#define _ELF_DEFINE_VRNU(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_VERSIONING_NUMBERS() + VER_DEF__LAST__ +}; + +/** + ** Relocation types. + **/ + +#define _ELF_DEFINE_386_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_386_NONE, 0) \ +_ELF_DEFINE_RELOC(R_386_32, 1) \ +_ELF_DEFINE_RELOC(R_386_PC32, 2) \ +_ELF_DEFINE_RELOC(R_386_GOT32, 3) \ +_ELF_DEFINE_RELOC(R_386_PLT32, 4) \ +_ELF_DEFINE_RELOC(R_386_COPY, 5) \ +_ELF_DEFINE_RELOC(R_386_GLOB_DAT, 6) \ +_ELF_DEFINE_RELOC(R_386_JMP_SLOT, 7) \ +_ELF_DEFINE_RELOC(R_386_RELATIVE, 8) \ +_ELF_DEFINE_RELOC(R_386_GOTOFF, 9) \ +_ELF_DEFINE_RELOC(R_386_GOTPC, 10) \ +_ELF_DEFINE_RELOC(R_386_32PLT, 11) \ +_ELF_DEFINE_RELOC(R_386_16, 20) \ +_ELF_DEFINE_RELOC(R_386_PC16, 21) \ +_ELF_DEFINE_RELOC(R_386_8, 22) \ +_ELF_DEFINE_RELOC(R_386_PC8, 23) + +/* + * These are the symbols used in the Sun ``Linkers and Loaders + * Guide'', Document No: 817-1984-17. See the X86_64 relocations list + * below for the spellings used in the ELF specification. + */ +#define _ELF_DEFINE_AMD64_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_AMD64_NONE, 0) \ +_ELF_DEFINE_RELOC(R_AMD64_64, 1) \ +_ELF_DEFINE_RELOC(R_AMD64_PC32, 2) \ +_ELF_DEFINE_RELOC(R_AMD64_GOT32, 3) \ +_ELF_DEFINE_RELOC(R_AMD64_PLT32, 4) \ +_ELF_DEFINE_RELOC(R_AMD64_COPY, 5) \ +_ELF_DEFINE_RELOC(R_AMD64_GLOB_DAT, 6) \ +_ELF_DEFINE_RELOC(R_AMD64_JUMP_SLOT, 7) \ +_ELF_DEFINE_RELOC(R_AMD64_RELATIVE, 8) \ +_ELF_DEFINE_RELOC(R_AMD64_GOTPCREL, 9) \ +_ELF_DEFINE_RELOC(R_AMD64_32, 10) \ +_ELF_DEFINE_RELOC(R_AMD64_32S, 11) \ +_ELF_DEFINE_RELOC(R_AMD64_16, 12) \ +_ELF_DEFINE_RELOC(R_AMD64_PC16, 13) \ +_ELF_DEFINE_RELOC(R_AMD64_8, 14) \ +_ELF_DEFINE_RELOC(R_AMD64_PC8, 15) \ +_ELF_DEFINE_RELOC(R_AMD64_PC64, 24) \ +_ELF_DEFINE_RELOC(R_AMD64_GOTOFF64, 25) \ +_ELF_DEFINE_RELOC(R_AMD64_GOTPC32, 26) + +#define _ELF_DEFINE_ARM_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_ARM_NONE, 0) \ +_ELF_DEFINE_RELOC(R_ARM_PC24, 1) \ +_ELF_DEFINE_RELOC(R_ARM_ABS32, 2) \ +_ELF_DEFINE_RELOC(R_ARM_REL32, 3) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_PC_G0, 4) \ +_ELF_DEFINE_RELOC(R_ARM_ABS16, 5) \ +_ELF_DEFINE_RELOC(R_ARM_ABS12, 6) \ +_ELF_DEFINE_RELOC(R_ARM_THM_ABS5, 7) \ +_ELF_DEFINE_RELOC(R_ARM_ABS8, 8) \ +_ELF_DEFINE_RELOC(R_ARM_SBREL32, 9) \ +_ELF_DEFINE_RELOC(R_ARM_THM_CALL, 10) \ +_ELF_DEFINE_RELOC(R_ARM_THM_PC8, 11) \ +_ELF_DEFINE_RELOC(R_ARM_BREL_ADJ, 12) \ +_ELF_DEFINE_RELOC(R_ARM_SWI24, 13) \ +_ELF_DEFINE_RELOC(R_ARM_THM_SWI8, 14) \ +_ELF_DEFINE_RELOC(R_ARM_XPC25, 15) \ +_ELF_DEFINE_RELOC(R_ARM_THM_XPC22, 16) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_DTPMOD32, 17) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_DTPOFF32, 18) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_TPOFF32, 19) \ +_ELF_DEFINE_RELOC(R_ARM_COPY, 20) \ +_ELF_DEFINE_RELOC(R_ARM_GLOB_DAT, 21) \ +_ELF_DEFINE_RELOC(R_ARM_JUMP_SLOT, 22) \ +_ELF_DEFINE_RELOC(R_ARM_RELATIVE, 23) \ +_ELF_DEFINE_RELOC(R_ARM_GOTOFF32, 24) \ +_ELF_DEFINE_RELOC(R_ARM_BASE_PREL, 25) \ +_ELF_DEFINE_RELOC(R_ARM_GOT_BREL, 26) \ +_ELF_DEFINE_RELOC(R_ARM_PLT32, 27) \ +_ELF_DEFINE_RELOC(R_ARM_CALL, 28) \ +_ELF_DEFINE_RELOC(R_ARM_JUMP24, 29) \ +_ELF_DEFINE_RELOC(R_ARM_THM_JUMP24, 30) \ +_ELF_DEFINE_RELOC(R_ARM_BASE_ABS, 31) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PCREL7_0, 32) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PCREL15_8, 33) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PCREL23_15, 34) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_SBREL_11_0, 35) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SBREL_19_12, 36) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SBREL_27_20, 37) \ +_ELF_DEFINE_RELOC(R_ARM_TARGET1, 38) \ +_ELF_DEFINE_RELOC(R_ARM_SBREL31, 39) \ +_ELF_DEFINE_RELOC(R_ARM_V4BX, 40) \ +_ELF_DEFINE_RELOC(R_ARM_TARGET2, 41) \ +_ELF_DEFINE_RELOC(R_ARM_PREL31, 42) \ +_ELF_DEFINE_RELOC(R_ARM_MOVW_ABS_NC, 43) \ +_ELF_DEFINE_RELOC(R_ARM_MOVT_ABS, 44) \ +_ELF_DEFINE_RELOC(R_ARM_MOVW_PREL_NC, 45) \ +_ELF_DEFINE_RELOC(R_ARM_MOVT_PREL, 46) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVW_ABS_NC, 47) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVT_ABS, 48) \ +_ELF_DEFINE_RELOC(R_ARM_MOVW_PREL_NC, 49) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVT_PREL, 50) \ +_ELF_DEFINE_RELOC(R_ARM_THM_JUMP19, 51) \ +_ELF_DEFINE_RELOC(R_ARM_THM_JUMP6, 52) \ +_ELF_DEFINE_RELOC(R_ARM_THM_ALU_PREL_11_0, 53) \ +_ELF_DEFINE_RELOC(R_ARM_THM_PC12, 54) \ +_ELF_DEFINE_RELOC(R_ARM_ABS32_NOI, 55) \ +_ELF_DEFINE_RELOC(R_ARM_REL32_NOI, 56) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PC_G0_NC, 57) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PC_G0, 58) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PC_G1_NC, 59) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PC_G1, 60) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_PC_G2, 61) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_PC_G1, 62) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_PC_G2, 63) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_PC_G0, 64) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_PC_G1, 65) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_PC_G2, 66) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_PC_G0, 67) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_PC_G1, 68) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_PC_G2, 69) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SB_G0_NC, 70) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SB_G0, 71) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SB_G1_NC, 72) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SB_G1, 73) \ +_ELF_DEFINE_RELOC(R_ARM_ALU_SB_G2, 74) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_SB_G0, 75) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_SB_G1, 76) \ +_ELF_DEFINE_RELOC(R_ARM_LDR_SB_G2, 77) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_SB_G0, 78) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_SB_G1, 79) \ +_ELF_DEFINE_RELOC(R_ARM_LDRS_SB_G2, 80) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_SB_G0, 81) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_SB_G1, 82) \ +_ELF_DEFINE_RELOC(R_ARM_LDC_SB_G2, 83) \ +_ELF_DEFINE_RELOC(R_ARM_MOVW_BREL_NC, 84) \ +_ELF_DEFINE_RELOC(R_ARM_MOVT_BREL, 85) \ +_ELF_DEFINE_RELOC(R_ARM_MOVW_BREL, 86) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVW_BREL_NC, 87) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVT_BREL, 88) \ +_ELF_DEFINE_RELOC(R_ARM_THM_MOVW_BREL, 89) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_GOTDESC, 90) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_CALL, 91) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_DESCSEQ, 92) \ +_ELF_DEFINE_RELOC(R_ARM_THM_TLS_CALL, 93) \ +_ELF_DEFINE_RELOC(R_ARM_PLT32_ABS, 94) \ +_ELF_DEFINE_RELOC(R_ARM_GOT_ABS, 95) \ +_ELF_DEFINE_RELOC(R_ARM_GOT_PREL, 96) \ +_ELF_DEFINE_RELOC(R_ARM_GOT_BREL12, 97) \ +_ELF_DEFINE_RELOC(R_ARM_GOTOFF12, 98) \ +_ELF_DEFINE_RELOC(R_ARM_GOTRELAX, 99) \ +_ELF_DEFINE_RELOC(R_ARM_GNU_VTENTRY, 100) \ +_ELF_DEFINE_RELOC(R_ARM_GNU_VTINHERIT, 101) \ +_ELF_DEFINE_RELOC(R_ARM_THM_JUMP11, 102) \ +_ELF_DEFINE_RELOC(R_ARM_THM_JUMP8, 103) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_GD32, 104) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_LDM32, 105) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_LDO32, 106) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_IE32, 107) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_LE32, 108) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_LDO12, 109) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_LE12, 110) \ +_ELF_DEFINE_RELOC(R_ARM_TLS_IE12GP, 111) \ +_ELF_DEFINE_RELOC(R_ARM_ME_TOO, 128) \ +_ELF_DEFINE_RELOC(R_ARM_THM_TLS_DESCSEQ16, 129) \ +_ELF_DEFINE_RELOC(R_ARM_THM_TLS_DESCSEQ32, 130) + +#define _ELF_DEFINE_IA64_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_IA_64_NONE, 0) \ +_ELF_DEFINE_RELOC(R_IA_64_IMM14, 0x21) \ +_ELF_DEFINE_RELOC(R_IA_64_IMM22, 0x22) \ +_ELF_DEFINE_RELOC(R_IA_64_IMM64, 0x23) \ +_ELF_DEFINE_RELOC(R_IA_64_DIR32MSB, 0x24) \ +_ELF_DEFINE_RELOC(R_IA_64_DIR32LSB, 0x25) \ +_ELF_DEFINE_RELOC(R_IA_64_DIR64MSB, 0x26) \ +_ELF_DEFINE_RELOC(R_IA_64_DIR64LSB, 0x27) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL22, 0x2a) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL64I, 0x2b) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL32MSB, 0x2c) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL32LSB, 0x2d) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL64MSB, 0x2e) \ +_ELF_DEFINE_RELOC(R_IA_64_GPREL64LSB, 0x2f) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF22, 0x32) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF64I, 0x33) \ +_ELF_DEFINE_RELOC(R_IA_64_PLTOFF22, 0x3a) \ +_ELF_DEFINE_RELOC(R_IA_64_PLTOFF64I, 0x3b) \ +_ELF_DEFINE_RELOC(R_IA_64_PLTOFF64MSB, 0x3e) \ +_ELF_DEFINE_RELOC(R_IA_64_PLTOFF64LSB, 0x3f) \ +_ELF_DEFINE_RELOC(R_IA_64_FPTR64I, 0x43) \ +_ELF_DEFINE_RELOC(R_IA_64_FPTR32MSB, 0x44) \ +_ELF_DEFINE_RELOC(R_IA_64_FPTR32LSB, 0x45) \ +_ELF_DEFINE_RELOC(R_IA_64_FPTR64MSB, 0x46) \ +_ELF_DEFINE_RELOC(R_IA_64_FPTR64LSB, 0x47) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL60B, 0x48) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL21B, 0x49) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL21M, 0x4a) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL21F, 0x4b) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL32MSB, 0x4c) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL32LSB, 0x4d) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL64MSB, 0x4e) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL64LSB, 0x4f) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR22, 0x52) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR64I, 0x53) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR32MSB, 0x54) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR32LSB, 0x55) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR64MSB, 0x56) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_FPTR64LSB, 0x57) \ +_ELF_DEFINE_RELOC(R_IA_64_SEGREL32MSB, 0x5c) \ +_ELF_DEFINE_RELOC(R_IA_64_SEGREL32LSB, 0x5d) \ +_ELF_DEFINE_RELOC(R_IA_64_SEGREL64MSB, 0x5e) \ +_ELF_DEFINE_RELOC(R_IA_64_SEGREL64LSB, 0x5f) \ +_ELF_DEFINE_RELOC(R_IA_64_SECREL32MSB, 0x64) \ +_ELF_DEFINE_RELOC(R_IA_64_SECREL32LSB, 0x65) \ +_ELF_DEFINE_RELOC(R_IA_64_SECREL64MSB, 0x66) \ +_ELF_DEFINE_RELOC(R_IA_64_SECREL64LSB, 0x67) \ +_ELF_DEFINE_RELOC(R_IA_64_REL32MSB, 0x6c) \ +_ELF_DEFINE_RELOC(R_IA_64_REL32LSB, 0x6d) \ +_ELF_DEFINE_RELOC(R_IA_64_REL64MSB, 0x6e) \ +_ELF_DEFINE_RELOC(R_IA_64_REL64LSB, 0x6f) \ +_ELF_DEFINE_RELOC(R_IA_64_LTV32MSB, 0x74) \ +_ELF_DEFINE_RELOC(R_IA_64_LTV32LSB, 0x75) \ +_ELF_DEFINE_RELOC(R_IA_64_LTV64MSB, 0x76) \ +_ELF_DEFINE_RELOC(R_IA_64_LTV64LSB, 0x77) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL21BIa, 0x79) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL22, 0x7A) \ +_ELF_DEFINE_RELOC(R_IA_64_PCREL64I, 0x7B) \ +_ELF_DEFINE_RELOC(R_IA_64_IPLTMSB, 0x80) \ +_ELF_DEFINE_RELOC(R_IA_64_IPLTLSB, 0x81) \ +_ELF_DEFINE_RELOC(R_IA_64_SUB, 0x85) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF22X, 0x86) \ +_ELF_DEFINE_RELOC(R_IA_64_LDXMOV, 0x87) \ +_ELF_DEFINE_RELOC(R_IA_64_TPREL14, 0x91) \ +_ELF_DEFINE_RELOC(R_IA_64_TPREL22, 0x92) \ +_ELF_DEFINE_RELOC(R_IA_64_TPREL64I, 0x93) \ +_ELF_DEFINE_RELOC(R_IA_64_TPREL64MSB, 0x96) \ +_ELF_DEFINE_RELOC(R_IA_64_TPREL64LSB, 0x97) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_TPREL22, 0x9A) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPMOD64MSB, 0xA6) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPMOD64LSB, 0xA7) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_DTPMOD22, 0xAA) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL14, 0xB1) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL22, 0xB2) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL64I, 0xB3) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL32MSB, 0xB4) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL32LSB, 0xB5) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL64MSB, 0xB6) \ +_ELF_DEFINE_RELOC(R_IA_64_DTPREL64LSB, 0xB7) \ +_ELF_DEFINE_RELOC(R_IA_64_LTOFF_DTPREL22, 0xBA) + +#define _ELF_DEFINE_MIPS_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_MIPS_NONE, 0) \ +_ELF_DEFINE_RELOC(R_MIPS_16, 1) \ +_ELF_DEFINE_RELOC(R_MIPS_32, 2) \ +_ELF_DEFINE_RELOC(R_MIPS_REL32, 3) \ +_ELF_DEFINE_RELOC(R_MIPS_26, 4) \ +_ELF_DEFINE_RELOC(R_MIPS_HI16, 5) \ +_ELF_DEFINE_RELOC(R_MIPS_LO16, 6) \ +_ELF_DEFINE_RELOC(R_MIPS_GPREL16, 7) \ +_ELF_DEFINE_RELOC(R_MIPS_LITERAL, 8) \ +_ELF_DEFINE_RELOC(R_MIPS_GOT16, 9) \ +_ELF_DEFINE_RELOC(R_MIPS_PC16, 10) \ +_ELF_DEFINE_RELOC(R_MIPS_CALL16, 11) \ +_ELF_DEFINE_RELOC(R_MIPS_GPREL32, 12) \ +_ELF_DEFINE_RELOC(R_MIPS_64, 18) \ +_ELF_DEFINE_RELOC(R_MIPS_GOTHI16, 21) \ +_ELF_DEFINE_RELOC(R_MIPS_GOTLO16, 22) \ +_ELF_DEFINE_RELOC(R_MIPS_CALLHI16, 30) \ +_ELF_DEFINE_RELOC(R_MIPS_CALLLO16, 31) + +#define _ELF_DEFINE_PPC32_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_PPC_NONE, 0) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR32, 1) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR24, 2) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR16, 3) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR16_LO, 4) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR16_HI, 5) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR16_HA, 6) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR14, 7) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR14_BRTAKEN, 8) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR14_BRNTAKEN, 9) \ +_ELF_DEFINE_RELOC(R_PPC_REL24, 10) \ +_ELF_DEFINE_RELOC(R_PPC_REL14, 11) \ +_ELF_DEFINE_RELOC(R_PPC_REL14_BRTAKEN, 12) \ +_ELF_DEFINE_RELOC(R_PPC_REL14_BRNTAKEN, 13) \ +_ELF_DEFINE_RELOC(R_PPC_GOT16, 14) \ +_ELF_DEFINE_RELOC(R_PPC_GOT16_LO, 15) \ +_ELF_DEFINE_RELOC(R_PPC_GOT16_HI, 16) \ +_ELF_DEFINE_RELOC(R_PPC_GOT16_HA, 17) \ +_ELF_DEFINE_RELOC(R_PPC_PLTREL24, 18) \ +_ELF_DEFINE_RELOC(R_PPC_COPY, 19) \ +_ELF_DEFINE_RELOC(R_PPC_GLOB_DAT, 20) \ +_ELF_DEFINE_RELOC(R_PPC_JMP_SLOT, 21) \ +_ELF_DEFINE_RELOC(R_PPC_RELATIVE, 22) \ +_ELF_DEFINE_RELOC(R_PPC_LOCAL24PC, 23) \ +_ELF_DEFINE_RELOC(R_PPC_UADDR32, 24) \ +_ELF_DEFINE_RELOC(R_PPC_UADDR16, 25) \ +_ELF_DEFINE_RELOC(R_PPC_REL32, 26) \ +_ELF_DEFINE_RELOC(R_PPC_PLT32, 27) \ +_ELF_DEFINE_RELOC(R_PPC_PLTREL32, 28) \ +_ELF_DEFINE_RELOC(R_PPC_PLT16_LO, 29) \ +_ELF_DEFINE_RELOC(R_PPL_PLT16_HI, 30) \ +_ELF_DEFINE_RELOC(R_PPC_PLT16_HA, 31) \ +_ELF_DEFINE_RELOC(R_PPC_SDAREL16, 32) \ +_ELF_DEFINE_RELOC(R_PPC_SECTOFF, 33) \ +_ELF_DEFINE_RELOC(R_PPC_SECTOFF_LO, 34) \ +_ELF_DEFINE_RELOC(R_PPC_SECTOFF_HI, 35) \ +_ELF_DEFINE_RELOC(R_PPC_SECTOFF_HA, 36) \ +_ELF_DEFINE_RELOC(R_PPC_ADDR30, 37) \ +_ELF_DEFINE_RELOC(R_PPC_TLS, 67) \ +_ELF_DEFINE_RELOC(R_PPC_DTPMOD32, 68) \ +_ELF_DEFINE_RELOC(R_PPC_TPREL16, 69) \ +_ELF_DEFINE_RELOC(R_PPC_TPREL16_LO, 70) \ +_ELF_DEFINE_RELOC(R_PPC_TPREL16_HI, 71) \ +_ELF_DEFINE_RELOC(R_PPC_TPREL16_HA, 72) \ +_ELF_DEFINE_RELOC(R_PPC_TPREL32, 73) \ +_ELF_DEFINE_RELOC(R_PPC_DTPREL16, 74) \ +_ELF_DEFINE_RELOC(R_PPC_DTPREL16_LO, 75) \ +_ELF_DEFINE_RELOC(R_PPC_DTPREL16_HI, 76) \ +_ELF_DEFINE_RELOC(R_PPC_DTPREL16_HA, 77) \ +_ELF_DEFINE_RELOC(R_PPC_DTPREL32, 78) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSGD16, 79) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSGD16_LO, 80) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSGD16_HI, 81) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSGD16_HA, 82) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSLD16, 83) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSLD16_LO, 84) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSLD16_HI, 85) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TLSLD16_HA, 86) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TPREL16, 87) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TPREL16_LO, 88) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TPREL16_HI, 89) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_TPREL16_HA, 90) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_DTPREL16, 91) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_DTPREL16_LO, 92) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_DTPREL16_HI, 93) \ +_ELF_DEFINE_RELOC(R_PPC_GOT_DTPREL16_HA, 94) \ +_ELF_DEFINE_RELOC(R_PPC_TLSGD, 95) \ +_ELF_DEFINE_RELOC(R_PPC_TLSLD, 96) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_NADDR32, 101) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_NADDR16, 102) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_NADDR16_LO, 103) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_NADDR16_HI, 104) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_NADDR16_HA, 105) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_SDAI16, 106) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_SDA2I16, 107) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_SDA2REL, 108) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_SDA21, 109) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_MRKREF, 110) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_RELSEC16, 111) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_RELST_LO, 112) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_RELST_HI, 113) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_RELST_HA, 114) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_BIT_FLD, 115) \ +_ELF_DEFINE_RELOC(R_PPC_EMB_RELSDA, 116) \ + +#define _ELF_DEFINE_PPC64_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_PPC64_NONE, 0) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR32, 1) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR24, 2) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16, 3) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_LO, 4) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HI, 5) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HA, 6) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR14, 7) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR14_BRTAKEN, 8) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR14_BRNTAKEN, 9) \ +_ELF_DEFINE_RELOC(R_PPC64_REL24, 10) \ +_ELF_DEFINE_RELOC(R_PPC64_REL14, 11) \ +_ELF_DEFINE_RELOC(R_PPC64_REL14_BRTAKEN, 12) \ +_ELF_DEFINE_RELOC(R_PPC64_REL14_BRNTAKEN, 13) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16, 14) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16_LO, 15) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16_HI, 16) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16_HA, 17) \ +_ELF_DEFINE_RELOC(R_PPC64_COPY, 19) \ +_ELF_DEFINE_RELOC(R_PPC64_GLOB_DAT, 20) \ +_ELF_DEFINE_RELOC(R_PPC64_JMP_SLOT, 21) \ +_ELF_DEFINE_RELOC(R_PPC64_RELATIVE, 22) \ +_ELF_DEFINE_RELOC(R_PPC64_UADDR32, 24) \ +_ELF_DEFINE_RELOC(R_PPC64_UADDR16, 25) \ +_ELF_DEFINE_RELOC(R_PPC64_REL32, 26) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT32, 27) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTREL32, 28) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT16_LO, 29) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT16_HI, 30) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT16_HA, 31) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF, 33) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF_LO, 34) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF_HI, 35) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF_HA, 36) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR30, 37) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR64, 38) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HIGHER, 39) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HIGHERA, 40) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HIGHEST, 41) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_HIGHESTA, 42) \ +_ELF_DEFINE_RELOC(R_PPC64_UADDR64, 43) \ +_ELF_DEFINE_RELOC(R_PPC64_REL64, 44) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT64, 45) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTREL64, 46) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16, 47) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16_LO, 48) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16_HI, 49) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16_HA, 50) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC, 51) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16, 52) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16_LO, 53) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16_HI, 54) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16_HA, 55) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_DS, 56) \ +_ELF_DEFINE_RELOC(R_PPC64_ADDR16_LO_DS, 57) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16_DS, 58) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT16_LO_DS, 59) \ +_ELF_DEFINE_RELOC(R_PPC64_PLT16_LO_DS, 60) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF_DS, 61) \ +_ELF_DEFINE_RELOC(R_PPC64_SECTOFF_LO_DS, 62) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16_DS, 63) \ +_ELF_DEFINE_RELOC(R_PPC64_TOC16_LO_DS, 64) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16_DS, 65) \ +_ELF_DEFINE_RELOC(R_PPC64_PLTGOT16_LO_DS, 66) \ +_ELF_DEFINE_RELOC(R_PPC64_TLS, 67) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPMOD64, 68) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16, 69) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_LO, 60) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HI, 71) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HA, 72) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL64, 73) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16, 74) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_LO, 75) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HI, 76) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HA, 77) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL64, 78) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSGD16, 79) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSGD16_LO, 80) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSGD16_HI, 81) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSGD16_HA, 82) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSLD16, 83) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSLD16_LO, 84) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSLD16_HI, 85) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TLSLD16_HA, 86) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TPREL16_DS, 87) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TPREL16_LO_DS, 88) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TPREL16_HI, 89) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_TPREL16_HA, 90) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_DTPREL16_DS, 91) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_DTPREL16_LO_DS, 92) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_DTPREL16_HI, 93) \ +_ELF_DEFINE_RELOC(R_PPC64_GOT_DTPREL16_HA, 94) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_DS, 95) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_LO_DS, 96) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HIGHER, 97) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HIGHERA, 98) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HIGHEST, 99) \ +_ELF_DEFINE_RELOC(R_PPC64_TPREL16_HIGHESTA, 100) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_DS, 101) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_LO_DS, 102) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HIGHER, 103) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HIGHERA, 104) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HIGHEST, 105) \ +_ELF_DEFINE_RELOC(R_PPC64_DTPREL16_HIGHESTA, 106) \ +_ELF_DEFINE_RELOC(R_PPC64_TLSGD, 107) \ +_ELF_DEFINE_RELOC(R_PPC64_TLSLD, 108) + +#define _ELF_DEFINE_SPARC_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_SPARC_NONE, 0) \ +_ELF_DEFINE_RELOC(R_SPARC_8, 1) \ +_ELF_DEFINE_RELOC(R_SPARC_16, 2) \ +_ELF_DEFINE_RELOC(R_SPARC_32, 3) \ +_ELF_DEFINE_RELOC(R_SPARC_DISP8, 4) \ +_ELF_DEFINE_RELOC(R_SPARC_DISP16, 5) \ +_ELF_DEFINE_RELOC(R_SPARC_DISP32, 6) \ +_ELF_DEFINE_RELOC(R_SPARC_WDISP30, 7) \ +_ELF_DEFINE_RELOC(R_SPARC_WDISP22, 8) \ +_ELF_DEFINE_RELOC(R_SPARC_HI22, 9) \ +_ELF_DEFINE_RELOC(R_SPARC_22, 10) \ +_ELF_DEFINE_RELOC(R_SPARC_13, 11) \ +_ELF_DEFINE_RELOC(R_SPARC_LO10, 12) \ +_ELF_DEFINE_RELOC(R_SPARC_GOT10, 13) \ +_ELF_DEFINE_RELOC(R_SPARC_GOT13, 14) \ +_ELF_DEFINE_RELOC(R_SPARC_GOT22, 15) \ +_ELF_DEFINE_RELOC(R_SPARC_PC10, 16) \ +_ELF_DEFINE_RELOC(R_SPARC_PC22, 17) \ +_ELF_DEFINE_RELOC(R_SPARC_WPLT30, 18) \ +_ELF_DEFINE_RELOC(R_SPARC_COPY, 19) \ +_ELF_DEFINE_RELOC(R_SPARC_GLOB_DAT, 20) \ +_ELF_DEFINE_RELOC(R_SPARC_JMP_SLOT, 21) \ +_ELF_DEFINE_RELOC(R_SPARC_RELATIVE, 22) \ +_ELF_DEFINE_RELOC(R_SPARC_UA32, 23) \ +_ELF_DEFINE_RELOC(R_SPARC_PLT32, 24) \ +_ELF_DEFINE_RELOC(R_SPARC_HIPLT22, 25) \ +_ELF_DEFINE_RELOC(R_SPARC_LOPLT10, 26) \ +_ELF_DEFINE_RELOC(R_SPARC_PCPLT32, 27) \ +_ELF_DEFINE_RELOC(R_SPARC_PCPLT22, 28) \ +_ELF_DEFINE_RELOC(R_SPARC_PCPLT10, 29) \ +_ELF_DEFINE_RELOC(R_SPARC_10, 30) \ +_ELF_DEFINE_RELOC(R_SPARC_11, 31) \ +_ELF_DEFINE_RELOC(R_SPARC_64, 32) \ +_ELF_DEFINE_RELOC(R_SPARC_OLO10, 33) \ +_ELF_DEFINE_RELOC(R_SPARC_HH22, 34) \ +_ELF_DEFINE_RELOC(R_SPARC_HM10, 35) \ +_ELF_DEFINE_RELOC(R_SPARC_LM22, 36) \ +_ELF_DEFINE_RELOC(R_SPARC_PC_HH22, 37) \ +_ELF_DEFINE_RELOC(R_SPARC_PC_HM10, 38) \ +_ELF_DEFINE_RELOC(R_SPARC_PC_LM22, 39) \ +_ELF_DEFINE_RELOC(R_SPARC_WDISP16, 40) \ +_ELF_DEFINE_RELOC(R_SPARC_WDISP19, 41) \ +_ELF_DEFINE_RELOC(R_SPARC_7, 43) \ +_ELF_DEFINE_RELOC(R_SPARC_5, 44) \ +_ELF_DEFINE_RELOC(R_SPARC_6, 45) \ +_ELF_DEFINE_RELOC(R_SPARC_DISP64, 46) \ +_ELF_DEFINE_RELOC(R_SPARC_PLT64, 47) \ +_ELF_DEFINE_RELOC(R_SPARC_HIX22, 48) \ +_ELF_DEFINE_RELOC(R_SPARC_LOX10, 49) \ +_ELF_DEFINE_RELOC(R_SPARC_H44, 50) \ +_ELF_DEFINE_RELOC(R_SPARC_M44, 51) \ +_ELF_DEFINE_RELOC(R_SPARC_L44, 52) \ +_ELF_DEFINE_RELOC(R_SPARC_REGISTER, 53) \ +_ELF_DEFINE_RELOC(R_SPARC_UA64, 54) \ +_ELF_DEFINE_RELOC(R_SPARC_UA16, 55) \ +_ELF_DEFINE_RELOC(R_SPARC_GOTDATA_HIX22, 80) \ +_ELF_DEFINE_RELOC(R_SPARC_GOTDATA_LOX10, 81) \ +_ELF_DEFINE_RELOC(R_SPARC_GOTDATA_OP_HIX22, 82) \ +_ELF_DEFINE_RELOC(R_SPARC_GOTDATA_OP_LOX10, 83) \ +_ELF_DEFINE_RELOC(R_SPARC_GOTDATA_OP, 84) \ +_ELF_DEFINE_RELOC(R_SPARC_H34, 85) + +#define _ELF_DEFINE_X86_64_RELOCATIONS() \ +_ELF_DEFINE_RELOC(R_X86_64_NONE, 0) \ +_ELF_DEFINE_RELOC(R_X86_64_64, 1) \ +_ELF_DEFINE_RELOC(R_X86_64_PC32, 2) \ +_ELF_DEFINE_RELOC(R_X86_64_GOT32, 3) \ +_ELF_DEFINE_RELOC(R_X86_64_PLT32, 4) \ +_ELF_DEFINE_RELOC(R_X86_64_COPY, 5) \ +_ELF_DEFINE_RELOC(R_X86_64_GLOB_DAT, 6) \ +_ELF_DEFINE_RELOC(R_X86_64_JUMP_SLOT, 7) \ +_ELF_DEFINE_RELOC(R_X86_64_RELATIVE, 8) \ +_ELF_DEFINE_RELOC(R_X86_64_GOTPCREL, 9) \ +_ELF_DEFINE_RELOC(R_X86_64_32, 10) \ +_ELF_DEFINE_RELOC(R_X86_64_32S, 11) \ +_ELF_DEFINE_RELOC(R_X86_64_16, 12) \ +_ELF_DEFINE_RELOC(R_X86_64_PC16, 13) \ +_ELF_DEFINE_RELOC(R_X86_64_8, 14) \ +_ELF_DEFINE_RELOC(R_X86_64_PC8, 15) \ +_ELF_DEFINE_RELOC(R_X86_64_DTPMOD64, 16) \ +_ELF_DEFINE_RELOC(R_X86_64_DTPOFF64, 17) \ +_ELF_DEFINE_RELOC(R_X86_64_TPOFF64, 18) \ +_ELF_DEFINE_RELOC(R_X86_64_TLSGD, 19) \ +_ELF_DEFINE_RELOC(R_X86_64_TLSLD, 20) \ +_ELF_DEFINE_RELOC(R_X86_64_DTPOFF32, 21) \ +_ELF_DEFINE_RELOC(R_X86_64_GOTTPOFF, 22) \ +_ELF_DEFINE_RELOC(R_X86_64_TPOFF32, 23) \ +_ELF_DEFINE_RELOC(R_X86_64_PC64, 24) \ +_ELF_DEFINE_RELOC(R_X86_64_GOTOFF64, 25) \ +_ELF_DEFINE_RELOC(R_X86_64_GOTPC32, 26) \ +_ELF_DEFINE_RELOC(R_X86_64_SIZE32, 32) \ +_ELF_DEFINE_RELOC(R_X86_64_SIZE64, 33) \ +_ELF_DEFINE_RELOC(R_X86_64_GOTPC32_TLSDESC, 34) \ +_ELF_DEFINE_RELOC(R_X86_64_TLSDESC_CALL, 35) \ +_ELF_DEFINE_RELOC(R_X86_64_TLSDESC, 36) + +#define _ELF_DEFINE_HSAIL_RELOCATIONS() \ + _ELF_DEFINE_RELOC(R_HSA_NONE, 0) \ + _ELF_DEFINE_RELOC(R_HSA_RESERVED_TYPE1, 1) /*reserved to use in BRIG linker */\ + _ELF_DEFINE_RELOC(R_HSA_RESERVED_TYPE2, 2) /*reserved to use in BRIG linker */\ + _ELF_DEFINE_RELOC(R_HSA_DWARF_32, 3) \ + _ELF_DEFINE_RELOC(R_HSA_DWARF_TO_BRIG_CODE32, 4) \ + _ELF_DEFINE_RELOC(R_HSA_DWARF_TO_BRIG_DIRECTIVES32, 5) \ + _ELF_DEFINE_RELOC(R_HSA_DWARF_64, 6) \ + _ELF_DEFINE_RELOC(R_HSA_DWARF_TO_BRIG_CODE64, 7) \ + _ELF_DEFINE_RELOC(R_HSA_DWARF_TO_BRIG_DIRECTIVES64, 8) + +#define _ELF_DEFINE_RELOCATIONS() \ +_ELF_DEFINE_386_RELOCATIONS() \ +_ELF_DEFINE_AMD64_RELOCATIONS() \ +_ELF_DEFINE_IA64_RELOCATIONS() \ +_ELF_DEFINE_MIPS_RELOCATIONS() \ +_ELF_DEFINE_PPC32_RELOCATIONS() \ +_ELF_DEFINE_PPC64_RELOCATIONS() \ +_ELF_DEFINE_SPARC_RELOCATIONS() \ +_ELF_DEFINE_X86_64_RELOCATIONS() \ +_ELF_DEFINE_HSAIL_RELOCATIONS() + +#undef _ELF_DEFINE_RELOC +#define _ELF_DEFINE_RELOC(N, V) N = V , +enum { + _ELF_DEFINE_RELOCATIONS() + R__LAST__ +}; + +#define PN_XNUM 0xFFFFU /* Use extended section numbering. */ + +/** + ** ELF Types. + **/ + +typedef uint32_t Elf32_Addr; /* Program address. */ +typedef uint8_t Elf32_Byte; /* Unsigned tiny integer. */ +typedef uint16_t Elf32_Half; /* Unsigned medium integer. */ +typedef uint32_t Elf32_Off; /* File offset. */ +typedef uint16_t Elf32_Section; /* Section index. */ +typedef int32_t Elf32_Sword; /* Signed integer. */ +typedef uint32_t Elf32_Word; /* Unsigned integer. */ +typedef uint64_t Elf32_Lword; /* Unsigned long integer. */ + +typedef uint64_t Elf64_Addr; /* Program address. */ +typedef uint8_t Elf64_Byte; /* Unsigned tiny integer. */ +typedef uint16_t Elf64_Half; /* Unsigned medium integer. */ +typedef uint64_t Elf64_Off; /* File offset. */ +typedef uint16_t Elf64_Section; /* Section index. */ +typedef int32_t Elf64_Sword; /* Signed integer. */ +typedef uint32_t Elf64_Word; /* Unsigned integer. */ +typedef uint64_t Elf64_Lword; /* Unsigned long integer. */ +typedef uint64_t Elf64_Xword; /* Unsigned long integer. */ +typedef int64_t Elf64_Sxword; /* Signed long integer. */ + + +/* + * Capability descriptors. + */ + +/* 32-bit capability descriptor. */ +typedef struct { + Elf32_Word c_tag; /* Type of entry. */ + union { + Elf32_Word c_val; /* Integer value. */ + Elf32_Addr c_ptr; /* Pointer value. */ + } c_un; +} Elf32_Cap; + +/* 64-bit capability descriptor. */ +typedef struct { + Elf64_Xword c_tag; /* Type of entry. */ + union { + Elf64_Xword c_val; /* Integer value. */ + Elf64_Addr c_ptr; /* Pointer value. */ + } c_un; +} Elf64_Cap; + +/* + * MIPS .conflict section entries. + */ + +/* 32-bit entry. */ +typedef struct { + Elf32_Addr c_index; +} Elf32_Conflict; + +/* 64-bit entry. */ +typedef struct { + Elf64_Addr c_index; +} Elf64_Conflict; + +/* + * Dynamic section entries. + */ + +/* 32-bit entry. */ +typedef struct { + Elf32_Sword d_tag; /* Type of entry. */ + union { + Elf32_Word d_val; /* Integer value. */ + Elf32_Addr d_ptr; /* Pointer value. */ + } d_un; +} Elf32_Dyn; + +/* 64-bit entry. */ +typedef struct { + Elf64_Sxword d_tag; /* Type of entry. */ + union { + Elf64_Xword d_val; /* Integer value. */ + Elf64_Addr d_ptr; /* Pointer value; */ + } d_un; +} Elf64_Dyn; + + +/* + * The executable header (EHDR). + */ + +/* 32 bit EHDR. */ +typedef struct { + unsigned char e_ident[EI_NIDENT]; /* ELF identification. */ + Elf32_Half e_type; /* Object file type (ET_*). */ + Elf32_Half e_machine; /* Machine type (EM_*). */ + Elf32_Word e_version; /* File format version (EV_*). */ + Elf32_Addr e_entry; /* Start address. */ + Elf32_Off e_phoff; /* File offset to the PHDR table. */ + Elf32_Off e_shoff; /* File offset to the SHDRheader. */ + Elf32_Word e_flags; /* Flags (EF_*). */ + Elf32_Half e_ehsize; /* Elf header size in bytes. */ + Elf32_Half e_phentsize; /* PHDR table entry size in bytes. */ + Elf32_Half e_phnum; /* Number of PHDR entries. */ + Elf32_Half e_shentsize; /* SHDR table entry size in bytes. */ + Elf32_Half e_shnum; /* Number of SHDR entries. */ + Elf32_Half e_shstrndx; /* Index of section name string table. */ +} Elf32_Ehdr; + + +/* 64 bit EHDR. */ +typedef struct { + unsigned char e_ident[EI_NIDENT]; /* ELF identification. */ + Elf64_Half e_type; /* Object file type (ET_*). */ + Elf64_Half e_machine; /* Machine type (EM_*). */ + Elf64_Word e_version; /* File format version (EV_*). */ + Elf64_Addr e_entry; /* Start address. */ + Elf64_Off e_phoff; /* File offset to the PHDR table. */ + Elf64_Off e_shoff; /* File offset to the SHDRheader. */ + Elf64_Word e_flags; /* Flags (EF_*). */ + Elf64_Half e_ehsize; /* Elf header size in bytes. */ + Elf64_Half e_phentsize; /* PHDR table entry size in bytes. */ + Elf64_Half e_phnum; /* Number of PHDR entries. */ + Elf64_Half e_shentsize; /* SHDR table entry size in bytes. */ + Elf64_Half e_shnum; /* Number of SHDR entries. */ + Elf64_Half e_shstrndx; /* Index of section name string table. */ +} Elf64_Ehdr; + + +/* + * Shared object information. + */ + +/* 32-bit entry. */ +typedef struct { + Elf32_Word l_name; /* The name of a shared object. */ + Elf32_Word l_time_stamp; /* 32-bit timestamp. */ + Elf32_Word l_checksum; /* Checksum of visible symbols, sizes. */ + Elf32_Word l_version; /* Interface version string index. */ + Elf32_Word l_flags; /* Flags (LL_*). */ +} Elf32_Lib; + +/* 64-bit entry. */ +typedef struct { + Elf64_Word l_name; + Elf64_Word l_time_stamp; + Elf64_Word l_checksum; + Elf64_Word l_version; + Elf64_Word l_flags; +} Elf64_Lib; + +#define _ELF_DEFINE_LL_FLAGS() \ +_ELF_DEFINE_LL(LL_NONE, 0, \ + "no flags") \ +_ELF_DEFINE_LL(LL_EXACT_MATCH, 0x1, \ + "require an exact match") \ +_ELF_DEFINE_LL(LL_IGNORE_INT_VER, 0x2, \ + "ignore version incompatibilities") \ +_ELF_DEFINE_LL(LL_REQUIRE_MINOR, 0x4, \ + "") \ +_ELF_DEFINE_LL(LL_EXPORTS, 0x8, \ + "") \ +_ELF_DEFINE_LL(LL_DELAY_LOAD, 0x10, \ + "") \ +_ELF_DEFINE_LL(LL_DELTA, 0x20, \ + "") + +#undef _ELF_DEFINE_LL +#define _ELF_DEFINE_LL(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_LL_FLAGS() + LL__LAST__ +}; + +/* + * Note tags + */ + +#define _ELF_DEFINE_NOTE_ENTRY_TYPES() \ +_ELF_DEFINE_NT(NT_ABI_TAG, 1, "Tag indicating the ABI") \ +_ELF_DEFINE_NT(NT_GNU_HWCAP, 2, "Hardware capabilities") \ +_ELF_DEFINE_NT(NT_GNU_BUILD_ID, 3, "Build id, set by ld(1)") \ +_ELF_DEFINE_NT(NT_GNU_GOLD_VERSION, 4, \ + "Version number of the GNU gold linker") \ +_ELF_DEFINE_NT(NT_PRSTATUS, 1, "Process status") \ +_ELF_DEFINE_NT(NT_FPREGSET, 2, "Floating point information") \ +_ELF_DEFINE_NT(NT_PRPSINFO, 3, "Process information") \ +_ELF_DEFINE_NT(NT_AUXV, 6, "Auxiliary vector") \ +_ELF_DEFINE_NT(NT_PRXFPREG, 0x46E62B7FUL, \ + "Linux user_xfpregs structure") \ +_ELF_DEFINE_NT(NT_PSTATUS, 10, "Linux process status") \ +_ELF_DEFINE_NT(NT_FPREGS, 12, "Linux floating point regset") \ +_ELF_DEFINE_NT(NT_PSINFO, 13, "Linux process information") \ +_ELF_DEFINE_NT(NT_LWPSTATUS, 16, "Linux lwpstatus_t type") \ +_ELF_DEFINE_NT(NT_LWPSINFO, 17, "Linux lwpinfo_t type") + +#undef _ELF_DEFINE_NT +#define _ELF_DEFINE_NT(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_NOTE_ENTRY_TYPES() + NT__LAST__ +}; + +/* Aliases for the ABI tag. */ +#define NT_FREEBSD_ABI_TAG NT_ABI_TAG +#define NT_GNU_ABI_TAG NT_ABI_TAG +#define NT_NETBSD_IDENT NT_ABI_TAG +#define NT_OPENBSD_IDENT NT_ABI_TAG + +/* + * Note descriptors. + */ + +typedef struct { + uint32_t n_namesz; /* Length of note's name. */ + uint32_t n_descsz; /* Length of note's value. */ + uint32_t n_type; /* Type of note. */ +} Elf_Note; + +typedef Elf_Note Elf32_Nhdr; /* 32-bit note header. */ +typedef Elf_Note Elf64_Nhdr; /* 64-bit note header. */ + +/* + * MIPS ELF options descriptor header. + */ + +typedef struct { + Elf64_Byte kind; /* Type of options. */ + Elf64_Byte size; /* Size of option descriptor. */ + Elf64_Half section; /* Index of section affected. */ + Elf64_Word info; /* Kind-specific information. */ +} Elf_Options; + +/* + * Option kinds. + */ + +#define _ELF_DEFINE_OPTION_KINDS() \ +_ELF_DEFINE_ODK(ODK_NULL, 0, "undefined") \ +_ELF_DEFINE_ODK(ODK_REGINFO, 1, "register usage info") \ +_ELF_DEFINE_ODK(ODK_EXCEPTIONS, 2, "exception processing info") \ +_ELF_DEFINE_ODK(ODK_PAD, 3, "section padding") \ +_ELF_DEFINE_ODK(ODK_HWPATCH, 4, "hardware patch applied") \ +_ELF_DEFINE_ODK(ODK_FILL, 5, "fill value used by linker") \ +_ELF_DEFINE_ODK(ODK_TAGS, 6, "reserved space for tools") \ +_ELF_DEFINE_ODK(ODK_HWAND, 7, "hardware AND patch applied") \ +_ELF_DEFINE_ODK(ODK_HWOR, 8, "hardware OR patch applied") \ +_ELF_DEFINE_ODK(ODK_GP_GROUP, 9, \ + "GP group to use for text/data sections") \ +_ELF_DEFINE_ODK(ODK_IDENT, 10, "ID information") \ +_ELF_DEFINE_ODK(ODK_PAGESIZE, 11, "page size infomation") + +#undef _ELF_DEFINE_ODK +#define _ELF_DEFINE_ODK(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_OPTION_KINDS() + ODK__LAST__ +}; + +/* + * ODK_EXCEPTIONS info field masks. + */ + +#define _ELF_DEFINE_ODK_EXCEPTIONS_MASK() \ +_ELF_DEFINE_OEX(OEX_FPU_MIN, 0x0000001FUL, \ + "minimum FPU exception which must be enabled") \ +_ELF_DEFINE_OEX(OEX_FPU_MAX, 0x00001F00UL, \ + "maximum FPU exception which can be enabled") \ +_ELF_DEFINE_OEX(OEX_PAGE0, 0x00010000UL, \ + "page zero must be mapped") \ +_ELF_DEFINE_OEX(OEX_SMM, 0x00020000UL, \ + "run in sequential memory mode") \ +_ELF_DEFINE_OEX(OEX_PRECISEFP, 0x00040000UL, \ + "run in precise FP exception mode") \ +_ELF_DEFINE_OEX(OEX_DISMISS, 0x00080000UL, \ + "dismiss invalid address traps") + +#undef _ELF_DEFINE_OEX +#define _ELF_DEFINE_OEX(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ODK_EXCEPTIONS_MASK() + OEX__LAST__ +}; + +/* + * ODK_PAD info field masks. + */ + +#define _ELF_DEFINE_ODK_PAD_MASK() \ +_ELF_DEFINE_OPAD(OPAD_PREFIX, 0x0001) \ +_ELF_DEFINE_OPAD(OPAD_POSTFIX, 0x0002) \ +_ELF_DEFINE_OPAD(OPAD_SYMBOL, 0x0004) + +#undef _ELF_DEFINE_OPAD +#define _ELF_DEFINE_OPAD(N, V) N = V , +enum { + _ELF_DEFINE_ODK_PAD_MASK() + OPAD__LAST__ +}; + +/* + * ODK_HWPATCH info field masks. + */ + +#define _ELF_DEFINE_ODK_HWPATCH_MASK() \ +_ELF_DEFINE_OHW(OHW_R4KEOP, 0x00000001UL, \ + "patch for R4000 branch at end-of-page bug") \ +_ELF_DEFINE_OHW(OHW_R8KPFETCH, 0x00000002UL, \ + "R8000 prefetch bug may occur") \ +_ELF_DEFINE_OHW(OHW_R5KEOP, 0x00000004UL, \ + "patch for R5000 branch at end-of-page bug") \ +_ELF_DEFINE_OHW(OHW_R5KCVTL, 0x00000008UL, \ + "R5000 cvt.[ds].l bug: clean == 1") \ +_ELF_DEFINE_OHW(OHW_R10KLDL, 0x00000010UL, \ + "needd patch for R10000 misaligned load") + +#undef _ELF_DEFINE_OHW +#define _ELF_DEFINE_OHW(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ODK_HWPATCH_MASK() + OHW__LAST__ +}; + +/* + * ODK_HWAND/ODK_HWOR info field and hwp_flags[12] masks. + */ + +#define _ELF_DEFINE_ODK_HWP_MASK() \ +_ELF_DEFINE_HWP(OHWA0_R4KEOP_CHECKED, 0x00000001UL, \ + "object checked for R4000 end-of-page bug") \ +_ELF_DEFINE_HWP(OHWA0_R4KEOP_CLEAN, 0x00000002UL, \ + "object verified clean for R4000 end-of-page bug") \ +_ELF_DEFINE_HWP(OHWO0_FIXADE, 0x00000001UL, \ + "object requires call to fixade") + +#undef _ELF_DEFINE_HWP +#define _ELF_DEFINE_HWP(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ODK_HWP_MASK() + OHWX0__LAST__ +}; + +/* + * ODK_IDENT/ODK_GP_GROUP info field masks. + */ + +#define _ELF_DEFINE_ODK_GP_MASK() \ +_ELF_DEFINE_OGP(OGP_GROUP, 0x0000FFFFUL, "GP group number") \ +_ELF_DEFINE_OGP(OGP_SELF, 0x00010000UL, \ + "GP group is self-contained") + +#undef _ELF_DEFINE_OGP +#define _ELF_DEFINE_OGP(N, V, DESCR) N = V , +enum { + _ELF_DEFINE_ODK_GP_MASK() + OGP__LAST__ +}; + +/* + * MIPS ELF register info descriptor. + */ + +/* 32 bit RegInfo entry. */ +typedef struct { + Elf32_Word ri_gprmask; /* Mask of general register used. */ + Elf32_Word ri_cprmask[4]; /* Mask of coprocessor register used. */ + Elf32_Addr ri_gp_value; /* GP register value. */ +} Elf32_RegInfo; + +/* 64 bit RegInfo entry. */ +typedef struct { + Elf64_Word ri_gprmask; /* Mask of general register used. */ + Elf64_Word ri_pad; /* Padding. */ + Elf64_Word ri_cprmask[4]; /* Mask of coprocessor register used. */ + Elf64_Addr ri_gp_value; /* GP register value. */ +} Elf64_RegInfo; + +/* + * Program Header Table (PHDR) entries. + */ + +/* 32 bit PHDR entry. */ +typedef struct { + Elf32_Word p_type; /* Type of segment. */ + Elf32_Off p_offset; /* File offset to segment. */ + Elf32_Addr p_vaddr; /* Virtual address in memory. */ + Elf32_Addr p_paddr; /* Physical address (if relevant). */ + Elf32_Word p_filesz; /* Size of segment in file. */ + Elf32_Word p_memsz; /* Size of segment in memory. */ + Elf32_Word p_flags; /* Segment flags. */ + Elf32_Word p_align; /* Alignment constraints. */ +} Elf32_Phdr; + +/* 64 bit PHDR entry. */ +typedef struct { + Elf64_Word p_type; /* Type of segment. */ + Elf64_Word p_flags; /* File offset to segment. */ + Elf64_Off p_offset; /* Virtual address in memory. */ + Elf64_Addr p_vaddr; /* Physical address (if relevant). */ + Elf64_Addr p_paddr; /* Size of segment in file. */ + Elf64_Xword p_filesz; /* Size of segment in memory. */ + Elf64_Xword p_memsz; /* Segment flags. */ + Elf64_Xword p_align; /* Alignment constraints. */ +} Elf64_Phdr; + + +/* + * Move entries, for describing data in COMMON blocks in a compact + * manner. + */ + +/* 32-bit move entry. */ +typedef struct { + Elf32_Lword m_value; /* Initialization value. */ + Elf32_Word m_info; /* Encoded size and index. */ + Elf32_Word m_poffset; /* Offset relative to symbol. */ + Elf32_Half m_repeat; /* Repeat count. */ + Elf32_Half m_stride; /* Number of units to skip. */ +} Elf32_Move; + +/* 64-bit move entry. */ +typedef struct { + Elf64_Lword m_value; /* Initialization value. */ + Elf64_Xword m_info; /* Encoded size and index. */ + Elf64_Xword m_poffset; /* Offset relative to symbol. */ + Elf64_Half m_repeat; /* Repeat count. */ + Elf64_Half m_stride; /* Number of units to skip. */ +} Elf64_Move; +#ifndef ELF32_M_SYM +#define ELF32_M_SYM(I) ((I) >> 8) +#endif +#ifndef ELF32_M_SIZE +#define ELF32_M_SIZE(I) ((unsigned char) (I)) +#endif +#ifndef ELF32_M_INFO +#define ELF32_M_INFO(M, S) (((M) << 8) + (unsigned char) (S)) +#endif + +#ifndef ELF64_M_SYM +#define ELF64_M_SYM(I) ((I) >> 8) +#endif +#ifndef ELF64_M_SIZE +#define ELF64_M_SIZE(I) ((unsigned char) (I)) +#endif +#ifndef ELF64_M_INFO +#define ELF64_M_INFO(M, S) (((M) << 8) + (unsigned char) (S)) +#endif + +/* + * Section Header Table (SHDR) entries. + */ + +/* 32 bit SHDR */ +typedef struct { + Elf32_Word sh_name; /* index of section name */ + Elf32_Word sh_type; /* section type */ + Elf32_Word sh_flags; /* section flags */ + Elf32_Addr sh_addr; /* in-memory address of section */ + Elf32_Off sh_offset; /* file offset of section */ + Elf32_Word sh_size; /* section size in bytes */ + Elf32_Word sh_link; /* section header table link */ + Elf32_Word sh_info; /* extra information */ + Elf32_Word sh_addralign; /* alignment constraint */ + Elf32_Word sh_entsize; /* size for fixed-size entries */ +} Elf32_Shdr; + +/* 64 bit SHDR */ +typedef struct { + Elf64_Word sh_name; /* index of section name */ + Elf64_Word sh_type; /* section type */ + Elf64_Xword sh_flags; /* section flags */ + Elf64_Addr sh_addr; /* in-memory address of section */ + Elf64_Off sh_offset; /* file offset of section */ + Elf64_Xword sh_size; /* section size in bytes */ + Elf64_Word sh_link; /* section header table link */ + Elf64_Word sh_info; /* extra information */ + Elf64_Xword sh_addralign; /* alignment constraint */ + Elf64_Xword sh_entsize; /* size for fixed-size entries */ +} Elf64_Shdr; + + +/* + * Symbol table entries. + */ + +typedef struct { + Elf32_Word st_name; /* index of symbol's name */ + Elf32_Addr st_value; /* value for the symbol */ + Elf32_Word st_size; /* size of associated data */ + unsigned char st_info; /* type and binding attributes */ + unsigned char st_other; /* visibility */ + Elf32_Half st_shndx; /* index of related section */ +} Elf32_Sym; + +typedef struct { + Elf64_Word st_name; /* index of symbol's name */ + unsigned char st_info; /* value for the symbol */ + unsigned char st_other; /* size of associated data */ + Elf64_Half st_shndx; /* type and binding attributes */ + Elf64_Addr st_value; /* visibility */ + Elf64_Xword st_size; /* index of related section */ +} Elf64_Sym; +#ifndef ELF32_ST_BIND +#define ELF32_ST_BIND(I) ((I) >> 4) +#endif +#ifndef ELF32_ST_TYPE +#define ELF32_ST_TYPE(I) ((I) & 0xFU) +#endif +#ifndef ELF32_ST_INFO +#define ELF32_ST_INFO(B,T) (((B) << 4) + ((T) & 0xF)) +#endif + +#ifndef ELF64_ST_BIND +#define ELF64_ST_BIND(I) ((I) >> 4) +#endif +#ifndef ELF64_ST_TYPE +#define ELF64_ST_TYPE(I) ((I) & 0xFU) +#endif +#ifndef ELF64_ST_INFO +#define ELF64_ST_INFO(B,T) (((B) << 4) + ((T) & 0xF)) +#endif + +#ifndef ELF32_ST_VISIBILITY +#define ELF32_ST_VISIBILITY(O) ((O) & 0x3) +#endif + +#ifndef ELF64_ST_VISIBILITY +#define ELF64_ST_VISIBILITY(O) ((O) & 0x3) +#endif + +/* + * Syminfo descriptors, containing additional symbol information. + */ + +/* 32-bit entry. */ +typedef struct { + Elf32_Half si_boundto; /* Entry index with additional flags. */ + Elf32_Half si_flags; /* Flags. */ +} Elf32_Syminfo; + +/* 64-bit entry. */ +typedef struct { + Elf64_Half si_boundto; /* Entry index with additional flags. */ + Elf64_Half si_flags; /* Flags. */ +} Elf64_Syminfo; + +/* + * Relocation descriptors. + */ + +typedef struct { + Elf32_Addr r_offset; /* location to apply relocation to */ + Elf32_Word r_info; /* type+section for relocation */ +} Elf32_Rel; + +typedef struct { + Elf32_Addr r_offset; /* location to apply relocation to */ + Elf32_Word r_info; /* type+section for relocation */ + Elf32_Sword r_addend; /* constant addend */ +} Elf32_Rela; + +typedef struct { + Elf64_Addr r_offset; /* location to apply relocation to */ + Elf64_Xword r_info; /* type+section for relocation */ +} Elf64_Rel; + +typedef struct { + Elf64_Addr r_offset; /* location to apply relocation to */ + Elf64_Xword r_info; /* type+section for relocation */ + Elf64_Sxword r_addend; /* constant addend */ +} Elf64_Rela; + +#ifndef ELF32_R_SYM +#define ELF32_R_SYM(I) ((I) >> 8) +#endif +#ifndef ELF32_R_TYPE +#define ELF32_R_TYPE(I) ((unsigned char) (I)) +#endif +#ifndef ELF32_R_INFO +#define ELF32_R_INFO(S,T) (((S) << 8) + (unsigned char) (T)) +#endif +#ifndef ELF64_R_SYM +#define ELF64_R_SYM(I) ((I) >> 32) +#endif +#ifndef ELF64_R_TYPE +#define ELF64_R_TYPE(I) ((I) & 0xFFFFFFFFUL) +#endif +#ifndef ELF64_R_INFO +#define ELF64_R_INFO(S,T) (((S) << 32) + ((T) & 0xFFFFFFFFUL)) +#endif + +/* + * Symbol versioning structures. + */ + +/* 32-bit structures. */ +typedef struct +{ + Elf32_Word vda_name; /* Index to name. */ + Elf32_Word vda_next; /* Offset to next entry. */ +} Elf32_Verdaux; + +typedef struct +{ + Elf32_Word vna_hash; /* Hash value of dependency name. */ + Elf32_Half vna_flags; /* Flags. */ + Elf32_Half vna_other; /* Unused. */ + Elf32_Word vna_name; /* Offset to dependency name. */ + Elf32_Word vna_next; /* Offset to next vernaux entry. */ +} Elf32_Vernaux; + +typedef struct +{ + Elf32_Half vd_version; /* Version information. */ + Elf32_Half vd_flags; /* Flags. */ + Elf32_Half vd_ndx; /* Index into the versym section. */ + Elf32_Half vd_cnt; /* Number of aux entries. */ + Elf32_Word vd_hash; /* Hash value of name. */ + Elf32_Word vd_aux; /* Offset to aux entries. */ + Elf32_Word vd_next; /* Offset to next version definition. */ +} Elf32_Verdef; + +typedef struct +{ + Elf32_Half vn_version; /* Version number. */ + Elf32_Half vn_cnt; /* Number of aux entries. */ + Elf32_Word vn_file; /* Offset of associated file name. */ + Elf32_Word vn_aux; /* Offset of vernaux array. */ + Elf32_Word vn_next; /* Offset of next verneed entry. */ +} Elf32_Verneed; + +typedef Elf32_Half Elf32_Versym; + +/* 64-bit structures. */ + +typedef struct { + Elf64_Word vda_name; /* Index to name. */ + Elf64_Word vda_next; /* Offset to next entry. */ +} Elf64_Verdaux; + +typedef struct { + Elf64_Word vna_hash; /* Hash value of dependency name. */ + Elf64_Half vna_flags; /* Flags. */ + Elf64_Half vna_other; /* Unused. */ + Elf64_Word vna_name; /* Offset to dependency name. */ + Elf64_Word vna_next; /* Offset to next vernaux entry. */ +} Elf64_Vernaux; + +typedef struct { + Elf64_Half vd_version; /* Version information. */ + Elf64_Half vd_flags; /* Flags. */ + Elf64_Half vd_ndx; /* Index into the versym section. */ + Elf64_Half vd_cnt; /* Number of aux entries. */ + Elf64_Word vd_hash; /* Hash value of name. */ + Elf64_Word vd_aux; /* Offset to aux entries. */ + Elf64_Word vd_next; /* Offset to next version definition. */ +} Elf64_Verdef; + +typedef struct { + Elf64_Half vn_version; /* Version number. */ + Elf64_Half vn_cnt; /* Number of aux entries. */ + Elf64_Word vn_file; /* Offset of associated file name. */ + Elf64_Word vn_aux; /* Offset of vernaux array. */ + Elf64_Word vn_next; /* Offset of next verneed entry. */ +} Elf64_Verneed; + +typedef Elf64_Half Elf64_Versym; + + +#ifndef LIBELF_CONFIG_GNUHASH +#define LIBELF_CONFIG_GNUHASH 1 + +/* + * The header for GNU-style hash sections. + */ + +typedef struct { + uint32_t gh_nbuckets; /* Number of hash buckets. */ + uint32_t gh_symndx; /* First visible symbol in .dynsym. */ + uint32_t gh_maskwords; /* #maskwords used in bloom filter. */ + uint32_t gh_shift2; /* Bloom filter shift count. */ +} Elf_GNU_Hash_Header; +#endif + +#endif /* _ELFDEFINITIONS_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format new file mode 100644 index 0000000000..af707591d1 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format @@ -0,0 +1,47 @@ +#!/bin/sh +# +# $Id: native-elf-format 2064 2011-10-26 15:12:32Z jkoshy $ +# +# Find the native ELF format for a host platform by compiling a +# test object and examining the resulting object. +# +# This script is used if there is no easy way to determine this +# information statically at compile time. + +program=`basename $0` +tmp_c=`mktemp -u nefXXXXXX`.c +tmp_o=`echo ${tmp_c} | sed -e 's/.c$/.o/'` + +trap "rm -f ${tmp_c} ${tmp_o}" 0 1 2 3 15 + +touch ${tmp_c} + +echo "/* Generated by ${program} on `date` */" + +cc -c ${tmp_c} -o ${tmp_o} +readelf -h ${tmp_o} | awk ' +$1 ~ "Class:" { + sub("ELF","",$2); elfclass = $2; + } +$1 ~ "Data:" { + if (match($0, "little")) { + elfdata = "LSB"; + } else { + elfdata = "MSB"; + } + } +$1 ~ "Machine:" { + if (match($0, "Intel.*386")) { + elfarch = "EM_386"; + } else if (match($0, ".*X86-64")) { + elfarch = "EM_X86_64"; + } else { + elfarch = "unknown"; + } + } +END { + printf("#define ELFTC_CLASS ELFCLASS%s\n", elfclass); + printf("#define ELFTC_ARCH %s\n", elfarch); + printf("#define ELFTC_BYTEORDER ELFDATA2%s\n", elfdata); +}' + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format.h new file mode 100644 index 0000000000..8e24b511ad --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/native-elf-format.h @@ -0,0 +1,14 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#if !defined(_LP64) +#define ELFTC_CLASS ELFCLASS32 +#else +#define ELFTC_CLASS ELFCLASS64 +#endif +#define ELFTC_ARCH EM_386 +#define ELFTC_BYTEORDER ELFDATA2LSB diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/uthash.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/uthash.h new file mode 100644 index 0000000000..8428b9c0dd --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/uthash.h @@ -0,0 +1,906 @@ +/* +Copyright (c) 2003-2011, Troy D. Hanson http://uthash.sourceforge.net +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* $Id: uthash.h 2064 2011-10-26 15:12:32Z jkoshy $ */ + +#ifndef UTHASH_H +#define UTHASH_H + +#include /* memcmp,strlen */ +#include /* ptrdiff_t */ +#include /* exit() */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#ifdef _MSC_VER /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#define DECLTYPE(x) +#endif +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while(0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while(0) +#endif + +/* a number of the hash function use uint32_t which isn't defined on win32 */ +#ifdef _MSC_VER +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#else +#include /* uint32_t */ +#endif + +#define UTHASH_VERSION 1.9.4 + +#define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */ +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ + +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhe */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + unsigned _hf_bkt,_hf_hashv; \ + out=NULL; \ + if (head) { \ + HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \ + keyptr,keylen,out); \ + } \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0) +#define HASH_BLOOM_MAKE(tbl) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \ + memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ +} while (0); + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0); + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8))) +#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8))) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#else +#define HASH_BLOOM_MAKE(tbl) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) (1) +#endif + +#define HASH_MAKE_TABLE(hh,head) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \ + sizeof(UT_hash_table)); \ + if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl->buckets, 0, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ +} while(0) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh,head,&add->fieldname,keylen_in,add) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_bkt; \ + (add)->hh.next = NULL; \ + (add)->hh.key = (char*)keyptr; \ + (add)->hh.keylen = keylen_in; \ + if (!(head)) { \ + head = (add); \ + (head)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh,head); \ + } else { \ + (head)->hh.tbl->tail->next = (add); \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail = &((add)->hh); \ + } \ + (head)->hh.tbl->num_items++; \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \ + (add)->hh.hashv, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \ + HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \ + HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \ + HASH_FSCK(hh,head); \ +} while(0) + +#define HASH_TO_BKT( hashv, num_bkts, bkt ) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1)); \ +} while(0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ +do { \ + unsigned _hd_bkt; \ + struct UT_hash_handle *_hd_hh_del; \ + if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + head = NULL; \ + } else { \ + _hd_hh_del = &((delptr)->hh); \ + if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \ + (head)->hh.tbl->tail = \ + (UT_hash_handle*)((char*)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho); \ + } \ + if ((delptr)->hh.prev) { \ + ((UT_hash_handle*)((char*)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho))->next = (delptr)->hh.next; \ + } else { \ + DECLTYPE_ASSIGN(head,(delptr)->hh.next); \ + } \ + if (_hd_hh_del->next) { \ + ((UT_hash_handle*)((char*)_hd_hh_del->next + \ + (head)->hh.tbl->hho))->prev = \ + _hd_hh_del->prev; \ + } \ + HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh,head); \ +} while (0) + + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ + HASH_FIND(hh,head,findstr,strlen(findstr),out) +#define HASH_ADD_STR(head,strfield,add) \ + HASH_ADD(hh,head,strfield,strlen(add->strfield),add) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head) \ +do { \ + unsigned _bkt_i; \ + unsigned _count, _bkt_count; \ + char *_prev; \ + struct UT_hash_handle *_thh; \ + if (head) { \ + _count = 0; \ + for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \ + _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("invalid hh_prev %p, actual %p\n", \ + _thh->hh_prev, _prev ); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("invalid bucket count %d, actual %d\n", \ + (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid hh item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + /* traverse hh in app order; check next/prev integrity, count */ \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev !=(char*)(_thh->prev)) { \ + HASH_OOPS("invalid prev %p, actual %p\n", \ + _thh->prev, _prev ); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \ + (head)->hh.tbl->hho) : NULL ); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid app item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ +#ifdef HASH_FUNCTION +#define HASH_FCN HASH_FUNCTION +#else +#define HASH_FCN HASH_JEN +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6 */ +#define HASH_BER(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hb_keylen=keylen; \ + char *_hb_key=(char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen--) { (hashv) = ((hashv) * 33) + *_hb_key++; } \ + bkt = (hashv) & (num_bkts-1); \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ +#define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _sx_i; \ + char *_hs_key=(char*)(key); \ + hashv = 0; \ + for(_sx_i=0; _sx_i < keylen; _sx_i++) \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + bkt = hashv & (num_bkts-1); \ +} while (0) + +#define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _fn_i; \ + char *_hf_key=(char*)(key); \ + hashv = 2166136261UL; \ + for(_fn_i=0; _fn_i < keylen; _fn_i++) \ + hashv = (hashv * 16777619) ^ _hf_key[_fn_i]; \ + bkt = hashv & (num_bkts-1); \ +} while(0); + +#define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _ho_i; \ + char *_ho_key=(char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + char *_hj_key=(char*)(key); \ + hashv = 0xfeedbeef; \ + _hj_i = _hj_j = 0x9e3779b9; \ + _hj_k = keylen; \ + while (_hj_k >= 12) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12; \ + } \ + hashv += keylen; \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \ + case 5: _hj_j += _hj_key[4]; \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \ + case 1: _hj_i += _hj_key[0]; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \ +do { \ + char *_sfh_key=(char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = keylen; \ + \ + int _sfh_rem = _sfh_len & 3; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabe; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = (get16bits (_sfh_key+2) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= _sfh_key[sizeof (uint16_t)] << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ + bkt = hashv & (num_bkts-1); \ +} while(0); + +#ifdef HASH_USING_NO_STRICT_ALIASING +/* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads. + * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error. + * MurmurHash uses the faster approach only on CPU's where we know it's safe. + * + * Note the preprocessor built-in defines can be emitted using: + * + * gcc -m64 -dM -E - < /dev/null (on gcc) + * cc -## a.c (where a.c is a simple test file) (Sun Studio) + */ +#if (defined(__i386__) || defined(__x86_64__)) +#define MUR_GETBLOCK(p,i) p[i] +#else /* non intel */ +#define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0) +#define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1) +#define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2) +#define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3) +#define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL)) +#if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__)) +#define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8)) +#else /* assume little endian non-intel */ +#define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8)) +#endif +#define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \ + (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \ + (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) : \ + MUR_ONE_THREE(p)))) +#endif +#define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +#define MUR_FMIX(_h) \ +do { \ + _h ^= _h >> 16; \ + _h *= 0x85ebca6b; \ + _h ^= _h >> 13; \ + _h *= 0xc2b2ae35l; \ + _h ^= _h >> 16; \ +} while(0) + +#define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \ +do { \ + const uint8_t *_mur_data = (const uint8_t*)(key); \ + const int _mur_nblocks = (keylen) / 4; \ + uint32_t _mur_h1 = 0xf88D5353; \ + uint32_t _mur_c1 = 0xcc9e2d51; \ + uint32_t _mur_c2 = 0x1b873593; \ + const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \ + int _mur_i; \ + for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \ + uint32_t _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + \ + _mur_h1 ^= _mur_k1; \ + _mur_h1 = MUR_ROTL32(_mur_h1,13); \ + _mur_h1 = _mur_h1*5+0xe6546b64; \ + } \ + const uint8_t *_mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \ + uint32_t _mur_k1=0; \ + switch((keylen) & 3) { \ + case 3: _mur_k1 ^= _mur_tail[2] << 16; \ + case 2: _mur_k1 ^= _mur_tail[1] << 8; \ + case 1: _mur_k1 ^= _mur_tail[0]; \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + _mur_h1 ^= _mur_k1; \ + } \ + _mur_h1 ^= (keylen); \ + MUR_FMIX(_mur_h1); \ + hashv = _mur_h1; \ + bkt = hashv & (num_bkts-1); \ +} while(0) +#endif /* HASH_USING_NO_STRICT_ALIASING */ + +/* key comparison function; return 0 if keys equal */ +#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \ +do { \ + if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \ + else out=NULL; \ + while (out) { \ + if (out->hh.keylen == keylen_in) { \ + if ((HASH_KEYCMP(out->hh.key,keyptr,keylen_in)) == 0) break; \ + } \ + if (out->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,out->hh.hh_next)); \ + else out = NULL; \ + } \ +} while(0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,addhh) \ +do { \ + head.count++; \ + (addhh)->hh_next = head.hh_head; \ + (addhh)->hh_prev = NULL; \ + if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \ + (head).hh_head=addhh; \ + if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \ + && (addhh)->tbl->noexpand != 1) { \ + HASH_EXPAND_BUCKETS((addhh)->tbl); \ + } \ +} while(0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(hh,head,hh_del) \ + (head).count--; \ + if ((head).hh_head == hh_del) { \ + (head).hh_head = hh_del->hh_next; \ + } \ + if (hh_del->hh_prev) { \ + hh_del->hh_prev->hh_next = hh_del->hh_next; \ + } \ + if (hh_del->hh_next) { \ + hh_del->hh_next->hh_prev = hh_del->hh_prev; \ + } + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(tbl) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \ + memset(_he_new_buckets, 0, \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + tbl->ideal_chain_maxlen = \ + (tbl->num_items >> (tbl->log2_num_buckets+1)) + \ + ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \ + tbl->nonideal_items = 0; \ + for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \ + { \ + _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \ + if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \ + tbl->nonideal_items++; \ + _he_newbkt->expand_mult = _he_newbkt->count / \ + tbl->ideal_chain_maxlen; \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \ + _he_thh; \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + tbl->num_buckets *= 2; \ + tbl->log2_num_buckets++; \ + tbl->buckets = _he_new_buckets; \ + tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \ + (tbl->ineff_expands+1) : 0; \ + if (tbl->ineff_expands > 1) { \ + tbl->noexpand=1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ +} while(0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \ + _hs_psize++; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + if (! (_hs_q) ) break; \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \ + if (_hs_psize == 0) { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \ + _hs_e = _hs_p; \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_psize--; \ + } else if (( \ + cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \ + ) <= 0) { \ + _hs_e = _hs_p; \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail ) { \ + _hs_tail->next = ((_hs_e) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + _hs_e->prev = ((_hs_tail) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + _hs_tail->next = NULL; \ + if ( _hs_nmerges <= 1 ) { \ + _hs_looping=0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2; \ + } \ + HASH_FSCK(hh,head); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt=NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if (src) { \ + for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh) { _last_elt_hh->next = _elt; } \ + if (!dst) { \ + DECLTYPE_ASSIGN(dst,_elt); \ + HASH_MAKE_TABLE(hh_dst,dst); \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \ + (dst)->hh_dst.tbl->num_items++; \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst,dst); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if (head) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)=NULL; \ + } \ +} while(0) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL)) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1 +#define HASH_BLOOM_SIGNATURE 0xb12220f2 + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + char bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/ar.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/ar.h new file mode 100644 index 0000000000..194ee33f1f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/ar.h @@ -0,0 +1,24 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifndef _AR_H_ +#define _AR_H_ 1 + +#define ARMAG "!\n" +#define SARMAG 8 +#define ARFMAG "`\n" + +struct ar_hdr { + char ar_name[16]; + char ar_date[12]; + char ar_uid[6], ar_gid[6]; + char ar_mode[8]; + char ar_size[10]; + char ar_fmag[2]; +}; + +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/compat.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/compat.h new file mode 100644 index 0000000000..9980af7d01 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/compat.h @@ -0,0 +1,57 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifndef _COMPAT_H_ +#define _COMPAT_H_ 1 + +#pragma warning(disable:4244 4267) + +#include +#include +#include +#include +#include "sys/mman.h" +typedef signed __int8 int8_t; +typedef unsigned __int8 uint8_t; +typedef signed __int16 int16_t; +typedef unsigned __int16 uint16_t; +typedef signed __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + +typedef uint8_t u_int8_t; +typedef uint16_t u_int16_t; +typedef uint32_t u_int32_t; +typedef uint64_t u_int64_t; + +#ifndef _WIN64 +typedef __int32 ssize_t; +#else // _WIN64 +typedef __int64 ssize_t; +#endif // _WIN64 + +typedef long uid_t; +typedef long gid_t; +typedef long mode_t; +typedef long off_t; + +#if _MSC_VER < 1900 +#ifndef snprintf +# define snprintf sprintf_s +#endif +#ifndef fstat +# define fstat(A, B) _fstat((A), (struct _stat*)(B)) +#endif +#endif +#ifndef S_ISCHR +#define S_ISCHR(A) (((A) & S_IFMT) == S_IFCHR) +#endif +#ifndef S_ISREG +#define S_ISREG(A) (((A) & S_IFMT) == S_IFREG) +#endif +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/cdefs.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/cdefs.h new file mode 100644 index 0000000000..b3c0212639 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/cdefs.h @@ -0,0 +1,19 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifndef _SYS_CDEFS_H_ +#define _SYS_CDEFS_H_ 1 + +#ifdef __cplusplus +# define __BEGIN_DECLS extern "C" { +# define __END_DECLS } +#else +# define __BEGIN_DECLS +# define __END_DECLS +#endif + +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf32.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf32.h new file mode 100644 index 0000000000..72fd64ccb0 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf32.h @@ -0,0 +1,163 @@ +/*- + * Copyright (c) 1996-1998 John D. Polstra. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/elf32.h,v 1.8.14.1 2005/12/30 22:13:58 marcel Exp $ + */ + +#ifndef _SYS_ELF32_H_ +#define _SYS_ELF32_H_ 1 + +#include "elf_common.h" + +/* + * ELF definitions common to all 32-bit architectures. + */ + +typedef uint32_t Elf32_Addr; +typedef uint16_t Elf32_Half; +typedef uint32_t Elf32_Off; +typedef int32_t Elf32_Sword; +typedef uint32_t Elf32_Word; + +typedef Elf32_Word Elf32_Hashelt; + +/* Non-standard class-dependent datatype used for abstraction. */ +typedef Elf32_Word Elf32_Size; +typedef Elf32_Sword Elf32_Ssize; + +/* + * ELF header. + */ + +typedef struct { + unsigned char e_ident[EI_NIDENT]; /* File identification. */ + Elf32_Half e_type; /* File type. */ + Elf32_Half e_machine; /* Machine architecture. */ + Elf32_Word e_version; /* ELF format version. */ + Elf32_Addr e_entry; /* Entry point. */ + Elf32_Off e_phoff; /* Program header file offset. */ + Elf32_Off e_shoff; /* Section header file offset. */ + Elf32_Word e_flags; /* Architecture-specific flags. */ + Elf32_Half e_ehsize; /* Size of ELF header in bytes. */ + Elf32_Half e_phentsize; /* Size of program header entry. */ + Elf32_Half e_phnum; /* Number of program header entries. */ + Elf32_Half e_shentsize; /* Size of section header entry. */ + Elf32_Half e_shnum; /* Number of section header entries. */ + Elf32_Half e_shstrndx; /* Section name strings section. */ +} Elf32_Ehdr; + +/* + * Section header. + */ + +typedef struct { + Elf32_Word sh_name; /* Section name (index into the + section header string table). */ + Elf32_Word sh_type; /* Section type. */ + Elf32_Word sh_flags; /* Section flags. */ + Elf32_Addr sh_addr; /* Address in memory image. */ + Elf32_Off sh_offset; /* Offset in file. */ + Elf32_Word sh_size; /* Size in bytes. */ + Elf32_Word sh_link; /* Index of a related section. */ + Elf32_Word sh_info; /* Depends on section type. */ + Elf32_Word sh_addralign; /* Alignment in bytes. */ + Elf32_Word sh_entsize; /* Size of each entry in section. */ +} Elf32_Shdr; + +/* + * Program header. + */ + +typedef struct { + Elf32_Word p_type; /* Entry type. */ + Elf32_Off p_offset; /* File offset of contents. */ + Elf32_Addr p_vaddr; /* Virtual address in memory image. */ + Elf32_Addr p_paddr; /* Physical address (not used). */ + Elf32_Word p_filesz; /* Size of contents in file. */ + Elf32_Word p_memsz; /* Size of contents in memory. */ + Elf32_Word p_flags; /* Access permission flags. */ + Elf32_Word p_align; /* Alignment in memory and file. */ +} Elf32_Phdr; + +/* + * Dynamic structure. The ".dynamic" section contains an array of them. + */ + +typedef struct { + Elf32_Sword d_tag; /* Entry type. */ + union { + Elf32_Word d_val; /* Integer value. */ + Elf32_Addr d_ptr; /* Address value. */ + } d_un; +} Elf32_Dyn; + +/* + * Relocation entries. + */ + +/* Relocations that don't need an addend field. */ +typedef struct { + Elf32_Addr r_offset; /* Location to be relocated. */ + Elf32_Word r_info; /* Relocation type and symbol index. */ +} Elf32_Rel; + +/* Relocations that need an addend field. */ +typedef struct { + Elf32_Addr r_offset; /* Location to be relocated. */ + Elf32_Word r_info; /* Relocation type and symbol index. */ + Elf32_Sword r_addend; /* Addend. */ +} Elf32_Rela; + +/* Macros for accessing the fields of r_info. */ +#define ELF32_R_SYM(info) ((info) >> 8) +#define ELF32_R_TYPE(info) ((unsigned char)(info)) + +/* Macro for constructing r_info from field values. */ +#define ELF32_R_INFO(sym, type) (((sym) << 8) + (unsigned char)(type)) + +/* + * Symbol table entries. + */ + +typedef struct { + Elf32_Word st_name; /* String table index of name. */ + Elf32_Addr st_value; /* Symbol value. */ + Elf32_Word st_size; /* Size of associated object. */ + unsigned char st_info; /* Type and binding information. */ + unsigned char st_other; /* Reserved (not used). */ + Elf32_Half st_shndx; /* Section index of symbol. */ +} Elf32_Sym; + +/* Macros for accessing the fields of st_info. */ +#define ELF32_ST_BIND(info) ((info) >> 4) +#define ELF32_ST_TYPE(info) ((info) & 0xf) + +/* Macro for constructing st_info from field values. */ +#define ELF32_ST_INFO(bind, type) (((bind) << 4) + ((type) & 0xf)) + +/* Macro for accessing the fields of st_other. */ +#define ELF32_ST_VISIBILITY(oth) ((oth) & 0x3) + +#endif /* !_SYS_ELF32_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf64.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf64.h new file mode 100644 index 0000000000..d2da3b0d4c --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf64.h @@ -0,0 +1,176 @@ +/*- + * Copyright (c) 1996-1998 John D. Polstra. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/elf64.h,v 1.10.14.1 2005/12/30 22:13:58 marcel Exp $ + */ + +#ifndef _SYS_ELF64_H_ +#define _SYS_ELF64_H_ 1 + +#include "elf_common.h" + +/* + * ELF definitions common to all 64-bit architectures. + */ + +typedef uint64_t Elf64_Addr; +typedef uint16_t Elf64_Half; +typedef uint64_t Elf64_Off; +typedef int32_t Elf64_Sword; +typedef int64_t Elf64_Sxword; +typedef uint32_t Elf64_Word; +typedef uint64_t Elf64_Xword; + +/* + * Types of dynamic symbol hash table bucket and chain elements. + * + * This is inconsistent among 64 bit architectures, so a machine dependent + * typedef is required. + */ + +#ifdef __alpha__ +typedef Elf64_Off Elf64_Hashelt; +#else +typedef Elf64_Word Elf64_Hashelt; +#endif + +/* Non-standard class-dependent datatype used for abstraction. */ +typedef Elf64_Xword Elf64_Size; +typedef Elf64_Sxword Elf64_Ssize; + +/* + * ELF header. + */ + +typedef struct { + unsigned char e_ident[EI_NIDENT]; /* File identification. */ + Elf64_Half e_type; /* File type. */ + Elf64_Half e_machine; /* Machine architecture. */ + Elf64_Word e_version; /* ELF format version. */ + Elf64_Addr e_entry; /* Entry point. */ + Elf64_Off e_phoff; /* Program header file offset. */ + Elf64_Off e_shoff; /* Section header file offset. */ + Elf64_Word e_flags; /* Architecture-specific flags. */ + Elf64_Half e_ehsize; /* Size of ELF header in bytes. */ + Elf64_Half e_phentsize; /* Size of program header entry. */ + Elf64_Half e_phnum; /* Number of program header entries. */ + Elf64_Half e_shentsize; /* Size of section header entry. */ + Elf64_Half e_shnum; /* Number of section header entries. */ + Elf64_Half e_shstrndx; /* Section name strings section. */ +} Elf64_Ehdr; + +/* + * Section header. + */ + +typedef struct { + Elf64_Word sh_name; /* Section name (index into the + section header string table). */ + Elf64_Word sh_type; /* Section type. */ + Elf64_Xword sh_flags; /* Section flags. */ + Elf64_Addr sh_addr; /* Address in memory image. */ + Elf64_Off sh_offset; /* Offset in file. */ + Elf64_Xword sh_size; /* Size in bytes. */ + Elf64_Word sh_link; /* Index of a related section. */ + Elf64_Word sh_info; /* Depends on section type. */ + Elf64_Xword sh_addralign; /* Alignment in bytes. */ + Elf64_Xword sh_entsize; /* Size of each entry in section. */ +} Elf64_Shdr; + +/* + * Program header. + */ + +typedef struct { + Elf64_Word p_type; /* Entry type. */ + Elf64_Word p_flags; /* Access permission flags. */ + Elf64_Off p_offset; /* File offset of contents. */ + Elf64_Addr p_vaddr; /* Virtual address in memory image. */ + Elf64_Addr p_paddr; /* Physical address (not used). */ + Elf64_Xword p_filesz; /* Size of contents in file. */ + Elf64_Xword p_memsz; /* Size of contents in memory. */ + Elf64_Xword p_align; /* Alignment in memory and file. */ +} Elf64_Phdr; + +/* + * Dynamic structure. The ".dynamic" section contains an array of them. + */ + +typedef struct { + Elf64_Sxword d_tag; /* Entry type. */ + union { + Elf64_Xword d_val; /* Integer value. */ + Elf64_Addr d_ptr; /* Address value. */ + } d_un; +} Elf64_Dyn; + +/* + * Relocation entries. + */ + +/* Relocations that don't need an addend field. */ +typedef struct { + Elf64_Addr r_offset; /* Location to be relocated. */ + Elf64_Xword r_info; /* Relocation type and symbol index. */ +} Elf64_Rel; + +/* Relocations that need an addend field. */ +typedef struct { + Elf64_Addr r_offset; /* Location to be relocated. */ + Elf64_Xword r_info; /* Relocation type and symbol index. */ + Elf64_Sxword r_addend; /* Addend. */ +} Elf64_Rela; + +/* Macros for accessing the fields of r_info. */ +#define ELF64_R_SYM(info) ((info) >> 32) +#define ELF64_R_TYPE(info) ((info) & 0xffffffffL) + +/* Macro for constructing r_info from field values. */ +#define ELF64_R_INFO(sym, type) (((sym) << 32) + ((type) & 0xffffffffL)) + +/* + * Symbol table entries. + */ + +typedef struct { + Elf64_Word st_name; /* String table index of name. */ + unsigned char st_info; /* Type and binding information. */ + unsigned char st_other; /* Reserved (not used). */ + Elf64_Half st_shndx; /* Section index of symbol. */ + Elf64_Addr st_value; /* Symbol value. */ + Elf64_Xword st_size; /* Size of associated object. */ +} Elf64_Sym; + +/* Macros for accessing the fields of st_info. */ +#define ELF64_ST_BIND(info) ((info) >> 4) +#define ELF64_ST_TYPE(info) ((info) & 0xf) + +/* Macro for constructing st_info from field values. */ +#define ELF64_ST_INFO(bind, type) (((bind) << 4) + ((type) & 0xf)) + +/* Macro for accessing the fields of st_other. */ +#define ELF64_ST_VISIBILITY(oth) ((oth) & 0x3) + +#endif /* !_SYS_ELF64_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf_common.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf_common.h new file mode 100644 index 0000000000..b11f1702f7 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/elf_common.h @@ -0,0 +1,355 @@ +/*- + * Copyright (c) 1998 John D. Polstra. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/elf_common.h,v 1.15.8.1 2005/12/30 22:13:58 marcel Exp $ + */ + +#ifndef _SYS_ELF_COMMON_H_ +#define _SYS_ELF_COMMON_H_ 1 + +/* + * ELF definitions that are independent of architecture or word size. + */ + +/* + * Note header. The ".note" section contains an array of notes. Each + * begins with this header, aligned to a word boundary. Immediately + * following the note header is n_namesz bytes of name, padded to the + * next word boundary. Then comes n_descsz bytes of descriptor, again + * padded to a word boundary. The values of n_namesz and n_descsz do + * not include the padding. + */ + +typedef struct { + u_int32_t n_namesz; /* Length of name. */ + u_int32_t n_descsz; /* Length of descriptor. */ + u_int32_t n_type; /* Type of this note. */ +} Elf_Note; + +/* Indexes into the e_ident array. Keep synced with + http://www.sco.com/developer/gabi/ch4.eheader.html */ +#define EI_MAG0 0 /* Magic number, byte 0. */ +#define EI_MAG1 1 /* Magic number, byte 1. */ +#define EI_MAG2 2 /* Magic number, byte 2. */ +#define EI_MAG3 3 /* Magic number, byte 3. */ +#define EI_CLASS 4 /* Class of machine. */ +#define EI_DATA 5 /* Data format. */ +#define EI_VERSION 6 /* ELF format version. */ +#define EI_OSABI 7 /* Operating system / ABI identification */ +#define EI_ABIVERSION 8 /* ABI version */ +#define OLD_EI_BRAND 8 /* Start of architecture identification. */ +#define EI_PAD 9 /* Start of padding (per SVR4 ABI). */ +#define EI_NIDENT 16 /* Size of e_ident array. */ + +/* Values for the magic number bytes. */ +#define ELFMAG0 0x7f +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" /* magic string */ +#define SELFMAG 4 /* magic string size */ + +/* Values for e_ident[EI_VERSION] and e_version. */ +#define EV_NONE 0 +#define EV_CURRENT 1 + +/* Values for e_ident[EI_CLASS]. */ +#define ELFCLASSNONE 0 /* Unknown class. */ +#define ELFCLASS32 1 /* 32-bit architecture. */ +#define ELFCLASS64 2 /* 64-bit architecture. */ + +/* Values for e_ident[EI_DATA]. */ +#define ELFDATANONE 0 /* Unknown data format. */ +#define ELFDATA2LSB 1 /* 2's complement little-endian. */ +#define ELFDATA2MSB 2 /* 2's complement big-endian. */ + +/* Values for e_ident[EI_OSABI]. */ +#define ELFOSABI_NONE 0 /* UNIX System V ABI */ +#define ELFOSABI_HPUX 1 /* HP-UX operating system */ +#define ELFOSABI_NETBSD 2 /* NetBSD */ +#define ELFOSABI_LINUX 3 /* GNU/Linux */ +#define ELFOSABI_HURD 4 /* GNU/Hurd */ +#define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */ +#define ELFOSABI_SOLARIS 6 /* Solaris */ +#define ELFOSABI_AIX 7 /* AIX */ +#define ELFOSABI_IRIX 8 /* IRIX */ +#define ELFOSABI_FREEBSD 9 /* FreeBSD */ +#define ELFOSABI_TRU64 10 /* TRU64 UNIX */ +#define ELFOSABI_MODESTO 11 /* Novell Modesto */ +#define ELFOSABI_OPENBSD 12 /* OpenBSD */ +#define ELFOSABI_OPENVMS 13 /* Open VMS */ +#define ELFOSABI_NSK 14 /* HP Non-Stop Kernel */ +#define ELFOSABI_ARM 97 /* ARM */ +#define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */ + +#define ELFOSABI_SYSV ELFOSABI_NONE /* symbol used in old spec */ +#define ELFOSABI_MONTEREY ELFOSABI_AIX /* Monterey */ + +/* e_ident */ +#define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \ + (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \ + (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \ + (ehdr).e_ident[EI_MAG3] == ELFMAG3) + +/* Values for e_type. */ +#define ET_NONE 0 /* Unknown type. */ +#define ET_REL 1 /* Relocatable. */ +#define ET_EXEC 2 /* Executable. */ +#define ET_DYN 3 /* Shared object. */ +#define ET_CORE 4 /* Core file. */ +#define ET_LOOS 0xfe00 /* First operating system specific. */ +#define ET_HIOS 0xfeff /* Last operating system-specific. */ +#define ET_LOPROC 0xff00 /* First processor-specific. */ +#define ET_HIPROC 0xffff /* Last processor-specific. */ + +/* Values for e_machine. */ +#define EM_NONE 0 /* Unknown machine. */ +#define EM_M32 1 /* AT&T WE32100. */ +#define EM_SPARC 2 /* Sun SPARC. */ +#define EM_386 3 /* Intel i386. */ +#define EM_68K 4 /* Motorola 68000. */ +#define EM_88K 5 /* Motorola 88000. */ +#define EM_860 7 /* Intel i860. */ +#define EM_MIPS 8 /* MIPS R3000 Big-Endian only. */ +#define EM_S370 9 /* IBM System/370. */ +#define EM_MIPS_RS3_LE 10 /* MIPS R3000 Little-Endian. */ +#define EM_PARISC 15 /* HP PA-RISC. */ +#define EM_VPP500 17 /* Fujitsu VPP500. */ +#define EM_SPARC32PLUS 18 /* SPARC v8plus. */ +#define EM_960 19 /* Intel 80960. */ +#define EM_PPC 20 /* PowerPC 32-bit. */ +#define EM_PPC64 21 /* PowerPC 64-bit. */ +#define EM_S390 22 /* IBM System/390. */ +#define EM_V800 36 /* NEC V800. */ +#define EM_FR20 37 /* Fujitsu FR20. */ +#define EM_RH32 38 /* TRW RH-32. */ +#define EM_RCE 39 /* Motorola RCE. */ +#define EM_ARM 40 /* ARM. */ +#define EM_SH 42 /* Hitachi SH. */ +#define EM_SPARCV9 43 /* SPARC v9 64-bit. */ +#define EM_TRICORE 44 /* Siemens TriCore embedded processor. */ +#define EM_ARC 45 /* Argonaut RISC Core. */ +#define EM_H8_300 46 /* Hitachi H8/300. */ +#define EM_H8_300H 47 /* Hitachi H8/300H. */ +#define EM_H8S 48 /* Hitachi H8S. */ +#define EM_H8_500 49 /* Hitachi H8/500. */ +#define EM_IA_64 50 /* Intel IA-64 Processor. */ +#define EM_MIPS_X 51 /* Stanford MIPS-X. */ +#define EM_COLDFIRE 52 /* Motorola ColdFire. */ +#define EM_68HC12 53 /* Motorola M68HC12. */ +#define EM_MMA 54 /* Fujitsu MMA. */ +#define EM_PCP 55 /* Siemens PCP. */ +#define EM_NCPU 56 /* Sony nCPU. */ +#define EM_NDR1 57 /* Denso NDR1 microprocessor. */ +#define EM_STARCORE 58 /* Motorola Star*Core processor. */ +#define EM_ME16 59 /* Toyota ME16 processor. */ +#define EM_ST100 60 /* STMicroelectronics ST100 processor. */ +#define EM_TINYJ 61 /* Advanced Logic Corp. TinyJ processor. */ +#define EM_X86_64 62 /* Advanced Micro Devices x86-64 */ + +/* Non-standard or deprecated. */ +#define EM_486 6 /* Intel i486. */ +#define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */ +#define EM_ALPHA_STD 41 /* Digital Alpha (standard value). */ +#define EM_ALPHA 0x9026 /* Alpha (written in the absence of an ABI) */ + +/* Special section indexes. */ +#define SHN_UNDEF 0 /* Undefined, missing, irrelevant. */ +#define SHN_LORESERVE 0xff00 /* First of reserved range. */ +#define SHN_LOPROC 0xff00 /* First processor-specific. */ +#define SHN_HIPROC 0xff1f /* Last processor-specific. */ +#define SHN_LOOS 0xff20 /* First operating system-specific. */ +#define SHN_HIOS 0xff3f /* Last operating system-specific. */ +#define SHN_ABS 0xfff1 /* Absolute values. */ +#define SHN_COMMON 0xfff2 /* Common data. */ +#define SHN_XINDEX 0xffff /* Escape -- index stored elsewhere. */ +#define SHN_HIRESERVE 0xffff /* Last of reserved range. */ + +/* sh_type */ +#define SHT_NULL 0 /* inactive */ +#define SHT_PROGBITS 1 /* program defined information */ +#define SHT_SYMTAB 2 /* symbol table section */ +#define SHT_STRTAB 3 /* string table section */ +#define SHT_RELA 4 /* relocation section with addends */ +#define SHT_HASH 5 /* symbol hash table section */ +#define SHT_DYNAMIC 6 /* dynamic section */ +#define SHT_NOTE 7 /* note section */ +#define SHT_NOBITS 8 /* no space section */ +#define SHT_REL 9 /* relocation section - no addends */ +#define SHT_SHLIB 10 /* reserved - purpose unknown */ +#define SHT_DYNSYM 11 /* dynamic symbol table section */ +#define SHT_INIT_ARRAY 14 /* Initialization function pointers. */ +#define SHT_FINI_ARRAY 15 /* Termination function pointers. */ +#define SHT_PREINIT_ARRAY 16 /* Pre-initialization function ptrs. */ +#define SHT_GROUP 17 /* Section group. */ +#define SHT_SYMTAB_SHNDX 18 /* Section indexes (see SHN_XINDEX). */ +#define SHT_LOOS 0x60000000 /* First of OS specific semantics */ +#define SHT_HIOS 0x6fffffff /* Last of OS specific semantics */ +#define SHT_LOPROC 0x70000000 /* reserved range for processor */ +#define SHT_HIPROC 0x7fffffff /* specific section header types */ +#define SHT_LOUSER 0x80000000 /* reserved range for application */ +#define SHT_HIUSER 0xffffffff /* specific indexes */ + +/* Flags for sh_flags. */ +#define SHF_WRITE 0x1 /* Section contains writable data. */ +#define SHF_ALLOC 0x2 /* Section occupies memory. */ +#define SHF_EXECINSTR 0x4 /* Section contains instructions. */ +#define SHF_MERGE 0x10 /* Section may be merged. */ +#define SHF_STRINGS 0x20 /* Section contains strings. */ +#define SHF_INFO_LINK 0x40 /* sh_info holds section index. */ +#define SHF_LINK_ORDER 0x80 /* Special ordering requirements. */ +#define SHF_OS_NONCONFORMING 0x100 /* OS-specific processing required. */ +#define SHF_GROUP 0x200 /* Member of section group. */ +#define SHF_TLS 0x400 /* Section contains TLS data. */ +#define SHF_MASKOS 0x0ff00000 /* OS-specific semantics. */ +#define SHF_MASKPROC 0xf0000000 /* Processor-specific semantics. */ + +/* Values for p_type. */ +#define PT_NULL 0 /* Unused entry. */ +#define PT_LOAD 1 /* Loadable segment. */ +#define PT_DYNAMIC 2 /* Dynamic linking information segment. */ +#define PT_INTERP 3 /* Pathname of interpreter. */ +#define PT_NOTE 4 /* Auxiliary information. */ +#define PT_SHLIB 5 /* Reserved (not used). */ +#define PT_PHDR 6 /* Location of program header itself. */ +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* First OS-specific. */ +#define PT_HIOS 0x6fffffff /* Last OS-specific. */ +#define PT_LOPROC 0x70000000 /* First processor-specific type. */ +#define PT_HIPROC 0x7fffffff /* Last processor-specific type. */ + +/* Values for p_flags. */ +#define PF_X 0x1 /* Executable. */ +#define PF_W 0x2 /* Writable. */ +#define PF_R 0x4 /* Readable. */ +#define PF_MASKOS 0x0ff00000 /* Operating system-specific. */ +#define PF_MASKPROC 0xf0000000 /* Processor-specific. */ + +/* Values for d_tag. */ +#define DT_NULL 0 /* Terminating entry. */ +#define DT_NEEDED 1 /* String table offset of a needed shared + library. */ +#define DT_PLTRELSZ 2 /* Total size in bytes of PLT relocations. */ +#define DT_PLTGOT 3 /* Processor-dependent address. */ +#define DT_HASH 4 /* Address of symbol hash table. */ +#define DT_STRTAB 5 /* Address of string table. */ +#define DT_SYMTAB 6 /* Address of symbol table. */ +#define DT_RELA 7 /* Address of ElfNN_Rela relocations. */ +#define DT_RELASZ 8 /* Total size of ElfNN_Rela relocations. */ +#define DT_RELAENT 9 /* Size of each ElfNN_Rela relocation entry. */ +#define DT_STRSZ 10 /* Size of string table. */ +#define DT_SYMENT 11 /* Size of each symbol table entry. */ +#define DT_INIT 12 /* Address of initialization function. */ +#define DT_FINI 13 /* Address of finalization function. */ +#define DT_SONAME 14 /* String table offset of shared object + name. */ +#define DT_RPATH 15 /* String table offset of library path. [sup] */ +#define DT_SYMBOLIC 16 /* Indicates "symbolic" linking. [sup] */ +#define DT_REL 17 /* Address of ElfNN_Rel relocations. */ +#define DT_RELSZ 18 /* Total size of ElfNN_Rel relocations. */ +#define DT_RELENT 19 /* Size of each ElfNN_Rel relocation. */ +#define DT_PLTREL 20 /* Type of relocation used for PLT. */ +#define DT_DEBUG 21 /* Reserved (not used). */ +#define DT_TEXTREL 22 /* Indicates there may be relocations in + non-writable segments. [sup] */ +#define DT_JMPREL 23 /* Address of PLT relocations. */ +#define DT_BIND_NOW 24 /* [sup] */ +#define DT_INIT_ARRAY 25 /* Address of the array of pointers to + initialization functions */ +#define DT_FINI_ARRAY 26 /* Address of the array of pointers to + termination functions */ +#define DT_INIT_ARRAYSZ 27 /* Size in bytes of the array of + initialization functions. */ +#define DT_FINI_ARRAYSZ 28 /* Size in bytes of the array of + terminationfunctions. */ +#define DT_RUNPATH 29 /* String table offset of a null-terminated + library search path string. */ +#define DT_FLAGS 30 /* Object specific flag values. */ +#define DT_ENCODING 32 /* Values greater than or equal to DT_ENCODING + and less than DT_LOOS follow the rules for + the interpretation of the d_un union + as follows: even == 'd_ptr', even == 'd_val' + or none */ +#define DT_PREINIT_ARRAY 32 /* Address of the array of pointers to + pre-initialization functions. */ +#define DT_PREINIT_ARRAYSZ 33 /* Size in bytes of the array of + pre-initialization functions. */ +#define DT_LOOS 0x6000000d /* First OS-specific */ +#define DT_HIOS 0x6ffff000 /* Last OS-specific */ +#define DT_LOPROC 0x70000000 /* First processor-specific type. */ +#define DT_HIPROC 0x7fffffff /* Last processor-specific type. */ + +/* Values for DT_FLAGS */ +#define DF_ORIGIN 0x0001 /* Indicates that the object being loaded may + make reference to the $ORIGIN substitution + string */ +#define DF_SYMBOLIC 0x0002 /* Indicates "symbolic" linking. */ +#define DF_TEXTREL 0x0004 /* Indicates there may be relocations in + non-writable segments. */ +#define DF_BIND_NOW 0x0008 /* Indicates that the dynamic linker should + process all relocations for the object + containing this entry before transferring + control to the program. */ +#define DF_STATIC_TLS 0x0010 /* Indicates that the shared object or + executable contains code using a static + thread-local storage scheme. */ + +/* Values for n_type. Used in core files. */ +#define NT_PRSTATUS 1 /* Process status. */ +#define NT_FPREGSET 2 /* Floating point registers. */ +#define NT_PRPSINFO 3 /* Process state info. */ + +/* Symbol Binding - ELFNN_ST_BIND - st_info */ +#define STB_LOCAL 0 /* Local symbol */ +#define STB_GLOBAL 1 /* Global symbol */ +#define STB_WEAK 2 /* like global - lower precedence */ +#define STB_LOOS 10 /* Reserved range for operating system */ +#define STB_HIOS 12 /* specific semantics. */ +#define STB_LOPROC 13 /* reserved range for processor */ +#define STB_HIPROC 15 /* specific semantics. */ + +/* Symbol type - ELFNN_ST_TYPE - st_info */ +#define STT_NOTYPE 0 /* Unspecified type. */ +#define STT_OBJECT 1 /* Data object. */ +#define STT_FUNC 2 /* Function. */ +#define STT_SECTION 3 /* Section. */ +#define STT_FILE 4 /* Source file. */ +#define STT_COMMON 5 /* Uninitialized common block. */ +#define STT_TLS 6 /* TLS object. */ +#define STT_LOOS 10 /* Reserved range for operating system */ +#define STT_HIOS 12 /* specific semantics. */ +#define STT_LOPROC 13 /* reserved range for processor */ +#define STT_HIPROC 15 /* specific semantics. */ + +/* Symbol visibility - ELFNN_ST_VISIBILITY - st_other */ +#define STV_DEFAULT 0x0 /* Default visibility (see binding). */ +#define STV_INTERNAL 0x1 /* Special meaning in relocatable objects. */ +#define STV_HIDDEN 0x2 /* Not visible. */ +#define STV_PROTECTED 0x3 /* Visible but not preemptible. */ + +/* Special symbol table indexes. */ +#define STN_UNDEF 0 /* Undefined symbol index. */ + +#endif /* !_SYS_ELF_COMMON_H_ */ \ No newline at end of file diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/mman.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/mman.h new file mode 100644 index 0000000000..a53cafc831 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/mman.h @@ -0,0 +1,15 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifndef _MMAN_H_ +#define _MMAN_H_ +#if defined(WIN32) + +void *mmap(void*, size_t, int, int, int, unsigned); +int munmap(void*, size_t); +#endif +#endif // _MMAN_H_ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/param.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/param.h new file mode 100644 index 0000000000..1ce245f3e9 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/param.h @@ -0,0 +1,13 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifndef _SYS_PARAM_H_ +#define _SYS_PARAM_H_ 1 + +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) + +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/queue.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/queue.h new file mode 100644 index 0000000000..1706d876d9 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/common/win32/sys/queue.h @@ -0,0 +1,671 @@ +/* $NetBSD: queue.h,v 1.45.14.2 2009/06/05 16:23:34 snj Exp $ */ + +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + */ + +#ifndef _ELF_SYS_QUEUE_H_ +#define _ELF_SYS_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * lists, simple queues, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The + * elements are singly linked for minimum space and pointer manipulation + * overhead at the expense of O(n) removal for arbitrary elements. New + * elements can be added to the list after an existing element or at the + * head of the list. Elements being removed from the head of the list + * should use the explicit macro for this purpose for optimum + * efficiency. A singly-linked list may only be traversed in the forward + * direction. Singly-linked lists are ideal for applications with large + * datasets and few or no removals or for implementing a LIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A simple queue is headed by a pair of pointers, one the head of the + * list and the other to the tail of the list. The elements are singly + * linked to save space, so elements can only be removed from the + * head of the list. New elements can be added to the list after + * an existing element, at the head of the list, or at the end of the + * list. A simple queue may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * List definitions. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ +#if defined(_KERNEL) && defined(QUEUEDEBUG) +#define QUEUEDEBUG_LIST_INSERT_HEAD(head, elm, field) \ + if ((head)->lh_first && \ + (head)->lh_first->field.le_prev != &(head)->lh_first) \ + panic("LIST_INSERT_HEAD %p %s:%d", (head), __FILE__, __LINE__); +#define QUEUEDEBUG_LIST_OP(elm, field) \ + if ((elm)->field.le_next && \ + (elm)->field.le_next->field.le_prev != \ + &(elm)->field.le_next) \ + panic("LIST_* forw %p %s:%d", (elm), __FILE__, __LINE__);\ + if (*(elm)->field.le_prev != (elm)) \ + panic("LIST_* back %p %s:%d", (elm), __FILE__, __LINE__); +#define QUEUEDEBUG_LIST_POSTREMOVE(elm, field) \ + (elm)->field.le_next = (void *)1L; \ + (elm)->field.le_prev = (void *)1L; +#else +#define QUEUEDEBUG_LIST_INSERT_HEAD(head, elm, field) +#define QUEUEDEBUG_LIST_OP(elm, field) +#define QUEUEDEBUG_LIST_POSTREMOVE(elm, field) +#endif + +#define LIST_INIT(head) do { \ + (head)->lh_first = NULL; \ +} while (/*CONSTCOND*/0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + QUEUEDEBUG_LIST_OP((listelm), field) \ + if (((elm)->field.le_next = (listelm)->field.le_next) != NULL) \ + (listelm)->field.le_next->field.le_prev = \ + &(elm)->field.le_next; \ + (listelm)->field.le_next = (elm); \ + (elm)->field.le_prev = &(listelm)->field.le_next; \ +} while (/*CONSTCOND*/0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + QUEUEDEBUG_LIST_OP((listelm), field) \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + (elm)->field.le_next = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &(elm)->field.le_next; \ +} while (/*CONSTCOND*/0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + QUEUEDEBUG_LIST_INSERT_HEAD((head), (elm), field) \ + if (((elm)->field.le_next = (head)->lh_first) != NULL) \ + (head)->lh_first->field.le_prev = &(elm)->field.le_next;\ + (head)->lh_first = (elm); \ + (elm)->field.le_prev = &(head)->lh_first; \ +} while (/*CONSTCOND*/0) + +#define LIST_REMOVE(elm, field) do { \ + QUEUEDEBUG_LIST_OP((elm), field) \ + if ((elm)->field.le_next != NULL) \ + (elm)->field.le_next->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = (elm)->field.le_next; \ + QUEUEDEBUG_LIST_POSTREMOVE((elm), field) \ +} while (/*CONSTCOND*/0) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = ((head)->lh_first); \ + (var); \ + (var) = ((var)->field.le_next)) + +/* + * List access methods. + */ +#define LIST_EMPTY(head) ((head)->lh_first == NULL) +#define LIST_FIRST(head) ((head)->lh_first) +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +/* + * Singly-linked List definitions. + */ +#define ELF_SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define ELF_SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define ELF_SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define ELF_SLIST_INIT(head) do { \ + (head)->slh_first = NULL; \ +} while (/*CONSTCOND*/0) + +#define ELF_SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + (elm)->field.sle_next = (slistelm)->field.sle_next; \ + (slistelm)->field.sle_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define ELF_SLIST_INSERT_HEAD(head, elm, field) do { \ + (elm)->field.sle_next = (head)->slh_first; \ + (head)->slh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define ELF_SLIST_REMOVE_HEAD(head, field) do { \ + (head)->slh_first = (head)->slh_first->field.sle_next; \ +} while (/*CONSTCOND*/0) + +#define ELF_SLIST_REMOVE(head, elm, type, field) do { \ + if ((head)->slh_first == (elm)) { \ + ELF_SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = (head)->slh_first; \ + while(curelm->field.sle_next != (elm)) \ + curelm = curelm->field.sle_next; \ + curelm->field.sle_next = \ + curelm->field.sle_next->field.sle_next; \ + } \ +} while (/*CONSTCOND*/0) + +#define ELF_SLIST_FOREACH(var, head, field) \ + for((var) = (head)->slh_first; (var); (var) = (var)->field.sle_next) + +/* + * Singly-linked List access methods. + */ +#define ELF_SLIST_EMPTY(head) ((head)->slh_first == NULL) +#define ELF_SLIST_FIRST(head) ((head)->slh_first) +#define ELF_SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first; /* first element */ \ + struct type **stqh_last; /* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_INIT(head) do { \ + (head)->stqh_first = NULL; \ + (head)->stqh_last = &(head)->stqh_first; \ +} while (/*CONSTCOND*/0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if (((elm)->field.stqe_next = (head)->stqh_first) == NULL) \ + (head)->stqh_last = &(elm)->field.stqe_next; \ + (head)->stqh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.stqe_next = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &(elm)->field.stqe_next; \ +} while (/*CONSTCOND*/0) + +#define STAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if (((elm)->field.stqe_next = (listelm)->field.stqe_next) == NULL)\ + (head)->stqh_last = &(elm)->field.stqe_next; \ + (listelm)->field.stqe_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if (((head)->stqh_first = (head)->stqh_first->field.stqe_next) == NULL) \ + (head)->stqh_last = &(head)->stqh_first; \ +} while (/*CONSTCOND*/0) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if ((head)->stqh_first == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } else { \ + struct type *curelm = (head)->stqh_first; \ + while (curelm->field.stqe_next != (elm)) \ + curelm = curelm->field.stqe_next; \ + if ((curelm->field.stqe_next = \ + curelm->field.stqe_next->field.stqe_next) == NULL) \ + (head)->stqh_last = &(curelm)->field.stqe_next; \ + } \ +} while (/*CONSTCOND*/0) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = ((head)->stqh_first); \ + (var); \ + (var) = ((var)->field.stqe_next)) + +/* + * Singly-linked Tail queue access methods. + */ +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) +#define STAILQ_FIRST(head) ((head)->stqh_first) +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +/* + * Simple queue definitions. + */ +#define SIMPLEQ_HEAD(name, type) \ +struct name { \ + struct type *sqh_first; /* first element */ \ + struct type **sqh_last; /* addr of last next element */ \ +} + +#define SIMPLEQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).sqh_first } + +#define SIMPLEQ_ENTRY(type) \ +struct { \ + struct type *sqe_next; /* next element */ \ +} + +/* + * Simple queue functions. + */ +#define SIMPLEQ_INIT(head) do { \ + (head)->sqh_first = NULL; \ + (head)->sqh_last = &(head)->sqh_first; \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_INSERT_HEAD(head, elm, field) do { \ + if (((elm)->field.sqe_next = (head)->sqh_first) == NULL) \ + (head)->sqh_last = &(elm)->field.sqe_next; \ + (head)->sqh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.sqe_next = NULL; \ + *(head)->sqh_last = (elm); \ + (head)->sqh_last = &(elm)->field.sqe_next; \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if (((elm)->field.sqe_next = (listelm)->field.sqe_next) == NULL)\ + (head)->sqh_last = &(elm)->field.sqe_next; \ + (listelm)->field.sqe_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_REMOVE_HEAD(head, field) do { \ + if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL) \ + (head)->sqh_last = &(head)->sqh_first; \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_REMOVE(head, elm, type, field) do { \ + if ((head)->sqh_first == (elm)) { \ + SIMPLEQ_REMOVE_HEAD((head), field); \ + } else { \ + struct type *curelm = (head)->sqh_first; \ + while (curelm->field.sqe_next != (elm)) \ + curelm = curelm->field.sqe_next; \ + if ((curelm->field.sqe_next = \ + curelm->field.sqe_next->field.sqe_next) == NULL) \ + (head)->sqh_last = &(curelm)->field.sqe_next; \ + } \ +} while (/*CONSTCOND*/0) + +#define SIMPLEQ_FOREACH(var, head, field) \ + for ((var) = ((head)->sqh_first); \ + (var); \ + (var) = ((var)->field.sqe_next)) + +/* + * Simple queue access methods. + */ +#define SIMPLEQ_EMPTY(head) ((head)->sqh_first == NULL) +#define SIMPLEQ_FIRST(head) ((head)->sqh_first) +#define SIMPLEQ_NEXT(elm, field) ((elm)->field.sqe_next) + +/* + * Tail queue definitions. + */ +#define _TAILQ_HEAD(name, type, qual) \ +struct name { \ + qual type *tqh_first; /* first element */ \ + qual type *qual *tqh_last; /* addr of last next element */ \ +} +#define TAILQ_HEAD(name, type) _TAILQ_HEAD(name, struct type,) + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define _TAILQ_ENTRY(type, qual) \ +struct { \ + qual type *tqe_next; /* next element */ \ + qual type *qual *tqe_prev; /* address of previous next element */\ +} +#define TAILQ_ENTRY(type) _TAILQ_ENTRY(struct type,) + +/* + * Tail queue functions. + */ +#if defined(_KERNEL) && defined(QUEUEDEBUG) +#define QUEUEDEBUG_TAILQ_INSERT_HEAD(head, elm, field) \ + if ((head)->tqh_first && \ + (head)->tqh_first->field.tqe_prev != &(head)->tqh_first) \ + panic("TAILQ_INSERT_HEAD %p %s:%d", (head), __FILE__, __LINE__); +#define QUEUEDEBUG_TAILQ_INSERT_TAIL(head, elm, field) \ + if (*(head)->tqh_last != NULL) \ + panic("TAILQ_INSERT_TAIL %p %s:%d", (head), __FILE__, __LINE__); +#define QUEUEDEBUG_TAILQ_OP(elm, field) \ + if ((elm)->field.tqe_next && \ + (elm)->field.tqe_next->field.tqe_prev != \ + &(elm)->field.tqe_next) \ + panic("TAILQ_* forw %p %s:%d", (elm), __FILE__, __LINE__);\ + if (*(elm)->field.tqe_prev != (elm)) \ + panic("TAILQ_* back %p %s:%d", (elm), __FILE__, __LINE__); +#define QUEUEDEBUG_TAILQ_PREREMOVE(head, elm, field) \ + if ((elm)->field.tqe_next == NULL && \ + (head)->tqh_last != &(elm)->field.tqe_next) \ + panic("TAILQ_PREREMOVE head %p elm %p %s:%d", \ + (head), (elm), __FILE__, __LINE__); +#define QUEUEDEBUG_TAILQ_POSTREMOVE(elm, field) \ + (elm)->field.tqe_next = (void *)1L; \ + (elm)->field.tqe_prev = (void *)1L; +#else +#define QUEUEDEBUG_TAILQ_INSERT_HEAD(head, elm, field) +#define QUEUEDEBUG_TAILQ_INSERT_TAIL(head, elm, field) +#define QUEUEDEBUG_TAILQ_OP(elm, field) +#define QUEUEDEBUG_TAILQ_PREREMOVE(head, elm, field) +#define QUEUEDEBUG_TAILQ_POSTREMOVE(elm, field) +#endif + +#define TAILQ_INIT(head) do { \ + (head)->tqh_first = NULL; \ + (head)->tqh_last = &(head)->tqh_first; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QUEUEDEBUG_TAILQ_INSERT_HEAD((head), (elm), field) \ + if (((elm)->field.tqe_next = (head)->tqh_first) != NULL) \ + (head)->tqh_first->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (head)->tqh_first = (elm); \ + (elm)->field.tqe_prev = &(head)->tqh_first; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QUEUEDEBUG_TAILQ_INSERT_TAIL((head), (elm), field) \ + (elm)->field.tqe_next = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &(elm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QUEUEDEBUG_TAILQ_OP((listelm), field) \ + if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\ + (elm)->field.tqe_next->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (listelm)->field.tqe_next = (elm); \ + (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QUEUEDEBUG_TAILQ_OP((listelm), field) \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + (elm)->field.tqe_next = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &(elm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QUEUEDEBUG_TAILQ_PREREMOVE((head), (elm), field) \ + QUEUEDEBUG_TAILQ_OP((elm), field) \ + if (((elm)->field.tqe_next) != NULL) \ + (elm)->field.tqe_next->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = (elm)->field.tqe_next; \ + QUEUEDEBUG_TAILQ_POSTREMOVE((elm), field); \ +} while (/*CONSTCOND*/0) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = ((head)->tqh_first); \ + (var); \ + (var) = ((var)->field.tqe_next)) + +#define TAILQ_FOREACH_SAFE(var, head, field, next) \ + for ((var) = ((head)->tqh_first); \ + (var) != NULL && ((next) = TAILQ_NEXT(var, field), 1); \ + (var) = (next)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = (*(((struct headname *)((head)->tqh_last))->tqh_last)); \ + (var); \ + (var) = (*(((struct headname *)((var)->field.tqe_prev))->tqh_last))) + +/* + * Tail queue access methods. + */ +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#define TAILQ_FIRST(head) ((head)->tqh_first) +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +/* + * Circular queue definitions. + */ +#if defined(_KERNEL) && defined(QUEUEDEBUG) +#define QUEUEDEBUG_CIRCLEQ_HEAD(head, field) \ + if ((head)->cqh_first != (void *)(head) && \ + (head)->cqh_first->field.cqe_prev != (void *)(head)) \ + panic("CIRCLEQ head forw %p %s:%d", (head), \ + __FILE__, __LINE__); \ + if ((head)->cqh_last != (void *)(head) && \ + (head)->cqh_last->field.cqe_next != (void *)(head)) \ + panic("CIRCLEQ head back %p %s:%d", (head), \ + __FILE__, __LINE__); +#define QUEUEDEBUG_CIRCLEQ_ELM(head, elm, field) \ + if ((elm)->field.cqe_next == (void *)(head)) { \ + if ((head)->cqh_last != (elm)) \ + panic("CIRCLEQ elm last %p %s:%d", (elm), \ + __FILE__, __LINE__); \ + } else { \ + if ((elm)->field.cqe_next->field.cqe_prev != (elm)) \ + panic("CIRCLEQ elm forw %p %s:%d", (elm), \ + __FILE__, __LINE__); \ + } \ + if ((elm)->field.cqe_prev == (void *)(head)) { \ + if ((head)->cqh_first != (elm)) \ + panic("CIRCLEQ elm first %p %s:%d", (elm), \ + __FILE__, __LINE__); \ + } else { \ + if ((elm)->field.cqe_prev->field.cqe_next != (elm)) \ + panic("CIRCLEQ elm prev %p %s:%d", (elm), \ + __FILE__, __LINE__); \ + } +#define QUEUEDEBUG_CIRCLEQ_POSTREMOVE(elm, field) \ + (elm)->field.cqe_next = (void *)1L; \ + (elm)->field.cqe_prev = (void *)1L; +#else +#define QUEUEDEBUG_CIRCLEQ_HEAD(head, field) +#define QUEUEDEBUG_CIRCLEQ_ELM(head, elm, field) +#define QUEUEDEBUG_CIRCLEQ_POSTREMOVE(elm, field) +#endif + +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_HEAD_INITIALIZER(head) \ + { (void *)&head, (void *)&head } + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +/* + * Circular queue functions. + */ +#define CIRCLEQ_INIT(head) do { \ + (head)->cqh_first = (void *)(head); \ + (head)->cqh_last = (void *)(head); \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \ + QUEUEDEBUG_CIRCLEQ_ELM((head), (listelm), field) \ + (elm)->field.cqe_next = (listelm)->field.cqe_next; \ + (elm)->field.cqe_prev = (listelm); \ + if ((listelm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (listelm)->field.cqe_next->field.cqe_prev = (elm); \ + (listelm)->field.cqe_next = (elm); \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \ + QUEUEDEBUG_CIRCLEQ_ELM((head), (listelm), field) \ + (elm)->field.cqe_next = (listelm); \ + (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \ + if ((listelm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (listelm)->field.cqe_prev->field.cqe_next = (elm); \ + (listelm)->field.cqe_prev = (elm); \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \ + (elm)->field.cqe_next = (head)->cqh_first; \ + (elm)->field.cqe_prev = (void *)(head); \ + if ((head)->cqh_last == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (head)->cqh_first->field.cqe_prev = (elm); \ + (head)->cqh_first = (elm); \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ + QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \ + (elm)->field.cqe_next = (void *)(head); \ + (elm)->field.cqe_prev = (head)->cqh_last; \ + if ((head)->cqh_first == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (head)->cqh_last->field.cqe_next = (elm); \ + (head)->cqh_last = (elm); \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_REMOVE(head, elm, field) do { \ + QUEUEDEBUG_CIRCLEQ_HEAD((head), field) \ + QUEUEDEBUG_CIRCLEQ_ELM((head), (elm), field) \ + if ((elm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm)->field.cqe_prev; \ + else \ + (elm)->field.cqe_next->field.cqe_prev = \ + (elm)->field.cqe_prev; \ + if ((elm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm)->field.cqe_next; \ + else \ + (elm)->field.cqe_prev->field.cqe_next = \ + (elm)->field.cqe_next; \ + QUEUEDEBUG_CIRCLEQ_POSTREMOVE((elm), field) \ +} while (/*CONSTCOND*/0) + +#define CIRCLEQ_FOREACH(var, head, field) \ + for ((var) = ((head)->cqh_first); \ + (var) != (const void *)(head); \ + (var) = ((var)->field.cqe_next)) + +#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ + for ((var) = ((head)->cqh_last); \ + (var) != (const void *)(head); \ + (var) = ((var)->field.cqe_prev)) + +/* + * Circular queue access methods. + */ +#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) +#define CIRCLEQ_FIRST(head) ((head)->cqh_first) +#define CIRCLEQ_LAST(head) ((head)->cqh_last) +#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) +#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) + +#define CIRCLEQ_LOOP_NEXT(head, elm, field) \ + (((elm)->field.cqe_next == (void *)(head)) \ + ? ((head)->cqh_first) \ + : (elm->field.cqe_next)) +#define CIRCLEQ_LOOP_PREV(head, elm, field) \ + (((elm)->field.cqe_prev == (void *)(head)) \ + ? ((head)->cqh_last) \ + : (elm->field.cqe_prev)) + +#endif /* !_ELF_SYS_QUEUE_H_ */ + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/CMakeLists.txt b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/CMakeLists.txt new file mode 100644 index 0000000000..8c0c0afab2 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/CMakeLists.txt @@ -0,0 +1,29 @@ +## +######################################################################################################################## +# +# Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +# +######################################################################################################################## + +cmake_minimum_required(VERSION 3.10) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../common) +if(WIN32) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../common/win32) +else() +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../common/win32/sys) +endif() + +add_definitions(-DBSD_LIBELF -DUSE_MEMFILE) + +file(GLOB sources + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.c + ${CMAKE_CURRENT_SOURCE_DIR}/*.h +) +add_library(oclelf STATIC ${sources}) +set_target_properties(oclelf PROPERTIES POSITION_INDEPENDENT_CODE ON) + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README new file mode 100644 index 0000000000..726fcc92e4 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README @@ -0,0 +1,12 @@ +# $FreeBSD$ +# $NetBSD$ + +libelf: a BSD-licensed implementation of the ELF(3)/GELF(3) API. + +Documentation: + * Manual page elf.3 contains an overview of the library. Other + manual pages document individual APIs in the library. + * A tutorial "libelf by Example" is available at: + http://people.freebsd.org/~jkoshy/download/libelf/article.html + +For ongoing development please see http://elftoolchain.sourceforge.net/ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README.build b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README.build new file mode 100644 index 0000000000..6fc04c64e9 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/README.build @@ -0,0 +1,64 @@ +: README.build -- notes on the build process + +The "Makefile" in this directory uses BSD make(1) syntax. If you are +trying to build this library on a platform that does not have a +pre-built BSD compatible make(1), then you could try porting NetBSD's +make(1). NetBSD's make(1) is available at: + + http://www.crufty.net/help/sjg/bmake.html + + +: Supporting cross builds + +In the general case, libelf may be built for a target operating system +and machine architecture that is different from the host operating +system and machine architecture that the compilation is happening on. +For example, compilation could be running on a Linux/i386 host, with +target binaries being created for a NetBSD/sparc64 system. + +To support cross building: +- The top-level "Makefile" pulls in the appropriate make rules for the + target system. + + Inside of makefiles, we determine the target OS by looking at the + contents of the ${unix} make variable. The top-level makefile then + includes any target specific makefiles if they exist. + +- Operating systems differ in the names and locations of the headers + where their ELF types are defined. They also differ in the set of + ELF types supported. Inside of libelf's implementation these + differences are abstracted out by the auxiliary header + "_libelf_config.h". + + +: OS Specific Configuration : + +:: Debian :: + +The following packages are needed for the build: +- `build-essential' +- `m4' +- `freebsd-buildutils' or `freebsd5-buildutils' + +You would need to use `freebsd-make' instead of GNU make to build +the tools. You would also need to place /usr/lib/freebsd in the +shell's `PATH', preferably at the beginning. + +:: FreeBSD :: + +libelf should build out of the box on FreeBSD versions later than 6.0. + +:: NetBSD :: + +libelf should build out of the box on NetBSD versions later than 4.0. + +:: Ubuntu :: + +See the section on 'Debian' above. + + +: Porting resources on the 'net + +The 'predef' project [http://predef.sourceforge.net/] has a +comprehensive list of CPP macros predefined by various OSes. + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/Version.map b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/Version.map new file mode 100644 index 0000000000..2c595eafba --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/Version.map @@ -0,0 +1,97 @@ +/* + * $Id: Version.map 2033 2011-10-23 09:21:13Z jkoshy $ + * + * $FreeBSD: src/lib/libelf/Version.map,v 1.3 2007/04/29 14:05:22 deischen Exp $ + */ +FBSD_1.0 { +global: + elf32_checksum; + elf32_fsize; + elf32_getehdr; + elf32_getphdr; + elf32_getshdr; + elf32_newehdr; + elf32_newphdr; + elf32_xlatetof; + elf32_xlatetom; + elf64_checksum; + elf64_fsize; + elf64_getehdr; + elf64_getphdr; + elf64_getshdr; + elf64_newehdr; + elf64_newphdr; + elf64_xlatetof; + elf64_xlatetom; + elf_begin; + elf_cntl; + elf_end; + elf_errmsg; + elf_errno; + elf_fill; + elf_flagarhdr; + elf_flagdata; + elf_flagehdr; + elf_flagelf; + elf_flagphdr; + elf_flagscn; + elf_flagshdr; + elf_getarhdr; + elf_getarsym; + elf_getbase; + elf_getdata; + elf_getident; + elf_getscn; + elf_getphdrnum; + elf_getphnum; + elf_getshdrnum; + elf_getshnum; + elf_getshdrstrndx; + elf_getshstrndx; + elf_hash; + elf_kind; + elf_memory; + elf_ndxscn; + elf_newdata; + elf_newscn; + elf_next; + elf_nextscn; + elf_rand; + elf_rawdata; + elf_rawfile; + elf_setshstrndx; + elf_strptr; + elf_update; + elf_version; + gelf_checksum; + gelf_fsize; + gelf_getcap; + gelf_getclass; + gelf_getdyn; + gelf_getehdr; + gelf_getmove; + gelf_getphdr; + gelf_getrel; + gelf_getrela; + gelf_getshdr; + gelf_getsym; + gelf_getsyminfo; + gelf_getsymshndx; + gelf_newehdr; + gelf_newphdr; + gelf_update_cap; + gelf_update_dyn; + gelf_update_ehdr; + gelf_update_move; + gelf_update_phdr; + gelf_update_rel; + gelf_update_rela; + gelf_update_shdr; + gelf_update_sym; + gelf_update_syminfo; + gelf_update_symshndx; + gelf_xlatetof; + gelf_xlatetom; +local: + *; +}; diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf.h new file mode 100644 index 0000000000..8a44198e95 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf.h @@ -0,0 +1,216 @@ +/*- + * Copyright (c) 2006,2008-2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: _libelf.h 1921 2011-09-23 08:04:02Z jkoshy $ + */ + +#ifndef __LIBELF_H_ +#define __LIBELF_H_ + +#include +#include + +#include "_libelf_config.h" + +#include "_elftc.h" + +/* + * Library-private data structures. + */ + +#define LIBELF_MSG_SIZE 256 + +struct _libelf_globals { + int libelf_arch; + unsigned int libelf_byteorder; + int libelf_class; + int libelf_error; + int libelf_fillchar; + unsigned int libelf_version; + char libelf_msg[LIBELF_MSG_SIZE]; +}; + +extern struct _libelf_globals _libelf; + +#define LIBELF_PRIVATE(N) (_libelf.libelf_##N) + +#define LIBELF_ELF_ERROR_MASK 0xFF +#define LIBELF_OS_ERROR_SHIFT 8 + +#define LIBELF_SET_ERROR(E, O) do { \ + LIBELF_PRIVATE(error) = ((ELF_E_##E & LIBELF_ELF_ERROR_MASK)| \ + ((O) << LIBELF_OS_ERROR_SHIFT)); \ + } while (0) + +#define LIBELF_ADJUST_AR_SIZE(S) (((S) + 1U) & ~1U) + +/* + * Flags for library internal use. These use the upper 16 bits of the + * `e_flags' field. + */ +#define LIBELF_F_API_MASK 0x00FFFF /* Flags defined by the API. */ +#define LIBELF_F_AR_HEADER 0x010000 /* translated header available */ +#define LIBELF_F_AR_VARIANT_SVR4 0x020000 /* BSD style ar(1) archive */ +#define LIBELF_F_DATA_MALLOCED 0x040000 /* whether data was malloc'ed */ +#define LIBELF_F_RAWFILE_MALLOC 0x080000 /* whether e_rawfile was malloc'ed */ +#define LIBELF_F_RAWFILE_MMAP 0x100000 /* whether e_rawfile was mmap'ed */ +#define LIBELF_F_SHDRS_LOADED 0x200000 /* whether all shdrs were read in */ +#define LIBELF_F_SPECIAL_FILE 0x400000 /* non-regular file */ +struct _Elf_Mem { + void (*dealloc)(void*); + void* (*alloc)(size_t); +}; + +struct _Elf { + int e_activations; /* activation count */ + unsigned int e_byteorder; /* ELFDATA* */ + int e_class; /* ELFCLASS* */ + Elf_Cmd e_cmd; /* ELF_C_* used at creation time */ + int e_fd; /* associated file descriptor */ + unsigned int e_flags; /* ELF_F_* & LIBELF_F_* flags */ + Elf_Kind e_kind; /* ELF_K_* */ + Elf *e_parent; /* non-NULL for archive members */ + char *e_rawfile; /* uninterpreted bytes */ + size_t e_rawsize; /* size of uninterpreted bytes */ + unsigned int e_version; /* file version */ + /* AMD Memory interface */ + struct _Elf_Mem e_mem; + + /* + * Header information for archive members. See the + * LIBELF_F_AR_HEADER flag. + */ + union { + Elf_Arhdr *e_arhdr; /* translated header */ + char *e_rawhdr; /* untranslated header */ + } e_hdr; + + union { + struct { /* ar(1) archives */ + off_t e_next; /* set by elf_rand()/elf_next() */ + int e_nchildren; + char *e_rawstrtab; /* file name strings */ + size_t e_rawstrtabsz; + char *e_rawsymtab; /* symbol table */ + size_t e_rawsymtabsz; + Elf_Arsym *e_symtab; + size_t e_symtabsz; + } e_ar; + struct { /* regular ELF files */ + union { + Elf32_Ehdr *e_ehdr32; + Elf64_Ehdr *e_ehdr64; + } e_ehdr; + union { + Elf32_Phdr *e_phdr32; + Elf64_Phdr *e_phdr64; + } e_phdr; + STAILQ_HEAD(, _Elf_Scn) e_scn; /* section list */ + size_t e_nphdr; /* number of Phdr entries */ + size_t e_nscn; /* number of sections */ + size_t e_strndx; /* string table section index */ + } e_elf; + } e_u; +}; + +struct _Elf_Scn { + union { + Elf32_Shdr s_shdr32; + Elf64_Shdr s_shdr64; + } s_shdr; + STAILQ_HEAD(, _Elf_Data) s_data; /* list of Elf_Data descriptors */ + STAILQ_HEAD(, _Elf_Data) s_rawdata; /* raw data for this section */ + STAILQ_ENTRY(_Elf_Scn) s_next; + struct _Elf *s_elf; /* parent ELF descriptor */ + unsigned int s_flags; /* flags for the section as a whole */ + size_t s_ndx; /* index# for this section */ + uint64_t s_offset; /* managed by elf_update() */ + uint64_t s_rawoff; /* original offset in the file */ + uint64_t s_size; /* managed by elf_update() */ +}; + +enum { + ELF_TOFILE, + ELF_TOMEMORY +}; + +#define LIBELF_COPY_U32(DST,SRC,NAME) do { \ + if ((SRC)->NAME > UINT_MAX) { \ + LIBELF_SET_ERROR(RANGE, 0); \ + return (0); \ + } \ + (DST)->NAME = (SRC)->NAME; \ + } while (0) + +#define LIBELF_COPY_S32(DST,SRC,NAME) do { \ + if ((SRC)->NAME > INT_MAX || \ + (SRC)->NAME < INT_MIN) { \ + LIBELF_SET_ERROR(RANGE, 0); \ + return (0); \ + } \ + (DST)->NAME = (SRC)->NAME; \ + } while (0) + +/* + * Function Prototypes. + */ + +__BEGIN_DECLS +Elf_Data *_libelf_allocate_data(Elf_Scn *_s); +Elf *_libelf_allocate_elf(Elf_Mem *mem); +Elf_Scn *_libelf_allocate_scn(Elf *_e, size_t _ndx); +Elf_Arhdr *_libelf_ar_gethdr(Elf *_e); +Elf *_libelf_ar_open(Elf *_e); +Elf *_libelf_ar_open_member(int _fd, Elf_Cmd _c, Elf *_ar, Elf_Mem *mem); +int _libelf_ar_get_member(char *_s, size_t _sz, int _base, size_t *_ret); +Elf_Arsym *_libelf_ar_process_bsd_symtab(Elf *_ar, size_t *_dst); +Elf_Arsym *_libelf_ar_process_svr4_symtab(Elf *_ar, size_t *_dst); +unsigned long _libelf_checksum(Elf *_e, int _elfclass); +void *_libelf_ehdr(Elf *_e, int _elfclass, int _allocate); +int _libelf_falign(Elf_Type _t, int _elfclass); +size_t _libelf_fsize(Elf_Type _t, int _elfclass, unsigned int _version, + size_t count); +int (*_libelf_get_translator(Elf_Type _t, int _direction, int _elfclass)) + (char *_dst, size_t dsz, char *_src, size_t _cnt, int _byteswap); +void *_libelf_getphdr(Elf *_e, int _elfclass); +void *_libelf_getshdr(Elf_Scn *_scn, int _elfclass); +void _libelf_init_elf(Elf *_e, Elf_Kind _kind); +int _libelf_load_section_headers(Elf *e, void *ehdr); +int _libelf_malign(Elf_Type _t, int _elfclass); +size_t _libelf_msize(Elf_Type _t, int _elfclass, unsigned int _version); +void *_libelf_newphdr(Elf *_e, int _elfclass, size_t _count); +Elf_Data *_libelf_release_data(Elf_Data *_d); +Elf *_libelf_release_elf(Elf *_e); +Elf_Scn *_libelf_release_scn(Elf_Scn *_s); +int _libelf_setphnum(Elf *_e, void *_eh, int _elfclass, size_t _phnum); +int _libelf_setshnum(Elf *_e, void *_eh, int _elfclass, size_t _shnum); +int _libelf_setshstrndx(Elf *_e, void *_eh, int _elfclass, + size_t _shstrndx); +Elf_Data *_libelf_xlate(Elf_Data *_d, const Elf_Data *_s, + unsigned int _encoding, int _elfclass, int _direction); +int _libelf_xlate_shtype(uint32_t _sht); +__END_DECLS + +#endif /* __LIBELF_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_ar.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_ar.h new file mode 100644 index 0000000000..3bc60cadfc --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_ar.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: _libelf_ar.h 2032 2011-10-23 09:07:00Z jkoshy $ + */ + +#ifndef __LIBELF_AR_H_ +#define __LIBELF_AR_H_ + +/* + * Prototypes and declarations needed by libelf's ar(1) archive + * handling code. + */ + +#include + +#define LIBELF_AR_BSD_EXTENDED_NAME_PREFIX "#1/" +#define LIBELF_AR_BSD_SYMTAB_NAME "__.SYMDEF" +#define LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE \ + (sizeof(LIBELF_AR_BSD_EXTENDED_NAME_PREFIX) - 1) + +#define IS_EXTENDED_BSD_NAME(NAME) \ + (strncmp((NAME), LIBELF_AR_BSD_EXTENDED_NAME_PREFIX, \ + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE) == 0) + +char *_libelf_ar_get_string(const char *_buf, size_t _sz, int _rawname, + int _svr4names); +char *_libelf_ar_get_raw_name(const struct ar_hdr *_arh); +char *_libelf_ar_get_translated_name(const struct ar_hdr *_arh, Elf *_ar); +int _libelf_ar_get_number(const char *_buf, size_t _sz, int _base, + size_t *_ret); + +#endif /* __LIBELF_AR_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_config.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_config.h new file mode 100644 index 0000000000..28e6943a4f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/_libelf_config.h @@ -0,0 +1,288 @@ +/*- + * Copyright (c) 2008-2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: _libelf_config.h 2032 2011-10-23 09:07:00Z jkoshy $ + */ + +#ifdef __FreeBSD__ + +#define LIBELF_VCSID(ID) __FBSDID(ID) + +/* + * Define LIBELF_{ARCH,BYTEORDER,CLASS} based on the machine architecture. + * See also: . + */ + +#if defined(__amd64__) + +#define LIBELF_ARCH EM_X86_64 +#define LIBELF_BYTEORDER ELFDATA2LSB +#define LIBELF_CLASS ELFCLASS64 + +#elif defined(__arm__) + +#define LIBELF_ARCH EM_ARM +#if defined(__ARMEB__) /* Big-endian ARM. */ +#define LIBELF_BYTEORDER ELFDATA2MSB +#else +#define LIBELF_BYTEORDER ELFDATA2LSB +#endif +#define LIBELF_CLASS ELFCLASS32 + +#elif defined(__i386__) + +#define LIBELF_ARCH EM_386 +#define LIBELF_BYTEORDER ELFDATA2LSB +#define LIBELF_CLASS ELFCLASS32 + +#elif defined(__ia64__) + +#define LIBELF_ARCH EM_IA_64 +#define LIBELF_BYTEORDER ELFDATA2LSB +#define LIBELF_CLASS ELFCLASS64 + +#elif defined(__mips__) + +#define LIBELF_ARCH EM_MIPS +#if defined(__MIPSEB__) +#define LIBELF_BYTEORDER ELFDATA2MSB +#else +#define LIBELF_BYTEORDER ELFDATA2LSB +#endif +#define LIBELF_CLASS ELFCLASS32 + +#elif defined(__powerpc__) + +#define LIBELF_ARCH EM_PPC +#define LIBELF_BYTEORDER ELFDATA2MSB +#define LIBELF_CLASS ELFCLASS32 + +#elif defined(__sparc__) + +#define LIBELF_ARCH EM_SPARCV9 +#define LIBELF_BYTEORDER ELFDATA2MSB +#define LIBELF_CLASS ELFCLASS64 + +#else +#error Unknown FreeBSD architecture. +#endif +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ + +#include + +#define LIBELF_VCSID(ID) __RCSID(ID) + +#if !defined(ARCH_ELFSIZE) +#error ARCH_ELFSIZE is not defined. +#endif + +#if ARCH_ELFSIZE == 32 +#define LIBELF_ARCH ELF32_MACHDEP_ID +#define LIBELF_BYTEORDER ELF32_MACHDEP_ENDIANNESS +#define LIBELF_CLASS ELFCLASS32 +#define Elf_Note Elf32_Nhdr +#else +#define LIBELF_ARCH ELF64_MACHDEP_ID +#define LIBELF_BYTEORDER ELF64_MACHDEP_ENDIANNESS +#define LIBELF_CLASS ELFCLASS64 +#define Elf_Note Elf64_Nhdr +#endif + +#endif /* __NetBSD__ */ + +/* + * GNU & Linux compatibility. + * + * `__linux__' is defined in an environment runs the Linux kernel and glibc. + * `__GNU__' is defined in an environment runs a GNU kernel (Hurd) and glibc. + * `__GLIBC__' is defined for an environment that runs glibc over a non-GNU + * kernel such as GNU/kFreeBSD. + */ + +#if defined(__linux__) || defined(__GNU__) || defined(__GLIBC__) + +#if defined(__linux__) + +#include "native-elf-format.h" + +#define LIBELF_CLASS ELFTC_CLASS +#define LIBELF_ARCH ELFTC_ARCH +#define LIBELF_BYTEORDER ELFTC_BYTEORDER + +#endif /* defined(__linux__) */ + +#define LIBELF_VCSID(ID) + +#if LIBELF_CLASS == ELFCLASS32 +#define Elf_Note Elf32_Nhdr +#elif LIBELF_CLASS == ELFCLASS64 +#define Elf_Note Elf64_Nhdr +#else +#error LIBELF_CLASS needs to be one of ELFCLASS32 or ELFCLASS64 +#endif + +#define roundup2 roundup + +#endif /* defined(__linux__) || defined(__GNU__) || defined(__GLIBC__) */ +/* + * Common configuration for the GNU environment. + */ + +#define LIBELF_CONFIG_ADDR 1 +#define LIBELF_CONFIG_BYTE 1 +#define LIBELF_CONFIG_DYN 1 +#define LIBELF_CONFIG_EHDR 1 +#define LIBELF_CONFIG_HALF 1 +#define LIBELF_CONFIG_MOVEP 1 +#define LIBELF_CONFIG_NOTE 1 +#define LIBELF_CONFIG_OFF 1 +#define LIBELF_CONFIG_PHDR 1 +#define LIBELF_CONFIG_REL 1 +#define LIBELF_CONFIG_RELA 1 +#define LIBELF_CONFIG_SHDR 1 +#define LIBELF_CONFIG_SWORD 1 +#define LIBELF_CONFIG_SXWORD 1 +#define LIBELF_CONFIG_SYM 1 +#define LIBELF_CONFIG_VDEF 1 +#define LIBELF_CONFIG_VNEED 1 +#define LIBELF_CONFIG_WORD 1 +#define LIBELF_CONFIG_XWORD 1 + +#if defined(WIN32) + +#include "native-elf-format.h" + +#define LIBELF_CLASS ELFTC_CLASS +#define LIBELF_ARCH ELFTC_ARCH +#define LIBELF_BYTEORDER ELFTC_BYTEORDER + +#define LIBELF_CONFIG_ADDR 1 +#define LIBELF_CONFIG_BYTE 1 +#define LIBELF_CONFIG_DYN 1 +#define LIBELF_CONFIG_EHDR 1 +#define LIBELF_CONFIG_HALF 1 +#define LIBELF_CONFIG_MOVEP 1 +#define LIBELF_CONFIG_OFF 1 +#define LIBELF_CONFIG_PHDR 1 +#define LIBELF_CONFIG_REL 1 +#define LIBELF_CONFIG_RELA 1 +#define LIBELF_CONFIG_SHDR 1 +#define LIBELF_CONFIG_SWORD 1 +#define LIBELF_CONFIG_SXWORD 1 +#define LIBELF_CONFIG_SYM 1 +#define LIBELF_CONFIG_WORD 1 +#define LIBELF_CONFIG_XWORD 1 + +#define LIBELF_VCSID(ID) + +#define roundup2 roundup +#endif // defined(WIN32) +#ifndef LIBELF_CONFIG_GNUHASH +#define LIBELF_CONFIG_GNUHASH 1 + +/* + * The header for GNU-style hash sections. + */ + +typedef struct { + u_int32_t gh_nbuckets; /* Number of hash buckets. */ + u_int32_t gh_symndx; /* First visible symbol in .dynsym. */ + u_int32_t gh_maskwords; /* #maskwords used in bloom filter. */ + u_int32_t gh_shift2; /* Bloom filter shift count. */ +} Elf_GNU_Hash_Header; +#endif + +#if defined(USE_MEMFILE) +#include "memfile.h" + +#if !defined(read) +#define read(f, b, l) mem_read((f), (b), (l)) +#endif + +#if !defined(write) +#define write(f, b, l) mem_write((f), (b), (l)) +#endif + +#if !defined(lseek) +#define lseek(f, l, w) mem_lseek((f), (l), (w)) +#endif + +#if !defined(fstat) +#define fstat(f, b) mem_fstat((f), (struct stat*)(b)) +#endif + +#if !defined(_fstat64i32) +#define _fstat64i32(f, b) mem_fstat((f), (struct stat*)(b)) +#endif + +#if !defined(_fstat32i64) +#define _fstat32i64(f, b) mem_fstat((f), (struct stat*)(b)) +#endif + +#if !defined(_fstat32) +#define _fstat32(f, b) mem_fstat((f), (struct stat*)(b)) +#endif + +#if !defined(_fstat64) +#define _fstat64(f, b) mem_fstat((f), (struct stat*)(b)) +#endif + +#if !defined(ftruncate) +#define ftruncate(f, l) mem_ftruncate((f), (size_t)(l)) +#endif + +#if !defined(_chsize) +#define _chsize(f, l) mem_ftruncate((f), (size_t)(l)) +#endif + +#if !defined(mmap) +#define mmap mem_mmap +#endif + +#if !defined(mem_munmap) +#define munmap mem_munmap +#endif + +#else // !USE_MEMFILE + +#if !defined(mmap) +#if defined(WIN32) +#define mmap w32_mmap +#else +#define mmap mmap +#endif +#endif + +#if !defined(mem_munmap) +#if defined(WIN32) +#define munmap w32_munmap +#else +#define munmap munmap +#endif +#endif + +#endif //USE_MEMFILE diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf.c new file mode 100644 index 0000000000..9028aed367 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2006,2008,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf.c 1345 2011-01-01 11:17:52Z jkoshy $"); + +struct _libelf_globals _libelf = { + /*.libelf_arch = */LIBELF_ARCH, + /*.libelf_byteorder = */LIBELF_BYTEORDER, + /*.libelf_class = */LIBELF_CLASS, + /*.libelf_error = */0, + /*.libelf_fillchar = */0, + /*.libelf_version = */EV_NONE, + { 0 } +}; diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_begin.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_begin.c new file mode 100644 index 0000000000..0fb5ae9610 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_begin.c @@ -0,0 +1,334 @@ +/*- + * Copyright (c) 2006,2008-2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#if !defined(WIN32) +#include +#include +#else +#ifndef PROT_READ +#define PROT_READ FILE_MAP_READ +#endif +#ifndef MAP_PRIVATE +#define MAP_PRIVATE FILE_MAP_COPY +#endif +#ifndef MAP_FAILED +#define MAP_FAILED NULL +#endif +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#if !defined(WIN32) +#include +#else +#include "compat.h" +#endif + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_begin.c 1923 2011-09-23 09:01:13Z jkoshy $"); + +#define _LIBELF_INITSIZE (64*1024) + +/* + * Read from a device file, pipe or socket. + */ +static void * +_libelf_read_special_file(int fd, size_t *fsz) +{ + ssize_t readsz; + size_t bufsz, datasz; + unsigned char *buf, *t; + + datasz = 0; + readsz = 0; + bufsz = _LIBELF_INITSIZE; + if ((buf = malloc(bufsz)) == NULL) + goto resourceerror; + + /* + * Read data from the file descriptor till we reach EOF, or + * till an error is encountered. + */ + do { + /* Check if we need to expand the data buffer. */ + if (datasz == bufsz) { + bufsz *= 2; + if ((t = realloc(buf, bufsz)) == NULL) + goto resourceerror; + buf = t; + } + + do { + readsz = bufsz - datasz; + t = buf + datasz; + if ((readsz = read(fd, t, readsz)) <= 0) + break; + datasz += readsz; + } while (datasz < bufsz); + + } while (readsz > 0); + + if (readsz < 0) { + LIBELF_SET_ERROR(IO, errno); + goto error; + } + + assert(readsz == 0); + + /* + * Free up extra buffer space. + */ + if (bufsz > datasz) { + if (datasz > 0) { + if ((t = realloc(buf, datasz)) == NULL) + goto resourceerror; + buf = t; + } else { /* Zero bytes read. */ + LIBELF_SET_ERROR(ARGUMENT, 0); + free(buf); + buf = NULL; + } + } + + *fsz = datasz; + return (buf); + +resourceerror: + LIBELF_SET_ERROR(RESOURCE, 0); +error: + if (buf != NULL) + free(buf); + return (NULL); +} + +static Elf * +_libelf_open_object(int fd, Elf_Cmd c, Elf_Mem *mem) +{ + Elf *e; + void *m; + mode_t mode; + size_t fsize; + struct stat sb; + unsigned int flags; + + assert(c == ELF_C_READ || c == ELF_C_RDWR || c == ELF_C_WRITE); + + if (fstat(fd, &sb) < 0) { + LIBELF_SET_ERROR(IO, errno); + return (NULL); + } + + mode = sb.st_mode; + fsize = (size_t) sb.st_size; + + /* + * Reject unsupported file types. + */ + if (!S_ISREG(mode) && !S_ISCHR(mode) +#if !defined(WIN32) + && !S_ISFIFO(mode) && + !S_ISSOCK(mode) +#endif + ) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + /* + * For ELF_C_WRITE mode, allocate and return a descriptor. + * For ELF_C_RDWR mode, if the file is empty, allocate and return. + */ + if (c == ELF_C_WRITE || (c == ELF_C_RDWR && !fsize)) { + if ((e = _libelf_allocate_elf(mem)) != NULL) { + _libelf_init_elf(e, ELF_K_ELF); + e->e_byteorder = LIBELF_PRIVATE(byteorder); + e->e_fd = fd; + e->e_cmd = c; + if (!S_ISREG(mode)) + e->e_flags |= LIBELF_F_SPECIAL_FILE; + } + + return (e); + } + + /* + * ELF_C_READ and ELF_C_RDWR mode. + */ + m = NULL; + flags = 0; + if (S_ISREG(mode)) { + /* + * Always map regular files in with 'PROT_READ' + * permissions. + * + * For objects opened in ELF_C_RDWR mode, when + * elf_update(3) is called, we remove this mapping, + * write file data out using write(2), and map the new + * contents back. + */ + if ((m = mmap(NULL, fsize, PROT_READ, MAP_PRIVATE, fd, + (off_t) 0)) == MAP_FAILED) { + LIBELF_SET_ERROR(IO, errno); + return (NULL); + } + flags = LIBELF_F_RAWFILE_MMAP; +#if 0 + m = mem.alloc(fsize); + if (!fread(m, 1, fsize, _fdopen(fd, "w+b"))) { + LIBELF_SET_ERROR(IO, errno); + mem.dealloc(m); + return(NULL); + } + flags = LIBELF_F_RAWFILE_MALLOC; +#endif + } else if ((m = _libelf_read_special_file(fd, &fsize)) != NULL) + flags = LIBELF_F_RAWFILE_MALLOC | LIBELF_F_SPECIAL_FILE; + else + return (NULL); + + if ((e = elf_memory(m, fsize, mem)) == NULL) { + assert((flags & LIBELF_F_RAWFILE_MALLOC) || + (flags & LIBELF_F_RAWFILE_MMAP)); + if (flags & LIBELF_F_RAWFILE_MMAP) + (void) munmap(m, fsize); + else + e->e_mem.dealloc(m); + return (NULL); + } + + /* ar(1) archives aren't supported in RDWR mode. */ + if (c == ELF_C_RDWR && e->e_kind == ELF_K_AR) { + (void) elf_end(e); + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + e->e_flags |= flags; + e->e_fd = fd; + e->e_cmd = c; + + return (e); +} + +Elf * +elf_begin(int fd, Elf_Cmd c, Elf *a, Elf_Mem *mem) +{ + Elf *e; + + e = NULL; + + if (LIBELF_PRIVATE(version) == EV_NONE) { + LIBELF_SET_ERROR(SEQUENCE, 0); + return (NULL); + } + + switch (c) { + case ELF_C_NULL: + return (NULL); + + case ELF_C_WRITE: + /* + * The ELF_C_WRITE command is required to ignore the + * descriptor passed in. + */ + a = NULL; + break; + + case ELF_C_RDWR: + if (a != NULL && a->e_kind == ELF_K_AR) { /* not allowed for ar(1) archives. */ + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + /*FALLTHROUGH*/ + case ELF_C_READ: + /* + * Descriptor `a' could be for a regular ELF file, or + * for an ar(1) archive. If descriptor `a' was opened + * using a valid file descriptor, we need to check if + * the passed in `fd' value matches the original one. + */ + if (a && + ((a->e_fd != -1 && a->e_fd != fd) || c != a->e_cmd)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + break; + + default: + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + + } + + if (a == NULL) + e = _libelf_open_object(fd, c, mem); + else if (a->e_kind == ELF_K_AR) + e = _libelf_ar_open_member(a->e_fd, c, a, mem); + else + (e = a)->e_activations++; + + return (e); +} +#if defined(WIN32) +// This code taken from: +// http://git.661346.n2.nabble.com/PATCH-mmap-implementation-for-mingw-td1560056.html +// This code is in public domain according to the FAQ here: +// http://www.mingw.org/wiki/FAQ +// http://www.mingw.org/license +// FIXME: This needs to be more robust to the protection and flag options. +void *w32_mmap(void *start, size_t length, int prot, int flags, int fd, +unsigned offset) +{ + HANDLE handle; + + if (start != NULL || !(flags & MAP_PRIVATE)) + assert(!"Invalid usage of mingw_mmap"); + + handle = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, PAGE_READONLY, 0, 0, NULL); + if (handle != NULL) { + start = MapViewOfFile(handle, flags, 0, offset, +length); + CloseHandle(handle); + } + return start; +} + +int w32_munmap(void *start, size_t length) { + UnmapViewOfFile(start); + return 0; +} +#endif diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_cntl.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_cntl.c new file mode 100644 index 0000000000..2021917437 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_cntl.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_cntl.c 189 2008-07-20 10:38:08Z jkoshy $"); + +int +elf_cntl(Elf *e, Elf_Cmd c) +{ + if (e == NULL || + (c != ELF_C_FDDONE && c != ELF_C_FDREAD)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (-1); + } + + if (e->e_parent) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (-1); + } + + if (c == ELF_C_FDREAD) { + if (e->e_cmd == ELF_C_WRITE) { + LIBELF_SET_ERROR(MODE, 0); + return (-1); + } + else + return (0); + } + + e->e_fd = -1; + return 0; +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_data.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_data.c new file mode 100644 index 0000000000..45b503a6b7 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_data.c @@ -0,0 +1,254 @@ +/*- + * Copyright (c) 2006,2008,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_data.c 1765 2011-08-22 05:59:05Z jkoshy $"); + +Elf_Data * +elf_getdata(Elf_Scn *s, Elf_Data *d) +{ + Elf *e; + size_t fsz, msz, count; + int elfclass, elftype; + unsigned int sh_type; + uint64_t sh_align, sh_offset, sh_size; + int (*xlate)(char *_d, size_t _dsz, char *_s, size_t _c, int _swap); + + if (s == NULL || (e = s->s_elf) == NULL || + (d != NULL && s != d->d_scn)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + assert(e->e_kind == ELF_K_ELF); + + if (d == NULL && (d = STAILQ_FIRST(&s->s_data)) != NULL) + return (d); + + if (d != NULL) + return (STAILQ_NEXT(d, d_next)); + + if (e->e_rawfile == NULL) { + /* + * In the ELF_C_WRITE case, there is no source that + * can provide data for the section. + */ + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + elfclass = e->e_class; + + assert(elfclass == ELFCLASS32 || elfclass == ELFCLASS64); + + if (elfclass == ELFCLASS32) { + sh_type = s->s_shdr.s_shdr32.sh_type; + sh_offset = (uint64_t) s->s_shdr.s_shdr32.sh_offset; + sh_size = (uint64_t) s->s_shdr.s_shdr32.sh_size; + sh_align = (uint64_t) s->s_shdr.s_shdr32.sh_addralign; + } else { + sh_type = s->s_shdr.s_shdr64.sh_type; + sh_offset = s->s_shdr.s_shdr64.sh_offset; + sh_size = s->s_shdr.s_shdr64.sh_size; + sh_align = s->s_shdr.s_shdr64.sh_addralign; + } + + if (sh_type == SHT_NULL) { + LIBELF_SET_ERROR(SECTION, 0); + return (NULL); + } + + if ((elftype = _libelf_xlate_shtype(sh_type)) < ELF_T_FIRST || + elftype > ELF_T_LAST || (sh_type != SHT_NOBITS && + sh_offset + sh_size > (uint64_t) e->e_rawsize)) { + LIBELF_SET_ERROR(SECTION, 0); + return (NULL); + } + + if ((fsz = (elfclass == ELFCLASS32 ? elf32_fsize : elf64_fsize) + (elftype, (size_t) 1, e->e_version)) == 0) { + LIBELF_SET_ERROR(UNIMPL, 0); + return (NULL); + } + + if (sh_size % fsz) { + LIBELF_SET_ERROR(SECTION, 0); + return (NULL); + } + + count = sh_size / fsz; + + msz = _libelf_msize(elftype, elfclass, e->e_version); + + assert(msz > 0); + + if ((d = _libelf_allocate_data(s)) == NULL) + return (NULL); + + d->d_buf = NULL; + d->d_off = 0; + d->d_align = sh_align; + d->d_size = msz * count; + d->d_type = elftype; + d->d_version = e->e_version; + + if (sh_type == SHT_NOBITS || sh_size == 0) { + STAILQ_INSERT_TAIL(&s->s_data, d, d_next); + return (d); + } + + if ((d->d_buf = e->e_mem.alloc(msz*count)) == NULL) { + (void) _libelf_release_data(d); + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + d->d_flags |= LIBELF_F_DATA_MALLOCED; + + xlate = _libelf_get_translator(elftype, ELF_TOMEMORY, elfclass); + if (!(*xlate)(d->d_buf, d->d_size, e->e_rawfile + sh_offset, count, + e->e_byteorder != LIBELF_PRIVATE(byteorder))) { + _libelf_release_data(d); + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + STAILQ_INSERT_TAIL(&s->s_data, d, d_next); + + return (d); +} + +Elf_Data * +elf_newdata(Elf_Scn *s) +{ + Elf *e; + Elf_Data *d; + + if (s == NULL || (e = s->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + assert(e->e_kind == ELF_K_ELF); + + /* + * elf_newdata() has to append a data descriptor, so + * bring in existing section data if not already present. + */ + if (e->e_rawfile && s->s_size > 0 && STAILQ_EMPTY(&s->s_data)) + if (elf_getdata(s, NULL) == NULL) + return (NULL); + + if ((d = _libelf_allocate_data(s)) == NULL) + return (NULL); + + STAILQ_INSERT_TAIL(&s->s_data, d, d_next); + + d->d_align = 1; + d->d_buf = NULL; + d->d_off = (uint64_t) ~0; + d->d_size = 0; + d->d_type = ELF_T_BYTE; + d->d_version = LIBELF_PRIVATE(version); + + (void) elf_flagscn(s, ELF_C_SET, ELF_F_DIRTY); + + return (d); +} + +/* + * Retrieve a data descriptor for raw (untranslated) data for section + * `s'. + */ + +Elf_Data * +elf_rawdata(Elf_Scn *s, Elf_Data *d) +{ + Elf *e; + int elf_class; + uint32_t sh_type; + uint64_t sh_align, sh_offset, sh_size; + + if (s == NULL || (e = s->s_elf) == NULL || e->e_rawfile == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + assert(e->e_kind == ELF_K_ELF); + + if (d == NULL && (d = STAILQ_FIRST(&s->s_rawdata)) != NULL) + return (d); + + if (d != NULL) + return (STAILQ_NEXT(d, d_next)); + + elf_class = e->e_class; + + assert(elf_class == ELFCLASS32 || elf_class == ELFCLASS64); + + if (elf_class == ELFCLASS32) { + sh_type = s->s_shdr.s_shdr32.sh_type; + sh_offset = (uint64_t) s->s_shdr.s_shdr32.sh_offset; + sh_size = (uint64_t) s->s_shdr.s_shdr32.sh_size; + sh_align = (uint64_t) s->s_shdr.s_shdr32.sh_addralign; + } else { + sh_type = s->s_shdr.s_shdr64.sh_type; + sh_offset = s->s_shdr.s_shdr64.sh_offset; + sh_size = s->s_shdr.s_shdr64.sh_size; + sh_align = s->s_shdr.s_shdr64.sh_addralign; + } + + if (sh_type == SHT_NULL) + return (NULL); + + if ((d = _libelf_allocate_data(s)) == NULL) + return (NULL); + + d->d_buf = (sh_type == SHT_NOBITS || sh_size == 0) ? NULL : + e->e_rawfile + sh_offset; + d->d_off = 0; + d->d_align = sh_align; + d->d_size = sh_size; + d->d_type = ELF_T_BYTE; + d->d_version = e->e_version; + + STAILQ_INSERT_TAIL(&s->s_rawdata, d, d_next); + + return (d); +} + +void +elf_removedata(Elf_Scn *s, Elf_Data *d) +{ + STAILQ_REMOVE(&s->s_data, d, _Elf_Data, d_next); + d = _libelf_release_data(d); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_end.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_end.c new file mode 100644 index 0000000000..6f954af321 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_end.c @@ -0,0 +1,99 @@ +/*- + * Copyright (c) 2006,2008-2009,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#if !defined(WIN32) +#include +#endif + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_end.c 1922 2011-09-23 08:04:33Z jkoshy $"); + +int +elf_end(Elf *e) +{ + Elf *sv; + Elf_Scn *scn, *tscn; + + if (e == NULL || e->e_activations == 0) + return (0); + + if (--e->e_activations > 0) + return (e->e_activations); + + assert(e->e_activations == 0); + + while (e && e->e_activations == 0) { + switch (e->e_kind) { + case ELF_K_AR: + /* + * If we still have open child descriptors, we + * need to defer reclaiming resources till all + * the child descriptors for the archive are + * closed. + */ + if (e->e_u.e_ar.e_nchildren > 0) + return (0); + break; + case ELF_K_ELF: + /* + * Reclaim all section descriptors. + */ + STAILQ_FOREACH_SAFE(scn, &e->e_u.e_elf.e_scn, s_next, + tscn) + scn = _libelf_release_scn(scn); + break; + case ELF_K_NUM: + assert(0); + default: + break; + } + + if (e->e_rawfile) { + if (e->e_flags & LIBELF_F_RAWFILE_MMAP) +#if !defined(WIN32) + (void) munmap(e->e_rawfile, e->e_rawsize); +#else + abort(); +#endif + else if (e->e_flags & LIBELF_F_RAWFILE_MALLOC) + e->e_mem.dealloc(e->e_rawfile); + } + + sv = e; + if ((e = e->e_parent) != NULL) + e->e_u.e_ar.e_nchildren--; + sv = _libelf_release_elf(sv); + } + + return (0); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errmsg.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errmsg.c new file mode 100644 index 0000000000..225ec8d5f0 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errmsg.c @@ -0,0 +1,88 @@ +/*- + * Copyright (c) 2006,2008,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_errmsg.c 1345 2011-01-01 11:17:52Z jkoshy $"); + +/* + * Retrieve a human readable translation for an error message. + */ +#if defined(__GNUC__) +#define DEFINE_ERROR(N,S) [ELF_E_##N] = S +#else +#define DEFINE_ERROR(N,S) S +#endif +const char *_libelf_errors[] = { + DEFINE_ERROR(NONE, "No Error"), + DEFINE_ERROR(ARCHIVE, "Malformed ar(1) archive"), + DEFINE_ERROR(ARGUMENT, "Invalid argument"), + DEFINE_ERROR(CLASS, "ELF class mismatch"), + DEFINE_ERROR(DATA, "Invalid data buffer descriptor"), + DEFINE_ERROR(HEADER, "Missing or malformed ELF header"), + DEFINE_ERROR(IO, "I/O error"), + DEFINE_ERROR(LAYOUT, "Layout constraint violation"), + DEFINE_ERROR(MODE, "Incorrect ELF descriptor mode"), + DEFINE_ERROR(RANGE, "Value out of range of target"), + DEFINE_ERROR(RESOURCE, "Resource exhaustion"), + DEFINE_ERROR(SECTION, "Invalid section descriptor"), + DEFINE_ERROR(SEQUENCE, "API calls out of sequence"), + DEFINE_ERROR(UNIMPL, "Unimplemented feature"), + DEFINE_ERROR(VERSION, "Unknown ELF API version"), + DEFINE_ERROR(NUM, "Unknown error") +}; +#undef DEFINE_ERROR + +const char * +elf_errmsg(int error) +{ + int oserr; + + if (error == ELF_E_NONE && + (error = LIBELF_PRIVATE(error)) == 0) + return NULL; + else if (error == -1) + error = LIBELF_PRIVATE(error); + + oserr = error >> LIBELF_OS_ERROR_SHIFT; + error &= LIBELF_ELF_ERROR_MASK; + + if (error < ELF_E_NONE || error >= ELF_E_NUM) + return _libelf_errors[ELF_E_NUM]; + if (oserr) { + (void) snprintf(LIBELF_PRIVATE(msg), + sizeof(LIBELF_PRIVATE(msg)), "%s: %s", + _libelf_errors[error], strerror(oserr)); + return (const char *)&LIBELF_PRIVATE(msg); + } + return _libelf_errors[error]; +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errno.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errno.c new file mode 100644 index 0000000000..95e91b93ff --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_errno.c @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2006,2008,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_errno.c 1345 2011-01-01 11:17:52Z jkoshy $"); + +int +elf_errno(void) +{ + int old; + + old = LIBELF_PRIVATE(error); + LIBELF_PRIVATE(error) = 0; + return (old & LIBELF_ELF_ERROR_MASK); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_fill.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_fill.c new file mode 100644 index 0000000000..ac9e02ead0 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_fill.c @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_fill.c 189 2008-07-20 10:38:08Z jkoshy $"); + +void +elf_fill(int fill) +{ + LIBELF_PRIVATE(fillchar) = fill; +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_flag.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_flag.c new file mode 100644 index 0000000000..9d3171952f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_flag.c @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 2006,2008-2009,2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_flag.c 1918 2011-09-22 10:42:06Z jkoshy $"); + +unsigned int +elf_flagarhdr(Elf_Arhdr *a, Elf_Cmd c, unsigned int flags) +{ + unsigned int r; + + if (a == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (flags & ~ELF_F_DIRTY) != 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (c == ELF_C_SET) + r = a->ar_flags |= flags; + else + r = a->ar_flags &= ~flags; + + return (r & LIBELF_F_API_MASK); +} + +unsigned int +elf_flagdata(Elf_Data *d, Elf_Cmd c, unsigned int flags) +{ + unsigned int r; + + if (d == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (flags & ~ELF_F_DIRTY) != 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (c == ELF_C_SET) + r = d->d_flags |= flags; + else + r = d->d_flags &= ~flags; + + return (r & LIBELF_F_API_MASK); +} + +unsigned int +elf_flagehdr(Elf *e, Elf_Cmd c, unsigned int flags) +{ + int ec; + void *ehdr; + + if (e == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (e->e_kind != ELF_K_ELF) || (flags & ~ELF_F_DIRTY) != 0 || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) + ehdr = e->e_u.e_elf.e_ehdr.e_ehdr32; + else + ehdr = e->e_u.e_elf.e_ehdr.e_ehdr64; + + if (ehdr == NULL) { + LIBELF_SET_ERROR(SEQUENCE, 0); + return (0); + } + + return (elf_flagelf(e, c, flags)); +} + +unsigned int +elf_flagelf(Elf *e, Elf_Cmd c, unsigned int flags) +{ + int r; + + if (e == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (e->e_kind != ELF_K_ELF) || + (flags & ~(ELF_F_ARCHIVE | ELF_F_ARCHIVE_SYSV | + ELF_F_DIRTY | ELF_F_LAYOUT)) != 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if ((flags & ELF_F_ARCHIVE_SYSV) && (flags & ELF_F_ARCHIVE) == 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if ((flags & ELF_F_ARCHIVE) && e->e_cmd != ELF_C_WRITE) { + LIBELF_SET_ERROR(MODE, 0); + return (0); + } + + if (c == ELF_C_SET) + r = e->e_flags |= flags; + else + r = e->e_flags &= ~flags; + return (r & LIBELF_F_API_MASK); +} + +unsigned int +elf_flagphdr(Elf *e, Elf_Cmd c, unsigned int flags) +{ + int ec; + void *phdr; + + if (e == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (e->e_kind != ELF_K_ELF) || (flags & ~ELF_F_DIRTY) != 0 || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) + phdr = e->e_u.e_elf.e_phdr.e_phdr32; + else + phdr = e->e_u.e_elf.e_phdr.e_phdr64; + + if (phdr == NULL) { + LIBELF_SET_ERROR(SEQUENCE, 0); + return (0); + } + + return (elf_flagelf(e, c, flags)); +} + +unsigned int +elf_flagscn(Elf_Scn *s, Elf_Cmd c, unsigned int flags) +{ + int r; + + if (s == NULL) + return (0); + + if ((c != ELF_C_SET && c != ELF_C_CLR) || + (flags & ~ELF_F_DIRTY) != 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (c == ELF_C_SET) + r = s->s_flags |= flags; + else + r = s->s_flags &= ~flags; + return (r & LIBELF_F_API_MASK); +} + +unsigned int +elf_flagshdr(Elf_Scn *s, Elf_Cmd c, unsigned int flags) +{ + return (elf_flagscn(s, c, flags)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarhdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarhdr.c new file mode 100644 index 0000000000..43ceafda85 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarhdr.c @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2006,2008,2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_getarhdr.c 1341 2011-01-01 04:28:29Z jkoshy $"); + +Elf_Arhdr * +elf_getarhdr(Elf *e) +{ + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (e->e_flags & LIBELF_F_AR_HEADER) + return (e->e_hdr.e_arhdr); + + return (_libelf_ar_gethdr(e)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarsym.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarsym.c new file mode 100644 index 0000000000..1852262bdb --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getarsym.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_getarsym.c 1360 2011-01-08 08:27:41Z jkoshy $"); + +Elf_Arsym * +elf_getarsym(Elf *ar, size_t *ptr) +{ + size_t n; + Elf_Arsym *symtab; + + n = 0; + symtab = NULL; + + if (ar == NULL || ar->e_kind != ELF_K_AR) + LIBELF_SET_ERROR(ARGUMENT, 0); + else if ((symtab = ar->e_u.e_ar.e_symtab) != NULL) + n = ar->e_u.e_ar.e_symtabsz; + else if (ar->e_u.e_ar.e_rawsymtab) + symtab = (ar->e_flags & LIBELF_F_AR_VARIANT_SVR4) ? + _libelf_ar_process_svr4_symtab(ar, &n) : + _libelf_ar_process_bsd_symtab(ar, &n); + else + LIBELF_SET_ERROR(ARCHIVE, 0); + + if (ptr) + *ptr = n; + return (symtab); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getbase.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getbase.c new file mode 100644 index 0000000000..30058ca70c --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getbase.c @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_getbase.c 977 2010-06-06 11:50:31Z jkoshy $"); + +off_t +elf_getbase(Elf *e) +{ + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return ((off_t) -1); + } + + if (e->e_parent == NULL) + return ((off_t) 0); + + return ((off_t) ((uintptr_t) e->e_rawfile - + (uintptr_t) e->e_parent->e_rawfile)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getident.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getident.c new file mode 100644 index 0000000000..c17f3a56ef --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_getident.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_getident.c 189 2008-07-20 10:38:08Z jkoshy $"); + +char * +elf_getident(Elf *e, size_t *sz) +{ + + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + goto error; + } + + if (e->e_cmd == ELF_C_WRITE && e->e_rawfile == NULL) { + LIBELF_SET_ERROR(SEQUENCE, 0); + goto error; + } + + assert(e->e_kind != ELF_K_AR || e->e_cmd == ELF_C_READ); + + if (sz) { + if (e->e_kind == ELF_K_AR) + *sz = SARMAG; + else if (e->e_kind == ELF_K_ELF) + *sz = EI_NIDENT; + else + *sz = e->e_rawsize; + } + + return ((char *) e->e_rawfile); + + error: + if (sz) + *sz = 0; + return (NULL); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_hash.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_hash.c new file mode 100644 index 0000000000..12c764d54a --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_hash.c @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf_config.h" + +LIBELF_VCSID("$Id: elf_hash.c 189 2008-07-20 10:38:08Z jkoshy $"); + +/* + * This elf_hash function is defined by the System V ABI. + */ + +unsigned long +elf_hash(const char *name) +{ + unsigned long h, t; + const unsigned char *s; + + s = (const unsigned char *) name; + h = t = 0; + + for (; *s != '\0'; h = h & ~t) { + h = (h << 4) + *s++; + t = h & 0xF0000000UL; + if (t) + h ^= t >> 24; + } + + return (h); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_kind.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_kind.c new file mode 100644 index 0000000000..0b4251a23e --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_kind.c @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_kind.c 189 2008-07-20 10:38:08Z jkoshy $"); + +Elf_Kind +elf_kind(Elf *e) +{ + if (e == NULL) + return (ELF_K_NONE); + if (e->e_kind == ELF_K_AR || + e->e_kind == ELF_K_ELF) + return (e->e_kind); + return (ELF_K_NONE); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_memory.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_memory.c new file mode 100644 index 0000000000..e310b0107a --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_memory.c @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_memory.c 189 2008-07-20 10:38:08Z jkoshy $"); + +Elf * +elf_memory(char *image, size_t sz, Elf_Mem *mem) +{ + Elf *e; + + if (LIBELF_PRIVATE(version) == EV_NONE) { + LIBELF_SET_ERROR(SEQUENCE, 0); + return (NULL); + } + + if (image == NULL || sz == 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((e = _libelf_allocate_elf(mem)) == NULL) + return (NULL); + + e->e_cmd = ELF_C_READ; + e->e_rawfile = image; + e->e_rawsize = sz; + +#undef LIBELF_IS_ELF +#define LIBELF_IS_ELF(P) ((P)[EI_MAG0] == ELFMAG0 && \ + (P)[EI_MAG1] == ELFMAG1 && (P)[EI_MAG2] == ELFMAG2 && \ + (P)[EI_MAG3] == ELFMAG3) + + if (sz > EI_NIDENT && LIBELF_IS_ELF(image)) { + _libelf_init_elf(e, ELF_K_ELF); + e->e_class = image[EI_CLASS]; + e->e_byteorder = image[EI_DATA]; + e->e_version = image[EI_VERSION]; + + if (e->e_version > EV_CURRENT) { + e = _libelf_release_elf(e); + LIBELF_SET_ERROR(VERSION, 0); + return (NULL); + } + + if ((e->e_byteorder != ELFDATA2LSB && e->e_byteorder != + ELFDATA2MSB) || (e->e_class != ELFCLASS32 && e->e_class != + ELFCLASS64)) { + e = _libelf_release_elf(e); + LIBELF_SET_ERROR(HEADER, 0); + return (NULL); + } + + } else if (sz >= SARMAG && + strncmp(image, ARMAG, (size_t) SARMAG) == 0) { + _libelf_init_elf(e, ELF_K_AR); + e = _libelf_ar_open(e); + } else + _libelf_init_elf(e, ELF_K_NONE); + + return (e); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_next.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_next.c new file mode 100644 index 0000000000..d6ca552e5f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_next.c @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_next.c 1678 2011-07-28 04:36:34Z jkoshy $"); + +Elf_Cmd +elf_next(Elf *e) +{ + off_t next; + Elf *parent; + + if (e == NULL) + return (ELF_C_NULL); + + if ((parent = e->e_parent) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (ELF_C_NULL); + } + + assert (parent->e_kind == ELF_K_AR); + assert (parent->e_cmd == ELF_C_READ); + assert(e->e_rawfile > parent->e_rawfile); + + next = e->e_rawfile - parent->e_rawfile + e->e_rawsize; + next = (next + 1) & ~1; /* round up to an even boundary */ + + parent->e_u.e_ar.e_next = (next >= (off_t) parent->e_rawsize) ? + (off_t) 0 : next; + + return (ELF_C_READ); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_phnum.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_phnum.c new file mode 100644 index 0000000000..d63c490b24 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_phnum.c @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_phnum.c 466 2009-08-04 17:17:42Z jkoshy $"); + +static int +_libelf_getphdrnum(Elf *e, size_t *phnum) +{ + void *eh; + int ec; + + if (e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (-1); + } + + if ((eh = _libelf_ehdr(e, ec, 0)) == NULL) + return (-1); + + *phnum = e->e_u.e_elf.e_nphdr; + + return (0); +} + +int +elf_getphdrnum(Elf *e, size_t *phnum) +{ + return (_libelf_getphdrnum(e, phnum)); +} + +/* Deprecated API */ +int +elf_getphnum(Elf *e, size_t *phnum) +{ + return (_libelf_getphdrnum(e, phnum) >= 0); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rand.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rand.c new file mode 100644 index 0000000000..2e7328a53e --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rand.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_rand.c 189 2008-07-20 10:38:08Z jkoshy $"); + +off_t +elf_rand(Elf *ar, off_t offset) +{ + struct ar_hdr *arh; + + if (ar == NULL || ar->e_kind != ELF_K_AR || + (offset & 1) || offset < SARMAG || + offset + sizeof(struct ar_hdr) >= ar->e_rawsize) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return 0; + } + + arh = (struct ar_hdr *) (ar->e_rawfile + offset); + + /* a too simple sanity check */ + if (arh->ar_fmag[0] != '`' || arh->ar_fmag[1] != '\n') { + LIBELF_SET_ERROR(ARCHIVE, 0); + return 0; + } + + ar->e_u.e_ar.e_next = offset; + + return (offset); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rawfile.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rawfile.c new file mode 100644 index 0000000000..22a9f95d23 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_rawfile.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_rawfile.c 189 2008-07-20 10:38:08Z jkoshy $"); + +char * +elf_rawfile(Elf *e, size_t *sz) +{ + char *ptr; + size_t size; + + size = e ? e->e_rawsize : 0; + ptr = NULL; + + if (e == NULL) + LIBELF_SET_ERROR(ARGUMENT, 0); + else if ((ptr = e->e_rawfile) == NULL && e->e_cmd == ELF_C_WRITE) + LIBELF_SET_ERROR(SEQUENCE, 0); + + if (sz) + *sz = size; + + return (ptr); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_scn.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_scn.c new file mode 100644 index 0000000000..112bc545e1 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_scn.c @@ -0,0 +1,245 @@ +/*- + * Copyright (c) 2006,2008-2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_scn.c 1077 2010-08-09 15:37:40Z jkoshy $"); + +/* + * Load an ELF section table and create a list of Elf_Scn structures. + */ +int +_libelf_load_section_headers(Elf *e, void *ehdr) +{ + int ec, swapbytes; + size_t fsz, i, shnum; + uint64_t shoff; + char *src; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + Elf_Scn *scn; + int (*xlator)(char *_d, size_t _dsz, char *_s, size_t _c, int _swap); + + assert(e != NULL); + assert(ehdr != NULL); + assert((e->e_flags & LIBELF_F_SHDRS_LOADED) == 0); + +#define CHECK_EHDR(E,EH) do { \ + if (fsz != (EH)->e_shentsize || \ + (e->e_rawfile && shoff + fsz * shnum > e->e_rawsize)) { \ + LIBELF_SET_ERROR(HEADER, 0); \ + return (0); \ + } \ + } while (0) + + ec = e->e_class; + fsz = _libelf_fsize(ELF_T_SHDR, ec, e->e_version, (size_t) 1); + assert(fsz > 0); + + shnum = e->e_u.e_elf.e_nscn; + + if (ec == ELFCLASS32) { + eh32 = (Elf32_Ehdr *) ehdr; + shoff = (uint64_t) eh32->e_shoff; + CHECK_EHDR(e, eh32); + } else { + eh64 = (Elf64_Ehdr *) ehdr; + shoff = eh64->e_shoff; + CHECK_EHDR(e, eh64); + } + + xlator = _libelf_get_translator(ELF_T_SHDR, ELF_TOMEMORY, ec); + + swapbytes = e->e_byteorder != LIBELF_PRIVATE(byteorder); + + // If we aren't editing a rawfile, then we don't need to + // load any sections. + if (!e->e_rawfile) { + return 1; + } + src = e->e_rawfile + shoff; + + /* + * If the file is using extended numbering then section #0 + * would have already been read in. + */ + + i = 0; + if (!STAILQ_EMPTY(&e->e_u.e_elf.e_scn)) { + assert(STAILQ_FIRST(&e->e_u.e_elf.e_scn) == + STAILQ_LAST(&e->e_u.e_elf.e_scn, _Elf_Scn, s_next)); + + i = 1; + src += fsz; + } + + for (; i < shnum; i++, src += fsz) { + if ((scn = _libelf_allocate_scn(e, i)) == NULL) + return (0); + + (*xlator)((char *) &scn->s_shdr, sizeof(scn->s_shdr), src, + (size_t) 1, swapbytes); + + if (ec == ELFCLASS32) { + scn->s_offset = + scn->s_shdr.s_shdr32.sh_offset; + scn->s_size = scn->s_shdr.s_shdr32.sh_size; + } else { + scn->s_offset = + scn->s_shdr.s_shdr64.sh_offset; + scn->s_size = scn->s_shdr.s_shdr64.sh_size; + } + // If we have a true read/write elf, we cannot trust the + // raw offset and we need to pull in the data also when + // the section headers are loaded. + if (e->e_cmd != ELF_C_RDWR) { + scn->s_rawoff = scn->s_offset; + } else { + elf_getdata(scn, NULL); + } + } + + e->e_flags |= LIBELF_F_SHDRS_LOADED; + + return (1); +} + +Elf_Scn * +elf_getscn(Elf *e, size_t index) +{ + int ec; + void *ehdr; + Elf_Scn *s; + + if (e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return (NULL); + + if (e->e_cmd != ELF_C_WRITE && + (e->e_flags & LIBELF_F_SHDRS_LOADED) == 0 && + _libelf_load_section_headers(e, ehdr) == 0) + return (NULL); + + STAILQ_FOREACH(s, &e->e_u.e_elf.e_scn, s_next) + if (s->s_ndx == index) + return (s); + + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); +} + +size_t +elf_ndxscn(Elf_Scn *s) +{ + if (s == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (SHN_UNDEF); + } + return (s->s_ndx); +} + +Elf_Scn * +elf_newscn(Elf *e) +{ + int ec; + void *ehdr; + Elf_Scn *scn; + + if (e == NULL || e->e_kind != ELF_K_ELF) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64) { + LIBELF_SET_ERROR(CLASS, 0); + return (NULL); + } + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return (NULL); + + /* + * The application may be asking for a new section descriptor + * on an ELF object opened with ELF_C_RDWR or ELF_C_READ. We + * need to bring in the existing section information before + * appending a new one to the list. + * + * Per the ELF(3) API, an application is allowed to open a + * file using ELF_C_READ, mess with its internal structure and + * use elf_update(...,ELF_C_NULL) to compute its new layout. + */ + if (e->e_cmd != ELF_C_WRITE && + (e->e_flags & LIBELF_F_SHDRS_LOADED) == 0 && + _libelf_load_section_headers(e, ehdr) == 0) + return (NULL); + + if (STAILQ_EMPTY(&e->e_u.e_elf.e_scn)) { + assert(e->e_u.e_elf.e_nscn == 0); + if ((scn = _libelf_allocate_scn(e, (size_t) SHN_UNDEF)) == + NULL) + return (NULL); + e->e_u.e_elf.e_nscn++; + } + + assert(e->e_u.e_elf.e_nscn > 0); + + if ((scn = _libelf_allocate_scn(e, e->e_u.e_elf.e_nscn)) == NULL) + return (NULL); + + e->e_u.e_elf.e_nscn++; + + (void) elf_flagscn(scn, ELF_C_SET, ELF_F_DIRTY); + + return (scn); +} + +Elf_Scn * +elf_nextscn(Elf *e, Elf_Scn *s) +{ + if (e == NULL || (e->e_kind != ELF_K_ELF) || + (s && s->s_elf != e)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + return (s == NULL ? elf_getscn(e, (size_t) 1) : + STAILQ_NEXT(s, s_next)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shnum.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shnum.c new file mode 100644 index 0000000000..515027a083 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shnum.c @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_shnum.c 466 2009-08-04 17:17:42Z jkoshy $"); + +static int +_libelf_getshdrnum(Elf *e, size_t *shnum) +{ + void *eh; + int ec; + + if (e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (-1); + } + + if ((eh = _libelf_ehdr(e, ec, 0)) == NULL) + return (-1); + + *shnum = e->e_u.e_elf.e_nscn; + + return (0); +} + +int +elf_getshdrnum(Elf *e, size_t *shnum) +{ + return (_libelf_getshdrnum(e, shnum)); +} + +/* Deprecated API. */ +int +elf_getshnum(Elf *e, size_t *shnum) +{ + return (_libelf_getshdrnum(e, shnum) >= 0); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shstrndx.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shstrndx.c new file mode 100644 index 0000000000..bac14b48f1 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_shstrndx.c @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_shstrndx.c 466 2009-08-04 17:17:42Z jkoshy $"); + +static int +_libelf_getshdrstrndx(Elf *e, size_t *strndx) +{ + void *eh; + int ec; + + if (e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (-1); + } + + if ((eh = _libelf_ehdr(e, ec, 0)) == NULL) + return (-1); + + *strndx = e->e_u.e_elf.e_strndx; + + return (0); +} + +int +elf_getshdrstrndx(Elf *e, size_t *strndx) +{ + return (_libelf_getshdrstrndx(e, strndx)); +} + +int +elf_getshstrndx(Elf *e, size_t *strndx) /* Deprecated API. */ +{ + return (_libelf_getshdrstrndx(e, strndx) >= 0); +} + +int +elf_setshstrndx(Elf *e, size_t strndx) +{ + void *eh; + int ec; + + if (e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64) || + ((eh = _libelf_ehdr(e, ec, 0)) == NULL)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + return (_libelf_setshstrndx(e, eh, ec, strndx)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_strptr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_strptr.c new file mode 100644 index 0000000000..c9ee0525d9 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_strptr.c @@ -0,0 +1,136 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include "_libelf.h" + +#ifdef ANDROID +#include "roundup.h" +#else +#include +#endif + +LIBELF_VCSID("$Id: elf_strptr.c 189 2008-07-20 10:38:08Z jkoshy $"); + +/* + * Convert an ELF section#,offset pair to a string pointer. + */ + +char * +elf_strptr(Elf *e, size_t scndx, size_t offset) +{ + Elf_Scn *s; + Elf_Data *d; + size_t alignment, count; + GElf_Shdr shdr; + + if (e == NULL || e->e_kind != ELF_K_ELF) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((s = elf_getscn(e, scndx)) == NULL || + gelf_getshdr(s, &shdr) == NULL) + return (NULL); + + if (shdr.sh_type != SHT_STRTAB || + offset >= shdr.sh_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + d = NULL; + if (e->e_flags & ELF_F_LAYOUT) { + + /* + * The application is taking responsibility for the + * ELF object's layout, so we can directly translate + * an offset to a `char *' address using the `d_off' + * members of Elf_Data descriptors. + */ + while ((d = elf_getdata(s, d)) != NULL) { + + if (d->d_buf == 0 || d->d_size == 0) + continue; + + if (d->d_type != ELF_T_BYTE) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + if (offset >= d->d_off && + offset < d->d_off + d->d_size) + return ((char *) d->d_buf + offset - d->d_off); + } + } else { + /* + * Otherwise, the `d_off' members are not useable and + * we need to compute offsets ourselves, taking into + * account 'holes' in coverage of the section introduced + * by alignment requirements. + */ + count = (size_t) 0; /* cumulative count of bytes seen */ + while ((d = elf_getdata(s, d)) != NULL && count <= offset) { + + if (d->d_buf == NULL || d->d_size == 0) + continue; + + if (d->d_type != ELF_T_BYTE) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + if ((alignment = d->d_align) > 1) { + if ((alignment & (alignment - 1)) != 0) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + count = roundup2(count, alignment); + } + + if (offset < count) { + /* offset starts in the 'hole' */ + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (offset < count + d->d_size) { + if (d->d_buf != NULL) + return ((char *) d->d_buf + + offset - count); + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + count += d->d_size; + } + } + + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_update.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_update.c new file mode 100644 index 0000000000..9fae5cebd5 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_update.c @@ -0,0 +1,1264 @@ +/*- + * Copyright (c) 2006-2011 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#if !defined(WIN32) +#include +#include "queue.h" +#endif +#include + +#include +#include +#include +#include +#include +#include +#if !defined(WIN32) +#include +#else +#include +#ifndef PROT_READ +#define PROT_READ FILE_MAP_READ +#endif +#ifndef MAP_PRIVATE +#define MAP_PRIVATE FILE_MAP_COPY +#endif +#ifndef MAP_FAILED +#define MAP_FAILED NULL +#endif +#endif + +#include "_libelf.h" + +#ifdef ANDROID +#include "roundup.h" +#else +#include +#endif + +LIBELF_VCSID("$Id: elf_update.c 1922 2011-09-23 08:04:33Z jkoshy $"); + +/* + * Layout strategy: + * + * - Case 1: ELF_F_LAYOUT is asserted + * In this case the application has full control over where the + * section header table, program header table, and section data + * will reside. The library only perform error checks. + * + * - Case 2: ELF_F_LAYOUT is not asserted + * + * The library will do the object layout using the following + * ordering: + * - The executable header is placed first, are required by the + * ELF specification. + * - The program header table is placed immediately following the + * executable header. + * - Section data, if any, is placed after the program header + * table, aligned appropriately. + * - The section header table, if needed, is placed last. + * + * There are two sub-cases to be taken care of: + * + * - Case 2a: e->e_cmd == ELF_C_READ or ELF_C_RDWR + * + * In this sub-case, the underlying ELF object may already have + * content in it, which the application may have modified. The + * library will retrieve content from the existing object as + * needed. + * + * - Case 2b: e->e_cmd == ELF_C_WRITE + * + * The ELF object is being created afresh in this sub-case; + * there is no pre-existing content in the underlying ELF + * object. + */ + +/* + * The types of extents in an ELF object. + */ +enum elf_extent { + ELF_EXTENT_EHDR, + ELF_EXTENT_PHDR, + ELF_EXTENT_SECTION, + ELF_EXTENT_SHDR +}; + +/* + * A extent descriptor, used when laying out an ELF object. + */ +struct _Elf_Extent { + ELF_SLIST_ENTRY(_Elf_Extent) ex_next; + uint64_t ex_start; /* Start of the region. */ + uint64_t ex_size; /* The size of the region. */ + enum elf_extent ex_type; /* Type of region. */ + void *ex_desc; /* Associated descriptor. */ +}; + +ELF_SLIST_HEAD(_Elf_Extent_List, _Elf_Extent); + +/* + * Compute the extents of a section, by looking at the data + * descriptors associated with it. The function returns 1 + * if successful, or zero if an error was detected. + */ +static int +_libelf_compute_section_extents(Elf *e, Elf_Scn *s, off_t rc) +{ + int ec; + size_t fsz, msz; + Elf_Data *d; + Elf32_Shdr *shdr32; + Elf64_Shdr *shdr64; + uint32_t sh_type; + uint64_t d_align; + unsigned int elftype; + uint64_t scn_size, scn_alignment; + uint64_t sh_align, sh_entsize, sh_offset, sh_size; + + ec = e->e_class; + + shdr32 = &s->s_shdr.s_shdr32; + shdr64 = &s->s_shdr.s_shdr64; + if (ec == ELFCLASS32) { + sh_type = shdr32->sh_type; + sh_align = (uint64_t) shdr32->sh_addralign; + sh_entsize = (uint64_t) shdr32->sh_entsize; + sh_offset = (uint64_t) shdr32->sh_offset; + sh_size = (uint64_t) shdr32->sh_size; + } else { + sh_type = shdr64->sh_type; + sh_align = shdr64->sh_addralign; + sh_entsize = shdr64->sh_entsize; + sh_offset = shdr64->sh_offset; + sh_size = shdr64->sh_size; + } + + assert(sh_type != SHT_NULL && sh_type != SHT_NOBITS); + + elftype = _libelf_xlate_shtype(sh_type); + if (elftype > ELF_T_LAST) { + LIBELF_SET_ERROR(SECTION, 0); + return (0); + } + + if (sh_align == 0) + sh_align = _libelf_falign(elftype, ec); + + /* + * Compute the section's size and alignment using the data + * descriptors associated with the section. + */ + if (STAILQ_EMPTY(&s->s_data)) { + /* + * The section's content (if any) has not been read in + * yet. If section is not dirty marked dirty, we can + * reuse the values in the 'sh_size' and 'sh_offset' + * fields of the section header. + */ + if ((s->s_flags & ELF_F_DIRTY) == 0) { + /* + * If the library is doing the layout, then we + * compute the new start offset for the + * section based on the current offset and the + * section's alignment needs. + * + * If the application is doing the layout, we + * can use the value in the 'sh_offset' field + * in the section header directly. + */ + if (e->e_flags & ELF_F_LAYOUT) + goto updatedescriptor; + else + goto computeoffset; + } + + /* + * Otherwise, we need to bring in the section's data + * from the underlying ELF object. + */ + if (e->e_cmd != ELF_C_WRITE && elf_getdata(s, NULL) == NULL) + return (0); + } + + /* + * Loop through the section's data descriptors. + */ + scn_size = 0L; + scn_alignment = 0; + STAILQ_FOREACH(d, &s->s_data, d_next) { + + /* + * The data buffer's type is known. + */ + if (d->d_type >= ELF_T_NUM) { + LIBELF_SET_ERROR(DATA, 0); + return (0); + } + + /* + * The data buffer's version is supported. + */ + if (d->d_version != e->e_version) { + LIBELF_SET_ERROR(VERSION, 0); + return (0); + } + + /* + * The buffer's alignment is non-zero and a power of + * two. + */ + if ((d_align = d->d_align) == 0 || + (d_align & (d_align - 1))) { + LIBELF_SET_ERROR(DATA, 0); + return (0); + } + + /* + * The buffer's size should be a multiple of the + * memory size of the underlying type. + */ + msz = _libelf_msize(d->d_type, ec, e->e_version); + if (d->d_size % msz) { + LIBELF_SET_ERROR(DATA, 0); + return (0); + } + + /* + * If the application is controlling layout, then the + * d_offset field should be compatible with the + * buffer's specified alignment. + */ + if ((e->e_flags & ELF_F_LAYOUT) && + (d->d_off & (d_align - 1))) { + LIBELF_SET_ERROR(LAYOUT, 0); + return (0); + } + + /* + * Compute the section's size. + */ + if (e->e_flags & ELF_F_LAYOUT) { + if ((uint64_t) d->d_off + d->d_size > scn_size) + scn_size = d->d_off + d->d_size; + } else { + scn_size = roundup2(scn_size, d->d_align); + d->d_off = scn_size; + fsz = _libelf_fsize(d->d_type, ec, d->d_version, + d->d_size / msz); + scn_size += fsz; + } + + /* + * The section's alignment is the maximum alignment + * needed for its data buffers. + */ + if (d_align > scn_alignment) + scn_alignment = d_align; + } + + /* + * If the application is requesting full control over the + * layout of the section, check the section's specified size, + * offsets and alignment for sanity. + */ + if (e->e_flags & ELF_F_LAYOUT) { + if (scn_alignment > sh_align || sh_offset % sh_align || + sh_size < scn_size) { + LIBELF_SET_ERROR(LAYOUT, 0); + return (0); + } + goto updatedescriptor; + } + + /* + * Otherwise, compute the values in the section header. + * + * The section alignment is the maximum alignment for any of + * its contained data descriptors. + */ + if (scn_alignment > sh_align) + sh_align = scn_alignment; + + /* + * If the section entry size is zero, try and fill in an + * appropriate entry size. Per the elf(5) manual page + * sections without fixed-size entries should have their + * 'sh_entsize' field set to zero. + */ + if (sh_entsize == 0 && + (sh_entsize = _libelf_fsize(elftype, ec, e->e_version, + (size_t) 1)) == 1) + sh_entsize = 0; + + sh_size = scn_size; + +computeoffset: + /* + * Compute the new offset for the section based on + * the section's alignment needs. + */ + sh_offset = roundup(rc, sh_align); + + /* + * Update the section header. + */ + if (ec == ELFCLASS32) { + shdr32->sh_addralign = (uint32_t) sh_align; + shdr32->sh_entsize = (uint32_t) sh_entsize; + shdr32->sh_offset = (uint32_t) sh_offset; + shdr32->sh_size = (uint32_t) sh_size; + } else { + shdr64->sh_addralign = sh_align; + shdr64->sh_entsize = sh_entsize; + shdr64->sh_offset = sh_offset; + shdr64->sh_size = sh_size; + } + +updatedescriptor: + /* + * Update the section descriptor. + */ + s->s_size = sh_size; + s->s_offset = sh_offset; + + return (1); +} + +/* + * Free a list of extent descriptors. + */ + +static void +_libelf_release_extents(struct _Elf_Extent_List *extents) +{ + struct _Elf_Extent *ex; + + while ((ex = ELF_SLIST_FIRST(extents)) != NULL) { + ELF_SLIST_REMOVE_HEAD(extents, ex_next); + free(ex); + } +} + +/* + * Check if an extent 's' defined by [start..start+size) is free. + * This routine assumes that the given extent list is sorted in order + * of ascending extent offsets. + */ + +static int +_libelf_extent_is_unused(struct _Elf_Extent_List *extents, + const uint64_t start, const uint64_t size, struct _Elf_Extent **prevt) +{ + uint64_t tmax, tmin; + struct _Elf_Extent *t, *pt; + const uint64_t smax = start + size; + + /* First, look for overlaps with existing extents. */ + pt = NULL; + ELF_SLIST_FOREACH(t, extents, ex_next) { + tmin = t->ex_start; + tmax = tmin + t->ex_size; + + if (tmax <= start) { + /* + * 't' lies entirely before 's': ...| t |...| s |... + */ + pt = t; + continue; + } else if (smax <= tmin) { + /* + * 's' lies entirely before 't', and after 'pt': + * ...| pt |...| s |...| t |... + */ + assert(pt == NULL || + pt->ex_start + pt->ex_size <= start); + break; + } else + /* 's' and 't' overlap. */ + return (0); + } + + if (prevt) + *prevt = pt; + return (1); +} + +/* + * Insert an extent into the list of extents. + */ + +static int +_libelf_insert_extent(struct _Elf_Extent_List *extents, int type, + uint64_t start, uint64_t size, void *desc) +{ + struct _Elf_Extent *ex, *prevt; + + assert(type >= ELF_EXTENT_EHDR && type <= ELF_EXTENT_SHDR); + + prevt = NULL; + + /* + * If the requested range overlaps with an existing extent, + * signal an error. + */ + if (!_libelf_extent_is_unused(extents, start, size, &prevt)) { + LIBELF_SET_ERROR(LAYOUT, 0); + return (0); + } + + /* Allocate and fill in a new extent descriptor. */ + if ((ex = malloc(sizeof(struct _Elf_Extent))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, errno); + return (0); + } + ex->ex_start = start; + ex->ex_size = size; + ex->ex_desc = desc; + ex->ex_type = type; + + /* Insert the region descriptor into the list. */ + if (prevt) + ELF_SLIST_INSERT_AFTER(prevt, ex, ex_next); + else + ELF_SLIST_INSERT_HEAD(extents, ex, ex_next); + return (1); +} + +/* + * Recompute section layout. + */ + +static off_t +_libelf_resync_sections(Elf *e, off_t rc, struct _Elf_Extent_List *extents) +{ + int ec; + Elf_Scn *s; + size_t sh_type; + + ec = e->e_class; + + /* + * Make a pass through sections, computing the extent of each + * section. + */ + STAILQ_FOREACH(s, &e->e_u.e_elf.e_scn, s_next) { + if (ec == ELFCLASS32) + sh_type = s->s_shdr.s_shdr32.sh_type; + else + sh_type = s->s_shdr.s_shdr64.sh_type; + + if (sh_type == SHT_NOBITS || sh_type == SHT_NULL) + continue; + + if (_libelf_compute_section_extents(e, s, rc) == 0) + return ((off_t) -1); + + if (s->s_size == 0) + continue; + + if (!_libelf_insert_extent(extents, ELF_EXTENT_SECTION, + s->s_offset, s->s_size, s)) + return ((off_t) -1); + + if ((size_t) rc < s->s_offset + s->s_size) + rc = s->s_offset + s->s_size; + } + + return (rc); +} + +/* + * Recompute the layout of the ELF object and update the internal data + * structures associated with the ELF descriptor. + * + * Returns the size in bytes the ELF object would occupy in its file + * representation. + * + * After a successful call to this function, the following structures + * are updated: + * + * - The ELF header is updated. + * - All extents in the ELF object are sorted in order of ascending + * addresses. Sections have their section header table entries + * updated. An error is signalled if an overlap was detected among + * extents. + * - Data descriptors associated with sections are checked for valid + * types, offsets and alignment. + * + * After a resync_elf() successfully returns, the ELF descriptor is + * ready for being handed over to _libelf_write_elf(). + */ + +static off_t +_libelf_resync_elf(Elf *e, struct _Elf_Extent_List *extents) +{ + int ec, eh_class, eh_type; + unsigned int eh_byteorder, eh_version; + size_t align, fsz; + size_t phnum, shnum; + off_t rc, phoff, shoff; + void *ehdr, *phdr; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + + rc = 0; + + ec = e->e_class; + + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + /* + * Prepare the EHDR. + */ + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return ((off_t) -1); + + eh32 = ehdr; + eh64 = ehdr; + + if (ec == ELFCLASS32) { + eh_byteorder = eh32->e_ident[EI_DATA]; + eh_class = eh32->e_ident[EI_CLASS]; + phoff = (uint64_t) eh32->e_phoff; + shoff = (uint64_t) eh32->e_shoff; + eh_type = eh32->e_type; + eh_version = eh32->e_version; + } else { + eh_byteorder = eh64->e_ident[EI_DATA]; + eh_class = eh64->e_ident[EI_CLASS]; + phoff = eh64->e_phoff; + shoff = eh64->e_shoff; + eh_type = eh64->e_type; + eh_version = eh64->e_version; + } + + if (eh_version == EV_NONE) + eh_version = EV_CURRENT; + + if (eh_version != e->e_version) { /* always EV_CURRENT */ + LIBELF_SET_ERROR(VERSION, 0); + return ((off_t) -1); + } + + if (eh_class != e->e_class) { + LIBELF_SET_ERROR(CLASS, 0); + return ((off_t) -1); + } + + if ((e->e_cmd == ELF_C_READ + || (e->e_cmd == ELF_C_RDWR && e->e_rawfile)) + && eh_byteorder != e->e_byteorder) { + LIBELF_SET_ERROR(HEADER, 0); + return ((off_t) -1); + } + + shnum = e->e_u.e_elf.e_nscn; + phnum = e->e_u.e_elf.e_nphdr; + + e->e_byteorder = eh_byteorder; + +#define INITIALIZE_EHDR(E,EC,V) do { \ + (E)->e_ident[EI_MAG0] = ELFMAG0; \ + (E)->e_ident[EI_MAG1] = ELFMAG1; \ + (E)->e_ident[EI_MAG2] = ELFMAG2; \ + (E)->e_ident[EI_MAG3] = ELFMAG3; \ + (E)->e_ident[EI_CLASS] = (EC); \ + (E)->e_ident[EI_VERSION] = (V); \ + (E)->e_ehsize = _libelf_fsize(ELF_T_EHDR, (EC), (V), \ + (size_t) 1); \ + (E)->e_phentsize = (phnum == 0) ? 0 : _libelf_fsize( \ + ELF_T_PHDR, (EC), (V), (size_t) 1); \ + (E)->e_shentsize = _libelf_fsize(ELF_T_SHDR, (EC), (V), \ + (size_t) 1); \ + } while (0) + + if (ec == ELFCLASS32) + INITIALIZE_EHDR(eh32, ec, eh_version); + else + INITIALIZE_EHDR(eh64, ec, eh_version); + + (void) elf_flagehdr(e, ELF_C_SET, ELF_F_DIRTY); + + rc += _libelf_fsize(ELF_T_EHDR, ec, eh_version, (size_t) 1); + + if (!_libelf_insert_extent(extents, ELF_EXTENT_EHDR, 0, rc, ehdr)) + return ((off_t) -1); + + /* + * Compute the layout the program header table, if one is + * present. The program header table needs to be aligned to a + * `natural' boundary. + */ + if (phnum) { + fsz = _libelf_fsize(ELF_T_PHDR, ec, eh_version, phnum); + align = _libelf_falign(ELF_T_PHDR, ec); + + if (e->e_flags & ELF_F_LAYOUT) { + /* + * Check offsets for sanity. + */ + if (rc > phoff) { + LIBELF_SET_ERROR(LAYOUT, 0); + return ((off_t) -1); + } + + if (phoff % align) { + LIBELF_SET_ERROR(LAYOUT, 0); + return ((off_t) -1); + } + + } else + phoff = roundup(rc, align); + + rc = phoff + fsz; + + phdr = _libelf_getphdr(e, ec); + + if (!_libelf_insert_extent(extents, ELF_EXTENT_PHDR, phoff, + fsz, phdr)) + return ((off_t) -1); + } else + phoff = 0; + + /* + * Compute the layout of the sections associated with the + * file. + */ + /* + * If we are a read only elf that has not had its + * headers loaded, or a read/write elf that is not + * based on a file descripter and had its headers + * loaded, then lets load the headers. + * If the loading of the headers fails, return -1. + */ + if ((e->e_cmd == ELF_C_READ + || (e->e_cmd == ELF_C_RDWR && e->e_rawfile + && e->e_fd == -1)) && + (e->e_flags & LIBELF_F_SHDRS_LOADED) == 0 && + _libelf_load_section_headers(e, ehdr) == 0) + return ((off_t) -1); + + if ((rc = _libelf_resync_sections(e, rc, extents)) < 0) + return ((off_t) -1); + + /* + * Compute the space taken up by the section header table, if + * one is needed. + * + * If ELF_F_LAYOUT has been asserted, the application may have + * placed the section header table in between existing + * sections, so the net size of the file need not increase due + * to the presence of the section header table. + * + * If the library is responsible for laying out the object, + * the section header table is placed after section data. + */ + if (shnum) { + fsz = _libelf_fsize(ELF_T_SHDR, ec, eh_version, shnum); + align = _libelf_falign(ELF_T_SHDR, ec); + + if (e->e_flags & ELF_F_LAYOUT) { + if (shoff % align) { + LIBELF_SET_ERROR(LAYOUT, 0); + return ((off_t) -1); + } + } else + shoff = roundup(rc, align); + + if (shoff + fsz > (size_t) rc) + rc = shoff + fsz; + + if (!_libelf_insert_extent(extents, ELF_EXTENT_SHDR, shoff, + fsz, NULL)) + return ((off_t) -1); + } else + shoff = 0; + + /* + * Set the fields of the Executable Header that could potentially use + * extended numbering. + */ + _libelf_setphnum(e, ehdr, ec, phnum); + _libelf_setshnum(e, ehdr, ec, shnum); + + /* + * Update the `e_phoff' and `e_shoff' fields if the library is + * doing the layout. + */ + if ((e->e_flags & ELF_F_LAYOUT) == 0) { + if (ec == ELFCLASS32) { + eh32->e_phoff = (uint32_t) phoff; + eh32->e_shoff = (uint32_t) shoff; + } else { + eh64->e_phoff = (uint64_t) phoff; + eh64->e_shoff = (uint64_t) shoff; + } + } + + return (rc); +} + +/* + * Write out the contents of an ELF section. + */ + +static size_t +_libelf_write_scn(Elf *e, char *nf, struct _Elf_Extent *ex) +{ + int ec; + size_t fsz, msz, nobjects, rc; + uint32_t sh_type; + uint64_t sh_off, sh_size; + int elftype; + Elf_Scn *s; + Elf_Data *d, dst; + + assert(ex->ex_type == ELF_EXTENT_SECTION); + + s = ex->ex_desc; + rc = ex->ex_start; + + if ((ec = e->e_class) == ELFCLASS32) { + sh_type = s->s_shdr.s_shdr32.sh_type; + sh_size = (uint64_t) s->s_shdr.s_shdr32.sh_size; + } else { + sh_type = s->s_shdr.s_shdr64.sh_type; + sh_size = s->s_shdr.s_shdr64.sh_size; + } + + /* + * Ignore sections that do not allocate space in the file. + */ + if (sh_type == SHT_NOBITS || sh_type == SHT_NULL || sh_size == 0) + return (rc); + + elftype = _libelf_xlate_shtype(sh_type); + assert(elftype >= ELF_T_FIRST && elftype <= ELF_T_LAST); + + sh_off = s->s_offset; + assert(sh_off % _libelf_falign(elftype, ec) == 0); + + /* + * If the section has a `rawdata' descriptor, and the section + * contents have not been modified, use its contents directly. + * The `s_rawoff' member contains the offset into the original + * file, while `s_offset' contains its new location in the + * destination. + */ + + /* If we are a read/write elf, we cannot trust the rawdata. */ + if (e->e_cmd != ELF_C_RDWR) { + if (STAILQ_EMPTY(&s->s_data)) { + + if ((d = elf_rawdata(s, NULL)) == NULL) + return ((off_t) -1); + + STAILQ_FOREACH(d, &s->s_rawdata, d_next) { + if ((uint64_t) rc < sh_off + d->d_off) + (void) memset(nf + rc, + LIBELF_PRIVATE(fillchar), sh_off + + d->d_off - rc); + rc = sh_off + d->d_off; + + assert(d->d_buf != NULL); + assert(d->d_type == ELF_T_BYTE); + assert(d->d_version == e->e_version); + + (void) memcpy(nf + rc, + e->e_rawfile + s->s_rawoff + d->d_off, d->d_size); + + rc += d->d_size; + } + + return (rc); + } + } + + /* + * Iterate over the set of data descriptors for this section. + * The prior call to _libelf_resync_elf() would have setup the + * descriptors for this step. + */ + + dst.d_version = e->e_version; + + STAILQ_FOREACH(d, &s->s_data, d_next) { + + msz = _libelf_msize(d->d_type, ec, e->e_version); + + if ((uint64_t) rc < sh_off + d->d_off) + (void) memset(nf + rc, + LIBELF_PRIVATE(fillchar), sh_off + d->d_off - rc); + + rc = sh_off + d->d_off; + + assert(d->d_buf != NULL); + assert(d->d_version == e->e_version); + assert(d->d_size % msz == 0); + + nobjects = d->d_size / msz; + + fsz = _libelf_fsize(d->d_type, ec, e->e_version, nobjects); + + dst.d_buf = nf + rc; + dst.d_size = fsz; + + if (_libelf_xlate(&dst, d, e->e_byteorder, ec, ELF_TOFILE) == + NULL) + return ((off_t) -1); + + rc += fsz; + } + + return ((off_t) rc); +} + +/* + * Write out an ELF Executable Header. + */ + +static off_t +_libelf_write_ehdr(Elf *e, char *nf, struct _Elf_Extent *ex) +{ + int ec; + void *ehdr; + size_t fsz, msz; + Elf_Data dst, src; + + assert(ex->ex_type == ELF_EXTENT_EHDR); + assert(ex->ex_start == 0); /* Ehdr always comes first. */ + + ec = e->e_class; + + ehdr = _libelf_ehdr(e, ec, 0); + assert(ehdr != NULL); + + fsz = _libelf_fsize(ELF_T_EHDR, ec, e->e_version, (size_t) 1); + msz = _libelf_msize(ELF_T_EHDR, ec, e->e_version); + + (void) memset(&dst, 0, sizeof(dst)); + (void) memset(&src, 0, sizeof(src)); + + src.d_buf = ehdr; + src.d_size = msz; + src.d_type = ELF_T_EHDR; + src.d_version = dst.d_version = e->e_version; + + dst.d_buf = nf; + dst.d_size = fsz; + + if (_libelf_xlate(&dst, &src, e->e_byteorder, ec, ELF_TOFILE) == + NULL) + return ((off_t) -1); + + return ((off_t) fsz); +} + +/* + * Write out an ELF program header table. + */ + +static off_t +_libelf_write_phdr(Elf *e, char *nf, struct _Elf_Extent *ex) +{ + int ec; + void *ehdr; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + Elf_Data dst, src; + size_t fsz, phnum; + uint64_t phoff; + + assert(ex->ex_type == ELF_EXTENT_PHDR); + + ec = e->e_class; + ehdr = _libelf_ehdr(e, ec, 0); + phnum = e->e_u.e_elf.e_nphdr; + + assert(phnum > 0); + + if (ec == ELFCLASS32) { + eh32 = (Elf32_Ehdr *) ehdr; + phoff = (uint64_t) eh32->e_phoff; + } else { + eh64 = (Elf64_Ehdr *) ehdr; + phoff = eh64->e_phoff; + } + + assert(phoff > 0); + assert(ex->ex_start == phoff); + assert(phoff % _libelf_falign(ELF_T_PHDR, ec) == 0); + + (void) memset(&dst, 0, sizeof(dst)); + (void) memset(&src, 0, sizeof(src)); + + fsz = _libelf_fsize(ELF_T_PHDR, ec, e->e_version, phnum); + assert(fsz > 0); + + src.d_buf = _libelf_getphdr(e, ec); + src.d_version = dst.d_version = e->e_version; + src.d_type = ELF_T_PHDR; + src.d_size = phnum * _libelf_msize(ELF_T_PHDR, ec, + e->e_version); + + dst.d_size = fsz; + dst.d_buf = nf + ex->ex_start; + + if (_libelf_xlate(&dst, &src, e->e_byteorder, ec, ELF_TOFILE) == + NULL) + return ((off_t) -1); + + return (phoff + fsz); +} + +/* + * Write out an ELF section header table. + */ + +static off_t +_libelf_write_shdr(Elf *e, char *nf, struct _Elf_Extent *ex) +{ + int ec; + void *ehdr; + Elf_Scn *scn; + uint64_t shoff; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + size_t fsz, nscn; + Elf_Data dst, src; + + assert(ex->ex_type == ELF_EXTENT_SHDR); + + ec = e->e_class; + ehdr = _libelf_ehdr(e, ec, 0); + nscn = e->e_u.e_elf.e_nscn; + + if (ec == ELFCLASS32) { + eh32 = (Elf32_Ehdr *) ehdr; + shoff = (uint64_t) eh32->e_shoff; + } else { + eh64 = (Elf64_Ehdr *) ehdr; + shoff = eh64->e_shoff; + } + + assert(nscn > 0); + assert(shoff % _libelf_falign(ELF_T_SHDR, ec) == 0); + assert(ex->ex_start == shoff); + + (void) memset(&dst, 0, sizeof(dst)); + (void) memset(&src, 0, sizeof(src)); + + src.d_type = ELF_T_SHDR; + src.d_size = _libelf_msize(ELF_T_SHDR, ec, e->e_version); + src.d_version = dst.d_version = e->e_version; + + fsz = _libelf_fsize(ELF_T_SHDR, ec, e->e_version, (size_t) 1); + + STAILQ_FOREACH(scn, &e->e_u.e_elf.e_scn, s_next) { + if (ec == ELFCLASS32) + src.d_buf = &scn->s_shdr.s_shdr32; + else + src.d_buf = &scn->s_shdr.s_shdr64; + + dst.d_size = fsz; + dst.d_buf = nf + ex->ex_start + scn->s_ndx * fsz; + + if (_libelf_xlate(&dst, &src, e->e_byteorder, ec, + ELF_TOFILE) == NULL) + return ((off_t) -1); + } + + return (ex->ex_start + nscn * fsz); +} + +/* + * Update the elf file image. + * + * The original file could have been mapped in with an ELF_C_RDWR + * command and the application could have added new content or + * re-arranged its sections before calling elf_update(). Consequently + * its not safe to work `in place' on the original file. So we + * malloc() the required space for the updated ELF object and build + * the object there and write it out to the underlying file at the + * end. Note that the application may have opened the underlying file + * in ELF_C_RDWR and only retrieved/modified a few sections. We take + * care to avoid translating file sections unnecessarily. + * + * Gaps in the coverage of the file by the file's sections will be + * filled with the fill character set by elf_fill(3). + */ + +static off_t +_libelf_update_elf(Elf *e, off_t newsize, struct _Elf_Extent_List *extents) +{ + off_t nrc, rc; + char *newfile; + struct _Elf_Extent *ex; + + assert(e->e_kind == ELF_K_ELF); + // There are two types of ELF_C_RDWR files, one that is based in + // memory and has a raw file and one that is based on a file + // descriptor and does not have a raw_file. Both are equally + // valid, so we don't special case here. + assert(e->e_cmd == ELF_C_RDWR || (e->e_cmd == ELF_C_WRITE && e->e_fd != -1)); + + if ((newfile = e->e_mem.alloc((size_t) newsize)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, errno); + return ((off_t) -1); + } + + nrc = rc = 0; + ELF_SLIST_FOREACH(ex, extents, ex_next) { + + /* Fill inter-extent gaps. */ + if (ex->ex_start > (size_t) rc) + (void) memset(newfile + rc, LIBELF_PRIVATE(fillchar), + ex->ex_start - rc); + + switch (ex->ex_type) { + case ELF_EXTENT_EHDR: + if ((nrc = _libelf_write_ehdr(e, newfile, ex)) < 0) + goto error; + break; + + case ELF_EXTENT_PHDR: + if ((nrc = _libelf_write_phdr(e, newfile, ex)) < 0) + goto error; + break; + + case ELF_EXTENT_SECTION: + if ((nrc = _libelf_write_scn(e, newfile, ex)) < 0) + goto error; + break; + + case ELF_EXTENT_SHDR: + if ((nrc = _libelf_write_shdr(e, newfile, ex)) < 0) + goto error; + break; + + default: + assert(0); + break; + } + + assert(ex->ex_start + ex->ex_size == (size_t) nrc); + assert(rc < nrc); + + rc = nrc; + } + + assert(rc == newsize); + + /* + * For regular files, throw away existing file content and + * unmap any existing mappings. + */ + if ((e->e_flags & LIBELF_F_SPECIAL_FILE) == 0 && e->e_fd != -1) { +#if !defined(WIN32) +#define FTRUNC(A, B) ftruncate(A, (off_t)B) +#else +#define FTRUNC(A, B) _chsize(A, B) +#endif + if (FTRUNC(e->e_fd, 0) < 0 || + lseek(e->e_fd, (off_t) 0, SEEK_SET)) { + LIBELF_SET_ERROR(IO, errno); + goto error; + } + if (e->e_flags & LIBELF_F_RAWFILE_MMAP) { + assert(e->e_rawfile != NULL); + assert(e->e_cmd == ELF_C_RDWR); + if (munmap(e->e_rawfile, e->e_rawsize) < 0) { + LIBELF_SET_ERROR(IO, errno); + goto error; + } + } + } + + /* + * Write out the new contents. + */ + if (e->e_fd != -1 && write(e->e_fd, newfile, (size_t) newsize) != newsize) { + LIBELF_SET_ERROR(IO, errno); + goto error; + } + + /* + * For files opened in ELF_C_RDWR mode, set up the new 'raw' + * contents. + */ + if (e->e_cmd == ELF_C_RDWR) { + if (e->e_rawfile) { + if (e->e_flags & LIBELF_F_RAWFILE_MMAP && e->e_fd >= 0) { + if ((e->e_rawfile = mmap(NULL, (size_t) newsize, + PROT_READ, MAP_PRIVATE, e->e_fd, (off_t) 0)) == + MAP_FAILED) { + LIBELF_SET_ERROR(IO, errno); + goto error; + } + } else if (e->e_flags & LIBELF_F_RAWFILE_MALLOC) { + e->e_mem.dealloc(e->e_rawfile); + e->e_rawfile = newfile; + newfile = NULL; + } + } else { + e->e_rawfile = newfile; + newfile = NULL; + e->e_flags |= LIBELF_F_RAWFILE_MALLOC; + } + /* Record the new size of the file. */ + e->e_rawsize = newsize; + } else { + /* File opened in ELF_C_WRITE mode. */ + assert(e->e_rawfile == NULL); + } + + /* Free the temporary buffer. */ + if (newfile) + e->e_mem.dealloc(newfile); + + return (rc); + +error: + e->e_mem.dealloc(newfile); + + return ((off_t) -1); +} + +/* + * Write out the file image. + * + * The original file could have been mapped in with an ELF_C_RDWR + * command and the application could have added new content or + * re-arranged its sections before calling elf_update(). Consequently + * its not safe to work `in place' on the original file. So we + * malloc() the required space for the updated ELF object and build + * the object there and write it out to the underlying file at the + * end. Note that the application may have opened the underlying file + * in ELF_C_RDWR and only retrieved/modified a few sections. We take + * care to avoid translating file sections unnecessarily. + * + * Gaps in the coverage of the file by the file's sections will be + * filled with the fill character set by elf_fill(3). + */ + +static off_t +_libelf_write_elf(Elf *e, off_t newsize, struct _Elf_Extent_List *extents) +{ + off_t rc; + Elf_Scn *scn, *tscn; + + rc = _libelf_update_elf(e, newsize, extents); + if (rc == (off_t)-1) + return (rc); + + /* + * Reset flags, remove existing section descriptors and + * {E,P}HDR pointers so that a subsequent elf_get{e,p}hdr() + * and elf_getscn() will function correctly. + */ + + e->e_flags &= ~ELF_F_DIRTY; + + STAILQ_FOREACH_SAFE(scn, &e->e_u.e_elf.e_scn, s_next, tscn) + _libelf_release_scn(scn); + + if (e->e_class == ELFCLASS32) { + e->e_mem.dealloc(e->e_u.e_elf.e_ehdr.e_ehdr32); + if (e->e_u.e_elf.e_phdr.e_phdr32) + e->e_mem.dealloc(e->e_u.e_elf.e_phdr.e_phdr32); + + e->e_u.e_elf.e_ehdr.e_ehdr32 = NULL; + e->e_u.e_elf.e_phdr.e_phdr32 = NULL; + } else { + e->e_mem.dealloc(e->e_u.e_elf.e_ehdr.e_ehdr64); + if (e->e_u.e_elf.e_phdr.e_phdr64) + e->e_mem.dealloc(e->e_u.e_elf.e_phdr.e_phdr64); + + e->e_u.e_elf.e_ehdr.e_ehdr64 = NULL; + e->e_u.e_elf.e_phdr.e_phdr64 = NULL; + } + + return (rc); +} + +/* + * Update an ELF object. + */ + +off_t +elf_update(Elf *e, Elf_Cmd c) +{ + int ec; + off_t rc; + struct _Elf_Extent_List extents; + + rc = (off_t) -1; + + if (e == NULL || e->e_kind != ELF_K_ELF || + (c != ELF_C_NULL && c != ELF_C_WRITE + && c != ELF_C_RDWR)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (rc); + } + + if ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64) { + LIBELF_SET_ERROR(CLASS, 0); + return (rc); + } + + if (e->e_version == EV_NONE) + e->e_version = EV_CURRENT; + + if (c == ELF_C_WRITE && e->e_cmd == ELF_C_READ) { + LIBELF_SET_ERROR(MODE, 0); + return (rc); + } + + ELF_SLIST_INIT(&extents); + + if ((rc = _libelf_resync_elf(e, &extents)) < 0) + goto done; + + if (c == ELF_C_NULL) + goto done; + + if (c == ELF_C_WRITE && e->e_fd == -1) { + rc = (off_t) -1; + LIBELF_SET_ERROR(SEQUENCE, 0); + goto done; + } + + if (c == ELF_C_RDWR) { + rc = _libelf_update_elf(e, rc, &extents); + } else { + rc = _libelf_write_elf(e, rc, &extents); + } + +done: + _libelf_release_extents(&extents); + return (rc); +} + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_version.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_version.c new file mode 100644 index 0000000000..48950f407d --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/elf_version.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: elf_version.c 189 2008-07-20 10:38:08Z jkoshy $"); + +unsigned int +elf_version(unsigned int v) +{ + unsigned int old; + + if ((old = LIBELF_PRIVATE(version)) == EV_NONE) + old = EV_CURRENT; + + if (v == EV_NONE) + return old; + if (v > EV_CURRENT) { + LIBELF_SET_ERROR(VERSION, 0); + return EV_NONE; + } + + LIBELF_PRIVATE(version) = v; + return (old); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf.h new file mode 100644 index 0000000000..152973824f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf.h @@ -0,0 +1,114 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: gelf.h 1168 2010-09-04 01:03:25Z jkoshy $ + */ + +#ifndef _GELF_H_ +#define _GELF_H_ + +#include + +#include + +typedef Elf64_Addr GElf_Addr; /* Addresses */ +typedef Elf64_Half GElf_Half; /* Half words (16 bit) */ +typedef Elf64_Off GElf_Off; /* Offsets */ +typedef Elf64_Sword GElf_Sword; /* Signed words (32 bit) */ +typedef Elf64_Sxword GElf_Sxword; /* Signed long words (64 bit) */ +typedef Elf64_Word GElf_Word; /* Unsigned words (32 bit) */ +typedef Elf64_Xword GElf_Xword; /* Unsigned long words (64 bit) */ + +typedef Elf64_Dyn GElf_Dyn; /* ".dynamic" section entries */ +typedef Elf64_Ehdr GElf_Ehdr; /* ELF header */ +typedef Elf64_Phdr GElf_Phdr; /* Program header */ +typedef Elf64_Shdr GElf_Shdr; /* Section header */ +typedef Elf64_Sym GElf_Sym; /* Symbol table entries */ +typedef Elf64_Rel GElf_Rel; /* Relocation entries */ +typedef Elf64_Rela GElf_Rela; /* Relocation entries with addend */ + +#if (defined(__FreeBSD_version) && __FreeBSD_version >= 700025) || \ + (defined(__NetBSD_Version) && __NetBSD_Version > 400000003) +typedef Elf64_Cap GElf_Cap; /* SW/HW capabilities */ +typedef Elf64_Move GElf_Move; /* Move entries */ +typedef Elf64_Syminfo GElf_Syminfo; /* Symbol information */ +#endif + +#define GELF_M_INFO ELF64_M_INFO +#define GELF_M_SIZE ELF64_M_SIZE +#define GELF_M_SYM ELF64_M_SYM + +#define GELF_R_INFO ELF64_R_INFO +#define GELF_R_SYM ELF64_R_SYM +#define GELF_R_TYPE ELF64_R_TYPE +#define GELF_R_TYPE_DATA ELF64_R_TYPE_DATA +#define GELF_R_TYPE_ID ELF64_R_TYPE_ID +#define GELF_R_TYPE_INFO ELF64_R_TYPE_INFO + +#define GELF_ST_BIND ELF64_ST_BIND +#define GELF_ST_INFO ELF64_ST_INFO +#define GELF_ST_TYPE ELF64_ST_TYPE +#define GELF_ST_VISIBILITY ELF64_ST_VISIBILITY + +__BEGIN_DECLS +long gelf_checksum(Elf *_elf); +size_t gelf_fsize(Elf *_elf, Elf_Type _type, size_t _count, + unsigned int _version); +int gelf_getclass(Elf *_elf); +GElf_Dyn *gelf_getdyn(Elf_Data *_data, int _index, GElf_Dyn *_dst); +GElf_Ehdr *gelf_getehdr(Elf *_elf, GElf_Ehdr *_dst); +GElf_Phdr *gelf_getphdr(Elf *_elf, int _index, GElf_Phdr *_dst); +GElf_Rel *gelf_getrel(Elf_Data *_src, int _index, GElf_Rel *_dst); +GElf_Rela *gelf_getrela(Elf_Data *_src, int _index, GElf_Rela *_dst); +GElf_Shdr *gelf_getshdr(Elf_Scn *_scn, GElf_Shdr *_dst); +GElf_Sym *gelf_getsym(Elf_Data *_src, int _index, GElf_Sym *_dst); +GElf_Sym *gelf_getsymshndx(Elf_Data *_src, Elf_Data *_shindexsrc, + int _index, GElf_Sym *_dst, Elf32_Word *_shindexdst); +void * gelf_newehdr(Elf *_elf, int _class); +void * gelf_newphdr(Elf *_elf, size_t _phnum); +int gelf_update_dyn(Elf_Data *_dst, int _index, GElf_Dyn *_src); +int gelf_update_ehdr(Elf *_elf, GElf_Ehdr *_src); +int gelf_update_phdr(Elf *_elf, int _index, GElf_Phdr *_src); +int gelf_update_rel(Elf_Data *_dst, int _index, GElf_Rel *_src); +int gelf_update_rela(Elf_Data *_dst, int _index, GElf_Rela *_src); +int gelf_update_shdr(Elf_Scn *_dst, GElf_Shdr *_src); +int gelf_update_sym(Elf_Data *_dst, int _index, GElf_Sym *_src); +int gelf_update_symshndx(Elf_Data *_symdst, Elf_Data *_shindexdst, + int _index, GElf_Sym *_symsrc, Elf32_Word _shindexsrc); +Elf_Data *gelf_xlatetof(Elf *_elf, Elf_Data *_dst, const Elf_Data *_src, unsigned int _encode); +Elf_Data *gelf_xlatetom(Elf *_elf, Elf_Data *_dst, const Elf_Data *_src, unsigned int _encode); + +#if (defined(__FreeBSD_version) && __FreeBSD_version >= 700025) || \ + (defined(__NetBSD_Version) && __NetBSD_Version > 400000003) +GElf_Cap *gelf_getcap(Elf_Data *_data, int _index, GElf_Cap *_cap); +GElf_Move *gelf_getmove(Elf_Data *_src, int _index, GElf_Move *_dst); +GElf_Syminfo *gelf_getsyminfo(Elf_Data *_src, int _index, GElf_Syminfo *_dst); +int gelf_update_cap(Elf_Data *_dst, int _index, GElf_Cap *_src); +int gelf_update_move(Elf_Data *_dst, int _index, GElf_Move *_src); +int gelf_update_syminfo(Elf_Data *_dst, int _index, GElf_Syminfo *_src); +#endif +__END_DECLS + +#endif /* _GELF_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_cap.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_cap.c new file mode 100644 index 0000000000..68d34ca67d --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_cap.c @@ -0,0 +1,148 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_cap.c 1166 2010-09-04 00:54:36Z jkoshy $"); + +#if LIBELF_CONFIG_CAP + +GElf_Cap * +gelf_getcap(Elf_Data *d, int ndx, GElf_Cap *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Cap *cap32; + Elf64_Cap *cap64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_CAP) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_CAP, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + + cap32 = (Elf32_Cap *) d->d_buf + ndx; + + dst->c_tag = cap32->c_tag; + dst->c_un.c_val = (Elf64_Xword) cap32->c_un.c_val; + + } else { + + cap64 = (Elf64_Cap *) d->d_buf + ndx; + + *dst = *cap64; + } + + return (dst); +} + +int +gelf_update_cap(Elf_Data *d, int ndx, GElf_Cap *gc) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Cap *cap32; + Elf64_Cap *cap64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || gc == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_CAP) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_CAP, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + cap32 = (Elf32_Cap *) d->d_buf + ndx; + + LIBELF_COPY_U32(cap32, gc, c_tag); + LIBELF_COPY_U32(cap32, gc, c_un.c_val); + } else { + cap64 = (Elf64_Cap *) d->d_buf + ndx; + + *cap64 = *gc; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_CAP */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_checksum.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_checksum.c new file mode 100644 index 0000000000..30fbb9784f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_checksum.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_checksum.c 189 2008-07-20 10:38:08Z jkoshy $"); + +long +elf32_checksum(Elf *e) +{ + return (_libelf_checksum(e, ELFCLASS32)); +} + +long +elf64_checksum(Elf *e) +{ + return (_libelf_checksum(e, ELFCLASS64)); +} + +long +gelf_checksum(Elf *e) +{ + int ec; + if (e == NULL || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0L); + } + return (_libelf_checksum(e, ec)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_dyn.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_dyn.c new file mode 100644 index 0000000000..6a2885cd8f --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_dyn.c @@ -0,0 +1,143 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_dyn.c 189 2008-07-20 10:38:08Z jkoshy $"); + +GElf_Dyn * +gelf_getdyn(Elf_Data *d, int ndx, GElf_Dyn *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Dyn *dyn32; + Elf64_Dyn *dyn64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_DYN) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_DYN, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + dyn32 = (Elf32_Dyn *) d->d_buf + ndx; + + dst->d_tag = dyn32->d_tag; + dst->d_un.d_val = (Elf64_Xword) dyn32->d_un.d_val; + + } else { + + dyn64 = (Elf64_Dyn *) d->d_buf + ndx; + + *dst = *dyn64; + } + + return (dst); +} + +int +gelf_update_dyn(Elf_Data *d, int ndx, GElf_Dyn *ds) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Dyn *dyn32; + Elf64_Dyn *dyn64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || ds == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_DYN) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_DYN, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + dyn32 = (Elf32_Dyn *) d->d_buf + ndx; + + LIBELF_COPY_S32(dyn32, ds, d_tag); + LIBELF_COPY_U32(dyn32, ds, d_un.d_val); + } else { + dyn64 = (Elf64_Dyn *) d->d_buf + ndx; + + *dyn64 = *ds; + } + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_ehdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_ehdr.c new file mode 100644 index 0000000000..37ccce8c6a --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_ehdr.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_ehdr.c 1678 2011-07-28 04:36:34Z jkoshy $"); + +Elf32_Ehdr * +elf32_getehdr(Elf *e) +{ + return (_libelf_ehdr(e, ELFCLASS32, 0)); +} + +Elf64_Ehdr * +elf64_getehdr(Elf *e) +{ + return (_libelf_ehdr(e, ELFCLASS64, 0)); +} + +GElf_Ehdr * +gelf_getehdr(Elf *e, GElf_Ehdr *d) +{ + int ec; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + + if (d == NULL || e == NULL || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + if ((eh32 = _libelf_ehdr(e, ELFCLASS32, 0)) == NULL) + return (NULL); + + (void) memcpy(d->e_ident, eh32->e_ident, + sizeof(eh32->e_ident)); + d->e_type = eh32->e_type; + d->e_machine = eh32->e_machine; + d->e_version = eh32->e_version; + d->e_entry = eh32->e_entry; + d->e_phoff = eh32->e_phoff; + d->e_shoff = eh32->e_shoff; + d->e_flags = eh32->e_flags; + d->e_ehsize = eh32->e_ehsize; + d->e_phentsize = eh32->e_phentsize; + d->e_phnum = eh32->e_phnum; + d->e_shentsize = eh32->e_shentsize; + d->e_shnum = eh32->e_shnum; + d->e_shstrndx = eh32->e_shstrndx; + + return (d); + } + + assert(ec == ELFCLASS64); + + if ((eh64 = _libelf_ehdr(e, ELFCLASS64, 0)) == NULL) + return (NULL); + *d = *eh64; + + return (d); +} + +Elf32_Ehdr * +elf32_newehdr(Elf *e) +{ + return (_libelf_ehdr(e, ELFCLASS32, 1)); +} + +Elf64_Ehdr * +elf64_newehdr(Elf *e) +{ + return (_libelf_ehdr(e, ELFCLASS64, 1)); +} + +void * +gelf_newehdr(Elf *e, int ec) +{ + if (e != NULL && + (ec == ELFCLASS32 || ec == ELFCLASS64)) + return (_libelf_ehdr(e, ec, 1)); + + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); +} + +int +gelf_update_ehdr(Elf *e, GElf_Ehdr *s) +{ + int ec; + void *ehdr; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + + if (s== NULL || e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (e->e_cmd == ELF_C_READ) { + LIBELF_SET_ERROR(MODE, 0); + return (0); + } + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return (0); + + (void) elf_flagehdr(e, ELF_C_SET, ELF_F_DIRTY); + + if (ec == ELFCLASS64) { + eh64 = (Elf64_Ehdr *) ehdr; + *eh64 = *s; + return (1); + } + + eh32 = (Elf32_Ehdr *) ehdr; + + (void) memcpy(eh32->e_ident, s->e_ident, sizeof(eh32->e_ident)); + + eh32->e_type = s->e_type; + eh32->e_machine = s->e_machine; + eh32->e_version = s->e_version; + LIBELF_COPY_U32(eh32, s, e_entry); + LIBELF_COPY_U32(eh32, s, e_phoff); + LIBELF_COPY_U32(eh32, s, e_shoff); + eh32->e_flags = s->e_flags; + eh32->e_ehsize = s->e_ehsize; + eh32->e_phentsize = s->e_phentsize; + eh32->e_phnum = s->e_phnum; + eh32->e_shentsize = s->e_shentsize; + eh32->e_shnum = s->e_shnum; + eh32->e_shstrndx = s->e_shstrndx; + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_fsize.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_fsize.c new file mode 100644 index 0000000000..0e38d1462e --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_fsize.c @@ -0,0 +1,62 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_fsize.c 189 2008-07-20 10:38:08Z jkoshy $"); + +size_t +elf32_fsize(Elf_Type t, size_t c, unsigned int v) +{ + return (_libelf_fsize(t, ELFCLASS32, v, c)); +} + +size_t +elf64_fsize(Elf_Type t, size_t c, unsigned int v) +{ + return (_libelf_fsize(t, ELFCLASS64, v, c)); +} + +size_t +gelf_fsize(Elf *e, Elf_Type t, size_t c, unsigned int v) +{ + + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (e->e_class == ELFCLASS32 || e->e_class == ELFCLASS64) + return (_libelf_fsize(t, e->e_class, v, c)); + + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_getclass.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_getclass.c new file mode 100644 index 0000000000..349a9cd535 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_getclass.c @@ -0,0 +1,39 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_getclass.c 189 2008-07-20 10:38:08Z jkoshy $"); + +int +gelf_getclass(Elf *e) +{ + return (e != NULL ? e->e_class : ELFCLASSNONE); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_move.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_move.c new file mode 100644 index 0000000000..359b021ddc --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_move.c @@ -0,0 +1,154 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_move.c 1166 2010-09-04 00:54:36Z jkoshy $"); + +#if LIBELF_CONFIG_MOVE + +GElf_Move * +gelf_getmove(Elf_Data *d, int ndx, GElf_Move *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Move *move32; + Elf64_Move *move64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_MOVE) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_MOVE, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + + move32 = (Elf32_Move *) d->d_buf + ndx; + + dst->m_value = move32->m_value; + dst->m_info = (Elf64_Xword) move32->m_info; + dst->m_poffset = (Elf64_Xword) move32->m_poffset; + dst->m_repeat = move32->m_repeat; + dst->m_stride = move32->m_stride; + } else { + + move64 = (Elf64_Move *) d->d_buf + ndx; + + *dst = *move64; + } + + return (dst); +} + +int +gelf_update_move(Elf_Data *d, int ndx, GElf_Move *gm) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Move *move32; + Elf64_Move *move64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || gm == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_MOVE) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_MOVE, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + move32 = (Elf32_Move *) d->d_buf + ndx; + + move32->m_value = gm->m_value; + LIBELF_COPY_U32(move32, gm, m_info); + LIBELF_COPY_U32(move32, gm, m_poffset); + move32->m_repeat = gm->m_repeat; + move32->m_stride = gm->m_stride; + + } else { + move64 = (Elf64_Move *) d->d_buf + ndx; + + *move64 = *gm; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_MOVE */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_phdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_phdr.c new file mode 100644 index 0000000000..47000d84ec --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_phdr.c @@ -0,0 +1,177 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_phdr.c 189 2008-07-20 10:38:08Z jkoshy $"); + +Elf32_Phdr * +elf32_getphdr(Elf *e) +{ + return (_libelf_getphdr(e, ELFCLASS32)); +} + +Elf64_Phdr * +elf64_getphdr(Elf *e) +{ + return (_libelf_getphdr(e, ELFCLASS64)); +} + +GElf_Phdr * +gelf_getphdr(Elf *e, int index, GElf_Phdr *d) +{ + int ec; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + Elf32_Phdr *ep32; + Elf64_Phdr *ep64; + + if (d == NULL || e == NULL || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64) || + (e->e_kind != ELF_K_ELF) || index < 0) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + if ((eh32 = _libelf_ehdr(e, ELFCLASS32, 0)) == NULL || + ((ep32 = _libelf_getphdr(e, ELFCLASS32)) == NULL)) + return (NULL); + + if (index >= eh32->e_phnum) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ep32 += index; + + d->p_type = ep32->p_type; + d->p_offset = ep32->p_offset; + d->p_vaddr = (Elf64_Addr) ep32->p_vaddr; + d->p_paddr = (Elf64_Addr) ep32->p_paddr; + d->p_filesz = (Elf64_Xword) ep32->p_filesz; + d->p_memsz = (Elf64_Xword) ep32->p_memsz; + d->p_flags = ep32->p_flags; + d->p_align = (Elf64_Xword) ep32->p_align; + + } else { + if ((eh64 = _libelf_ehdr(e, ELFCLASS64, 0)) == NULL || + (ep64 = _libelf_getphdr(e, ELFCLASS64)) == NULL) + return (NULL); + + if (index >= eh64->e_phnum) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ep64 += index; + + *d = *ep64; + } + + return (d); +} + +Elf32_Phdr * +elf32_newphdr(Elf *e, size_t count) +{ + return (_libelf_newphdr(e, ELFCLASS32, count)); +} + +Elf64_Phdr * +elf64_newphdr(Elf *e, size_t count) +{ + return (_libelf_newphdr(e, ELFCLASS64, count)); +} + +void * +gelf_newphdr(Elf *e, size_t count) +{ + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + return (_libelf_newphdr(e, e->e_class, count)); +} + +int +gelf_update_phdr(Elf *e, int ndx, GElf_Phdr *s) +{ + int ec, phnum; + void *ehdr; + Elf32_Phdr *ph32; + Elf64_Phdr *ph64; + + if (s == NULL || e == NULL || e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (e->e_cmd == ELF_C_READ) { + LIBELF_SET_ERROR(MODE, 0); + return (0); + } + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return (0); + + if (ec == ELFCLASS32) + phnum = ((Elf32_Ehdr *) ehdr)->e_phnum; + else + phnum = ((Elf64_Ehdr *) ehdr)->e_phnum; + + if (ndx < 0 || ndx > phnum) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + (void) elf_flagphdr(e, ELF_C_SET, ELF_F_DIRTY); + + if (ec == ELFCLASS64) { + ph64 = e->e_u.e_elf.e_phdr.e_phdr64 + ndx; + *ph64 = *s; + return (1); + } + + ph32 = e->e_u.e_elf.e_phdr.e_phdr32 + ndx; + + ph32->p_type = s->p_type; + ph32->p_flags = s->p_flags; + LIBELF_COPY_U32(ph32, s, p_offset); + LIBELF_COPY_U32(ph32, s, p_vaddr); + LIBELF_COPY_U32(ph32, s, p_paddr); + LIBELF_COPY_U32(ph32, s, p_filesz); + LIBELF_COPY_U32(ph32, s, p_memsz); + LIBELF_COPY_U32(ph32, s, p_align); + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rel.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rel.c new file mode 100644 index 0000000000..7d0b6af7fe --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rel.c @@ -0,0 +1,152 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_rel.c 189 2008-07-20 10:38:08Z jkoshy $"); + +GElf_Rel * +gelf_getrel(Elf_Data *d, int ndx, GElf_Rel *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Rel *rel32; + Elf64_Rel *rel64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_REL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_REL, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + rel32 = (Elf32_Rel *) d->d_buf + ndx; + + dst->r_offset = (Elf64_Addr) rel32->r_offset; + dst->r_info = ELF64_R_INFO( + (Elf64_Xword) ELF32_R_SYM(rel32->r_info), + ELF32_R_TYPE(rel32->r_info)); + + } else { + + rel64 = (Elf64_Rel *) d->d_buf + ndx; + + *dst = *rel64; + } + + return (dst); +} + +int +gelf_update_rel(Elf_Data *d, int ndx, GElf_Rel *dr) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Rel *rel32; + Elf64_Rel *rel64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dr == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_REL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_REL, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + rel32 = (Elf32_Rel *) d->d_buf + ndx; + + LIBELF_COPY_U32(rel32, dr, r_offset); + + if (ELF64_R_SYM(dr->r_info) > ELF32_R_SYM(~0UL) || + ELF64_R_TYPE(dr->r_info) > ELF32_R_TYPE(~0U)) { + LIBELF_SET_ERROR(RANGE, 0); + return (0); + } + rel32->r_info = ELF32_R_INFO(ELF64_R_SYM(dr->r_info), + ELF64_R_TYPE(dr->r_info)); + } else { + rel64 = (Elf64_Rel *) d->d_buf + ndx; + + *rel64 = *dr; + } + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rela.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rela.c new file mode 100644 index 0000000000..722c1add83 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_rela.c @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_rela.c 189 2008-07-20 10:38:08Z jkoshy $"); + +GElf_Rela * +gelf_getrela(Elf_Data *d, int ndx, GElf_Rela *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Rela *rela32; + Elf64_Rela *rela64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_RELA) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_RELA, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + rela32 = (Elf32_Rela *) d->d_buf + ndx; + + dst->r_offset = (Elf64_Addr) rela32->r_offset; + dst->r_info = ELF64_R_INFO( + (Elf64_Xword) ELF32_R_SYM(rela32->r_info), + ELF32_R_TYPE(rela32->r_info)); + dst->r_addend = (Elf64_Sxword) rela32->r_addend; + + } else { + + rela64 = (Elf64_Rela *) d->d_buf + ndx; + + *dst = *rela64; + } + + return (dst); +} + +int +gelf_update_rela(Elf_Data *d, int ndx, GElf_Rela *dr) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Rela *rela32; + Elf64_Rela *rela64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dr == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_RELA) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_RELA, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + rela32 = (Elf32_Rela *) d->d_buf + ndx; + + LIBELF_COPY_U32(rela32, dr, r_offset); + + if (ELF64_R_SYM(dr->r_info) > ELF32_R_SYM(~0UL) || + ELF64_R_TYPE(dr->r_info) > ELF32_R_TYPE(~0U)) { + LIBELF_SET_ERROR(RANGE, 0); + return (0); + } + rela32->r_info = ELF32_R_INFO(ELF64_R_SYM(dr->r_info), + ELF64_R_TYPE(dr->r_info)); + + LIBELF_COPY_S32(rela32, dr, r_addend); + } else { + rela64 = (Elf64_Rela *) d->d_buf + ndx; + + *rela64 = *dr; + } + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_shdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_shdr.c new file mode 100644 index 0000000000..7404bcbac3 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_shdr.c @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_shdr.c 189 2008-07-20 10:38:08Z jkoshy $"); + +Elf32_Shdr * +elf32_getshdr(Elf_Scn *s) +{ + return (_libelf_getshdr(s, ELFCLASS32)); +} + +Elf64_Shdr * +elf64_getshdr(Elf_Scn *s) +{ + return (_libelf_getshdr(s, ELFCLASS64)); +} + +GElf_Shdr * +gelf_getshdr(Elf_Scn *s, GElf_Shdr *d) +{ + int ec; + void *sh; + Elf32_Shdr *sh32; + Elf64_Shdr *sh64; + + if (d == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((sh = _libelf_getshdr(s, ELFCLASSNONE)) == NULL) + return (NULL); + + ec = s->s_elf->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) { + sh32 = (Elf32_Shdr *) sh; + + d->sh_name = sh32->sh_name; + d->sh_type = sh32->sh_type; + d->sh_flags = (Elf64_Xword) sh32->sh_flags; + d->sh_addr = (Elf64_Addr) sh32->sh_addr; + d->sh_offset = (Elf64_Off) sh32->sh_offset; + d->sh_size = (Elf64_Xword) sh32->sh_size; + d->sh_link = sh32->sh_link; + d->sh_info = sh32->sh_info; + d->sh_addralign = (Elf64_Xword) sh32->sh_addralign; + d->sh_entsize = (Elf64_Xword) sh32->sh_entsize; + } else { + sh64 = (Elf64_Shdr *) sh; + *d = *sh64; + } + + return (d); +} + +int +gelf_update_shdr(Elf_Scn *scn, GElf_Shdr *s) +{ + int ec; + Elf *e; + Elf32_Shdr *sh32; + + if (s == NULL || scn == NULL || (e = scn->s_elf) == NULL || + e->e_kind != ELF_K_ELF || + ((ec = e->e_class) != ELFCLASS32 && ec != ELFCLASS64)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (e->e_cmd == ELF_C_READ) { + LIBELF_SET_ERROR(MODE, 0); + return (0); + } + + (void) elf_flagscn(scn, ELF_C_SET, ELF_F_DIRTY); + + if (ec == ELFCLASS64) { + scn->s_shdr.s_shdr64 = *s; + return (1); + } + + sh32 = &scn->s_shdr.s_shdr32; + + sh32->sh_name = s->sh_name; + sh32->sh_type = s->sh_type; + LIBELF_COPY_U32(sh32, s, sh_flags); + LIBELF_COPY_U32(sh32, s, sh_addr); + LIBELF_COPY_U32(sh32, s, sh_offset); + LIBELF_COPY_U32(sh32, s, sh_size); + sh32->sh_link = s->sh_link; + sh32->sh_info = s->sh_info; + LIBELF_COPY_U32(sh32, s, sh_addralign); + LIBELF_COPY_U32(sh32, s, sh_entsize); + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_sym.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_sym.c new file mode 100644 index 0000000000..3f84a178eb --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_sym.c @@ -0,0 +1,153 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_sym.c 189 2008-07-20 10:38:08Z jkoshy $"); + +GElf_Sym * +gelf_getsym(Elf_Data *d, int ndx, GElf_Sym *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Sym *sym32; + Elf64_Sym *sym64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_SYM) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_SYM, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + + sym32 = (Elf32_Sym *) d->d_buf + ndx; + + dst->st_name = sym32->st_name; + dst->st_value = (Elf64_Addr) sym32->st_value; + dst->st_size = (Elf64_Xword) sym32->st_size; + dst->st_info = ELF64_ST_INFO(ELF32_ST_BIND(sym32->st_info), + ELF32_ST_TYPE(sym32->st_info)); + dst->st_other = sym32->st_other; + dst->st_shndx = sym32->st_shndx; + } else { + + sym64 = (Elf64_Sym *) d->d_buf + ndx; + + *dst = *sym64; + } + + return (dst); +} + +int +gelf_update_sym(Elf_Data *d, int ndx, GElf_Sym *gs) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Sym *sym32; + Elf64_Sym *sym64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || gs == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_SYM) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_SYM, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + sym32 = (Elf32_Sym *) d->d_buf + ndx; + + sym32->st_name = gs->st_name; + sym32->st_info = gs->st_info; + sym32->st_other = gs->st_other; + sym32->st_shndx = gs->st_shndx; + + LIBELF_COPY_U32(sym32, gs, st_value); + LIBELF_COPY_U32(sym32, gs, st_size); + } else { + sym64 = (Elf64_Sym *) d->d_buf + ndx; + + *sym64 = *gs; + } + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_syminfo.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_syminfo.c new file mode 100644 index 0000000000..09b3744be8 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_syminfo.c @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_syminfo.c 1166 2010-09-04 00:54:36Z jkoshy $"); + +#if LIBELF_CONFIG_SYMINFO +GElf_Syminfo * +gelf_getsyminfo(Elf_Data *d, int ndx, GElf_Syminfo *dst) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Syminfo *syminfo32; + Elf64_Syminfo *syminfo64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || dst == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_SYMINFO) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_SYMINFO, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASS32) { + + syminfo32 = (Elf32_Syminfo *) d->d_buf + ndx; + + dst->si_boundto = syminfo32->si_boundto; + dst->si_flags = syminfo32->si_flags; + + } else { + + syminfo64 = (Elf64_Syminfo *) d->d_buf + ndx; + + *dst = *syminfo64; + } + + return (dst); +} + +int +gelf_update_syminfo(Elf_Data *d, int ndx, GElf_Syminfo *gs) +{ + int ec; + Elf *e; + Elf_Scn *scn; + Elf32_Syminfo *syminfo32; + Elf64_Syminfo *syminfo64; + size_t msz; + uint32_t sh_type; + + if (d == NULL || ndx < 0 || gs == NULL || + (scn = d->d_scn) == NULL || + (e = scn->s_elf) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_SYMINFO) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_SYMINFO, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= d->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + if (ec == ELFCLASS32) { + syminfo32 = (Elf32_Syminfo *) d->d_buf + ndx; + + syminfo32->si_boundto = gs->si_boundto; + syminfo32->si_flags = gs->si_flags; + + } else { + syminfo64 = (Elf64_Syminfo *) d->d_buf + ndx; + + *syminfo64 = *gs; + } + + return (1); +} +#endif /* LIBELF_CONFIG_SYMINFO */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_symshndx.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_symshndx.c new file mode 100644 index 0000000000..ab3549cdfd --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_symshndx.c @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_symshndx.c 189 2008-07-20 10:38:08Z jkoshy $"); + +GElf_Sym * +gelf_getsymshndx(Elf_Data *d, Elf_Data *id, int ndx, GElf_Sym *dst, + Elf32_Word *shindex) +{ + int ec; + Elf *e; + Elf_Scn *scn; + size_t msz; + uint32_t sh_type; + + if (gelf_getsym(d, ndx, dst) == 0) + return (NULL); + + if (id == NULL || (scn = id->d_scn) == NULL || + (e = scn->s_elf) == NULL || (e != d->d_scn->s_elf) || + shindex == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_WORD || + id->d_type != ELF_T_WORD) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_WORD, ec, e->e_version); + + assert(msz > 0); + + if (msz * ndx >= id->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + *shindex = ((Elf32_Word *) id->d_buf)[ndx]; + + return (dst); +} + +int +gelf_update_symshndx(Elf_Data *d, Elf_Data *id, int ndx, GElf_Sym *gs, + Elf32_Word xindex) +{ + int ec; + Elf *e; + Elf_Scn *scn; + size_t msz; + uint32_t sh_type; + + if (gelf_update_sym(d, ndx, gs) == 0) + return (0); + + if (id == NULL || (scn = id->d_scn) == NULL || + (e = scn->s_elf) == NULL || (e != d->d_scn->s_elf)) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + ec = e->e_class; + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (ec == ELFCLASS32) + sh_type = scn->s_shdr.s_shdr32.sh_type; + else + sh_type = scn->s_shdr.s_shdr64.sh_type; + + if (_libelf_xlate_shtype(sh_type) != ELF_T_WORD || + d->d_type != ELF_T_WORD) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + msz = _libelf_msize(ELF_T_WORD, ec, e->e_version); + assert(msz > 0); + + if (msz * ndx >= id->d_size) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0); + } + + *(((Elf32_Word *) id->d_buf) + ndx) = xindex; + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_xlate.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_xlate.c new file mode 100644 index 0000000000..6cdf70536a --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/gelf_xlate.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: gelf_xlate.c 1678 2011-07-28 04:36:34Z jkoshy $"); + +Elf_Data * +elf32_xlatetof(Elf_Data *dst, const Elf_Data *src, unsigned int encoding) +{ + return _libelf_xlate(dst, src, encoding, ELFCLASS32, ELF_TOFILE); +} + +Elf_Data * +elf64_xlatetof(Elf_Data *dst, const Elf_Data *src, unsigned int encoding) +{ + return _libelf_xlate(dst, src, encoding, ELFCLASS64, ELF_TOFILE); +} + +Elf_Data * +elf32_xlatetom(Elf_Data *dst, const Elf_Data *src, unsigned int encoding) +{ + return _libelf_xlate(dst, src, encoding, ELFCLASS32, ELF_TOMEMORY); +} + +Elf_Data * +elf64_xlatetom(Elf_Data *dst, const Elf_Data *src, unsigned int encoding) +{ + return _libelf_xlate(dst, src, encoding, ELFCLASS64, ELF_TOMEMORY); +} + +Elf_Data * +gelf_xlatetom(Elf *e, Elf_Data *dst, const Elf_Data *src, + unsigned int encoding) +{ + if (e != NULL) + return (_libelf_xlate(dst, src, encoding, e->e_class, + ELF_TOMEMORY)); + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); +} + +Elf_Data * +gelf_xlatetof(Elf *e, Elf_Data *dst, const Elf_Data *src, + unsigned int encoding) +{ + if (e != NULL) + return (_libelf_xlate(dst, src, encoding, e->e_class, + ELF_TOFILE)); + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf.h new file mode 100644 index 0000000000..a13c9d45c4 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf.h @@ -0,0 +1,262 @@ +/*- + * Copyright (c) 2006,2008-2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: libelf.h 1345 2011-01-01 11:17:52Z jkoshy $ + */ + +#ifndef _LIBELF_H_ +#define _LIBELF_H_ + +#include +#include +#include +#ifndef EM_HSAIL +#define EM_HSAIL 0xAF5A +#endif + +/* Library private data structures */ +typedef struct _Elf Elf; +typedef struct _Elf_Scn Elf_Scn; +typedef struct _Elf_Mem Elf_Mem; + +/* File types */ +typedef enum { + ELF_K_NONE = 0, + ELF_K_AR, /* `ar' archives */ + ELF_K_COFF, /* COFF files (unsupported) */ + ELF_K_ELF, /* ELF files */ + ELF_K_NUM +} Elf_Kind; + +#define ELF_K_FIRST ELF_K_NONE +#define ELF_K_LAST ELF_K_NUM + +/* Data types */ +typedef enum { + ELF_T_ADDR, + ELF_T_BYTE, + ELF_T_CAP, + ELF_T_DYN, + ELF_T_EHDR, + ELF_T_HALF, + ELF_T_LWORD, + ELF_T_MOVE, + ELF_T_MOVEP, + ELF_T_NOTE, + ELF_T_OFF, + ELF_T_PHDR, + ELF_T_REL, + ELF_T_RELA, + ELF_T_SHDR, + ELF_T_SWORD, + ELF_T_SXWORD, + ELF_T_SYMINFO, + ELF_T_SYM, + ELF_T_VDEF, + ELF_T_VNEED, + ELF_T_WORD, + ELF_T_XWORD, + ELF_T_GNUHASH, /* GNU style hash tables. */ + ELF_T_NUM +} Elf_Type; + +#define ELF_T_FIRST ELF_T_ADDR +#define ELF_T_LAST ELF_T_GNUHASH + +/* Commands */ +typedef enum { + ELF_C_NULL = 0, + ELF_C_CLR, + ELF_C_FDDONE, + ELF_C_FDREAD, + ELF_C_RDWR, + ELF_C_READ, + ELF_C_SET, + ELF_C_WRITE, + ELF_C_NUM +} Elf_Cmd; + +#define ELF_C_FIRST ELF_C_NULL +#define ELF_C_LAST ELF_C_NUM + +/* + * An `Elf_Data' structure describes data in an + * ELF section. + */ +typedef struct _Elf_Data { + /* + * `Public' members that are part of the ELF(3) API. + */ + uint64_t d_align; + void *d_buf; + uint64_t d_off; + uint64_t d_size; + Elf_Type d_type; + unsigned int d_version; + + /* + * Members that are not part of the public API. + */ + Elf_Scn *d_scn; /* containing section */ + unsigned int d_flags; + STAILQ_ENTRY(_Elf_Data) d_next; +} Elf_Data; + +/* + * An `Elf_Arhdr' structure describes an archive + * header. + */ +typedef struct { + time_t ar_date; + char *ar_name; /* archive member name */ + gid_t ar_gid; + mode_t ar_mode; + char *ar_rawname; /* 'raw' member name */ + size_t ar_size; + uid_t ar_uid; + + /* + * Members that are not part of the public API. + */ + int ar_flags; +} Elf_Arhdr; + +/* + * An `Elf_Arsym' describes an entry in the archive + * symbol table. + */ +typedef struct { + off_t as_off; /* byte offset to member's header */ + unsigned long as_hash; /* elf_hash() value for name */ + char *as_name; /* null terminated symbol name */ +} Elf_Arsym; + +/* + * Error numbers. + */ + +enum Elf_Error { + ELF_E_NONE, /* No error */ + ELF_E_ARCHIVE, /* Malformed ar(1) archive */ + ELF_E_ARGUMENT, /* Invalid argument */ + ELF_E_CLASS, /* Mismatched ELF class */ + ELF_E_DATA, /* Invalid data descriptor */ + ELF_E_HEADER, /* Missing or malformed ELF header */ + ELF_E_IO, /* I/O error */ + ELF_E_LAYOUT, /* Layout constraint violation */ + ELF_E_MODE, /* Wrong mode for ELF descriptor */ + ELF_E_RANGE, /* Value out of range */ + ELF_E_RESOURCE, /* Resource exhaustion */ + ELF_E_SECTION, /* Invalid section descriptor */ + ELF_E_SEQUENCE, /* API calls out of sequence */ + ELF_E_UNIMPL, /* Feature is unimplemented */ + ELF_E_VERSION, /* Unknown API version */ + ELF_E_NUM /* Max error number */ +}; + +/* + * Flags defined by the API. + */ + +#define ELF_F_LAYOUT 0x001U /* application will layout the file */ +#define ELF_F_DIRTY 0x002U /* a section or ELF file is dirty */ + +/* ELF(3) API extensions. */ +#define ELF_F_ARCHIVE 0x100U /* archive creation */ +#define ELF_F_ARCHIVE_SYSV 0x200U /* SYSV style archive */ + +__BEGIN_DECLS +Elf *elf_begin(int _fd, Elf_Cmd _cmd, Elf *_elf, Elf_Mem *_mem); +int elf_cntl(Elf *_elf, Elf_Cmd _cmd); +int elf_end(Elf *_elf); +const char *elf_errmsg(int _error); +int elf_errno(void); +void elf_fill(int _fill); +unsigned int elf_flagarhdr(Elf_Arhdr *_arh, Elf_Cmd _cmd, + unsigned int _flags); +unsigned int elf_flagdata(Elf_Data *_data, Elf_Cmd _cmd, + unsigned int _flags); +unsigned int elf_flagehdr(Elf *_elf, Elf_Cmd _cmd, unsigned int _flags); +unsigned int elf_flagelf(Elf *_elf, Elf_Cmd _cmd, unsigned int _flags); +unsigned int elf_flagphdr(Elf *_elf, Elf_Cmd _cmd, unsigned int _flags); +unsigned int elf_flagscn(Elf_Scn *_scn, Elf_Cmd _cmd, unsigned int _flags); +unsigned int elf_flagshdr(Elf_Scn *_scn, Elf_Cmd _cmd, unsigned int _flags); +Elf_Arhdr *elf_getarhdr(Elf *_elf); +Elf_Arsym *elf_getarsym(Elf *_elf, size_t *_ptr); +off_t elf_getbase(Elf *_elf); +Elf_Data *elf_getdata(Elf_Scn *, Elf_Data *); +void elf_removedata(Elf_Scn *s, Elf_Data *d); +char *elf_getident(Elf *_elf, size_t *_ptr); +int elf_getphdrnum(Elf *_elf, size_t *_dst); +int elf_getphnum(Elf *_elf, size_t *_dst); /* Deprecated */ +Elf_Scn *elf_getscn(Elf *_elf, size_t _index); +int elf_getshdrnum(Elf *_elf, size_t *_dst); +int elf_getshnum(Elf *_elf, size_t *_dst); /* Deprecated */ +int elf_getshdrstrndx(Elf *_elf, size_t *_dst); +int elf_getshstrndx(Elf *_elf, size_t *_dst); /* Deprecated */ +unsigned long elf_hash(const char *_name); +Elf_Kind elf_kind(Elf *_elf); +Elf *elf_memory(char *_image, size_t _size, Elf_Mem *mem); +size_t elf_ndxscn(Elf_Scn *_scn); +Elf_Data *elf_newdata(Elf_Scn *_scn); +Elf_Scn *elf_newscn(Elf *_elf); +Elf_Scn *elf_nextscn(Elf *_elf, Elf_Scn *_scn); +Elf_Cmd elf_next(Elf *_elf); +off_t elf_rand(Elf *_elf, off_t _off); +Elf_Data *elf_rawdata(Elf_Scn *_scn, Elf_Data *_data); +char *elf_rawfile(Elf *_elf, size_t *_size); +int elf_setshstrndx(Elf *_elf, size_t _shnum); +char *elf_strptr(Elf *_elf, size_t _section, size_t _offset); +off_t elf_update(Elf *_elf, Elf_Cmd _cmd); +unsigned int elf_version(unsigned int _version); + +long elf32_checksum(Elf *_elf); +size_t elf32_fsize(Elf_Type _type, size_t _count, + unsigned int _version); +Elf32_Ehdr *elf32_getehdr(Elf *_elf); +Elf32_Phdr *elf32_getphdr(Elf *_elf); +Elf32_Shdr *elf32_getshdr(Elf_Scn *_scn); +Elf32_Ehdr *elf32_newehdr(Elf *_elf); +Elf32_Phdr *elf32_newphdr(Elf *_elf, size_t _count); +Elf_Data *elf32_xlatetof(Elf_Data *_dst, const Elf_Data *_src, + unsigned int _enc); +Elf_Data *elf32_xlatetom(Elf_Data *_dst, const Elf_Data *_src, + unsigned int _enc); + +long elf64_checksum(Elf *_elf); +size_t elf64_fsize(Elf_Type _type, size_t _count, + unsigned int _version); +Elf64_Ehdr *elf64_getehdr(Elf *_elf); +Elf64_Phdr *elf64_getphdr(Elf *_elf); +Elf64_Shdr *elf64_getshdr(Elf_Scn *_scn); +Elf64_Ehdr *elf64_newehdr(Elf *_elf); +Elf64_Phdr *elf64_newphdr(Elf *_elf, size_t _count); +Elf_Data *elf64_xlatetof(Elf_Data *_dst, const Elf_Data *_src, + unsigned int _enc); +Elf_Data *elf64_xlatetom(Elf_Data *_dst, const Elf_Data *_src, + unsigned int _enc); +__END_DECLS + +#endif /* _LIBELF_H_ */ diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_align.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_align.c new file mode 100644 index 0000000000..7328ca5ff0 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_align.c @@ -0,0 +1,185 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_align.c 1169 2010-09-04 01:06:31Z jkoshy $"); + +struct align { + int a32; + int a64; +}; + +#ifdef __GNUC__ +#define MALIGN(N) { \ + .a32 = __alignof__(Elf32_##N), \ + .a64 = __alignof__(Elf64_##N) \ + } +#define MALIGN64(V) { \ + .a32 = 0, \ + .a64 = __alignof__(Elf64_##V) \ + } +#define MALIGN_WORD() { \ + .a32 = __alignof__(int32_t), \ + .a64 = __alignof__(int64_t) \ + } +#elif !defined(_MSC_VER) +#error Need the __alignof__ builtin. +#endif +#define UNSUPPORTED() { \ + .a32 = 0, \ + .a64 = 0 \ + } + +static struct align malign[ELF_T_NUM] = { +#if defined(__GNUC__) + [ELF_T_ADDR] = MALIGN(Addr), + [ELF_T_BYTE] = { .a32 = 1, .a64 = 1 }, +#if LIBELF_CONFIG_CAP + [ELF_T_CAP] = MALIGN(Cap), +#endif + [ELF_T_DYN] = MALIGN(Dyn), + [ELF_T_EHDR] = MALIGN(Ehdr), + [ELF_T_HALF] = MALIGN(Half), +#if LIBELF_CONFIG_LWORD + [ELF_T_LWORD] = MALIGN(Lword), +#endif +#if LIBELF_CONFIG_MOVE + [ELF_T_MOVE] = MALIGN(Move), +#endif + [ELF_T_MOVEP] = UNSUPPORTED(), +#if LIBELF_CONFIG_NOTE + [ELF_T_NOTE] = MALIGN(Nhdr), +#endif + [ELF_T_OFF] = MALIGN(Off), + [ELF_T_PHDR] = MALIGN(Phdr), + [ELF_T_REL] = MALIGN(Rel), + [ELF_T_RELA] = MALIGN(Rela), + [ELF_T_SHDR] = MALIGN(Shdr), + [ELF_T_SWORD] = MALIGN(Sword), + [ELF_T_SXWORD] = MALIGN64(Sxword), + [ELF_T_SYM] = MALIGN(Sym), +#if LIBELF_CONFIG_SYMINFO + [ELF_T_SYMINFO] = MALIGN(Syminfo), +#endif +#if LIBELF_CONFIG_VDEF + [ELF_T_VDEF] = MALIGN(Verdef), +#endif +#if LIBELF_CONFIG_VNEED + [ELF_T_VNEED] = MALIGN(Verneed), +#endif + [ELF_T_WORD] = MALIGN(Word), + [ELF_T_XWORD] = MALIGN64(Xword), + [ELF_T_GNUHASH] = MALIGN_WORD() +#elif defined(_MSC_VER) + { 4, 8 }, { 1, 1 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, + { 2, 2 }, { 8, 8 }, { 8, 8 }, { 0, 0 }, { 4, 4 }, + { 4, 8 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, + { 4, 4 }, { 0, 8 }, { 4, 8 }, { 2, 2 }, { 4, 4 }, + { 4, 4 }, { 4, 4 }, { 0, 8 }, { 4, 8 } +#else +#error +#endif +}; + +int +_libelf_malign(Elf_Type t, int elfclass) +{ + if (t >= ELF_T_NUM || (int) t < 0) + return (0); + + return (elfclass == ELFCLASS32 ? malign[t].a32 : + malign[t].a64); +} + +#define FALIGN(A32,A64) { .a32 = (A32), .a64 = (A64) } + +static struct align falign[ELF_T_NUM] = { +#if defined(__GNUC__) + [ELF_T_ADDR] = FALIGN(4,8), + [ELF_T_BYTE] = FALIGN(1,1), +#if LIBELF_CONFIG_CAP + [ELF_T_CAP] = FALIGN(4,8), +#endif + [ELF_T_DYN] = FALIGN(4,8), + [ELF_T_EHDR] = FALIGN(4,8), + [ELF_T_HALF] = FALIGN(2,2), +#if LIBELF_CONFIG_LWORD + [ELF_T_LWORD] = FALIGN(8,8), +#endif +#if LIBELF_CONFIG_MOVE + [ELF_T_MOVE] = FALIGN(8,8), +#endif + [ELF_T_MOVEP] = UNSUPPORTED(), +#if LIBELF_CONFIG_NOTE + [ELF_T_NOTE] = FALIGN(4,4), +#endif + [ELF_T_OFF] = FALIGN(4,8), + [ELF_T_PHDR] = FALIGN(4,8), + [ELF_T_REL] = FALIGN(4,8), + [ELF_T_RELA] = FALIGN(4,8), + [ELF_T_SHDR] = FALIGN(4,8), + [ELF_T_SWORD] = FALIGN(4,4), + [ELF_T_SXWORD] = FALIGN(0,8), + [ELF_T_SYM] = FALIGN(4,8), +#if LIBELF_CONFIG_SYMINFO + [ELF_T_SYMINFO] = FALIGN(2,2), +#endif +#if LIBELF_CONFIG_VDEF + [ELF_T_VDEF] = FALIGN(4,4), +#endif +#if LIBELF_CONFIG_VNEED + [ELF_T_VNEED] = FALIGN(4,4), +#endif + [ELF_T_WORD] = FALIGN(4,4), + [ELF_T_XWORD] = FALIGN(0,8), + [ELF_T_GNUHASH] = FALIGN(4,8) +#elif defined(_MSC_VER) + { 4, 8 }, { 1, 1 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, + { 2, 2 }, { 8, 8 }, { 8, 8 }, { 0, 0 }, { 4, 4 }, + { 4, 8 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, { 4, 8 }, + { 4, 4 }, { 0, 8 }, { 4, 8 }, { 2, 2 }, { 4, 4 }, + { 4, 4 }, { 4, 4 }, { 0, 8 }, { 4, 8 } +#else +#error +#endif +}; + +int +_libelf_falign(Elf_Type t, int elfclass) +{ + if (t >= ELF_T_NUM || (int) t < 0) + return (0); + + return (elfclass == ELFCLASS32 ? falign[t].a32 : + falign[t].a64); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_allocate.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_allocate.c new file mode 100644 index 0000000000..2447fb73f9 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_allocate.c @@ -0,0 +1,228 @@ +/*- + * Copyright (c) 2006,2008,2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Internal APIs + */ + +#include + +#if !defined(WIN32) +#include +#endif + +#include +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_allocate.c 1341 2011-01-01 04:28:29Z jkoshy $"); + +Elf * +_libelf_allocate_elf(Elf_Mem *mem) +{ + Elf *e; + + if (mem != NULL) { + if ((e = mem->alloc(sizeof(*e))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, errno); + return NULL; + } + e->e_mem.dealloc = mem->dealloc; + e->e_mem.alloc = mem->alloc; + } else { + if ((e = malloc(sizeof(*e))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, errno); + return NULL; + } + e->e_mem.dealloc = &free; + e->e_mem.alloc = &malloc; + } + + e->e_activations = 1; + e->e_hdr.e_rawhdr = NULL; + e->e_byteorder = ELFDATANONE; + e->e_class = ELFCLASSNONE; + e->e_cmd = ELF_C_NULL; + e->e_fd = -1; + e->e_flags = 0; + e->e_kind = ELF_K_NONE; + e->e_parent = NULL; + e->e_rawfile = NULL; + e->e_rawsize = 0; + e->e_version = LIBELF_PRIVATE(version); + + (void) memset(&e->e_u, 0, sizeof(e->e_u)); + + return (e); +} + +void +_libelf_init_elf(Elf *e, Elf_Kind kind) +{ + assert(e != NULL); + assert(e->e_kind == ELF_K_NONE); + + e->e_kind = kind; + + switch (kind) { + case ELF_K_ELF: + STAILQ_INIT(&e->e_u.e_elf.e_scn); + break; + default: + break; + } +} + +#define FREE(E, P) do { \ + if (P) \ + E->e_mem.dealloc(P); \ + } while (0) + +Elf * +_libelf_release_elf(Elf *e) +{ + Elf_Arhdr *arh; + + switch (e->e_kind) { + case ELF_K_AR: + FREE(e, e->e_u.e_ar.e_symtab); + break; + + case ELF_K_ELF: + switch (e->e_class) { + case ELFCLASS32: + FREE(e, e->e_u.e_elf.e_ehdr.e_ehdr32); + FREE(e, e->e_u.e_elf.e_phdr.e_phdr32); + break; + case ELFCLASS64: + FREE(e, e->e_u.e_elf.e_ehdr.e_ehdr64); + FREE(e, e->e_u.e_elf.e_phdr.e_phdr64); + break; + } + + assert(STAILQ_EMPTY(&e->e_u.e_elf.e_scn)); + + if (e->e_flags & LIBELF_F_AR_HEADER) { + arh = e->e_hdr.e_arhdr; + FREE(e, arh->ar_name); + FREE(e, arh->ar_rawname); + e->e_mem.dealloc(arh); + } + + break; + + default: + break; + } + + e->e_mem.dealloc(e); + + return (NULL); +} + +Elf_Data * +_libelf_allocate_data(Elf_Scn *s) +{ + Elf_Data *d; + + if ((d = s->s_elf->e_mem.alloc(sizeof(Elf_Data))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + memset(d, 0, sizeof(*d)); + + d->d_scn = s; + + return (d); +} + +Elf_Data * +_libelf_release_data(Elf_Data *d) +{ + + if (d->d_flags & LIBELF_F_DATA_MALLOCED) + d->d_scn->s_elf->e_mem.dealloc(d->d_buf); + + d->d_scn->s_elf->e_mem.dealloc(d); + + return (NULL); +} + +Elf_Scn * +_libelf_allocate_scn(Elf *e, size_t ndx) +{ + Elf_Scn *s; + + if ((s = e->e_mem.alloc(sizeof(Elf_Scn))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, errno); + return (NULL); + } + memset(s, 0, sizeof(*s)); + + s->s_elf = e; + s->s_ndx = ndx; + + STAILQ_INIT(&s->s_data); + STAILQ_INIT(&s->s_rawdata); + + STAILQ_INSERT_TAIL(&e->e_u.e_elf.e_scn, s, s_next); + + return (s); +} + +Elf_Scn * +_libelf_release_scn(Elf_Scn *s) +{ + Elf *e; + Elf_Data *d, *td; + + assert(s != NULL); + + STAILQ_FOREACH_SAFE(d, &s->s_data, d_next, td) { + STAILQ_REMOVE(&s->s_data, d, _Elf_Data, d_next); + d = _libelf_release_data(d); + } + + STAILQ_FOREACH_SAFE(d, &s->s_rawdata, d_next, td) { + assert((d->d_flags & LIBELF_F_DATA_MALLOCED) == 0); + STAILQ_REMOVE(&s->s_rawdata, d, _Elf_Data, d_next); + d = _libelf_release_data(d); + } + + e = s->s_elf; + + assert(e != NULL); + + STAILQ_REMOVE(&e->e_u.e_elf.e_scn, s, _Elf_Scn, s_next); + + e->e_mem.dealloc(s); + + return (NULL); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar.c new file mode 100644 index 0000000000..e6bb261d17 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar.c @@ -0,0 +1,458 @@ +/*- + * Copyright (c) 2006,2008,2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include + +#include "_libelf.h" +#include "_libelf_ar.h" + +LIBELF_VCSID("$Id: libelf_ar.c 1341 2011-01-01 04:28:29Z jkoshy $"); + +#define LIBELF_NALLOC_SIZE 16 + +/* + * `ar' archive handling. + * + * `ar' archives start with signature `ARMAG'. Each archive member is + * preceded by a header containing meta-data for the member. This + * header is described in (struct ar_hdr). The header always + * starts on an even address. File data is padded with "\n" + * characters to keep this invariant. + * + * Special considerations for `ar' archives: + * + * There are two variants of the `ar' archive format: traditional BSD + * and SVR4. These differ in the way long file names are treated, and + * in the layout of the archive symbol table. + * + * The `ar' header only has space for a 16 character file name. + * + * In the SVR4 format, file names are terminated with a '/', so this + * effectively leaves 15 characters for the actual file name. Longer + * file names stored in a separate 'string table' and referenced + * indirectly from the name field. The string table itself appears as + * an archive member with name "// ". An `indirect' file name in an + * `ar' header matches the pattern "/[0-9]*". The digits form a + * decimal number that corresponds to a byte offset into the string + * table where the actual file name of the object starts. Strings in + * the string table are padded to start on even addresses. + * + * In the BSD format, file names can be upto 16 characters. File + * names shorter than 16 characters are padded to 16 characters using + * (ASCII) space characters. File names with embedded spaces and file + * names longer than 16 characters are stored immediately after the + * archive header and the name field set to a special indirect name + * matching the pattern "#1/[0-9]+". The digits form a decimal number + * that corresponds to the actual length of the file name following + * the archive header. The content of the archive member immediately + * follows the file name, and the size field of the archive member + * holds the sum of the sizes of the member and of the appended file + * name. + * + * Archives may also have a symbol table (see ranlib(1)), mapping + * program symbols to object files inside the archive. + * + * In the SVR4 format, a symbol table uses a file name of "/ " in its + * archive header. The symbol table is structured as: + * - a 4-byte count of entries stored as a binary value, MSB first + * - 'n' 4-byte offsets, stored as binary values, MSB first + * - 'n' NUL-terminated strings, for ELF symbol names, stored unpadded. + * + * In the BSD format, the symbol table uses a file name of "__.SYMDEF". + * It is structured as two parts: + * - The first part is an array of "ranlib" structures preceded by + * the size of the array in bytes. Each "ranlib" structure + * describes one symbol. Each structure contains an offset into + * the string table for the symbol name, and a file offset into the + * archive for the member defining the symbol. + * - The second part is a string table containing NUL-terminated + * strings, preceded by the size of the string table in bytes. + * + * If the symbol table and string table are is present in an archive + * they must be the very first objects and in that order. + */ + +/* + * Retrieve an archive header descriptor. + */ + +Elf_Arhdr * +_libelf_ar_gethdr(Elf *e) +{ + Elf *parent; + char *namelen; + Elf_Arhdr *eh; + size_t n, nlen; + struct ar_hdr *arh; + + if ((parent = e->e_parent) == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + assert((e->e_flags & LIBELF_F_AR_HEADER) == 0); + + arh = (struct ar_hdr *) (uintptr_t) e->e_hdr.e_rawhdr; + + assert((uintptr_t) arh >= (uintptr_t) parent->e_rawfile + SARMAG); + assert((uintptr_t) arh <= (uintptr_t) parent->e_rawfile + + parent->e_rawsize - sizeof(struct ar_hdr)); + + if ((eh = e->e_mem.alloc(sizeof(Elf_Arhdr))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + e->e_hdr.e_arhdr = eh; + e->e_flags |= LIBELF_F_AR_HEADER; + + eh->ar_name = eh->ar_rawname = NULL; + + if ((eh->ar_name = _libelf_ar_get_translated_name(arh, parent)) == + NULL) + goto error; + + if (_libelf_ar_get_number(arh->ar_uid, sizeof(arh->ar_uid), 10, + &n) == 0) + goto error; + eh->ar_uid = (uid_t) n; + + if (_libelf_ar_get_number(arh->ar_gid, sizeof(arh->ar_gid), 10, + &n) == 0) + goto error; + eh->ar_gid = (gid_t) n; + + if (_libelf_ar_get_number(arh->ar_mode, sizeof(arh->ar_mode), 8, + &n) == 0) + goto error; + eh->ar_mode = (mode_t) n; + + if (_libelf_ar_get_number(arh->ar_size, sizeof(arh->ar_size), 10, + &n) == 0) + goto error; + + /* + * Get the true size of the member if extended naming is being used. + */ + if (IS_EXTENDED_BSD_NAME(arh->ar_name)) { + namelen = arh->ar_name + + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE; + if (_libelf_ar_get_number(namelen, sizeof(arh->ar_name) - + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE, 10, &nlen) == 0) + goto error; + n -= nlen; + } + + eh->ar_size = n; + + if ((eh->ar_rawname = _libelf_ar_get_raw_name(arh)) == NULL) + goto error; + + eh->ar_flags = 0; + + return (eh); + + error: + if (eh) { + if (eh->ar_name) + e->e_mem.dealloc(eh->ar_name); + if (eh->ar_rawname) + e->e_mem.dealloc(eh->ar_rawname); + e->e_mem.dealloc(eh); + } + + e->e_flags &= ~LIBELF_F_AR_HEADER; + e->e_hdr.e_rawhdr = (char *) arh; + + return (NULL); +} + +Elf * +_libelf_ar_open_member(int fd, Elf_Cmd c, Elf *elf, Elf_Mem* mem) +{ + Elf *e; + char *member, *namelen; + size_t nsz, sz; + off_t next; + struct ar_hdr *arh; + + assert(elf->e_kind == ELF_K_AR); + + next = elf->e_u.e_ar.e_next; + + /* + * `next' is only set to zero by elf_next() when the last + * member of an archive is processed. + */ + if (next == (off_t) 0) + return (NULL); + + assert((next & 1) == 0); + + arh = (struct ar_hdr *) (elf->e_rawfile + next); + + /* + * Retrieve the size of the member. + */ + if (_libelf_ar_get_number(arh->ar_size, sizeof(arh->ar_size), 10, + &sz) == 0) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + } + + /* + * Adjust the size field for members in BSD archives using + * extended naming. + */ + if (IS_EXTENDED_BSD_NAME(arh->ar_name)) { + namelen = arh->ar_name + + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE; + if (_libelf_ar_get_number(namelen, sizeof(arh->ar_name) - + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE, 10, &nsz) == 0) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + } + + member = (char *) (arh + 1) + nsz; + sz -= nsz; + } else + member = (char *) (arh + 1); + + if ((e = elf_memory((char *) member, sz, mem)) == NULL) + return (NULL); + + e->e_fd = fd; + e->e_cmd = c; + e->e_hdr.e_rawhdr = (char *) arh; + + elf->e_u.e_ar.e_nchildren++; + e->e_parent = elf; + + return (e); +} + +/* + * A BSD-style ar(1) symbol table has the following layout: + * + * - A count of bytes used by the following array of 'ranlib' + * structures, stored as a 'long'. + * - An array of 'ranlib' structures. Each array element is + * two 'long's in size. + * - A count of bytes used for the following symbol table. + * - The symbol table itself. + */ + +/* + * A helper macro to read in a 'long' value from the archive. We use + * memcpy() since the source pointer may be misaligned with respect to + * the natural alignment for a C 'long'. + */ +#define GET_LONG(P, V)do { \ + memcpy(&(V), (P), sizeof(long)); \ + (P) += sizeof(long); \ + } while (0) + +Elf_Arsym * +_libelf_ar_process_bsd_symtab(Elf *e, size_t *count) +{ + Elf_Arsym *symtab, *sym; + unsigned char *end, *p, *p0, *s, *s0; + const unsigned int entrysize = 2 * sizeof(long); + long arraysize, fileoffset, n, nentries, stroffset, strtabsize; + + assert(e != NULL); + assert(count != NULL); + assert(e->e_u.e_ar.e_symtab == NULL); + + symtab = NULL; + + /* + * The BSD symbol table always contains the count fields even + * if there are no entries in it. + */ + if (e->e_u.e_ar.e_rawsymtabsz < 2 * sizeof(long)) + goto symtaberror; + + p = p0 = (unsigned char *) e->e_u.e_ar.e_rawsymtab; + end = p0 + e->e_u.e_ar.e_rawsymtabsz; + + /* + * Retrieve the size of the array of ranlib descriptors and + * check it for validity. + */ + GET_LONG(p, arraysize); + + if (p0 + arraysize >= end || (arraysize % entrysize != 0)) + goto symtaberror; + + /* + * Check the value of the string table size. + */ + s = p + arraysize; + GET_LONG(s, strtabsize); + + s0 = s; /* Start of string table. */ + if (s0 + strtabsize > end) + goto symtaberror; + + nentries = arraysize / entrysize; + + /* + * Allocate space for the returned Elf_Arsym array. + */ + if ((symtab = e->e_mem.alloc(sizeof(Elf_Arsym) * (nentries + 1))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + /* Read in symbol table entries. */ + for (n = 0, sym = symtab; n < nentries; n++, sym++) { + GET_LONG(p, stroffset); + GET_LONG(p, fileoffset); + + s = s0 + stroffset; + + if (s >= end) + goto symtaberror; + + sym->as_off = fileoffset; + sym->as_hash = elf_hash((char *) s); + sym->as_name = (char *) s; + } + + /* Fill up the sentinel entry. */ + sym->as_name = NULL; + sym->as_hash = ~0UL; + sym->as_off = (off_t) 0; + + /* Remember the processed symbol table. */ + e->e_u.e_ar.e_symtab = symtab; + + *count = e->e_u.e_ar.e_symtabsz = nentries + 1; + + return (symtab); + +symtaberror: + if (symtab) + e->e_mem.dealloc(symtab); + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); +} + +/* + * An SVR4-style ar(1) symbol table has the following layout: + * + * - The first 4 bytes are a binary count of the number of entries in the + * symbol table, stored MSB-first. + * - Then there are 'n' 4-byte binary offsets, also stored MSB first. + * - Following this, there are 'n' null-terminated strings. + */ + +#define GET_WORD(P, V) do { \ + (V) = 0; \ + (V) = (P)[0]; (V) <<= 8; \ + (V) += (P)[1]; (V) <<= 8; \ + (V) += (P)[2]; (V) <<= 8; \ + (V) += (P)[3]; \ + } while (0) + +#define INTSZ 4 + +Elf_Arsym * +_libelf_ar_process_svr4_symtab(Elf *e, size_t *count) +{ + size_t n, nentries, off; + Elf_Arsym *symtab, *sym; + char *p, *s, *end; + + assert(e != NULL); + assert(count != NULL); + assert(e->e_u.e_ar.e_symtab == NULL); + + symtab = NULL; + + if (e->e_u.e_ar.e_rawsymtabsz < INTSZ) + goto symtaberror; + + p = e->e_u.e_ar.e_rawsymtab; + end = p + e->e_u.e_ar.e_rawsymtabsz; + + GET_WORD(p, nentries); + p += INTSZ; + + if (nentries == 0 || p + nentries * INTSZ >= end) + goto symtaberror; + + /* Allocate space for a nentries + a sentinel. */ + if ((symtab = e->e_mem.alloc(sizeof(Elf_Arsym) * (nentries+1))) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + s = p + (nentries * INTSZ); /* start of the string table. */ + + for (n = nentries, sym = symtab; n > 0; n--) { + + if (s >= end) + goto symtaberror; + + off = 0; + + GET_WORD(p, off); + + sym->as_off = off; + sym->as_hash = elf_hash((char *) s); + sym->as_name = (char *) s; + + p += INTSZ; + sym++; + + for (; s < end && *s++ != '\0';) /* skip to next string */ + ; + } + + /* Fill up the sentinel entry. */ + sym->as_name = NULL; + sym->as_hash = ~0UL; + sym->as_off = (off_t) 0; + + *count = e->e_u.e_ar.e_symtabsz = nentries + 1; + e->e_u.e_ar.e_symtab = symtab; + + return (symtab); + +symtaberror: + if (symtab) + e->e_mem.dealloc(symtab); + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar_util.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar_util.c new file mode 100644 index 0000000000..70681aacda --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ar_util.c @@ -0,0 +1,354 @@ +/*- + * Copyright (c) 2006,2009,2010 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include "_libelf.h" +#include "_libelf_ar.h" + +LIBELF_VCSID("$Id: libelf_ar_util.c 2066 2011-10-26 15:40:28Z jkoshy $"); + +/* + * Convert a string bounded by `start' and `start+sz' (exclusive) to a + * number in the specified base. + */ +int +_libelf_ar_get_number(const char *s, size_t sz, int base, size_t *ret) +{ + int c, v; + size_t r; + const char *e; + + assert(base <= 10); + + e = s + sz; + + /* skip leading blanks */ + for (;s < e && (c = *s) == ' '; s++) + ; + + r = 0L; + for (;s < e; s++) { + if ((c = *s) == ' ') + break; + if (c < '0' || c > '9') + return (0); + v = c - '0'; + if (v >= base) /* Illegal digit. */ + break; + r *= base; + r += v; + } + + *ret = r; + + return (1); +} + +/* + * Return the translated name for an archive member. + */ +char * +_libelf_ar_get_translated_name(const struct ar_hdr *arh, Elf *ar) +{ + char c, *s; + size_t len, offset; + const char *buf, *p, *q, *r; + const size_t bufsize = sizeof(arh->ar_name); + + assert(arh != NULL); + assert(ar->e_kind == ELF_K_AR); + assert((const char *) arh >= ar->e_rawfile && + (const char *) arh < ar->e_rawfile + ar->e_rawsize); + + buf = arh->ar_name; + + /* + * Check for extended naming. + * + * If the name matches the pattern "^/[0-9]+", it is an + * SVR4-style extended name. If the name matches the pattern + * "#1/[0-9]+", the entry uses BSD style extended naming. + */ + if (buf[0] == '/' && (c = buf[1]) >= '0' && c <= '9') { + /* + * The value in field ar_name is a decimal offset into + * the archive string table where the actual name + * resides. + */ + if (_libelf_ar_get_number(buf + 1, bufsize - 1, 10, + &offset) == 0) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + } + + if (offset > ar->e_u.e_ar.e_rawstrtabsz) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + } + + p = q = ar->e_u.e_ar.e_rawstrtab + offset; + r = ar->e_u.e_ar.e_rawstrtab + ar->e_u.e_ar.e_rawstrtabsz; + + for (; p < r && *p != '/'; p++) + ; + len = p - q + 1; /* space for the trailing NUL */ + + if ((s = ar->e_mem.alloc(len)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + (void) strncpy(s, q, len - 1); + s[len - 1] = '\0'; + + return (s); + } else if (IS_EXTENDED_BSD_NAME(buf)) { + r = buf + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE; + + if (_libelf_ar_get_number(r, bufsize - + LIBELF_AR_BSD_EXTENDED_NAME_PREFIX_SIZE, 10, + &len) == 0) { + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + } + + /* + * Allocate space for the file name plus a + * trailing NUL. + */ + if ((s = ar->e_mem.alloc(len + 1)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + /* + * The file name follows the archive header. + */ + q = (const char *) (arh + 1); + + (void) strncpy(s, q, len); + s[len] = '\0'; + + return (s); + } + + /* + * A 'normal' name. + * + * Skip back over trailing blanks from the end of the field. + * In the SVR4 format, a '/' is used as a terminator for + * non-special names. + */ + for (q = buf + bufsize - 1; q >= buf && *q == ' '; --q) + ; + + if (q >= buf) { + if (*q == '/') { + /* + * SVR4 style names: ignore the trailing + * character '/', but only if the name is not + * one of the special names "/" and "//". + */ + if (q > buf + 1 || + (q == (buf + 1) && *buf != '/')) + q--; + } + + len = q - buf + 2; /* Add space for a trailing NUL. */ + } else { + /* The buffer only had blanks. */ + buf = ""; + len = 1; + } + + if ((s = ar->e_mem.alloc(len)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + (void) strncpy(s, buf, len - 1); + s[len - 1] = '\0'; + + return (s); +} + +/* + * Return the raw name for an archive member, inclusive of any + * formatting characters. + */ +char * +_libelf_ar_get_raw_name(const struct ar_hdr *arh) +{ + char *rawname; + const size_t namesz = sizeof(arh->ar_name); + + if ((rawname = malloc(namesz + 1)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + + (void) strncpy(rawname, arh->ar_name, namesz); + rawname[namesz] = '\0'; + return (rawname); +} + +/* + * Open an 'ar' archive. + */ +Elf * +_libelf_ar_open(Elf *e) +{ + int scanahead; + char *s, *end; + size_t sz; + struct ar_hdr arh; + + e->e_kind = ELF_K_AR; + e->e_u.e_ar.e_nchildren = 0; + e->e_u.e_ar.e_next = (off_t) -1; + + /* + * Look for special members. + */ + + s = e->e_rawfile + SARMAG; + end = e->e_rawfile + e->e_rawsize; + + assert(e->e_rawsize > 0); + + /* + * We use heuristics to determine the flavor of the archive we + * are examining. + * + * SVR4 flavor archives use the name "/ " and "// " for + * special members. + * + * In BSD flavor archives the symbol table, if present, is the + * first archive with name "__.SYMDEF". + */ + +#define READ_AR_HEADER(S, ARH, SZ, END) \ + do { \ + if ((S) + sizeof((ARH)) > (END)) \ + goto error; \ + (void) memcpy(&(ARH), (S), sizeof((ARH))); \ + if ((ARH).ar_fmag[0] != '`' || (ARH).ar_fmag[1] != '\n') \ + goto error; \ + if (_libelf_ar_get_number((ARH).ar_size, \ + sizeof((ARH).ar_size), 10, &(SZ)) == 0) \ + goto error; \ + } while (0) + + READ_AR_HEADER(s, arh, sz, end); + + /* + * Handle special archive members for the SVR4 format. + */ + if (arh.ar_name[0] == '/') { + + assert(sz > 0); + + e->e_flags |= LIBELF_F_AR_VARIANT_SVR4; + + scanahead = 0; + + /* + * The symbol table (file name "/ ") always comes before the + * string table (file name "// "). + */ + if (arh.ar_name[1] == ' ') { + /* "/ " => symbol table. */ + scanahead = 1; /* The string table to follow. */ + + s += sizeof(arh); + e->e_u.e_ar.e_rawsymtab = s; + e->e_u.e_ar.e_rawsymtabsz = sz; + + sz = LIBELF_ADJUST_AR_SIZE(sz); + s += sz; + + } else if (arh.ar_name[1] == '/' && arh.ar_name[2] == ' ') { + /* "// " => string table for long file names. */ + s += sizeof(arh); + e->e_u.e_ar.e_rawstrtab = s; + e->e_u.e_ar.e_rawstrtabsz = sz; + + sz = LIBELF_ADJUST_AR_SIZE(sz); + s += sz; + } + + /* + * If the string table hasn't been seen yet, look for + * it in the next member. + */ + if (scanahead) { + READ_AR_HEADER(s, arh, sz, end); + + /* "// " => string table for long file names. */ + if (arh.ar_name[0] == '/' && arh.ar_name[1] == '/' && + arh.ar_name[2] == ' ') { + + s += sizeof(arh); + + e->e_u.e_ar.e_rawstrtab = s; + e->e_u.e_ar.e_rawstrtabsz = sz; + + sz = LIBELF_ADJUST_AR_SIZE(sz); + s += sz; + } + } + } else if (strncmp(arh.ar_name, LIBELF_AR_BSD_SYMTAB_NAME, + sizeof(LIBELF_AR_BSD_SYMTAB_NAME) - 1) == 0) { + /* + * BSD style archive symbol table. + */ + s += sizeof(arh); + e->e_u.e_ar.e_rawsymtab = s; + e->e_u.e_ar.e_rawsymtabsz = sz; + + sz = LIBELF_ADJUST_AR_SIZE(sz); + s += sz; + } + + /* + * Update the 'next' offset, so that a subsequent elf_begin() + * works as expected. + */ + e->e_u.e_ar.e_next = (off_t) (s - e->e_rawfile); + + return (e); + +error: + LIBELF_SET_ERROR(ARCHIVE, 0); + return (NULL); + +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_checksum.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_checksum.c new file mode 100644 index 0000000000..0bece9a7e3 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_checksum.c @@ -0,0 +1,100 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_checksum.c 189 2008-07-20 10:38:08Z jkoshy $"); + +static unsigned long +_libelf_sum(unsigned long c, const unsigned char *s, size_t size) +{ + if (s == NULL || size == 0) + return (c); + + while (size--) + c += *s++; + + return (c); +} + +unsigned long +_libelf_checksum(Elf *e, int elfclass) +{ + size_t shn; + Elf_Scn *scn; + Elf_Data *d; + unsigned long checksum; + GElf_Ehdr eh; + GElf_Shdr shdr; + + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (0L); + } + + if (e->e_class != elfclass) { + LIBELF_SET_ERROR(CLASS, 0); + return (0L); + } + + if (gelf_getehdr(e, &eh) == NULL) + return (0); + + /* + * Iterate over all sections in the ELF file, computing the + * checksum along the way. + * + * The first section is always SHN_UNDEF and can be skipped. + * Non-allocatable sections are skipped, as are sections that + * could be affected by utilities such as strip(1). + */ + + checksum = 0; + for (shn = 1; shn < e->e_u.e_elf.e_nscn; shn++) { + if ((scn = elf_getscn(e, shn)) == NULL) + return (0); + if (gelf_getshdr(scn, &shdr) == NULL) + return (0); + if ((shdr.sh_flags & SHF_ALLOC) == 0 || + shdr.sh_type == SHT_DYNAMIC || + shdr.sh_type == SHT_DYNSYM) + continue; + + d = NULL; + while ((d = elf_rawdata(scn, d)) != NULL) + checksum = _libelf_sum(checksum, + (unsigned char *) d->d_buf, d->d_size); + } + + /* + * Return a 16-bit checksum compatible with Solaris. + */ + return (((checksum >> 16) & 0xFFFFUL) + (checksum & 0xFFFFUL)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_convert.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_convert.c new file mode 100644 index 0000000000..a530e9f269 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_convert.c @@ -0,0 +1,3100 @@ +/*- + * Copyright (c) 2006-2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_convert.m4 1048 2010-07-22 02:32:57Z jkoshy $"); + +/* WARNING: GENERATED FROM libelf_convert.m4. */ + +/* + * Macros to swap various integral quantities. + */ + +#define SWAP_HALF(X) do { \ + uint16_t _x = (uint16_t) (X); \ + uint16_t _t = _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + (X) = _t; \ + } while (0) +#define SWAP_WORD(X) do { \ + uint32_t _x = (uint32_t) (X); \ + uint32_t _t = _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + (X) = _t; \ + } while (0) +#define SWAP_ADDR32(X) SWAP_WORD(X) +#define SWAP_OFF32(X) SWAP_WORD(X) +#define SWAP_SWORD(X) SWAP_WORD(X) +#define SWAP_WORD64(X) do { \ + uint64_t _x = (uint64_t) (X); \ + uint64_t _t = _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + _t <<= 8; _x >>= 8; _t |= _x & 0xFF; \ + (X) = _t; \ + } while (0) +#define SWAP_ADDR64(X) SWAP_WORD64(X) +#define SWAP_LWORD(X) SWAP_WORD64(X) +#define SWAP_OFF64(X) SWAP_WORD64(X) +#define SWAP_SXWORD(X) SWAP_WORD64(X) +#define SWAP_XWORD(X) SWAP_WORD64(X) + +/* + * Write out various integral values. The destination pointer could + * be unaligned. Values are written out in native byte order. The + * destination pointer is incremented after the write. + */ +#define WRITE_BYTE(P,X) do { \ + char *const _p = (char *) (P); \ + _p[0] = (char) (X); \ + (P) = _p + 1; \ + } while (0) +#define WRITE_HALF(P,X) do { \ + uint16_t _t = (X); \ + char *const _p = (char *) (P); \ + const char *const _q = (char *) &_t; \ + _p[0] = _q[0]; \ + _p[1] = _q[1]; \ + (P) = _p + 2; \ + } while (0) +#define WRITE_WORD(P,X) do { \ + uint32_t _t = (X); \ + char *const _p = (char *) (P); \ + const char *const _q = (char *) &_t; \ + _p[0] = _q[0]; \ + _p[1] = _q[1]; \ + _p[2] = _q[2]; \ + _p[3] = _q[3]; \ + (P) = _p + 4; \ + } while (0) +#define WRITE_ADDR32(P,X) WRITE_WORD(P,X) +#define WRITE_OFF32(P,X) WRITE_WORD(P,X) +#define WRITE_SWORD(P,X) WRITE_WORD(P,X) +#define WRITE_WORD64(P,X) do { \ + uint64_t _t = (X); \ + char *const _p = (char *) (P); \ + const char *const _q = (char *) &_t; \ + _p[0] = _q[0]; \ + _p[1] = _q[1]; \ + _p[2] = _q[2]; \ + _p[3] = _q[3]; \ + _p[4] = _q[4]; \ + _p[5] = _q[5]; \ + _p[6] = _q[6]; \ + _p[7] = _q[7]; \ + (P) = _p + 8; \ + } while (0) +#define WRITE_ADDR64(P,X) WRITE_WORD64(P,X) +#define WRITE_LWORD(P,X) WRITE_WORD64(P,X) +#define WRITE_OFF64(P,X) WRITE_WORD64(P,X) +#define WRITE_SXWORD(P,X) WRITE_WORD64(P,X) +#define WRITE_XWORD(P,X) WRITE_WORD64(P,X) +#define WRITE_IDENT(P,X) do { \ + (void) memcpy((P), (X), sizeof((X))); \ + (P) = (P) + EI_NIDENT; \ + } while (0) + +/* + * Read in various integral values. The source pointer could be + * unaligned. Values are read in native byte order. The source + * pointer is incremented appropriately. + */ + +#define READ_BYTE(P,X) do { \ + const char *const _p = \ + (const char *) (P); \ + (X) = _p[0]; \ + (P) = (P) + 1; \ + } while (0) +#define READ_HALF(P,X) do { \ + uint16_t _t; \ + char *const _q = (char *) &_t; \ + const char *const _p = \ + (const char *) (P); \ + _q[0] = _p[0]; \ + _q[1] = _p[1]; \ + (P) = (P) + 2; \ + (X) = _t; \ + } while (0) +#define READ_WORD(P,X) do { \ + uint32_t _t; \ + char *const _q = (char *) &_t; \ + const char *const _p = \ + (const char *) (P); \ + _q[0] = _p[0]; \ + _q[1] = _p[1]; \ + _q[2] = _p[2]; \ + _q[3] = _p[3]; \ + (P) = (P) + 4; \ + (X) = _t; \ + } while (0) +#define READ_ADDR32(P,X) READ_WORD(P,X) +#define READ_OFF32(P,X) READ_WORD(P,X) +#define READ_SWORD(P,X) READ_WORD(P,X) +#define READ_WORD64(P,X) do { \ + uint64_t _t; \ + char *const _q = (char *) &_t; \ + const char *const _p = \ + (const char *) (P); \ + _q[0] = _p[0]; \ + _q[1] = _p[1]; \ + _q[2] = _p[2]; \ + _q[3] = _p[3]; \ + _q[4] = _p[4]; \ + _q[5] = _p[5]; \ + _q[6] = _p[6]; \ + _q[7] = _p[7]; \ + (P) = (P) + 8; \ + (X) = _t; \ + } while (0) +#define READ_ADDR64(P,X) READ_WORD64(P,X) +#define READ_LWORD(P,X) READ_WORD64(P,X) +#define READ_OFF64(P,X) READ_WORD64(P,X) +#define READ_SXWORD(P,X) READ_WORD64(P,X) +#define READ_XWORD(P,X) READ_WORD64(P,X) +#define READ_IDENT(P,X) do { \ + (void) memcpy((X), (P), sizeof((X))); \ + (P) = (P) + EI_NIDENT; \ + } while (0) + +#define ROUNDUP2(V,N) (V) = ((((V) + (N) - 1)) & ~((N) - 1)) + +/* + * Sections of type ELF_T_BYTE are never byteswapped, consequently a + * simple memcpy suffices for both directions of conversion. + */ + +static int +libelf_cvt_BYTE_tox(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + (void) byteswap; + if (dsz < count) + return (0); + if (dst != src) + (void) memcpy(dst, src, count); + return (1); +} + +#if LIBELF_CONFIG_ADDR + +static int +libelf_cvt_ADDR32_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Addr t, *s = (Elf32_Addr *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_ADDR32(t); + WRITE_ADDR32(dst,t); + } + + return (1); +} + +static int +libelf_cvt_ADDR32_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Addr t, *d = (Elf32_Addr *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf32_Addr)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_ADDR32(src,t); + SWAP_ADDR32(t); + *d++ = t; + } + + return (1); +} + +static int +libelf_cvt_ADDR64_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Addr t, *s = (Elf64_Addr *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_ADDR64(t); + WRITE_ADDR64(dst,t); + } + + return (1); +} + +static int +libelf_cvt_ADDR64_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Addr t, *d = (Elf64_Addr *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Addr)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_ADDR64(src,t); + SWAP_ADDR64(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_ADDR */ +#if LIBELF_CONFIG_BYTE + +#endif /* LIBELF_CONFIG_BYTE */ +#if LIBELF_CONFIG_CAP + +static int +libelf_cvt32_CAP_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Cap t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Cap *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Cap */ + SWAP_WORD(t.c_tag); + SWAP_WORD(t.c_un.c_val); + /**/ + } + /* Write an Elf32_Cap */ + WRITE_WORD(dst,t.c_tag); + WRITE_WORD(dst,t.c_un.c_val); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_CAP_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Cap t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Cap *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Cap */ + SWAP_XWORD(t.c_tag); + SWAP_XWORD(t.c_un.c_val); + /**/ + } + /* Write an Elf64_Cap */ + WRITE_XWORD(dst,t.c_tag); + WRITE_XWORD(dst,t.c_un.c_val); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_CAP_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Cap t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_CAP, (size_t) 1, EV_CURRENT); + d = ((Elf32_Cap *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Cap)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Cap */ + READ_WORD(s,t.c_tag); + READ_WORD(s,t.c_un.c_val); + /**/ + if (byteswap) { + /* Swap an Elf32_Cap */ + SWAP_WORD(t.c_tag); + SWAP_WORD(t.c_un.c_val); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_CAP_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Cap t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_CAP, (size_t) 1, EV_CURRENT); + d = ((Elf64_Cap *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Cap)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Cap */ + READ_XWORD(s,t.c_tag); + READ_XWORD(s,t.c_un.c_val); + /**/ + if (byteswap) { + /* Swap an Elf64_Cap */ + SWAP_XWORD(t.c_tag); + SWAP_XWORD(t.c_un.c_val); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_CAP */ +#if LIBELF_CONFIG_DYN + +static int +libelf_cvt32_DYN_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Dyn t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Dyn *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Dyn */ + SWAP_SWORD(t.d_tag); + SWAP_WORD(t.d_un.d_ptr); + /**/ + } + /* Write an Elf32_Dyn */ + WRITE_SWORD(dst,t.d_tag); + WRITE_WORD(dst,t.d_un.d_ptr); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_DYN_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Dyn t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Dyn *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Dyn */ + SWAP_SXWORD(t.d_tag); + SWAP_XWORD(t.d_un.d_ptr); + /**/ + } + /* Write an Elf64_Dyn */ + WRITE_SXWORD(dst,t.d_tag); + WRITE_XWORD(dst,t.d_un.d_ptr); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_DYN_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Dyn t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_DYN, (size_t) 1, EV_CURRENT); + d = ((Elf32_Dyn *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Dyn)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Dyn */ + READ_SWORD(s,t.d_tag); + READ_WORD(s,t.d_un.d_ptr); + /**/ + if (byteswap) { + /* Swap an Elf32_Dyn */ + SWAP_SWORD(t.d_tag); + SWAP_WORD(t.d_un.d_ptr); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_DYN_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Dyn t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_DYN, (size_t) 1, EV_CURRENT); + d = ((Elf64_Dyn *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Dyn)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Dyn */ + READ_SXWORD(s,t.d_tag); + READ_XWORD(s,t.d_un.d_ptr); + /**/ + if (byteswap) { + /* Swap an Elf64_Dyn */ + SWAP_SXWORD(t.d_tag); + SWAP_XWORD(t.d_un.d_ptr); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_DYN */ +#if LIBELF_CONFIG_EHDR + +static int +libelf_cvt32_EHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Ehdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Ehdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Ehdr */ + SWAP_HALF(t.e_type); + SWAP_HALF(t.e_machine); + SWAP_WORD(t.e_version); + SWAP_ADDR32(t.e_entry); + SWAP_OFF32(t.e_phoff); + SWAP_OFF32(t.e_shoff); + SWAP_WORD(t.e_flags); + SWAP_HALF(t.e_ehsize); + SWAP_HALF(t.e_phentsize); + SWAP_HALF(t.e_phnum); + SWAP_HALF(t.e_shentsize); + SWAP_HALF(t.e_shnum); + SWAP_HALF(t.e_shstrndx); + /**/ + } + /* Write an Elf32_Ehdr */ + WRITE_IDENT(dst,t.e_ident); + WRITE_HALF(dst,t.e_type); + WRITE_HALF(dst,t.e_machine); + WRITE_WORD(dst,t.e_version); + WRITE_ADDR32(dst,t.e_entry); + WRITE_OFF32(dst,t.e_phoff); + WRITE_OFF32(dst,t.e_shoff); + WRITE_WORD(dst,t.e_flags); + WRITE_HALF(dst,t.e_ehsize); + WRITE_HALF(dst,t.e_phentsize); + WRITE_HALF(dst,t.e_phnum); + WRITE_HALF(dst,t.e_shentsize); + WRITE_HALF(dst,t.e_shnum); + WRITE_HALF(dst,t.e_shstrndx); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_EHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Ehdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Ehdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Ehdr */ + SWAP_HALF(t.e_type); + SWAP_HALF(t.e_machine); + SWAP_WORD(t.e_version); + SWAP_ADDR64(t.e_entry); + SWAP_OFF64(t.e_phoff); + SWAP_OFF64(t.e_shoff); + SWAP_WORD(t.e_flags); + SWAP_HALF(t.e_ehsize); + SWAP_HALF(t.e_phentsize); + SWAP_HALF(t.e_phnum); + SWAP_HALF(t.e_shentsize); + SWAP_HALF(t.e_shnum); + SWAP_HALF(t.e_shstrndx); + /**/ + } + /* Write an Elf64_Ehdr */ + WRITE_IDENT(dst,t.e_ident); + WRITE_HALF(dst,t.e_type); + WRITE_HALF(dst,t.e_machine); + WRITE_WORD(dst,t.e_version); + WRITE_ADDR64(dst,t.e_entry); + WRITE_OFF64(dst,t.e_phoff); + WRITE_OFF64(dst,t.e_shoff); + WRITE_WORD(dst,t.e_flags); + WRITE_HALF(dst,t.e_ehsize); + WRITE_HALF(dst,t.e_phentsize); + WRITE_HALF(dst,t.e_phnum); + WRITE_HALF(dst,t.e_shentsize); + WRITE_HALF(dst,t.e_shnum); + WRITE_HALF(dst,t.e_shstrndx); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_EHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Ehdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_EHDR, (size_t) 1, EV_CURRENT); + d = ((Elf32_Ehdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Ehdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Ehdr */ + READ_IDENT(s,t.e_ident); + READ_HALF(s,t.e_type); + READ_HALF(s,t.e_machine); + READ_WORD(s,t.e_version); + READ_ADDR32(s,t.e_entry); + READ_OFF32(s,t.e_phoff); + READ_OFF32(s,t.e_shoff); + READ_WORD(s,t.e_flags); + READ_HALF(s,t.e_ehsize); + READ_HALF(s,t.e_phentsize); + READ_HALF(s,t.e_phnum); + READ_HALF(s,t.e_shentsize); + READ_HALF(s,t.e_shnum); + READ_HALF(s,t.e_shstrndx); + /**/ + if (byteswap) { + /* Swap an Elf32_Ehdr */ + SWAP_HALF(t.e_type); + SWAP_HALF(t.e_machine); + SWAP_WORD(t.e_version); + SWAP_ADDR32(t.e_entry); + SWAP_OFF32(t.e_phoff); + SWAP_OFF32(t.e_shoff); + SWAP_WORD(t.e_flags); + SWAP_HALF(t.e_ehsize); + SWAP_HALF(t.e_phentsize); + SWAP_HALF(t.e_phnum); + SWAP_HALF(t.e_shentsize); + SWAP_HALF(t.e_shnum); + SWAP_HALF(t.e_shstrndx); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_EHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Ehdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_EHDR, (size_t) 1, EV_CURRENT); + d = ((Elf64_Ehdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Ehdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Ehdr */ + READ_IDENT(s,t.e_ident); + READ_HALF(s,t.e_type); + READ_HALF(s,t.e_machine); + READ_WORD(s,t.e_version); + READ_ADDR64(s,t.e_entry); + READ_OFF64(s,t.e_phoff); + READ_OFF64(s,t.e_shoff); + READ_WORD(s,t.e_flags); + READ_HALF(s,t.e_ehsize); + READ_HALF(s,t.e_phentsize); + READ_HALF(s,t.e_phnum); + READ_HALF(s,t.e_shentsize); + READ_HALF(s,t.e_shnum); + READ_HALF(s,t.e_shstrndx); + /**/ + if (byteswap) { + /* Swap an Elf64_Ehdr */ + SWAP_HALF(t.e_type); + SWAP_HALF(t.e_machine); + SWAP_WORD(t.e_version); + SWAP_ADDR64(t.e_entry); + SWAP_OFF64(t.e_phoff); + SWAP_OFF64(t.e_shoff); + SWAP_WORD(t.e_flags); + SWAP_HALF(t.e_ehsize); + SWAP_HALF(t.e_phentsize); + SWAP_HALF(t.e_phnum); + SWAP_HALF(t.e_shentsize); + SWAP_HALF(t.e_shnum); + SWAP_HALF(t.e_shstrndx); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_EHDR */ +#if LIBELF_CONFIG_GNUHASH + +#endif /* LIBELF_CONFIG_GNUHASH */ +#if LIBELF_CONFIG_HALF + +static int +libelf_cvt_HALF_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Half t, *s = (Elf64_Half *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_HALF(t); + WRITE_HALF(dst,t); + } + + return (1); +} + +static int +libelf_cvt_HALF_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Half t, *d = (Elf64_Half *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Half)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_HALF(src,t); + SWAP_HALF(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_HALF */ +#if LIBELF_CONFIG_LWORD + +static int +libelf_cvt_LWORD_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Lword t, *s = (Elf64_Lword *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_LWORD(t); + WRITE_LWORD(dst,t); + } + + return (1); +} + +static int +libelf_cvt_LWORD_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Lword t, *d = (Elf64_Lword *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Lword)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_LWORD(src,t); + SWAP_LWORD(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_LWORD */ +#if LIBELF_CONFIG_MOVE + +static int +libelf_cvt32_MOVE_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Move t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Move *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Move */ + SWAP_LWORD(t.m_value); + SWAP_WORD(t.m_info); + SWAP_WORD(t.m_poffset); + SWAP_HALF(t.m_repeat); + SWAP_HALF(t.m_stride); + /**/ + } + /* Write an Elf32_Move */ + WRITE_LWORD(dst,t.m_value); + WRITE_WORD(dst,t.m_info); + WRITE_WORD(dst,t.m_poffset); + WRITE_HALF(dst,t.m_repeat); + WRITE_HALF(dst,t.m_stride); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_MOVE_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Move t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Move *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Move */ + SWAP_LWORD(t.m_value); + SWAP_XWORD(t.m_info); + SWAP_XWORD(t.m_poffset); + SWAP_HALF(t.m_repeat); + SWAP_HALF(t.m_stride); + /**/ + } + /* Write an Elf64_Move */ + WRITE_LWORD(dst,t.m_value); + WRITE_XWORD(dst,t.m_info); + WRITE_XWORD(dst,t.m_poffset); + WRITE_HALF(dst,t.m_repeat); + WRITE_HALF(dst,t.m_stride); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_MOVE_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Move t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_MOVE, (size_t) 1, EV_CURRENT); + d = ((Elf32_Move *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Move)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Move */ + READ_LWORD(s,t.m_value); + READ_WORD(s,t.m_info); + READ_WORD(s,t.m_poffset); + READ_HALF(s,t.m_repeat); + READ_HALF(s,t.m_stride); + /**/ + if (byteswap) { + /* Swap an Elf32_Move */ + SWAP_LWORD(t.m_value); + SWAP_WORD(t.m_info); + SWAP_WORD(t.m_poffset); + SWAP_HALF(t.m_repeat); + SWAP_HALF(t.m_stride); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_MOVE_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Move t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_MOVE, (size_t) 1, EV_CURRENT); + d = ((Elf64_Move *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Move)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Move */ + READ_LWORD(s,t.m_value); + READ_XWORD(s,t.m_info); + READ_XWORD(s,t.m_poffset); + READ_HALF(s,t.m_repeat); + READ_HALF(s,t.m_stride); + /**/ + if (byteswap) { + /* Swap an Elf64_Move */ + SWAP_LWORD(t.m_value); + SWAP_XWORD(t.m_info); + SWAP_XWORD(t.m_poffset); + SWAP_HALF(t.m_repeat); + SWAP_HALF(t.m_stride); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_MOVE */ +#if LIBELF_CONFIG_MOVEP + +#endif /* LIBELF_CONFIG_MOVEP */ +#if LIBELF_CONFIG_NOTE + +#endif /* LIBELF_CONFIG_NOTE */ +#if LIBELF_CONFIG_OFF + +static int +libelf_cvt_OFF32_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Off t, *s = (Elf32_Off *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_OFF32(t); + WRITE_OFF32(dst,t); + } + + return (1); +} + +static int +libelf_cvt_OFF32_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Off t, *d = (Elf32_Off *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf32_Off)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_OFF32(src,t); + SWAP_OFF32(t); + *d++ = t; + } + + return (1); +} + +static int +libelf_cvt_OFF64_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Off t, *s = (Elf64_Off *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_OFF64(t); + WRITE_OFF64(dst,t); + } + + return (1); +} + +static int +libelf_cvt_OFF64_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Off t, *d = (Elf64_Off *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Off)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_OFF64(src,t); + SWAP_OFF64(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_OFF */ +#if LIBELF_CONFIG_PHDR + +static int +libelf_cvt32_PHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Phdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Phdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Phdr */ + SWAP_WORD(t.p_type); + SWAP_OFF32(t.p_offset); + SWAP_ADDR32(t.p_vaddr); + SWAP_ADDR32(t.p_paddr); + SWAP_WORD(t.p_filesz); + SWAP_WORD(t.p_memsz); + SWAP_WORD(t.p_flags); + SWAP_WORD(t.p_align); + /**/ + } + /* Write an Elf32_Phdr */ + WRITE_WORD(dst,t.p_type); + WRITE_OFF32(dst,t.p_offset); + WRITE_ADDR32(dst,t.p_vaddr); + WRITE_ADDR32(dst,t.p_paddr); + WRITE_WORD(dst,t.p_filesz); + WRITE_WORD(dst,t.p_memsz); + WRITE_WORD(dst,t.p_flags); + WRITE_WORD(dst,t.p_align); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_PHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Phdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Phdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Phdr */ + SWAP_WORD(t.p_type); + SWAP_WORD(t.p_flags); + SWAP_OFF64(t.p_offset); + SWAP_ADDR64(t.p_vaddr); + SWAP_ADDR64(t.p_paddr); + SWAP_XWORD(t.p_filesz); + SWAP_XWORD(t.p_memsz); + SWAP_XWORD(t.p_align); + /**/ + } + /* Write an Elf64_Phdr */ + WRITE_WORD(dst,t.p_type); + WRITE_WORD(dst,t.p_flags); + WRITE_OFF64(dst,t.p_offset); + WRITE_ADDR64(dst,t.p_vaddr); + WRITE_ADDR64(dst,t.p_paddr); + WRITE_XWORD(dst,t.p_filesz); + WRITE_XWORD(dst,t.p_memsz); + WRITE_XWORD(dst,t.p_align); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_PHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Phdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_PHDR, (size_t) 1, EV_CURRENT); + d = ((Elf32_Phdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Phdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Phdr */ + READ_WORD(s,t.p_type); + READ_OFF32(s,t.p_offset); + READ_ADDR32(s,t.p_vaddr); + READ_ADDR32(s,t.p_paddr); + READ_WORD(s,t.p_filesz); + READ_WORD(s,t.p_memsz); + READ_WORD(s,t.p_flags); + READ_WORD(s,t.p_align); + /**/ + if (byteswap) { + /* Swap an Elf32_Phdr */ + SWAP_WORD(t.p_type); + SWAP_OFF32(t.p_offset); + SWAP_ADDR32(t.p_vaddr); + SWAP_ADDR32(t.p_paddr); + SWAP_WORD(t.p_filesz); + SWAP_WORD(t.p_memsz); + SWAP_WORD(t.p_flags); + SWAP_WORD(t.p_align); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_PHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Phdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_PHDR, (size_t) 1, EV_CURRENT); + d = ((Elf64_Phdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Phdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Phdr */ + READ_WORD(s,t.p_type); + READ_WORD(s,t.p_flags); + READ_OFF64(s,t.p_offset); + READ_ADDR64(s,t.p_vaddr); + READ_ADDR64(s,t.p_paddr); + READ_XWORD(s,t.p_filesz); + READ_XWORD(s,t.p_memsz); + READ_XWORD(s,t.p_align); + /**/ + if (byteswap) { + /* Swap an Elf64_Phdr */ + SWAP_WORD(t.p_type); + SWAP_WORD(t.p_flags); + SWAP_OFF64(t.p_offset); + SWAP_ADDR64(t.p_vaddr); + SWAP_ADDR64(t.p_paddr); + SWAP_XWORD(t.p_filesz); + SWAP_XWORD(t.p_memsz); + SWAP_XWORD(t.p_align); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_PHDR */ +#if LIBELF_CONFIG_REL + +static int +libelf_cvt32_REL_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Rel t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Rel *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Rel */ + SWAP_ADDR32(t.r_offset); + SWAP_WORD(t.r_info); + /**/ + } + /* Write an Elf32_Rel */ + WRITE_ADDR32(dst,t.r_offset); + WRITE_WORD(dst,t.r_info); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_REL_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Rel t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Rel *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Rel */ + SWAP_ADDR64(t.r_offset); + SWAP_XWORD(t.r_info); + /**/ + } + /* Write an Elf64_Rel */ + WRITE_ADDR64(dst,t.r_offset); + WRITE_XWORD(dst,t.r_info); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_REL_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Rel t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_REL, (size_t) 1, EV_CURRENT); + d = ((Elf32_Rel *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Rel)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Rel */ + READ_ADDR32(s,t.r_offset); + READ_WORD(s,t.r_info); + /**/ + if (byteswap) { + /* Swap an Elf32_Rel */ + SWAP_ADDR32(t.r_offset); + SWAP_WORD(t.r_info); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_REL_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Rel t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_REL, (size_t) 1, EV_CURRENT); + d = ((Elf64_Rel *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Rel)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Rel */ + READ_ADDR64(s,t.r_offset); + READ_XWORD(s,t.r_info); + /**/ + if (byteswap) { + /* Swap an Elf64_Rel */ + SWAP_ADDR64(t.r_offset); + SWAP_XWORD(t.r_info); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_REL */ +#if LIBELF_CONFIG_RELA + +static int +libelf_cvt32_RELA_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Rela t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Rela *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Rela */ + SWAP_ADDR32(t.r_offset); + SWAP_WORD(t.r_info); + SWAP_SWORD(t.r_addend); + /**/ + } + /* Write an Elf32_Rela */ + WRITE_ADDR32(dst,t.r_offset); + WRITE_WORD(dst,t.r_info); + WRITE_SWORD(dst,t.r_addend); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_RELA_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Rela t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Rela *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Rela */ + SWAP_ADDR64(t.r_offset); + SWAP_XWORD(t.r_info); + SWAP_SXWORD(t.r_addend); + /**/ + } + /* Write an Elf64_Rela */ + WRITE_ADDR64(dst,t.r_offset); + WRITE_XWORD(dst,t.r_info); + WRITE_SXWORD(dst,t.r_addend); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_RELA_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Rela t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_RELA, (size_t) 1, EV_CURRENT); + d = ((Elf32_Rela *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Rela)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Rela */ + READ_ADDR32(s,t.r_offset); + READ_WORD(s,t.r_info); + READ_SWORD(s,t.r_addend); + /**/ + if (byteswap) { + /* Swap an Elf32_Rela */ + SWAP_ADDR32(t.r_offset); + SWAP_WORD(t.r_info); + SWAP_SWORD(t.r_addend); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_RELA_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Rela t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_RELA, (size_t) 1, EV_CURRENT); + d = ((Elf64_Rela *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Rela)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Rela */ + READ_ADDR64(s,t.r_offset); + READ_XWORD(s,t.r_info); + READ_SXWORD(s,t.r_addend); + /**/ + if (byteswap) { + /* Swap an Elf64_Rela */ + SWAP_ADDR64(t.r_offset); + SWAP_XWORD(t.r_info); + SWAP_SXWORD(t.r_addend); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_RELA */ +#if LIBELF_CONFIG_SHDR + +static int +libelf_cvt32_SHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Shdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Shdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Shdr */ + SWAP_WORD(t.sh_name); + SWAP_WORD(t.sh_type); + SWAP_WORD(t.sh_flags); + SWAP_ADDR32(t.sh_addr); + SWAP_OFF32(t.sh_offset); + SWAP_WORD(t.sh_size); + SWAP_WORD(t.sh_link); + SWAP_WORD(t.sh_info); + SWAP_WORD(t.sh_addralign); + SWAP_WORD(t.sh_entsize); + /**/ + } + /* Write an Elf32_Shdr */ + WRITE_WORD(dst,t.sh_name); + WRITE_WORD(dst,t.sh_type); + WRITE_WORD(dst,t.sh_flags); + WRITE_ADDR32(dst,t.sh_addr); + WRITE_OFF32(dst,t.sh_offset); + WRITE_WORD(dst,t.sh_size); + WRITE_WORD(dst,t.sh_link); + WRITE_WORD(dst,t.sh_info); + WRITE_WORD(dst,t.sh_addralign); + WRITE_WORD(dst,t.sh_entsize); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_SHDR_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Shdr t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Shdr *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Shdr */ + SWAP_WORD(t.sh_name); + SWAP_WORD(t.sh_type); + SWAP_XWORD(t.sh_flags); + SWAP_ADDR64(t.sh_addr); + SWAP_OFF64(t.sh_offset); + SWAP_XWORD(t.sh_size); + SWAP_WORD(t.sh_link); + SWAP_WORD(t.sh_info); + SWAP_XWORD(t.sh_addralign); + SWAP_XWORD(t.sh_entsize); + /**/ + } + /* Write an Elf64_Shdr */ + WRITE_WORD(dst,t.sh_name); + WRITE_WORD(dst,t.sh_type); + WRITE_XWORD(dst,t.sh_flags); + WRITE_ADDR64(dst,t.sh_addr); + WRITE_OFF64(dst,t.sh_offset); + WRITE_XWORD(dst,t.sh_size); + WRITE_WORD(dst,t.sh_link); + WRITE_WORD(dst,t.sh_info); + WRITE_XWORD(dst,t.sh_addralign); + WRITE_XWORD(dst,t.sh_entsize); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_SHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Shdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_SHDR, (size_t) 1, EV_CURRENT); + d = ((Elf32_Shdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Shdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Shdr */ + READ_WORD(s,t.sh_name); + READ_WORD(s,t.sh_type); + READ_WORD(s,t.sh_flags); + READ_ADDR32(s,t.sh_addr); + READ_OFF32(s,t.sh_offset); + READ_WORD(s,t.sh_size); + READ_WORD(s,t.sh_link); + READ_WORD(s,t.sh_info); + READ_WORD(s,t.sh_addralign); + READ_WORD(s,t.sh_entsize); + /**/ + if (byteswap) { + /* Swap an Elf32_Shdr */ + SWAP_WORD(t.sh_name); + SWAP_WORD(t.sh_type); + SWAP_WORD(t.sh_flags); + SWAP_ADDR32(t.sh_addr); + SWAP_OFF32(t.sh_offset); + SWAP_WORD(t.sh_size); + SWAP_WORD(t.sh_link); + SWAP_WORD(t.sh_info); + SWAP_WORD(t.sh_addralign); + SWAP_WORD(t.sh_entsize); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_SHDR_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Shdr t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_SHDR, (size_t) 1, EV_CURRENT); + d = ((Elf64_Shdr *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Shdr)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Shdr */ + READ_WORD(s,t.sh_name); + READ_WORD(s,t.sh_type); + READ_XWORD(s,t.sh_flags); + READ_ADDR64(s,t.sh_addr); + READ_OFF64(s,t.sh_offset); + READ_XWORD(s,t.sh_size); + READ_WORD(s,t.sh_link); + READ_WORD(s,t.sh_info); + READ_XWORD(s,t.sh_addralign); + READ_XWORD(s,t.sh_entsize); + /**/ + if (byteswap) { + /* Swap an Elf64_Shdr */ + SWAP_WORD(t.sh_name); + SWAP_WORD(t.sh_type); + SWAP_XWORD(t.sh_flags); + SWAP_ADDR64(t.sh_addr); + SWAP_OFF64(t.sh_offset); + SWAP_XWORD(t.sh_size); + SWAP_WORD(t.sh_link); + SWAP_WORD(t.sh_info); + SWAP_XWORD(t.sh_addralign); + SWAP_XWORD(t.sh_entsize); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_SHDR */ +#if LIBELF_CONFIG_SWORD + +static int +libelf_cvt_SWORD_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sword t, *s = (Elf64_Sword *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_SWORD(t); + WRITE_SWORD(dst,t); + } + + return (1); +} + +static int +libelf_cvt_SWORD_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sword t, *d = (Elf64_Sword *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Sword)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_SWORD(src,t); + SWAP_SWORD(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_SWORD */ +#if LIBELF_CONFIG_SXWORD + +static int +libelf_cvt_SXWORD_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sxword t, *s = (Elf64_Sxword *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_SXWORD(t); + WRITE_SXWORD(dst,t); + } + + return (1); +} + +static int +libelf_cvt_SXWORD_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sxword t, *d = (Elf64_Sxword *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Sxword)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_SXWORD(src,t); + SWAP_SXWORD(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_SXWORD */ +#if LIBELF_CONFIG_SYMINFO + +static int +libelf_cvt32_SYMINFO_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Syminfo t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Syminfo *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Syminfo */ + SWAP_HALF(t.si_boundto); + SWAP_HALF(t.si_flags); + /**/ + } + /* Write an Elf32_Syminfo */ + WRITE_HALF(dst,t.si_boundto); + WRITE_HALF(dst,t.si_flags); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_SYMINFO_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Syminfo t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Syminfo *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Syminfo */ + SWAP_HALF(t.si_boundto); + SWAP_HALF(t.si_flags); + /**/ + } + /* Write an Elf64_Syminfo */ + WRITE_HALF(dst,t.si_boundto); + WRITE_HALF(dst,t.si_flags); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_SYMINFO_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Syminfo t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_SYMINFO, (size_t) 1, EV_CURRENT); + d = ((Elf32_Syminfo *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Syminfo)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Syminfo */ + READ_HALF(s,t.si_boundto); + READ_HALF(s,t.si_flags); + /**/ + if (byteswap) { + /* Swap an Elf32_Syminfo */ + SWAP_HALF(t.si_boundto); + SWAP_HALF(t.si_flags); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_SYMINFO_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Syminfo t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_SYMINFO, (size_t) 1, EV_CURRENT); + d = ((Elf64_Syminfo *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Syminfo)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Syminfo */ + READ_HALF(s,t.si_boundto); + READ_HALF(s,t.si_flags); + /**/ + if (byteswap) { + /* Swap an Elf64_Syminfo */ + SWAP_HALF(t.si_boundto); + SWAP_HALF(t.si_flags); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_SYMINFO */ +#if LIBELF_CONFIG_SYM + +static int +libelf_cvt32_SYM_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Sym t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Sym *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Sym */ + SWAP_WORD(t.st_name); + SWAP_ADDR32(t.st_value); + SWAP_WORD(t.st_size); + SWAP_HALF(t.st_shndx); + /**/ + } + /* Write an Elf32_Sym */ + WRITE_WORD(dst,t.st_name); + WRITE_ADDR32(dst,t.st_value); + WRITE_WORD(dst,t.st_size); + WRITE_BYTE(dst,t.st_info); + WRITE_BYTE(dst,t.st_other); + WRITE_HALF(dst,t.st_shndx); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_SYM_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sym t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Sym *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Sym */ + SWAP_WORD(t.st_name); + SWAP_HALF(t.st_shndx); + SWAP_ADDR64(t.st_value); + SWAP_XWORD(t.st_size); + /**/ + } + /* Write an Elf64_Sym */ + WRITE_WORD(dst,t.st_name); + WRITE_BYTE(dst,t.st_info); + WRITE_BYTE(dst,t.st_other); + WRITE_HALF(dst,t.st_shndx); + WRITE_ADDR64(dst,t.st_value); + WRITE_XWORD(dst,t.st_size); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_SYM_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Sym t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_SYM, (size_t) 1, EV_CURRENT); + d = ((Elf32_Sym *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Sym)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Sym */ + READ_WORD(s,t.st_name); + READ_ADDR32(s,t.st_value); + READ_WORD(s,t.st_size); + READ_BYTE(s,t.st_info); + READ_BYTE(s,t.st_other); + READ_HALF(s,t.st_shndx); + /**/ + if (byteswap) { + /* Swap an Elf32_Sym */ + SWAP_WORD(t.st_name); + SWAP_ADDR32(t.st_value); + SWAP_WORD(t.st_size); + SWAP_HALF(t.st_shndx); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_SYM_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Sym t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_SYM, (size_t) 1, EV_CURRENT); + d = ((Elf64_Sym *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Sym)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Sym */ + READ_WORD(s,t.st_name); + READ_BYTE(s,t.st_info); + READ_BYTE(s,t.st_other); + READ_HALF(s,t.st_shndx); + READ_ADDR64(s,t.st_value); + READ_XWORD(s,t.st_size); + /**/ + if (byteswap) { + /* Swap an Elf64_Sym */ + SWAP_WORD(t.st_name); + SWAP_HALF(t.st_shndx); + SWAP_ADDR64(t.st_value); + SWAP_XWORD(t.st_size); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_SYM */ +#if LIBELF_CONFIG_VDEF + +static int +libelf_cvt32_VDEF_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Verdef t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Verdef *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Verdef */ + SWAP_HALF(t.vd_version); + SWAP_HALF(t.vd_flags); + SWAP_HALF(t.vd_ndx); + SWAP_HALF(t.vd_cnt); + SWAP_WORD(t.vd_hash); + SWAP_WORD(t.vd_aux); + SWAP_WORD(t.vd_next); + /**/ + } + /* Write an Elf32_Verdef */ + WRITE_HALF(dst,t.vd_version); + WRITE_HALF(dst,t.vd_flags); + WRITE_HALF(dst,t.vd_ndx); + WRITE_HALF(dst,t.vd_cnt); + WRITE_WORD(dst,t.vd_hash); + WRITE_WORD(dst,t.vd_aux); + WRITE_WORD(dst,t.vd_next); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_VDEF_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Verdef t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Verdef *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Verdef */ + SWAP_HALF(t.vd_version); + SWAP_HALF(t.vd_flags); + SWAP_HALF(t.vd_ndx); + SWAP_HALF(t.vd_cnt); + SWAP_WORD(t.vd_hash); + SWAP_WORD(t.vd_aux); + SWAP_WORD(t.vd_next); + /**/ + } + /* Write an Elf64_Verdef */ + WRITE_HALF(dst,t.vd_version); + WRITE_HALF(dst,t.vd_flags); + WRITE_HALF(dst,t.vd_ndx); + WRITE_HALF(dst,t.vd_cnt); + WRITE_WORD(dst,t.vd_hash); + WRITE_WORD(dst,t.vd_aux); + WRITE_WORD(dst,t.vd_next); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_VDEF_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Verdef t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_VDEF, (size_t) 1, EV_CURRENT); + d = ((Elf32_Verdef *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Verdef)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Verdef */ + READ_HALF(s,t.vd_version); + READ_HALF(s,t.vd_flags); + READ_HALF(s,t.vd_ndx); + READ_HALF(s,t.vd_cnt); + READ_WORD(s,t.vd_hash); + READ_WORD(s,t.vd_aux); + READ_WORD(s,t.vd_next); + /**/ + if (byteswap) { + /* Swap an Elf32_Verdef */ + SWAP_HALF(t.vd_version); + SWAP_HALF(t.vd_flags); + SWAP_HALF(t.vd_ndx); + SWAP_HALF(t.vd_cnt); + SWAP_WORD(t.vd_hash); + SWAP_WORD(t.vd_aux); + SWAP_WORD(t.vd_next); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_VDEF_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Verdef t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_VDEF, (size_t) 1, EV_CURRENT); + d = ((Elf64_Verdef *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Verdef)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Verdef */ + READ_HALF(s,t.vd_version); + READ_HALF(s,t.vd_flags); + READ_HALF(s,t.vd_ndx); + READ_HALF(s,t.vd_cnt); + READ_WORD(s,t.vd_hash); + READ_WORD(s,t.vd_aux); + READ_WORD(s,t.vd_next); + /**/ + if (byteswap) { + /* Swap an Elf64_Verdef */ + SWAP_HALF(t.vd_version); + SWAP_HALF(t.vd_flags); + SWAP_HALF(t.vd_ndx); + SWAP_HALF(t.vd_cnt); + SWAP_WORD(t.vd_hash); + SWAP_WORD(t.vd_aux); + SWAP_WORD(t.vd_next); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_VDEF */ +#if LIBELF_CONFIG_VNEED + +static int +libelf_cvt32_VNEED_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Verneed t, *s; + size_t c; + + (void) dsz; + + s = (Elf32_Verneed *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf32_Verneed */ + SWAP_HALF(t.vn_version); + SWAP_HALF(t.vn_cnt); + SWAP_WORD(t.vn_file); + SWAP_WORD(t.vn_aux); + SWAP_WORD(t.vn_next); + /**/ + } + /* Write an Elf32_Verneed */ + WRITE_HALF(dst,t.vn_version); + WRITE_HALF(dst,t.vn_cnt); + WRITE_WORD(dst,t.vn_file); + WRITE_WORD(dst,t.vn_aux); + WRITE_WORD(dst,t.vn_next); + /**/ + } + + return (1); +} + +static int +libelf_cvt64_VNEED_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Verneed t, *s; + size_t c; + + (void) dsz; + + s = (Elf64_Verneed *) (uintptr_t) src; + for (c = 0; c < count; c++) { + t = *s++; + if (byteswap) { + /* Swap an Elf64_Verneed */ + SWAP_HALF(t.vn_version); + SWAP_HALF(t.vn_cnt); + SWAP_WORD(t.vn_file); + SWAP_WORD(t.vn_aux); + SWAP_WORD(t.vn_next); + /**/ + } + /* Write an Elf64_Verneed */ + WRITE_HALF(dst,t.vn_version); + WRITE_HALF(dst,t.vn_cnt); + WRITE_WORD(dst,t.vn_file); + WRITE_WORD(dst,t.vn_aux); + WRITE_WORD(dst,t.vn_next); + /**/ + } + + return (1); +} + +static int +libelf_cvt32_VNEED_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf32_Verneed t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf32_fsize(ELF_T_VNEED, (size_t) 1, EV_CURRENT); + d = ((Elf32_Verneed *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf32_Verneed)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf32_Verneed */ + READ_HALF(s,t.vn_version); + READ_HALF(s,t.vn_cnt); + READ_WORD(s,t.vn_file); + READ_WORD(s,t.vn_aux); + READ_WORD(s,t.vn_next); + /**/ + if (byteswap) { + /* Swap an Elf32_Verneed */ + SWAP_HALF(t.vn_version); + SWAP_HALF(t.vn_cnt); + SWAP_WORD(t.vn_file); + SWAP_WORD(t.vn_aux); + SWAP_WORD(t.vn_next); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +static int +libelf_cvt64_VNEED_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Verneed t, *d; + char *s,*s0; + size_t fsz; + + fsz = elf64_fsize(ELF_T_VNEED, (size_t) 1, EV_CURRENT); + d = ((Elf64_Verneed *) (uintptr_t) dst) + (count - 1); + s0 = (char *) src + (count - 1) * fsz; + + if (dsz < count * sizeof(Elf64_Verneed)) + return (0); + + while (count--) { + s = s0; + /* Read an Elf64_Verneed */ + READ_HALF(s,t.vn_version); + READ_HALF(s,t.vn_cnt); + READ_WORD(s,t.vn_file); + READ_WORD(s,t.vn_aux); + READ_WORD(s,t.vn_next); + /**/ + if (byteswap) { + /* Swap an Elf64_Verneed */ + SWAP_HALF(t.vn_version); + SWAP_HALF(t.vn_cnt); + SWAP_WORD(t.vn_file); + SWAP_WORD(t.vn_aux); + SWAP_WORD(t.vn_next); + /**/ + } + *d-- = t; s0 -= fsz; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_VNEED */ +#if LIBELF_CONFIG_WORD + +static int +libelf_cvt_WORD_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Word t, *s = (Elf64_Word *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_WORD(t); + WRITE_WORD(dst,t); + } + + return (1); +} + +static int +libelf_cvt_WORD_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Word t, *d = (Elf64_Word *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Word)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_WORD(src,t); + SWAP_WORD(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_WORD */ +#if LIBELF_CONFIG_XWORD + +static int +libelf_cvt_XWORD_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Xword t, *s = (Elf64_Xword *) (uintptr_t) src; + size_t c; + + (void) dsz; + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*s)); + return (1); + } + + for (c = 0; c < count; c++) { + t = *s++; + SWAP_XWORD(t); + WRITE_XWORD(dst,t); + } + + return (1); +} + +static int +libelf_cvt_XWORD_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + Elf64_Xword t, *d = (Elf64_Xword *) (uintptr_t) dst; + size_t c; + + if (dsz < count * sizeof(Elf64_Xword)) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count * sizeof(*d)); + return (1); + } + + for (c = 0; c < count; c++) { + READ_XWORD(src,t); + SWAP_XWORD(t); + *d++ = t; + } + + return (1); +} + +#endif /* LIBELF_CONFIG_XWORD */ + +#if LIBELF_CONFIG_GNUHASH +/* + * Sections of type ELF_T_GNUHASH start with a header containing 4 32-bit + * words. Bloom filter data comes next, followed by hash buckets and the + * hash chain. + * + * Bloom filter words are 64 bit wide on ELFCLASS64 objects and are 32 bit + * wide on ELFCLASS32 objects. The other objects in this section are 32 + * bits wide. + * + * Argument srcsz denotes the number of bytes to be converted. In the + * 32-bit case we need to translate srcsz to a count of 32-bit words. + */ + +static int +libelf_cvt32_GNUHASH_tom(char *dst, size_t dsz, char *src, size_t srcsz, + int byteswap) +{ + return (libelf_cvt_WORD_tom(dst, dsz, src, srcsz / sizeof(uint32_t), + byteswap)); +} + +static int +libelf_cvt32_GNUHASH_tof(char *dst, size_t dsz, char *src, size_t srcsz, + int byteswap) +{ + return (libelf_cvt_WORD_tof(dst, dsz, src, srcsz / sizeof(uint32_t), + byteswap)); +} + +static int +libelf_cvt64_GNUHASH_tom(char *dst, size_t dsz, char *src, size_t srcsz, + int byteswap) +{ + size_t sz; + uint64_t t64, *bloom64; + Elf_GNU_Hash_Header *gh; + uint32_t n, nbuckets, nchains, maskwords, shift2, symndx, t32; + uint32_t *buckets, *chains; + + sz = 4 * sizeof(uint32_t); /* File header is 4 words long. */ + if (dsz < sizeof(Elf_GNU_Hash_Header) || srcsz < sz) + return (0); + + /* Read in the section header and byteswap if needed. */ + READ_WORD(src, nbuckets); + READ_WORD(src, symndx); + READ_WORD(src, maskwords); + READ_WORD(src, shift2); + + srcsz -= sz; + + if (byteswap) { + SWAP_WORD(nbuckets); + SWAP_WORD(symndx); + SWAP_WORD(maskwords); + SWAP_WORD(shift2); + } + + /* Check source buffer and destination buffer sizes. */ + sz = nbuckets * sizeof(uint32_t) + maskwords * sizeof(uint64_t); + if (srcsz < sz || dsz < sz + sizeof(Elf_GNU_Hash_Header)) + return (0); + + gh = (Elf_GNU_Hash_Header *) (uintptr_t) dst; + gh->gh_nbuckets = nbuckets; + gh->gh_symndx = symndx; + gh->gh_maskwords = maskwords; + gh->gh_shift2 = shift2; + + dsz -= sizeof(Elf_GNU_Hash_Header); + dst += sizeof(Elf_GNU_Hash_Header); + + bloom64 = (uint64_t *) (uintptr_t) dst; + + /* Copy bloom filter data. */ + for (n = 0; n < maskwords; n++) { + READ_XWORD(src, t64); + if (byteswap) + SWAP_XWORD(t64); + bloom64[n] = t64; + } + + /* The hash buckets follows the bloom filter. */ + dst += maskwords * sizeof(uint64_t); + buckets = (uint32_t *) (uintptr_t) dst; + + for (n = 0; n < nbuckets; n++) { + READ_WORD(src, t32); + if (byteswap) + SWAP_WORD(t32); + buckets[n] = t32; + } + + dst += nbuckets * sizeof(uint32_t); + + /* The hash chain follows the hash buckets. */ + dsz -= sz; + srcsz -= sz; + + if (dsz < srcsz) /* Destination lacks space. */ + return (0); + + nchains = srcsz / sizeof(uint32_t); + chains = (uint32_t *) (uintptr_t) dst; + + for (n = 0; n < nchains; n++) { + READ_WORD(src, t32); + if (byteswap) + SWAP_WORD(t32); + *chains++ = t32; + } + + return (1); +} + +static int +libelf_cvt64_GNUHASH_tof(char *dst, size_t dsz, char *src, size_t srcsz, + int byteswap) +{ + uint32_t *s32; + size_t sz, hdrsz; + uint64_t *s64, t64; + Elf_GNU_Hash_Header *gh; + uint32_t maskwords, n, nbuckets, nchains, t0, t1, t2, t3, t32; + + hdrsz = 4 * sizeof(uint32_t); /* Header is 4x32 bits. */ + if (dsz < hdrsz || srcsz < sizeof(Elf_GNU_Hash_Header)) + return (0); + + gh = (Elf_GNU_Hash_Header *) (uintptr_t) src; + + t0 = nbuckets = gh->gh_nbuckets; + t1 = gh->gh_symndx; + t2 = maskwords = gh->gh_maskwords; + t3 = gh->gh_shift2; + + src += sizeof(Elf_GNU_Hash_Header); + srcsz -= sizeof(Elf_GNU_Hash_Header); + dsz -= hdrsz; + + sz = gh->gh_nbuckets * sizeof(uint32_t) + gh->gh_maskwords * + sizeof(uint64_t); + + if (srcsz < sz || dsz < sz) + return (0); + + /* Write out the header. */ + if (byteswap) { + SWAP_WORD(t0); + SWAP_WORD(t1); + SWAP_WORD(t2); + SWAP_WORD(t3); + } + + WRITE_WORD(dst, t0); + WRITE_WORD(dst, t1); + WRITE_WORD(dst, t2); + WRITE_WORD(dst, t3); + + /* Copy the bloom filter and the hash table. */ + s64 = (uint64_t *) (uintptr_t) src; + for (n = 0; n < maskwords; n++) { + t64 = *s64++; + if (byteswap) + SWAP_XWORD(t64); + WRITE_WORD64(dst, t64); + } + + s32 = (uint32_t *) s64; + for (n = 0; n < nbuckets; n++) { + t32 = *s32++; + if (byteswap) + SWAP_WORD(t32); + WRITE_WORD(dst, t32); + } + + srcsz -= sz; + dsz -= sz; + + /* Copy out the hash chains. */ + if (dsz < srcsz) + return (0); + + nchains = srcsz / sizeof(uint32_t); + for (n = 0; n < nchains; n++) { + t32 = *s32++; + if (byteswap) + SWAP_WORD(t32); + WRITE_WORD(dst, t32); + } + + return (1); +} +#endif /* LIBELF_CONFIG_GNUHASH */ + +#if LIBELF_CONFIG_NOTE +/* + * Elf_Note structures comprise a fixed size header followed by variable + * length strings. The fixed size header needs to be byte swapped, but + * not the strings. + * + * Argument count denotes the total number of bytes to be converted. + * The destination buffer needs to be at least count bytes in size. + */ +static int +libelf_cvt_NOTE_tom(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + uint32_t namesz, descsz, type; + Elf_Note *en; + size_t sz, hdrsz; + + if (dsz < count) /* Destination buffer is too small. */ + return (0); + + hdrsz = 3 * sizeof(uint32_t); + if (count < hdrsz) /* Source too small. */ + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count); + return (1); + } + + /* Process all notes in the section. */ + while (count > hdrsz) { + /* Read the note header. */ + READ_WORD(src, namesz); + READ_WORD(src, descsz); + READ_WORD(src, type); + + /* Translate. */ + SWAP_WORD(namesz); + SWAP_WORD(descsz); + SWAP_WORD(type); + + /* Copy out the translated note header. */ + en = (Elf_Note *) (uintptr_t) dst; + en->n_namesz = namesz; + en->n_descsz = descsz; + en->n_type = type; + + dsz -= sizeof(Elf_Note); + dst += sizeof(Elf_Note); + count -= hdrsz; + + ROUNDUP2(namesz, 4); + ROUNDUP2(descsz, 4); + + sz = namesz + descsz; + + if (count < sz || dsz < sz) /* Buffers are too small. */ + return (0); + + (void) memcpy(dst, src, sz); + + src += sz; + dst += sz; + + count -= sz; + dsz -= sz; + } + + return (1); +} + +static int +libelf_cvt_NOTE_tof(char *dst, size_t dsz, char *src, size_t count, + int byteswap) +{ + uint32_t namesz, descsz, type; + Elf_Note *en; + size_t sz; + + if (dsz < count) + return (0); + + if (!byteswap) { + (void) memcpy(dst, src, count); + return (1); + } + + while (count > sizeof(Elf_Note)) { + + en = (Elf_Note *) (uintptr_t) src; + namesz = en->n_namesz; + descsz = en->n_descsz; + type = en->n_type; + + SWAP_WORD(namesz); + SWAP_WORD(descsz); + SWAP_WORD(type); + + WRITE_WORD(dst, namesz); + WRITE_WORD(dst, descsz); + WRITE_WORD(dst, type); + + src += sizeof(Elf_Note); + + ROUNDUP2(namesz, 4); + ROUNDUP2(descsz, 4); + + sz = namesz + descsz; + + if (count < sz) + sz = count; + + (void) memcpy(dst, src, sz); + + src += sz; + dst += sz; + count -= sz; + } + + return (1); +} +#endif /* LIBELF_CONFIG_NOTE */ + +struct converters { + int (*tof32)(char *dst, size_t dsz, char *src, size_t cnt, + int byteswap); + int (*tom32)(char *dst, size_t dsz, char *src, size_t cnt, + int byteswap); + int (*tof64)(char *dst, size_t dsz, char *src, size_t cnt, + int byteswap); + int (*tom64)(char *dst, size_t dsz, char *src, size_t cnt, + int byteswap); +}; + +static struct converters cvt[ELF_T_NUM] = { +#if defined(__GNUC__) +#if LIBELF_CONFIG_ADDR + [ELF_T_ADDR] = { + .tof32 = libelf_cvt_ADDR32_tof, .tom32 = libelf_cvt_ADDR32_tom, + .tof64 = libelf_cvt_ADDR64_tof, .tom64 = libelf_cvt_ADDR64_tom }, +#endif +#if LIBELF_CONFIG_CAP + [ELF_T_CAP] = { + .tof32 = libelf_cvt32_CAP_tof, .tom32 = libelf_cvt32_CAP_tom, + .tof64 = libelf_cvt64_CAP_tof, .tom64 = libelf_cvt64_CAP_tom }, +#endif +#if LIBELF_CONFIG_DYN + [ELF_T_DYN] = { + .tof32 = libelf_cvt32_DYN_tof, .tom32 = libelf_cvt32_DYN_tom, + .tof64 = libelf_cvt64_DYN_tof, .tom64 = libelf_cvt64_DYN_tom }, +#endif +#if LIBELF_CONFIG_EHDR + [ELF_T_EHDR] = { + .tof32 = libelf_cvt32_EHDR_tof, .tom32 = libelf_cvt32_EHDR_tom, + .tof64 = libelf_cvt64_EHDR_tof, .tom64 = libelf_cvt64_EHDR_tom }, +#endif +#if LIBELF_CONFIG_HALF + [ELF_T_HALF] = { + .tof32 = libelf_cvt_HALF_tof, .tom32 = libelf_cvt_HALF_tom, + .tof64 = libelf_cvt_HALF_tof, .tom64 = libelf_cvt_HALF_tom }, +#endif +#if LIBELF_CONFIG_LWORD + [ELF_T_LWORD] = { + .tof32 = libelf_cvt_LWORD_tof, .tom32 = libelf_cvt_LWORD_tom, + .tof64 = libelf_cvt_LWORD_tof, .tom64 = libelf_cvt_LWORD_tom }, +#endif +#if LIBELF_CONFIG_MOVE + [ELF_T_MOVE] = { + .tof32 = libelf_cvt32_MOVE_tof, .tom32 = libelf_cvt32_MOVE_tom, + .tof64 = libelf_cvt64_MOVE_tof, .tom64 = libelf_cvt64_MOVE_tom }, +#endif +#if LIBELF_CONFIG_MOVEP + [ELF_T_MOVEP] = { + .tof32 = NULL, .tom32 = NULL, + .tof64 = NULL, .tom64 = NULL }, +#endif +#if LIBELF_CONFIG_OFF + [ELF_T_OFF] = { + .tof32 = libelf_cvt_OFF32_tof, .tom32 = libelf_cvt_OFF32_tom, + .tof64 = libelf_cvt_OFF64_tof, .tom64 = libelf_cvt_OFF64_tom }, +#endif +#if LIBELF_CONFIG_PHDR + [ELF_T_PHDR] = { + .tof32 = libelf_cvt32_PHDR_tof, .tom32 = libelf_cvt32_PHDR_tom, + .tof64 = libelf_cvt64_PHDR_tof, .tom64 = libelf_cvt64_PHDR_tom }, +#endif +#if LIBELF_CONFIG_REL + [ELF_T_REL] = { + .tof32 = libelf_cvt32_REL_tof, .tom32 = libelf_cvt32_REL_tom, + .tof64 = libelf_cvt64_REL_tof, .tom64 = libelf_cvt64_REL_tom }, +#endif +#if LIBELF_CONFIG_RELA + [ELF_T_RELA] = { + .tof32 = libelf_cvt32_RELA_tof, .tom32 = libelf_cvt32_RELA_tom, + .tof64 = libelf_cvt64_RELA_tof, .tom64 = libelf_cvt64_RELA_tom }, +#endif +#if LIBELF_CONFIG_SHDR + [ELF_T_SHDR] = { + .tof32 = libelf_cvt32_SHDR_tof, .tom32 = libelf_cvt32_SHDR_tom, + .tof64 = libelf_cvt64_SHDR_tof, .tom64 = libelf_cvt64_SHDR_tom }, +#endif +#if LIBELF_CONFIG_SWORD + [ELF_T_SWORD] = { + .tof32 = libelf_cvt_SWORD_tof, .tom32 = libelf_cvt_SWORD_tom, + .tof64 = libelf_cvt_SWORD_tof, .tom64 = libelf_cvt_SWORD_tom }, +#endif +#if LIBELF_CONFIG_SXWORD + [ELF_T_SXWORD] = { + .tof32 = NULL, .tom32 = NULL, + .tof64 = libelf_cvt_SXWORD_tof, .tom64 = libelf_cvt_SXWORD_tom }, +#endif +#if LIBELF_CONFIG_SYMINFO + [ELF_T_SYMINFO] = { + .tof32 = libelf_cvt32_SYMINFO_tof, .tom32 = libelf_cvt32_SYMINFO_tom, + .tof64 = libelf_cvt64_SYMINFO_tof, .tom64 = libelf_cvt64_SYMINFO_tom }, +#endif +#if LIBELF_CONFIG_SYM + [ELF_T_SYM] = { + .tof32 = libelf_cvt32_SYM_tof, .tom32 = libelf_cvt32_SYM_tom, + .tof64 = libelf_cvt64_SYM_tof, .tom64 = libelf_cvt64_SYM_tom }, +#endif +#if LIBELF_CONFIG_VDEF + [ELF_T_VDEF] = { + .tof32 = libelf_cvt32_VDEF_tof, .tom32 = libelf_cvt32_VDEF_tom, + .tof64 = libelf_cvt64_VDEF_tof, .tom64 = libelf_cvt64_VDEF_tom }, +#endif +#if LIBELF_CONFIG_VNEED + [ELF_T_VNEED] = { + .tof32 = libelf_cvt32_VNEED_tof, .tom32 = libelf_cvt32_VNEED_tom, + .tof64 = libelf_cvt64_VNEED_tof, .tom64 = libelf_cvt64_VNEED_tom }, +#endif +#if LIBELF_CONFIG_WORD + [ELF_T_WORD] = { + .tof32 = libelf_cvt_WORD_tof, .tom32 = libelf_cvt_WORD_tom, + .tof64 = libelf_cvt_WORD_tof, .tom64 = libelf_cvt_WORD_tom }, +#endif +#if LIBELF_CONFIG_XWORD + [ELF_T_XWORD] = { + .tof32 = NULL, .tom32 = NULL, + .tof64 = libelf_cvt_XWORD_tof, .tom64 = libelf_cvt_XWORD_tom }, +#endif + + /* + * Types that needs hand-coded converters follow. + */ + + [ELF_T_BYTE] = { + .tof32 = libelf_cvt_BYTE_tox, + .tom32 = libelf_cvt_BYTE_tox, + .tof64 = libelf_cvt_BYTE_tox, + .tom64 = libelf_cvt_BYTE_tox + }, + +#if LIBELF_CONFIG_GNUHASH + [ELF_T_GNUHASH] = { + .tof32 = libelf_cvt32_GNUHASH_tof, + .tom32 = libelf_cvt32_GNUHASH_tom, + .tof64 = libelf_cvt64_GNUHASH_tof, + .tom64 = libelf_cvt64_GNUHASH_tom + }, +#endif /* LIBELF_CONFIG_GNUHASH */ + +#if LIBELF_CONFIG_NOTE + [ELF_T_NOTE] = { + .tof32 = libelf_cvt_NOTE_tof, + .tom32 = libelf_cvt_NOTE_tom, + .tof64 = libelf_cvt_NOTE_tof, + .tom64 = libelf_cvt_NOTE_tom + } +#endif /* LIBELF_CONFIG_NOTE */ +#else + { libelf_cvt_ADDR32_tof, + libelf_cvt_ADDR32_tom, + libelf_cvt_ADDR64_tof, + libelf_cvt_ADDR64_tom + }, + { libelf_cvt_BYTE_tox, + libelf_cvt_BYTE_tox, + libelf_cvt_BYTE_tox, + libelf_cvt_BYTE_tox + }, + { 0, 0, 0, 0} , + { libelf_cvt32_DYN_tof, + libelf_cvt32_DYN_tom, + libelf_cvt64_DYN_tof, + libelf_cvt64_DYN_tom + }, + { libelf_cvt32_EHDR_tof, + libelf_cvt32_EHDR_tom, + libelf_cvt64_EHDR_tof, + libelf_cvt64_EHDR_tom + }, + { + libelf_cvt_HALF_tof, + libelf_cvt_HALF_tom, + libelf_cvt_HALF_tof, + libelf_cvt_HALF_tom + }, + {0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}, +#if LIBELF_CONFIG_NOTE + { libelf_cvt_NOTE_tof, + libelf_cvt_NOTE_tom, + libelf_cvt_NOTE_tof, + libelf_cvt_NOTE_tom + }, +#else + {0, 0, 0, 0}, +#endif /* LIBELF_CONFIG_NOTE */ + { libelf_cvt_OFF32_tof, + libelf_cvt_OFF32_tom, + libelf_cvt_OFF64_tof, + libelf_cvt_OFF64_tom + }, + { libelf_cvt32_PHDR_tof, + libelf_cvt32_PHDR_tom, + libelf_cvt64_PHDR_tof, + libelf_cvt64_PHDR_tom + }, + { libelf_cvt32_REL_tof, + libelf_cvt32_REL_tom, + libelf_cvt64_REL_tof, + libelf_cvt64_REL_tom + }, + { libelf_cvt32_RELA_tof, + libelf_cvt32_RELA_tom, + libelf_cvt64_RELA_tof, + libelf_cvt64_RELA_tom + }, + { libelf_cvt32_SHDR_tof, + libelf_cvt32_SHDR_tom, + libelf_cvt64_SHDR_tof, + libelf_cvt64_SHDR_tom + }, + { libelf_cvt_SWORD_tof, + libelf_cvt_SWORD_tom, + libelf_cvt_SWORD_tof, + libelf_cvt_SWORD_tom + }, + { 0, 0, + libelf_cvt_SXWORD_tof, + libelf_cvt_SXWORD_tom + }, + {0, 0, 0, 0}, + { libelf_cvt32_SYM_tof, + libelf_cvt32_SYM_tom, + libelf_cvt64_SYM_tof, + libelf_cvt64_SYM_tom + }, + {0, 0, 0, 0}, {0, 0, 0, 0}, + { libelf_cvt_WORD_tof, + libelf_cvt_WORD_tom, + libelf_cvt_WORD_tof, + libelf_cvt_WORD_tom + }, + { 0, 0, + libelf_cvt_XWORD_tof, + libelf_cvt_XWORD_tom + }, + { libelf_cvt32_GNUHASH_tof, + libelf_cvt32_GNUHASH_tom, + libelf_cvt64_GNUHASH_tof, + libelf_cvt64_GNUHASH_tom + } +#endif +}; + +int (*_libelf_get_translator(Elf_Type t, int direction, int elfclass)) + (char *_dst, size_t dsz, char *_src, size_t _cnt, int _byteswap) +{ + assert(elfclass == ELFCLASS32 || elfclass == ELFCLASS64); + assert(direction == ELF_TOFILE || direction == ELF_TOMEMORY); + + if (t >= ELF_T_NUM || + (elfclass != ELFCLASS32 && elfclass != ELFCLASS64) || + (direction != ELF_TOFILE && direction != ELF_TOMEMORY)) + return (NULL); + + return ((elfclass == ELFCLASS32) ? + (direction == ELF_TOFILE ? cvt[t].tof32 : cvt[t].tom32) : + (direction == ELF_TOFILE ? cvt[t].tof64 : cvt[t].tom64)); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_data.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_data.c new file mode 100644 index 0000000000..85039f0dba --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_data.c @@ -0,0 +1,114 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_data.c 1264 2010-11-12 14:53:23Z jkoshy $"); + +int +_libelf_xlate_shtype(uint32_t sht) +{ + switch (sht) { + case SHT_DYNAMIC: + return (ELF_T_DYN); + case SHT_DYNSYM: + return (ELF_T_SYM); +#if defined(SHT_FINI_ARRAY) + case SHT_FINI_ARRAY: + return (ELF_T_ADDR); +#endif +#if defined(SHT_GNU_HASH) + case SHT_GNU_HASH: + return (ELF_T_GNUHASH); +#endif +#if defined(SHT_GNU_LIBLIST) + case SHT_GNU_LIBLIST: + return (ELF_T_WORD); +#endif +#if defined(SHT_GROUP) + case SHT_GROUP: + return (ELF_T_WORD); +#endif + case SHT_HASH: + return (ELF_T_WORD); +#if defined(SHT_INIT_ARRAY) + case SHT_INIT_ARRAY: + return (ELF_T_ADDR); +#endif + case SHT_NOBITS: + return (ELF_T_BYTE); + case SHT_NOTE: + return (ELF_T_NOTE); +#if defined(SHT_PREINIT_ARRAY) + case SHT_PREINIT_ARRAY: + return (ELF_T_ADDR); +#endif + case SHT_PROGBITS: + return (ELF_T_BYTE); + case SHT_REL: + return (ELF_T_REL); + case SHT_RELA: + return (ELF_T_RELA); + case SHT_STRTAB: + return (ELF_T_BYTE); + case SHT_SYMTAB: + return (ELF_T_SYM); +#if defined(SHT_SYMTAB_SHNDX) + case SHT_SYMTAB_SHNDX: + return (ELF_T_WORD); +#endif +#if defined(SHT_SUNW_dof) + case SHT_SUNW_dof: + return (ELF_T_BYTE); +#endif +#if defined(SHT_SUNW_move) + case SHT_SUNW_move: + return (ELF_T_MOVE); +#endif +#if defined(SHT_SUNW_syminfo) + case SHT_SUNW_syminfo: + return (ELF_T_SYMINFO); +#endif +#if defined(SHT_SUNW_verdef) + case SHT_SUNW_verdef: /* == SHT_GNU_verdef */ + return (ELF_T_VDEF); +#endif +#if defined(SHT_SUNW_verneed) + case SHT_SUNW_verneed: /* == SHT_GNU_verneed */ + return (ELF_T_VNEED); +#endif +#if defined(SHT_SUNW_versym) + case SHT_SUNW_versym: /* == SHT_GNU_versym */ + return (ELF_T_HALF); +#endif + default: + return (-1); + } +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ehdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ehdr.c new file mode 100644 index 0000000000..4a3da5c43e --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_ehdr.c @@ -0,0 +1,211 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_ehdr.c 1677 2011-07-28 04:35:53Z jkoshy $"); + +/* + * Retrieve counts for sections, phdrs and the section string table index + * from section header #0 of the ELF object. + */ +static int +_libelf_load_extended(Elf *e, int ec, uint64_t shoff, uint16_t phnum, + uint16_t strndx) +{ + Elf_Scn *scn; + size_t fsz; + int (*xlator)(char *_d, size_t _dsz, char *_s, size_t _c, int _swap); + uint32_t shtype; + + assert(STAILQ_EMPTY(&e->e_u.e_elf.e_scn)); + + fsz = _libelf_fsize(ELF_T_SHDR, ec, e->e_version, 1); + assert(fsz > 0); + + if (e->e_rawsize < shoff + fsz) { /* raw file too small */ + LIBELF_SET_ERROR(HEADER, 0); + return (0); + } + + if ((scn = _libelf_allocate_scn(e, (size_t) 0)) == NULL) + return (0); + + xlator = _libelf_get_translator(ELF_T_SHDR, ELF_TOMEMORY, ec); + (*xlator)((char *) &scn->s_shdr, sizeof(scn->s_shdr), + e->e_rawfile + shoff, (size_t) 1, + e->e_byteorder != LIBELF_PRIVATE(byteorder)); + +#define GET_SHDR_MEMBER(M) ((ec == ELFCLASS32) ? scn->s_shdr.s_shdr32.M : \ + scn->s_shdr.s_shdr64.M) + + if ((shtype = GET_SHDR_MEMBER(sh_type)) != SHT_NULL) { + LIBELF_SET_ERROR(SECTION, 0); + return (0); + } + + e->e_u.e_elf.e_nscn = GET_SHDR_MEMBER(sh_size); + e->e_u.e_elf.e_nphdr = (phnum != PN_XNUM) ? phnum : + GET_SHDR_MEMBER(sh_info); + e->e_u.e_elf.e_strndx = (strndx != SHN_XINDEX) ? strndx : + GET_SHDR_MEMBER(sh_link); +#undef GET_SHDR_MEMBER + + return (1); +} + +#define EHDR_INIT(E,SZ) do { \ + Elf##SZ##_Ehdr *eh = (E); \ + eh->e_ident[EI_MAG0] = ELFMAG0; \ + eh->e_ident[EI_MAG1] = ELFMAG1; \ + eh->e_ident[EI_MAG2] = ELFMAG2; \ + eh->e_ident[EI_MAG3] = ELFMAG3; \ + eh->e_ident[EI_CLASS] = ELFCLASS##SZ; \ + eh->e_ident[EI_DATA] = ELFDATANONE; \ + eh->e_ident[EI_VERSION] = LIBELF_PRIVATE(version); \ + eh->e_machine = EM_NONE; \ + eh->e_type = ELF_K_NONE; \ + eh->e_version = LIBELF_PRIVATE(version); \ + } while (0) + +void * +_libelf_ehdr(Elf *e, int ec, int allocate) +{ + void *ehdr; + size_t fsz, msz; + uint16_t phnum, shnum, strndx; + uint64_t shoff; + int (*xlator)(char *_d, size_t _dsz, char *_s, size_t _c, int _swap); + + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (e == NULL || e->e_kind != ELF_K_ELF) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (e->e_class != ELFCLASSNONE && e->e_class != ec) { + LIBELF_SET_ERROR(CLASS, 0); + return (NULL); + } + + if (e->e_version != EV_CURRENT) { + LIBELF_SET_ERROR(VERSION, 0); + return (NULL); + } + + if (e->e_class == ELFCLASSNONE) + e->e_class = ec; + + if (ec == ELFCLASS32) + ehdr = (void *) e->e_u.e_elf.e_ehdr.e_ehdr32; + else + ehdr = (void *) e->e_u.e_elf.e_ehdr.e_ehdr64; + + if (ehdr != NULL) /* already have a translated ehdr */ + return (ehdr); + + fsz = _libelf_fsize(ELF_T_EHDR, ec, e->e_version, (size_t) 1); + assert(fsz > 0); + + // If we have a file that is attached to a read/write elf + // or the elf is a read only elf and the size is smaller than the + // file, then error. + if ((e->e_cmd == ELF_C_READ || (e->e_cmd == ELF_C_RDWR && e->e_rawfile)) && e->e_rawsize < fsz) { + LIBELF_SET_ERROR(HEADER, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_EHDR, ec, EV_CURRENT); + + assert(msz > 0); + + if ((ehdr = e->e_mem.alloc(msz)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + memset(ehdr, 0, msz); + + if (ec == ELFCLASS32) { + e->e_u.e_elf.e_ehdr.e_ehdr32 = ehdr; + EHDR_INIT(ehdr,32); + } else { + e->e_u.e_elf.e_ehdr.e_ehdr64 = ehdr; + EHDR_INIT(ehdr,64); + } + + if (allocate) + e->e_flags |= ELF_F_DIRTY; + + // If there is no raw file, return as we are done and don't need to + // process the file on the disk. + if (!e->e_rawfile) + return (ehdr); + + xlator = _libelf_get_translator(ELF_T_EHDR, ELF_TOMEMORY, ec); + (*xlator)(ehdr, msz, e->e_rawfile, (size_t) 1, + e->e_byteorder != LIBELF_PRIVATE(byteorder)); + + /* + * If extended numbering is being used, read the correct + * number of sections and program header entries. + */ + if (ec == ELFCLASS32) { + phnum = ((Elf32_Ehdr *) ehdr)->e_phnum; + shnum = ((Elf32_Ehdr *) ehdr)->e_shnum; + shoff = ((Elf32_Ehdr *) ehdr)->e_shoff; + strndx = ((Elf32_Ehdr *) ehdr)->e_shstrndx; + } else { + phnum = ((Elf64_Ehdr *) ehdr)->e_phnum; + shnum = ((Elf64_Ehdr *) ehdr)->e_shnum; + shoff = ((Elf64_Ehdr *) ehdr)->e_shoff; + strndx = ((Elf64_Ehdr *) ehdr)->e_shstrndx; + } + + if (shnum >= SHN_LORESERVE || + (shoff == 0LL && (shnum != 0 || phnum == PN_XNUM || + strndx == SHN_XINDEX))) { + LIBELF_SET_ERROR(HEADER, 0); + return (NULL); + } + + if (shnum != 0 || shoff == 0LL) { /* not using extended numbering */ + e->e_u.e_elf.e_nphdr = phnum; + e->e_u.e_elf.e_nscn = shnum; + e->e_u.e_elf.e_strndx = strndx; + } else if (_libelf_load_extended(e, ec, shoff, phnum, strndx) == 0) + return (NULL); + + return (ehdr); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_extended.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_extended.c new file mode 100644 index 0000000000..f110a28fe7 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_extended.c @@ -0,0 +1,135 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_extended.c 1360 2011-01-08 08:27:41Z jkoshy $"); + +/* + * Retrieve section #0, allocating a new section if needed. + */ +static Elf_Scn * +_libelf_getscn0(Elf *e) +{ + Elf_Scn *s; + + if ((s = STAILQ_FIRST(&e->e_u.e_elf.e_scn)) != NULL) + return (s); + + return (_libelf_allocate_scn(e, (size_t) SHN_UNDEF)); +} + +int +_libelf_setshnum(Elf *e, void *eh, int ec, size_t shnum) +{ + Elf_Scn *scn; + + if (shnum >= SHN_LORESERVE) { + if ((scn = _libelf_getscn0(e)) == NULL) + return (0); + + assert(scn->s_ndx == SHN_UNDEF); + + if (ec == ELFCLASS32) + scn->s_shdr.s_shdr32.sh_size = shnum; + else + scn->s_shdr.s_shdr64.sh_size = shnum; + + (void) elf_flagshdr(scn, ELF_C_SET, ELF_F_DIRTY); + + shnum = 0; + } + + if (ec == ELFCLASS32) + ((Elf32_Ehdr *) eh)->e_shnum = shnum; + else + ((Elf64_Ehdr *) eh)->e_shnum = shnum; + + return (1); +} + +int +_libelf_setshstrndx(Elf *e, void *eh, int ec, size_t shstrndx) +{ + Elf_Scn *scn; + + if (shstrndx >= SHN_LORESERVE) { + if ((scn = _libelf_getscn0(e)) == NULL) + return (0); + + assert(scn->s_ndx == SHN_UNDEF); + + if (ec == ELFCLASS32) + scn->s_shdr.s_shdr32.sh_link = shstrndx; + else + scn->s_shdr.s_shdr64.sh_link = shstrndx; + + (void) elf_flagshdr(scn, ELF_C_SET, ELF_F_DIRTY); + + shstrndx = SHN_XINDEX; + } + + if (ec == ELFCLASS32) + ((Elf32_Ehdr *) eh)->e_shstrndx = shstrndx; + else + ((Elf64_Ehdr *) eh)->e_shstrndx = shstrndx; + + return (1); +} + +int +_libelf_setphnum(Elf *e, void *eh, int ec, size_t phnum) +{ + Elf_Scn *scn; + + if (phnum >= PN_XNUM) { + if ((scn = _libelf_getscn0(e)) == NULL) + return (0); + + assert(scn->s_ndx == SHN_UNDEF); + + if (ec == ELFCLASS32) + scn->s_shdr.s_shdr32.sh_info = phnum; + else + scn->s_shdr.s_shdr64.sh_info = phnum; + + (void) elf_flagshdr(scn, ELF_C_SET, ELF_F_DIRTY); + + phnum = PN_XNUM; + } + + if (ec == ELFCLASS32) + ((Elf32_Ehdr *) eh)->e_phnum = phnum; + else + ((Elf64_Ehdr *) eh)->e_phnum = phnum; + + return (1); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_fsize.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_fsize.c new file mode 100644 index 0000000000..729e4a4019 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_fsize.c @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_fsize.m4 320 2009-03-07 16:37:53Z jkoshy $"); + +/* WARNING: GENERATED FROM libelf_fsize.m4. */ + +/* + * Create an array of file sizes from the elf_type definitions + */ + +struct fsize { + size_t fsz32; + size_t fsz64; +}; + +static struct fsize fsize[ELF_T_NUM] = { +#if defined(__GNUC__) +#if LIBELF_CONFIG_ADDR + [ELF_T_ADDR] = { .fsz32 = sizeof(Elf32_Addr), .fsz64 = sizeof(Elf64_Addr) }, +#endif +#if LIBELF_CONFIG_BYTE + [ELF_T_BYTE] = { .fsz32 = 1, .fsz64 = 1 }, +#endif +#if LIBELF_CONFIG_CAP + [ELF_T_CAP] = { .fsz32 = sizeof(Elf32_Word)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Xword)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_DYN + [ELF_T_DYN] = { .fsz32 = sizeof(Elf32_Sword)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Sxword)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_EHDR + [ELF_T_EHDR] = { .fsz32 = EI_NIDENT+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Word)+sizeof(Elf32_Addr)+sizeof(Elf32_Off)+sizeof(Elf32_Off)+sizeof(Elf32_Word)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+0, .fsz64 = EI_NIDENT+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Word)+sizeof(Elf64_Addr)+sizeof(Elf64_Off)+sizeof(Elf64_Off)+sizeof(Elf64_Word)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+0 }, +#endif +#if LIBELF_CONFIG_GNUHASH + [ELF_T_GNUHASH] = { .fsz32 = 1, .fsz64 = 1 }, +#endif +#if LIBELF_CONFIG_HALF + [ELF_T_HALF] = { .fsz32 = sizeof(Elf32_Half), .fsz64 = sizeof(Elf64_Half) }, +#endif +#if LIBELF_CONFIG_LWORD + [ELF_T_LWORD] = { .fsz32 = sizeof(Elf32_Lword), .fsz64 = sizeof(Elf64_Lword) }, +#endif +#if LIBELF_CONFIG_MOVE + [ELF_T_MOVE] = { .fsz32 = sizeof(Elf32_Lword)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+0, .fsz64 = sizeof(Elf64_Lword)+sizeof(Elf64_Xword)+sizeof(Elf64_Xword)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+0 }, +#endif +#if LIBELF_CONFIG_MOVEP + [ELF_T_MOVEP] = { .fsz32 = 0, .fsz64 = 0 }, +#endif +#if LIBELF_CONFIG_NOTE + [ELF_T_NOTE] = { .fsz32 = 1, .fsz64 = 1 }, +#endif +#if LIBELF_CONFIG_OFF + [ELF_T_OFF] = { .fsz32 = sizeof(Elf32_Off), .fsz64 = sizeof(Elf64_Off) }, +#endif +#if LIBELF_CONFIG_PHDR + [ELF_T_PHDR] = { .fsz32 = sizeof(Elf32_Word)+sizeof(Elf32_Off)+sizeof(Elf32_Addr)+sizeof(Elf32_Addr)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Word)+sizeof(Elf64_Word)+sizeof(Elf64_Off)+sizeof(Elf64_Addr)+sizeof(Elf64_Addr)+sizeof(Elf64_Xword)+sizeof(Elf64_Xword)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_REL + [ELF_T_REL] = { .fsz32 = sizeof(Elf32_Addr)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Addr)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_RELA + [ELF_T_RELA] = { .fsz32 = sizeof(Elf32_Addr)+sizeof(Elf32_Word)+sizeof(Elf32_Sword)+0, .fsz64 = sizeof(Elf64_Addr)+sizeof(Elf64_Xword)+sizeof(Elf64_Sxword)+0 }, +#endif +#if LIBELF_CONFIG_SHDR + [ELF_T_SHDR] = { .fsz32 = sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Addr)+sizeof(Elf32_Off)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Word)+sizeof(Elf64_Word)+sizeof(Elf64_Xword)+sizeof(Elf64_Addr)+sizeof(Elf64_Off)+sizeof(Elf64_Xword)+sizeof(Elf64_Word)+sizeof(Elf64_Word)+sizeof(Elf64_Xword)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_SWORD + [ELF_T_SWORD] = { .fsz32 = sizeof(Elf32_Sword), .fsz64 = sizeof(Elf64_Sword) }, +#endif +#if LIBELF_CONFIG_SXWORD + [ELF_T_SXWORD] = { .fsz32 = 0, .fsz64 = sizeof(Elf64_Sxword) }, +#endif +#if LIBELF_CONFIG_SYMINFO + [ELF_T_SYMINFO] = { .fsz32 = sizeof(Elf32_Half)+sizeof(Elf32_Half)+0, .fsz64 = sizeof(Elf64_Half)+sizeof(Elf64_Half)+0 }, +#endif +#if LIBELF_CONFIG_SYM + [ELF_T_SYM] = { .fsz32 = sizeof(Elf32_Word)+sizeof(Elf32_Addr)+sizeof(Elf32_Word)+1+1+sizeof(Elf32_Half)+0, .fsz64 = sizeof(Elf64_Word)+1+1+sizeof(Elf64_Half)+sizeof(Elf64_Addr)+sizeof(Elf64_Xword)+0 }, +#endif +#if LIBELF_CONFIG_VDEF + [ELF_T_VDEF] = { .fsz32 = sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Word)+sizeof(Elf64_Word)+sizeof(Elf64_Word)+0 }, +#endif +#if LIBELF_CONFIG_VNEED + [ELF_T_VNEED] = { .fsz32 = sizeof(Elf32_Half)+sizeof(Elf32_Half)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+sizeof(Elf32_Word)+0, .fsz64 = sizeof(Elf64_Half)+sizeof(Elf64_Half)+sizeof(Elf64_Word)+sizeof(Elf64_Word)+sizeof(Elf64_Word)+0 }, +#endif +#if LIBELF_CONFIG_WORD + [ELF_T_WORD] = { .fsz32 = sizeof(Elf32_Word), .fsz64 = sizeof(Elf64_Word) }, +#endif +#if LIBELF_CONFIG_XWORD + [ELF_T_XWORD] = { .fsz32 = 0, .fsz64 = sizeof(Elf64_Xword) }, +#endif +#elif defined(_MSC_VER) + {4, 8}, {1, 1}, {0, 0}, {8, 16}, {52, 64}, + {2, 2}, {0, 0}, {0, 0}, {0, 0}, {1, 1}, + {4, 8}, {32, 56}, {8, 16}, {12, 24}, {40, 64}, + {4, 4}, {0, 8}, {0, 0}, {16, 24}, {20, 20}, + {16, 16}, {4, 4}, {0, 8}, {1, 1} +#else +#error +#endif +}; + +size_t +_libelf_fsize(Elf_Type t, int ec, unsigned int v, size_t c) +{ + size_t sz; + + sz = 0; + if (v != EV_CURRENT) + LIBELF_SET_ERROR(VERSION, 0); + else if ((int) t < ELF_T_FIRST || t > ELF_T_LAST) + LIBELF_SET_ERROR(ARGUMENT, 0); + else { + sz = ec == ELFCLASS64 ? fsize[t].fsz64 : fsize[t].fsz32; + if (sz == 0) + LIBELF_SET_ERROR(UNIMPL, 0); + } + + return (sz*c); +} + diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_msize.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_msize.c new file mode 100644 index 0000000000..ce3447393b --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_msize.c @@ -0,0 +1,145 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_msize.m4 311 2009-02-26 16:46:31Z jkoshy $"); + +/* WARNING: GENERATED FROM libelf_msize.m4. */ + +struct msize { + size_t msz32; + size_t msz64; +}; + +static struct msize msize[ELF_T_NUM] = { +#if defined(__GNUC__) +#if LIBELF_CONFIG_ADDR + [ELF_T_ADDR] = { .msz32 = sizeof(Elf32_Addr), .msz64 = sizeof(Elf64_Addr) }, +#endif +#if LIBELF_CONFIG_BYTE + [ELF_T_BYTE] = { .msz32 = 1, .msz64 = 1 }, +#endif +#if LIBELF_CONFIG_CAP + [ELF_T_CAP] = { .msz32 = sizeof(Elf32_Cap), .msz64 = sizeof(Elf64_Cap) }, +#endif +#if LIBELF_CONFIG_DYN + [ELF_T_DYN] = { .msz32 = sizeof(Elf32_Dyn), .msz64 = sizeof(Elf64_Dyn) }, +#endif +#if LIBELF_CONFIG_EHDR + [ELF_T_EHDR] = { .msz32 = sizeof(Elf32_Ehdr), .msz64 = sizeof(Elf64_Ehdr) }, +#endif +#if LIBELF_CONFIG_GNUHASH + [ELF_T_GNUHASH] = { .msz32 = 1, .msz64 = 1 }, +#endif +#if LIBELF_CONFIG_HALF + [ELF_T_HALF] = { .msz32 = sizeof(Elf32_Half), .msz64 = sizeof(Elf64_Half) }, +#endif +#if LIBELF_CONFIG_LWORD + [ELF_T_LWORD] = { .msz32 = sizeof(Elf32_Lword), .msz64 = sizeof(Elf64_Lword) }, +#endif +#if LIBELF_CONFIG_MOVE + [ELF_T_MOVE] = { .msz32 = sizeof(Elf32_Move), .msz64 = sizeof(Elf64_Move) }, +#endif +#if LIBELF_CONFIG_MOVEP + [ELF_T_MOVEP] = { .msz32 = 0, .msz64 = 0 }, +#endif +#if LIBELF_CONFIG_NOTE + [ELF_T_NOTE] = { .msz32 = 1, .msz64 = 1 }, +#endif +#if LIBELF_CONFIG_OFF + [ELF_T_OFF] = { .msz32 = sizeof(Elf32_Off), .msz64 = sizeof(Elf64_Off) }, +#endif +#if LIBELF_CONFIG_PHDR + [ELF_T_PHDR] = { .msz32 = sizeof(Elf32_Phdr), .msz64 = sizeof(Elf64_Phdr) }, +#endif +#if LIBELF_CONFIG_REL + [ELF_T_REL] = { .msz32 = sizeof(Elf32_Rel), .msz64 = sizeof(Elf64_Rel) }, +#endif +#if LIBELF_CONFIG_RELA + [ELF_T_RELA] = { .msz32 = sizeof(Elf32_Rela), .msz64 = sizeof(Elf64_Rela) }, +#endif +#if LIBELF_CONFIG_SHDR + [ELF_T_SHDR] = { .msz32 = sizeof(Elf32_Shdr), .msz64 = sizeof(Elf64_Shdr) }, +#endif +#if LIBELF_CONFIG_SWORD + [ELF_T_SWORD] = { .msz32 = sizeof(Elf32_Sword), .msz64 = sizeof(Elf64_Sword) }, +#endif +#if LIBELF_CONFIG_SXWORD + [ELF_T_SXWORD] = { .msz32 = 0, .msz64 = sizeof(Elf64_Sxword) }, +#endif +#if LIBELF_CONFIG_SYMINFO + [ELF_T_SYMINFO] = { .msz32 = sizeof(Elf32_Syminfo), .msz64 = sizeof(Elf64_Syminfo) }, +#endif +#if LIBELF_CONFIG_SYM + [ELF_T_SYM] = { .msz32 = sizeof(Elf32_Sym), .msz64 = sizeof(Elf64_Sym) }, +#endif +#if LIBELF_CONFIG_VDEF + [ELF_T_VDEF] = { .msz32 = sizeof(Elf32_Verdef), .msz64 = sizeof(Elf64_Verdef) }, +#endif +#if LIBELF_CONFIG_VNEED + [ELF_T_VNEED] = { .msz32 = sizeof(Elf32_Verneed), .msz64 = sizeof(Elf64_Verneed) }, +#endif +#if LIBELF_CONFIG_WORD + [ELF_T_WORD] = { .msz32 = sizeof(Elf32_Word), .msz64 = sizeof(Elf64_Word) }, +#endif +#if LIBELF_CONFIG_XWORD + [ELF_T_XWORD] = { .msz32 = 0, .msz64 = sizeof(Elf64_Xword) }, +#endif +#elif defined(_MSC_VER) + {4, 8}, {1, 1}, {0, 0}, {8, 16}, {52, 64}, + {2, 2}, {0, 0}, {0, 0}, {0, 0}, {1, 1}, + {4, 8}, {32, 56}, {8, 16}, {12, 24}, {40, 64}, + {4, 4}, {0, 8}, {0, 0}, {16, 24}, {20, 20}, + {16, 16}, {4, 4}, {0, 8}, {1, 1} +#else +#error +#endif +}; + +size_t +_libelf_msize(Elf_Type t, int elfclass, unsigned int version) +{ + size_t sz; + + assert(elfclass == ELFCLASS32 || elfclass == ELFCLASS64); + assert((signed) t >= ELF_T_FIRST && t <= ELF_T_LAST); + + if (version != EV_CURRENT) { + LIBELF_SET_ERROR(VERSION, 0); + return (0); + } + + sz = (elfclass == ELFCLASS32) ? msize[t].msz32 : msize[t].msz64; + + return (sz); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_phdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_phdr.c new file mode 100644 index 0000000000..67ff4f9eb5 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_phdr.c @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_phdr.c 1677 2011-07-28 04:35:53Z jkoshy $"); + +void * +_libelf_getphdr(Elf *e, int ec) +{ + size_t phnum, phentsize; + size_t fsz, msz; + uint64_t phoff; + Elf32_Ehdr *eh32; + Elf64_Ehdr *eh64; + void *ehdr, *phdr; + int (*xlator)(char *_d, size_t _dsz, char *_s, size_t _c, int _swap); + + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((phdr = (ec == ELFCLASS32 ? + (void *) e->e_u.e_elf.e_phdr.e_phdr32 : + (void *) e->e_u.e_elf.e_phdr.e_phdr64)) != NULL) + return (phdr); + + /* + * Check the PHDR related fields in the EHDR for sanity. + */ + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) + return (NULL); + + phnum = e->e_u.e_elf.e_nphdr; + + if (ec == ELFCLASS32) { + eh32 = (Elf32_Ehdr *) ehdr; + phentsize = eh32->e_phentsize; + phoff = (uint64_t) eh32->e_phoff; + } else { + eh64 = (Elf64_Ehdr *) ehdr; + phentsize = eh64->e_phentsize; + phoff = (uint64_t) eh64->e_phoff; + } + + fsz = gelf_fsize(e, ELF_T_PHDR, phnum, e->e_version); + + assert(fsz > 0); + + if ((uint64_t) e->e_rawsize < (phoff + fsz)) { + LIBELF_SET_ERROR(HEADER, 0); + return (NULL); + } + + msz = _libelf_msize(ELF_T_PHDR, ec, EV_CURRENT); + + assert(msz > 0); + + if ((phdr = e->e_mem.alloc(phnum * msz)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + memset(phdr, 0, msz); + + if (ec == ELFCLASS32) + e->e_u.e_elf.e_phdr.e_phdr32 = phdr; + else + e->e_u.e_elf.e_phdr.e_phdr64 = phdr; + + xlator = _libelf_get_translator(ELF_T_PHDR, ELF_TOMEMORY, ec); + (*xlator)(phdr, phnum * msz, e->e_rawfile + phoff, phnum, + e->e_byteorder != LIBELF_PRIVATE(byteorder)); + + return (phdr); +} + +void * +_libelf_newphdr(Elf *e, int ec, size_t count) +{ + void *ehdr, *newphdr, *oldphdr; + size_t msz; + + if (e == NULL) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if ((ehdr = _libelf_ehdr(e, ec, 0)) == NULL) { + LIBELF_SET_ERROR(SEQUENCE, 0); + return (NULL); + } + + assert(e->e_class == ec); + assert(ec == ELFCLASS32 || ec == ELFCLASS64); + assert(e->e_version == EV_CURRENT); + + msz = _libelf_msize(ELF_T_PHDR, ec, e->e_version); + + assert(msz > 0); + + newphdr = NULL; + if (count > 0 && (newphdr = e->e_mem.alloc(count * msz)) == NULL) { + LIBELF_SET_ERROR(RESOURCE, 0); + return (NULL); + } + if (count > 0) { + memset(newphdr, 0, count * msz); + } + + if (ec == ELFCLASS32) { + if ((oldphdr = (void *) e->e_u.e_elf.e_phdr.e_phdr32) != NULL) + e->e_mem.dealloc(oldphdr); + e->e_u.e_elf.e_phdr.e_phdr32 = (Elf32_Phdr *) newphdr; + } else { + if ((oldphdr = (void *) e->e_u.e_elf.e_phdr.e_phdr64) != NULL) + e->e_mem.dealloc(oldphdr); + e->e_u.e_elf.e_phdr.e_phdr64 = (Elf64_Phdr *) newphdr; + } + + e->e_u.e_elf.e_nphdr = count; + + elf_flagphdr(e, ELF_C_SET, ELF_F_DIRTY); + + return (newphdr); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_shdr.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_shdr.c new file mode 100644 index 0000000000..a696cefc76 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_shdr.c @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_shdr.c 189 2008-07-20 10:38:08Z jkoshy $"); + +void * +_libelf_getshdr(Elf_Scn *s, int ec) +{ + Elf *e; + + if (s == NULL || (e = s->s_elf) == NULL || + e->e_kind != ELF_K_ELF) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + if (ec == ELFCLASSNONE) + ec = e->e_class; + + if (ec != e->e_class) { + LIBELF_SET_ERROR(CLASS, 0); + return (NULL); + } + + return ((void *) &s->s_shdr); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_xlate.c b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_xlate.c new file mode 100644 index 0000000000..ace4e09363 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/libelf_xlate.c @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 2006,2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "_libelf.h" + +LIBELF_VCSID("$Id: libelf_xlate.c 316 2009-02-28 16:08:44Z jkoshy $"); + +/* + * Translate to/from the file representation of ELF objects. + * + * Translation could potentially involve the following + * transformations: + * + * - an endianness conversion, + * - a change of layout, as the file representation of ELF objects + * can differ from their in-memory representation. + * - a change in representation due to a layout version change. + */ + +Elf_Data * +_libelf_xlate(Elf_Data *dst, const Elf_Data *src, unsigned int encoding, + int elfclass, int direction) +{ + int byteswap; + size_t cnt, dsz, fsz, msz; + uintptr_t sb, se, db, de; + + if (encoding == ELFDATANONE) + encoding = LIBELF_PRIVATE(byteorder); + + if ((encoding != ELFDATA2LSB && encoding != ELFDATA2MSB) || + dst == NULL || src == NULL || dst == src) { + LIBELF_SET_ERROR(ARGUMENT, 0); + return (NULL); + } + + assert(elfclass == ELFCLASS32 || elfclass == ELFCLASS64); + assert(direction == ELF_TOFILE || direction == ELF_TOMEMORY); + + if (dst->d_version != src->d_version) { + LIBELF_SET_ERROR(UNIMPL, 0); + return (NULL); + } + + if (src->d_buf == NULL || dst->d_buf == NULL) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + if ((int) src->d_type < 0 || src->d_type >= ELF_T_NUM) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + if ((fsz = (elfclass == ELFCLASS32 ? elf32_fsize : elf64_fsize) + (src->d_type, (size_t) 1, src->d_version)) == 0) + return (NULL); + + msz = _libelf_msize(src->d_type, elfclass, src->d_version); + + assert(msz > 0); + + if (src->d_size % (direction == ELF_TOMEMORY ? fsz : msz)) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + /* + * Determine the number of objects that need to be converted, and + * the space required for the converted objects in the destination + * buffer. + */ + if (direction == ELF_TOMEMORY) { + cnt = src->d_size / fsz; + dsz = cnt * msz; + } else { + cnt = src->d_size / msz; + dsz = cnt * fsz; + } + + if (dst->d_size < dsz) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + sb = (uintptr_t) src->d_buf; + se = sb + src->d_size; + db = (uintptr_t) dst->d_buf; + de = db + dst->d_size; + + /* + * Check for overlapping buffers. Note that db == sb is + * allowed. + */ + if (db != sb && de > sb && se > db) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + if ((direction == ELF_TOMEMORY ? db : sb) % + _libelf_malign(src->d_type, elfclass)) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + dst->d_type = src->d_type; + dst->d_size = dsz; + + byteswap = encoding != LIBELF_PRIVATE(byteorder); + + if (src->d_size == 0 || + (db == sb && !byteswap && fsz == msz)) + return (dst); /* nothing more to do */ + + if (!(_libelf_get_translator(src->d_type, direction, elfclass)) + (dst->d_buf, dsz, src->d_buf, cnt, byteswap)) { + LIBELF_SET_ERROR(DATA, 0); + return (NULL); + } + + return (dst); +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.cpp b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.cpp new file mode 100644 index 0000000000..0639f523cc --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.cpp @@ -0,0 +1,403 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. +// + +#include "memfile.h" +#include +#include +#include +#include +#include +#include +#include + +#if defined(__GNUC__) +#include +#include +#include +#else +#include +#if !defined(PROT_READ) +#define PROT_READ 0x0004 // FILE_MAP_READ +#endif +#if !defined(MAP_PRIVATE) +#define MAP_PRIVATE 0x0001 // FILE_MAP_COPY +#endif +#endif + +// Allocation granularity +#define ALLOC_G 512 +#define is_file(fd) ((fd) >= 0) + +#if defined(_WIN32) + +#define OPEN ::_open +#define READ(f, b, l) ::_read((f), (b), (unsigned int)(l)) +#define WRITE(f, b, l) ::_write((f), (b), (unsigned int)(l)) +#define CLOSE ::_close +#define LSEEK ::_lseek +#define FSTAT ::fstat +#define FTRUNC(f, l) ::_chsize((f), (long)(l)) +#define MMAP ::w32_mmap +#define MUNMAP ::w32_munmap + +#else + +#define OPEN ::open +#define READ(f, b, l) ::read((f), (b), (size_t)(l)) +#define WRITE ::write +#define CLOSE ::close +#define LSEEK ::lseek +#define FSTAT ::fstat +#define FTRUNC(f, l) ::ftruncate((f), (off_t)(l)) +#define MMAP ::mmap +#define MUNMAP ::munmap + +#endif + +#if defined(_WIN32) +extern "C" void* w32_mmap(void* start, size_t length, int prot, int flags, int fd, unsigned offset); +extern "C" int w32_munmap(void* start, size_t length); +#endif + +namespace amd { + +// A structure which either maintains in memory file or uses a real file. +class memfile_t { +public: + memfile_t() : buf(nullptr), curp(nullptr), size(0) {} + + bool reserve(size_t new_size) { + if (!new_size) + new_size = 1; + new_size = (new_size + ALLOC_G - 1) & ~(ALLOC_G - 1); + size_t pos = tell(); + void *p = realloc(buf, new_size); + if (!p) + return false; + buf = p; + setpos(pos); + return true; + } + + bool open(int oflag, int pmode) + { + size = 0; + buf = curp = nullptr; + return reserve(1); + } + + off_t read(void *buffer, size_t count) + { + if (!buffer) { + errno = EINVAL; + return -1; + } + + size_t pos = tell(); + if (pos >= size) + return 0; + + size_t ret = size - pos; + ret = std::min(ret, count); + memcpy(buffer, curp, ret); + advance(ret); + return (off_t)ret; + } + + off_t write(const void *buffer, size_t count) + { + if (!buffer) { + errno = EINVAL; + return -1; + } + + size_t pos = tell(); + size_t new_size = std::max(pos + count, size); + if (new_size > size) { + if (!reserve(new_size)) + return -1; + if (pos > size) + memset((char*)buf + size, 0, pos - size); + size = new_size; + } + + memcpy(curp, buffer, count); + advance(count); + return (off_t)count; + } + + int close() { + if (is_open()) { + free(buf); + buf = nullptr; + size = 0; + return 0; + } + errno = EBADF; + return -1; + } + + off_t lseek(off_t offset, int origin) + { + switch (origin) { + default: + errno = EINVAL; + return -1; + case SEEK_SET: + break; + case SEEK_CUR: + offset += (off_t)tell(); + break; + case SEEK_END: + offset += (off_t)size; + break; + } + + if (offset < 0) { + errno = EOVERFLOW; + return -1; + } + + setpos((size_t)offset); + return offset; + } + + bool fstat(struct stat *buf) const + { + if (!is_open()) { + errno = EBADF; + return false; + } + + memset(buf, 0, sizeof(struct stat)); + buf->st_mode = S_IFREG; + buf->st_size = (off_t)size; + return true; + } + + bool ftruncate(size_t len) + { + if (len > size) { + size_t pos = tell(); + lseek(0, SEEK_END); + while(len--) + write("", 1u); + setpos(len); + } else { + reserve(len); + size = len; + } + return true; + } + + bool is_open() const { return buf != nullptr; } + size_t tell() const { return size_t((char*)curp - (char*)buf); } + void* get() const { return buf; } + +protected: + + void setpos(size_t new_pos) { curp = (char*)buf + new_pos; } + void advance(off_t offset) { curp = (char*)curp + offset; } + void advance(size_t offset) { curp = (char*)curp + offset; } + +private: + void* buf; + void* curp; + size_t size; +}; + +} // namespace amd + +using namespace amd; + +static std::vector Files; + +static size_t fd2idx(int fd) +{ + return (unsigned)-fd - 2; +} + +static int idx2fd(size_t idx) +{ + return -(int)idx - 2; +} + +static memfile_t* get_memfile(int fd) +{ + if (fd >= -1) { + errno = EBADF; + return nullptr; + } + + size_t fno = fd2idx(fd); + + if (fno >= Files.size()) { + errno = EBADF; + return nullptr; + } + + memfile_t &m = Files[fno]; + if (!m.is_open()) { + errno = EBADF; + return nullptr; + } + + return &m; +} + +// Acts the same as open(), but path can be NULL, which is a request for in memory file +int mem_open(const char *path, int oflag, int pmode) +{ + if (path && path[0]) // Filename provided, real file requested + return OPEN(path, oflag, pmode); + + memfile_t m; + if (!m.open(oflag, pmode)) + return -1; + + for (size_t i = 0; i < Files.size(); ++i) { + if (!Files[i].is_open()) { + Files[i] = m; + return idx2fd(i); + } + } + + Files.push_back(m); + return idx2fd(Files.size() - 1); +} + +off_t mem_read(int fd, void *buffer, size_t count) +{ + if (is_file(fd)) + return READ(fd, buffer, count); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + return m->read(buffer, count); +} + +off_t mem_write(int fd, const void *buffer, size_t count) +{ + if (is_file(fd)) + return WRITE(fd, buffer, count); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + return m->write(buffer, count); +} + +int mem_close(int fd) +{ + if (is_file(fd)) + return CLOSE(fd); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + int ret = m->close(); + + if ((size_t)fd == (Files.size() - 1)) + Files.pop_back(); + + return ret; +} + +off_t mem_lseek(int fd, off_t offset, int origin) +{ + if (is_file(fd)) + return LSEEK(fd, offset, origin); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + return m->lseek(offset, origin); +} + +int mem_fstat(int fd, struct stat *buf) +{ + if (is_file(fd)) + return FSTAT(fd, buf); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + return m->fstat(buf) ? 0 : -1; +} + +int mem_ftruncate(int fd, size_t len) +{ + if (is_file(fd)) + return FTRUNC(fd, len); + + memfile_t *m = get_memfile(fd); + if (!m) + return -1; + + return m->ftruncate(len) ? 0 : -1; +} + +off_t mem_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ +#if defined(__GNUC__) + if (is_file(in_fd) && is_file(out_fd)) + return sendfile(out_fd, in_fd, offset, count); +#endif + + off_t start = offset ? *offset : mem_lseek(in_fd, 0, SEEK_CUR); + struct stat sb; + if (mem_fstat(in_fd, &sb) == -1) + return -1; + if (start < 0 || sb.st_size <= start) + return 0; + count = std::min(count, (size_t)(sb.st_size - start)); + void *in = mem_mmap(NULL, count, PROT_READ, MAP_PRIVATE, in_fd, start); + if ((void*)-1 == in) + return -1; + + off_t written = mem_write(out_fd, in, count); + mem_munmap(in, count); + if (written < 0) + return -1; + + if (offset) { + *offset += written; + } else { + if (mem_lseek(in_fd, written, SEEK_CUR) < 0) + return -1; + } + + return written; +} + +void* mem_mmap(void* start, size_t length, int prot, int flags, int fd, unsigned offset) +{ + if (is_file(fd)) + return MMAP(start, length, prot, flags, fd, offset); + + memfile_t *m = get_memfile(fd); + if (!m) + return (void*)-1; + + return (char*)m->get() + offset; +} + +int mem_munmap(void* start, size_t length) +{ + MUNMAP(start, length); + return 0; +} diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.h new file mode 100644 index 0000000000..1e906418ee --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/memfile.h @@ -0,0 +1,40 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +// +// Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. +// + +#ifndef _MEMFILE_H +#define _MEMFILE_H + +#include +#if !defined(_MSC_VER) +#include +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +// Acts the same as open(), but path can be NULL, which is a request for in memory file +extern int mem_open(const char *path, int oflag, int pmode); +extern off_t mem_read(int fd, void *buffer, size_t count); +extern off_t mem_write(int fd, const void *buffer, size_t count); +extern int mem_close(int fd); +extern off_t mem_lseek(int fd, off_t offset, int origin); +extern int mem_fstat(int fd, struct stat *buf); +extern int mem_ftruncate(int fd, size_t len); +extern off_t mem_sendfile(int out_fd, int in_fd, off_t *offset, size_t count); +extern void* mem_mmap(void* start, size_t length, int prot, int flags, int fd, unsigned offset); +extern int mem_munmap(void* start, size_t length); + +#if defined(__cplusplus) +} +#endif + +#endif // !_MEMFILE_H diff --git a/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/roundup.h b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/roundup.h new file mode 100644 index 0000000000..622c077110 --- /dev/null +++ b/shared/amdgpu-windows-interop/hsail-compiler/lib/loaders/elf/utils/libelf/roundup.h @@ -0,0 +1,15 @@ +/* +************************************************************************************************************************ +* +* Copyright (C) 2020 Advanced Micro Devices, Inc. All rights reserved. +* +***********************************************************************************************************************/ + +#ifdef __GNUC__ +# define roundup(x, y) (__builtin_constant_p (y) && powerof2 (y) \ + ? (((x) + (y) - 1) & ~((y) - 1)) \ + : ((((x) + ((y) - 1)) / (y)) * (y))) +#else +# define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif + diff --git a/shared/amdgpu-windows-interop/pal/CMakeLists.txt b/shared/amdgpu-windows-interop/pal/CMakeLists.txt new file mode 100644 index 0000000000..2be3e0d49e --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/CMakeLists.txt @@ -0,0 +1,92 @@ +## + ####################################################################################################################### + # + # Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + # + # Permission is hereby granted, free of charge, to any person obtaining a copy + # of this software and associated documentation files (the "Software"), to deal + # in the Software without restriction, including without limitation the rights + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + # copies of the Software, and to permit persons to whom the Software is + # furnished to do so, subject to the following conditions: + # + # The above copyright notice and this permission notice shall be included in all + # copies or substantial portions of the Software. + # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + # SOFTWARE. + # + ####################################################################################################################### +cmake_minimum_required(VERSION 3.21) +project(PAL LANGUAGES CXX) +add_library(pal INTERFACE) +set_target_properties(pal PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + POSITION_INDEPENDENT_CODE TRUE +) +target_compile_features(pal INTERFACE cxx_std_20) +if (NOT PAL_CLIENT_INTERFACE_MAJOR_VERSION EQUAL 932) + message(WARNING "PAL: PAL_CLIENT_INTERFACE_MAJOR_VERSION ${PAL_CLIENT_INTERFACE_MAJOR_VERSION} not supported !!!") +endif() +target_link_libraries(pal + INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palCompilerDeps.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palUtil.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_uuid.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/cwpack.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_lz4.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/addrlib.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/amdrdf.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/zstd.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/vam.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/UberTraceService.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/DriverUtilsService.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_settings.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/SettingsRpcService2.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcServer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddNet.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcShared.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddSocket.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/devdriver.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_common.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCommon.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCore.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_libyaml.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/mpack.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/metrohash.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/stb_sprintf.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcClient.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventStreamer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventClient.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventParser.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventServer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddYaml.lib + SetupAPI.Lib +) +target_compile_definitions(pal + INTERFACE + PAL_CLIENT_INTERFACE_MAJOR_VERSION=932 + GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION=42 + PAL_BUILD_RDF=1 + PAL_DEVELOPER_BUILD=0 + PAL_KMT_BUILD=1 +) +target_include_directories(pal + INTERFACE + inc + inc/core + inc/gpuUtil + inc/util + shared/inc + shared/devdriver/shared/legacy/inc + shared/devdriver/third_party/dd_crc32/inc + shared/metrohash/src +) diff --git a/shared/amdgpu-windows-interop/pal/LICENSE b/shared/amdgpu-windows-interop/pal/LICENSE new file mode 100644 index 0000000000..45b7b6b50c --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/shared/amdgpu-windows-interop/pal/inc/core/pal.h b/shared/amdgpu-windows-interop/pal/inc/core/pal.h new file mode 100644 index 0000000000..544a556f1d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/pal.h @@ -0,0 +1,833 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file pal.h + * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines common types, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palFormat.h" +#include "palSysUtil.h" + +// Forward declarations of global types (must be done outside of Pal namespace). +#if (PAL_KMT_BUILD) && !defined(__unix__) +struct HMONITOR__; +struct HWND__; +#endif + +#if PAL_KMT_BUILD +struct _SECURITY_ATTRIBUTES; +#endif + +/// Library-wide namespace encapsulating all PAL entities. +namespace Pal +{ + +typedef Util::int8 int8; ///< 8-bit integer. +typedef Util::int16 int16; ///< 16-bit integer. +typedef Util::int32 int32; ///< 32-bit integer. +typedef Util::int64 int64; ///< 64-bit integer. +typedef Util::uint8 uint8; ///< Unsigned 8-bit integer. +typedef Util::uint16 uint16; ///< Unsigned 16-bit integer. +typedef Util::uint32 uint32; ///< Unsigned 32-bit integer. +typedef Util::uint64 uint64; ///< Unsigned 64-bit integer. +typedef Util::gpusize gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from + /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. +typedef Util::Result Result; ///< The PAL core and utility companion share the same result codes for convenience. + +typedef Util::Rational Rational; ///< A ratio of two unsigned integers. + +#if defined(_WIN32) +typedef HMONITOR__* OsDisplayHandle; ///< OsDisplayHandle corresponds to an HMONITOR on Windows. +typedef HWND__* OsWindowHandle; ///< OsWindowHandle corresponds to an HWND on Windows. +typedef void* OsExternalHandle; ///< OsExternalHandle corresponds to a generic HANDLE on Windows + +typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on Vulkan. + +constexpr OsWindowHandle NullWindowHandle = nullptr; ///< Value representing a null or invalid window handle. +#elif defined(__unix__) + +typedef void* OsDisplayHandle; ///< The Display Handle for Linux except X11 platform +typedef uint32 OsExternalHandle; ///< OsExternalHandle corresponds to a generic handle on linux +typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on linux. + +/// OsWindowHandle corresponds to a window on X-Windows or surface on Wayland. +union OsWindowHandle +{ + void* pSurface; ///< Native surface handle in wayland is a pointer. + uint64 win; ///< Native window handle in X is a 32-bit integer (but stored here as 64 bit). +}; +constexpr OsWindowHandle NullWindowHandle = {nullptr}; ///< Value representing a null or invalid window handle. + +// don't check for the Linux Platform type; just compare the larger member of the union +inline bool operator==(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) + { return (lhs.pSurface == rhs.pSurface); } +inline bool operator!=(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) + { return (lhs.pSurface != rhs.pSurface); } +#else +#error "Unsupported OS platform detected!" +#endif + +#if PAL_CLIENT_EXAMPLE +typedef void* AddrHandle; ///< Corresponds to an ADDR_HANDLE. +#endif + +constexpr uint32 InvalidVidPnSourceId = ~0u; ///< In cases where PAL cannot abstract a Windows VidPnSourceId, this + /// represents an invalid value. (Note: zero is a valid value.) + +constexpr uint32 MaxVertexBuffers = 32; ///< Maximum number of vertex buffers per pipeline. +constexpr uint32 MaxColorTargets = 8; ///< Maximum number of color targets. +constexpr uint32 MaxStreamOutTargets = 4; ///< Maximum number of stream output target buffers. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 +constexpr uint32 MaxDescriptorSets = 2; ///< Maximum number of descriptor sets. +#endif +constexpr uint32 MaxMsaaRasterizerSamples = 16; ///< Maximum number of MSAA samples supported by the rasterizer. +constexpr uint32 MaxAvailableEngines = 12; ///< Maximum number of engines for a particular engine type. +constexpr uint32 MaxNumPlanes = 3; ///< Maximum number of format planes. + +constexpr uint64 InternalApiPsoHash = UINT64_MAX; ///< Default Hash for PAL internal pipelines. + +/// Specifies a category of GPU engine. Each category corresponds directly to a hardware engine. There may be multiple +/// engines available for a given type; the available engines on a particular GPU can be queried via +/// Device::GetProperties, returned in DeviceProperties.engineProperties[]. +enum EngineType : uint32 +{ + /// Corresponds to the graphics hardware engine (a.k.a. graphcis ring a.k.a 3D). + EngineTypeUniversal, + + /// Corresponds to asynchronous compute engines (ACE). + EngineTypeCompute, + + /// Corresponds to SDMA engines. + EngineTypeDma, + + /// Virtual engine that only supports inserting sleeps, used for implementing frame-pacing. + EngineTypeTimer, + + /// Number of engine types. + EngineTypeCount, +}; + +/// Specifies a category of GPU work. Each queue type only supports specific types of work. Determining which +/// QueueTypes are supported on which engines can be queried via IDevice::GetProperties, returned in +/// DeviceProperties.engineProperties[]. +enum QueueType : uint32 +{ + /// Supports graphics commands (draws), compute commands (dispatches), and copy commands. + QueueTypeUniversal, + + /// Supports compute commands (dispatches), and copy commands. + QueueTypeCompute, + + /// Supports copy commands. + QueueTypeDma, + + /// Virtual engine that only supports inserting sleeps, used for implementing frame pacing. + /// This is a software-only queue. + QueueTypeTimer, + + /// Number of queue types. + QueueTypeCount, +}; + +/// Defines flags for describing which queues are supported. +enum QueueTypeSupport : uint32 +{ + SupportQueueTypeUniversal = (1 << static_cast(QueueTypeUniversal)), + SupportQueueTypeCompute = (1 << static_cast(QueueTypeCompute)), + SupportQueueTypeDma = (1 << static_cast(QueueTypeDma)), + SupportQueueTypeTimer = (1 << static_cast(QueueTypeTimer)), + +}; + +// Many command buffers break down into multiple command streams targeting internal sub-engines. For example, Universal +// command buffers build a primary stream (DE) but may also build a second stream for async compute engine (ACE). +enum class SubEngineType : uint32 +{ + Primary = 0, // Subqueue that is the queue itself, rather than an ancillary queue. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 + AsyncCompute = 1, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. + ConstantEngine = 2, // CP constant update engine that runs in parallel with draw engine. + // Internal usage only. +#else + ConstantEngine = 1, // CP constant update engine that runs in parallel with draw engine. + AsyncCompute = 2, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. + Pup = 3, // Subqueue that is the queue itself but for PUP-style packets, rather than an + // ancillary queue +#endif + Count, +}; + +/// Defines the execution priority for a queue, specified either at queue creation or via IQueue::SetExecutionPriority() +/// on platforms that support it. QueuePriority::Normal corresponds to the default priority. +enum class QueuePriority : uint32 +{ + Normal = 0, ///< Normal priority (default). + Idle = 1, ///< Idle, or low priority (lower than Normal). + Medium = 2, ///< Medium priority (higher than Normal). + High = 3, ///< High priority (higher than Normal). + Realtime = 4, ///< Real time priority (higher than Normal). + Count +}; + +/// Defines flags for describing which queue priority levels are supported. +enum QueuePrioritySupport : uint32 +{ + SupportQueuePriorityNormal = (1 << static_cast(QueuePriority::Normal)), + SupportQueuePriorityIdle = (1 << static_cast(QueuePriority::Idle)), + SupportQueuePriorityMedium = (1 << static_cast(QueuePriority::Medium)), + SupportQueuePriorityHigh = (1 << static_cast(QueuePriority::High)), + SupportQueuePriorityRealtime = (1 << static_cast(QueuePriority::Realtime)), +}; + +/// Selects one of a few possible memory heaps accessible by a GPU. +enum GpuHeap : uint32 +{ + GpuHeapLocal = 0x0, ///< Local heap visible to the CPU. + GpuHeapInvisible = 0x1, ///< Local heap not visible to the CPU. + GpuHeapGartUswc = 0x2, ///< GPU-accessible uncached system memory. + GpuHeapGartCacheable = 0x3, ///< GPU-accessible cached system memory. + GpuHeapCount +}; + +/// Describes the desired access for a memory allocation. +enum GpuHeapAccess : uint32 +{ + GpuHeapAccessExplicit = 0x0, ///< Memory access is not known. Heaps will be explicitly defined. + GpuHeapAccessCpuNoAccess = 0x1, ///< Memory access from CPU not required. + GpuHeapAccessGpuMostly = 0x2, ///< Memory optimized for reads/writes from GPU and accessible from CPU. + GpuHeapAccessCpuReadMostly = 0x3, ///< Memory optimized for reads from CPU. + GpuHeapAccessCpuWriteMostly = 0x4, ///< Memory optimized for writes from CPU. + GpuHeapAccessCpuMostly = 0x5, ///< Memory optimized for read/writes from CPU. + GpuHeapAccessCount +}; + +#if defined(__unix__) +/// Describes possible handle types. +enum class HandleType : uint32 +{ + GemFlinkName = 0x0, ///< GEM flink name (needs DRM authentication, used by DRI2) + Kms = 0x1, ///< KMS handle which is used by all driver ioctls + DmaBufFd = 0x2, ///< DMA-buf fd handle + KmsNoImport = 0x3, ///< Deprecated in favour of and same behaviour as HandleTypeDmaBufFd, use that instead of this +}; +#endif + +/// Comparison function determines how a pass/fail condition is determined between two values. For depth/stencil +/// comparison, the first value comes from source data and the second value comes from destination data. +enum class CompareFunc : uint8 +{ + Never = 0x0, + Less = 0x1, + Equal = 0x2, + LessEqual = 0x3, + Greater = 0x4, + NotEqual = 0x5, + GreaterEqual = 0x6, + _Always = 0x7, + + // Unfortunately for Linux clients, X.h includes a "#define Always 2" macro. Clients have their choice of either + // undefing Always before including this header or using _Always when dealing with PAL. +#ifndef Always + Always = _Always, +#endif + + Count +}; + +/// Defines an offset into a 2D pixel region. +struct Offset2d +{ + int32 x; ///< X offset. + int32 y; ///< Y offset. +}; + +/// Defines an offset into a 3D pixel region. +struct Offset3d +{ + int32 x; ///< X offset. + int32 y; ///< Y offset. + int32 z; ///< Z offset. +}; + +/// Defines an floating-point offset into a 3D pixel region. +struct Offset3dFloat +{ + float x; ///< X offset. + float y; ///< Y offset. + float z; ///< Z offset. +}; + +/// Defines a width and height for a 2D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent2d +{ + uint32 width; ///< Width of region. + uint32 height; ///< Height of region. +}; + +/// Defines a signed width and height, for a 2D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct SignedExtent2d +{ + int32 width; ///< Width of region. + int32 height; ///< Height of region. +}; + +/// Defines a width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent3d +{ + uint32 width; ///< Width of region. + uint32 height; ///< Height of region. + uint32 depth; ///< Depth of region. +}; + +constexpr bool operator==(const Extent3d& x, const Extent3d& y) +{ + return (x.width == y.width) && (x.height == y.height) && (x.depth == y.depth); +} + +constexpr bool operator!=(const Extent3d& x, const Extent3d& y) { return (x == y) == false; } + +/// Defines a signed width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct SignedExtent3d +{ + int32 width; ///< Width of region. + int32 height; ///< Height of region. + int32 depth; ///< Depth of region. +}; + +/// Defines a floating-point width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or +/// bytes depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent3dFloat +{ + float width; ///< Width of region. + float height; ///< Height of region. + float depth; ///< Depth of region. +}; + +/// Defines a region in 1D space. +struct Range +{ + int32 offset; ///< Starting position. + uint32 extent; ///< Region size. +}; + +/// Defines a rectangular region in 2D space. +struct Rect +{ + Offset2d offset; ///< Top left corner. + Extent2d extent; ///< Rectangle width and height. +}; + +/// Defines a cubic region in 3D space. +struct Box +{ + Offset3d offset; ///< Top left front corner. + Extent3d extent; ///< Box width, height and depth. +}; + +/// ShaderHash represents a 128-bit shader hash. +struct ShaderHash +{ + uint64 lower; ///< Lower 64-bits of hash + uint64 upper; ///< Upper 64-bits of hash +}; + +/// PipelineHash represents a concatenated pair of 64-bit hashes. +struct PipelineHash +{ + uint64 stable; ///< Lower 64-bits of hash. "Stable" portion, suitable for e.g. shader replacement use cases. + uint64 unique; ///< Upper 64-bits of hash. "Unique" portion, suitable for e.g. pipeline cache use cases. +}; + +/// Common shader pre and post compilation stats. +struct CommonShaderStats +{ + uint32 numUsedVgprs; ///< Number of VGPRs used by this shader + uint32 numUsedSgprs; ///< Number of SGPRs used by this shader + + uint32 ldsSizePerThreadGroup; ///< LDS size per thread group in bytes. + size_t ldsUsageSizeInBytes; ///< LDS usage by this shader. + + size_t scratchMemUsageInBytes; ///< Amount of scratch mem used by this shader. + gpusize gpuVirtAddress; ///< Gpu mem address of shader ISA code. + + union + { + struct + { + uint32 isWave32 : 1; ///< If set, specifies that the shader is compiled in wave32 mode. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit uint. + } flags; ///< Shader compilation stat flags. +}; + +/// Per-thread stack sizes +struct CompilerStackSizes +{ + uint32 backendSize; ///< Managed by compiler backend + uint32 frontendSize; ///< Managed by compiler frontend +}; + +///@{ +/// Determines whether two ShaderHashes or PipelineHashes are equal. +/// +/// @param [in] hash1 The first 128-bit shader hash or pipeline hash +/// @param [in] hash2 The second 128-bit shader hash or pipeline hash +/// +/// @returns True if the hashes are equal. +constexpr bool ShaderHashesEqual(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } +constexpr bool operator==(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } +constexpr bool operator!=(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower != hash2.lower) || (hash1.upper != hash2.upper)); } +constexpr bool PipelineHashesEqual(const PipelineHash hash1, const PipelineHash hash2) + { return ((hash1.stable == hash2.stable) && (hash1.unique == hash2.unique)); } +///@} + +///@{ +/// Determines whether the given ShaderHash or PipelineHash is non-zero. +/// +/// @param [in] hash A 128-bit shader hash or pipeline hash +/// +/// @returns True if the hash is non-zero. +constexpr bool ShaderHashIsNonzero(const ShaderHash hash) { return ((hash.upper | hash.lower) != 0); } +constexpr bool PipelineHashIsNonzero(const PipelineHash hash) { return ((hash.stable | hash.unique) != 0); } +///@} + +/// Specifies the Display Output Post-Processing (DOPP) desktop texture information, which are provided by OpenGL via +/// interop. The DOPP is an OpenGL extension to allow its client to access the desktop texture directly without the +/// need of copying to system memory. This is only supported on Windows. +struct DoppDesktopInfo +{ + gpusize gpuVirtAddr; ///< The VA of the dopp desktop texture. Set to 0 for the non-dopp resource. + uint32 vidPnSourceId; ///< Display source id of the dopp desktop texture. +}; + +/// Specifies the Direct Capture resource information. Direct Capture is an extension that allows to access on-screen +/// primary, motion vectors, depth, and camera matrix directly. This is only supported on Windows. +struct DirectCaptureInfo +{ + uint32 vidPnSourceId; ///< VidPnSource ID of the on-screen primary. + union + { + struct + { + uint32 preflip : 1; ///< Requires pre-flip primary access + uint32 postflip : 1; ///< Requires post-flip primary access. A DirectCapture resource cannot + /// have pre-flip and post-flip access at the same time + uint32 accessDesktop : 1; ///< Requires acces to the desktop + uint32 shared : 1; ///< This resource will be shared between APIs + uint32 frameGenRatio : 4; ///< Frame generation ratio + uint32 paceGeneratedFrame : 1; ///< Requires pacing the generated frames + uint32 requiresDisplayDcc : 1; ///< Requires display dcc support + uint32 requestMotionVectors : 1; ///< Request DirectCapture access to motion vector data if available + uint32 requestDepth : 1; ///< Request DirectCapture access to depth data if available + uint32 requestCamera : 1; ///< Request DirectCapture access to camera matrix data if available + uint32 initMotionVectors : 1; ///< Initialize the DirectCapture resource to access motion vector data + uint32 initDepth : 1; ///< Initialize the DirectCapture resource to access depth data + uint32 initCamera : 1; ///< Initialize the DirectCapture resource to access camera matrix + uint32 requestHudLessImage : 1; ///< Request DirectCapture access to HUD less image if available + uint32 initHudLessImage : 1; ///< Initialize the DirectCapture resource to access HUD less image + uint32 reserved : 14; + }; + uint32 u32All; + } usageFlags; + + OsExternalHandle hNewFrameEvent; ///< Event to notify of a new frame available for pre-flip or post-flip access + OsExternalHandle hFatalErrorEvent; ///< Event to notify of a fatal error +}; + +/// Specifies parameters for opening a shared GPU resource from a non-PAL device or non-local process. +struct ExternalResourceOpenInfo +{ + OsExternalHandle hExternalResource; ///< External GPU resource from another non-PAL device to open. +#if defined(__unix__) + HandleType handleType; ///< Type of the external GPU resource to be opened. +#endif + + union + { + struct + { + uint32 ntHandle : 1; ///< The provided hExternalResource is an NT handle instead of a default + /// KMT handle. + uint32 androidHwBufHandle : 1; ///< The provided hExternalResource is android hardware buffer handle + /// instead of fd. + uint32 isDopp : 1; ///< This is a Dopp texture, doppDesktopInfo is in use. + uint32 isDirectCapture : 1; ///< This is a Direct Capture resource, directCaptureInfo is in use. + uint32 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External resource open flags. + + union + { + DoppDesktopInfo doppDesktopInfo; ///< The information of dopp desktop texture. + DirectCaptureInfo directCaptureInfo; ///< The information of direct capture resource. + }; +}; + +/// Packed pixel display enumeration. +/// +/// In the medical imaging market space, there are several 10-bit per component color and grayscale displays +/// available.In addition to being high precision, these displays tend to be very high resolution.For grayscale +/// displays,one method of getting high pixel resolution in 10b precision is a proprietary method called +/// "packed pixel".Each of these packed pixel formats packs two/three 10-bit luminance values into a single +/// R8G8B8 pixel. +/// +/// Example Displays: +/// +/// EIZO GS510 +/// NEC MD21GS +/// TOTOKU ME55Xi2 +/// FIMI 3/5MP +/// +/// +/// The enumerations are named in a way to describe the format of the packed pixels. Names for +/// formats with two or three pixels packed into a single word (corresponding to a simple RGB pixel) +/// follow this convention: +/// +/// LLLLLL_RRRRRR (L=left pixel, R=right pixel) or +/// LLL_MMM_RRR (L=left pixel, M=middle pixel, R=right pixel) +/// +/// The bit order for a pixel follows this convention: +/// +/// (ColorBand)MSB(ColorBand)LSB +/// +/// For example: G70B54 means that the MSBs are in 7-0 of the green channel, and the LSBs +/// are stored in bits 5-4. +/// +enum class PackedPixelType : uint32 +{ + NotPacked = 0, ///< Pixels not packed, for standard color RGB8 monitor + SplitG70B54_R70B10, ///< 10-bit mono, split screen + SplitB70G10_R70G76, ///< 10-bit mono, split screen + G70B54_R70B10, ///< 10-bit mono, 2 adjacent pixels + B70R32_G70R76, ///< 10-bit mono, 2 adjacent pixels + B70R30_G70R74, ///< 12-bit mono, 2 adjacent pixels + B70_G70_R70, ///< 8-bit mono, 3 adjacent pixels + R70G76, ///< 10-bit mono, single pixel + G70B54, ///< 10-bit mono, single pixel + Native, ///< 10-bit color, without packing +}; + +/// Enumerates the logging priority levels supported by PAL. +enum class LogLevel : uint32 +{ + Debug = 0, ///< Debug messages + Verbose, ///< High frequency messages + Info, ///< Low frequency messages + Alert, ///< Warnings + Error, ///< Critical issues + Always ///< All messages +}; + +/// Enumerates all log categories explicitly defined by PAL +enum class LogCategory : uint64 +{ + Correctness = 0, ///< Application correctness + Performance, ///< Application performance + Internal, ///< Internal logging + Display, ///< Display Info + Count +}; + +/// String table used to register log categories +constexpr const char* LogCategoryTable[] = +{ + "Correctness", + "Performance", + "Internal", + "Display" +}; + +/// Typedef for log category masks. +typedef uint64 LogCategoryMask; + +/// Log category mask for messages related to application correctness +constexpr LogCategoryMask LogCategoryMaskCorrectness = (1 << static_cast(LogCategory::Correctness)); + +/// Log category mask for messages related to application performance +constexpr LogCategoryMask LogCategoryMaskPerformance = (1 << static_cast(LogCategory::Performance)); + +/// Log category mask for messages related to internal messages +constexpr LogCategoryMask LogCategoryMaskInternal = (1 << static_cast(LogCategory::Internal)); + +/// Log category mask for messages related to display information (e.g. HDR format) +constexpr LogCategoryMask LogCategoryMaskDisplay = (1 << static_cast(LogCategory::Display)); + +/// Defines the modes that the GPU Profiling layer can be enabled with. If the GpuProfilerMode is +/// GpuProfilerTraceEnabledTtv or GpuProfilerTraceEnabledRgp, then the GpuProfilerConfig_TraceModeMask is examined to +/// configure the trace type (spm, sqtt or both) requested. +enum GpuProfilerMode : uint32 +{ + GpuProfilerDisabled = 0, ///< Gpu Profiler is disabled. + GpuProfilerCounterAndTimingOnly = 1, ///< Traces are disabled but perf counter and timing operations are enabled. + GpuProfilerTraceEnabledTtv = 2, ///< Traces are output in format (.csv, .out) for Thread trace viewer. + GpuProfilerTraceEnabledRgp = 3, ///< Trace data is output as .rgp file for Radeon Gpu Profiler. +}; + +// Defines the trigger keys for capturing the GPU profiler. +typedef Util::KeyCode GpuProfilerCaptureTriggerKey; + +#define PAL_EVENT_LOGGING_VERSION 528 + +/// This enumeration identifies the source/owner of a resource object, used for event logging. +enum ResourceOwner : uint32 +{ + ResourceOwnerApplication = 0, ///< The resource is owned by the application + ResourceOwnerPalClient = 1, ///< The resource is owned by the PAL client + ResourceOwnerPal = 2, ///< The resource is owned by PAL + ResourceOwnerUnknown = 3, ///< The resource owner is unknown +}; + +/// This enumeration lists the usage/category of a resource object to give context in event logging. +enum ResourceCategory : uint32 +{ + ResourceCategoryApplication = 0, ///< The resource is used by the application. + ResourceCategoryRpm = 1, ///< The resource is used by RPM + ResourceCategoryProfiling = 2, ///< The resource is used for profiling (e.g. SQTT, SPM, etc) + ResourceCategoryDebug = 3, ///< The resource is used for debug purposes + ResourceCategoryRayTracing = 4, ///< The resource is used for ray tracing + ResourceCategoryVideo = 5, ///< The resource is used for video encode/decode + ResourceCategoryMisc = 6, ///< Miscellaneous, resource doesn't fit in any of the above categories + ResourceCategoryUnknown = 7, ///< The resource category is unknown +}; + +/// Set of information about resource ownership and usage, used for event logging. +struct ResourceEventInfo +{ + ResourceOwner owner; ///< Resource owner + ResourceCategory category; ///< Resource category +}; + +/// General purpose on/off/default tri-state enum. +enum class TriState : uint8 +{ + Default = 0, ///< Let implementation decide whether to enable or disable + Enable = 1, ///< Force enable + Disable = 2, ///< Force disable + Count +}; + +/// Defines the modes that the GPU Profiling layer can be enabled with. +/** + *********************************************************************************************************************** + * @mainpage + * + * Introduction + * ------------ + * The Platform Abstraction Library (PAL) provides hardware and OS abstractions for Radeon (GCN+) user-mode 3D graphics + * drivers. The level of abstraction is chosen to support performant driver implementations of several APIs while + * hiding the client from hardware and operating system details. + * + * PAL client drivers will have no HW-specific code; their responsibility is to translate API/DDI commands into PAL + * commands as efficiently as possible. This means that the client should be unaware of hardware registers, PM4 + * commands, SP3 shaders, etc. However, PAL is an abstraction of AMD hardware only, so many things in the PAL interface + * have an obvious correlation to hardware features. + * + * PAL client drivers should have little OS-specific code. PAL and its companion utility collection provide + * OS abstractions for almost everything a client might need, but there are some cases where this is unavoidable: + * + * + Handling dynamic library infrastructure. I.e., the client has to implement DllMain() on Windows, etc. + * + OS-specific APIs or extensions. DX may have Windows-specific functionality in the core API, and Vulkan may + * export certain OS-specific features as extensions (like for presenting contents to the screen). + * + Single OS clients (e.g., DX) may choose to make OS-specific calls directly simply out of convenience with no down + * side. + * + * + * The following diagram illustrates the software stack when running a 3D application with a PAL-based UMD. Non-AMD + * components are in gray, UMD client code is blue, AMD static libs linked into the UMD are green, and the AMD KMD + * is in red. + * + * @image html swStack.png + * + * PAL is a relatively _thick_ abstraction layer, typically accounting for the majority of code (excluding SC) in any + * particular UMD built on PAL. The level of abstraction tends to be higher in areas where client APIs are similar, + * and lower (closer to hardware) in areas where client APIs diverge significantly. The overall philosophy is to share + * as much code as possible without impacting client driver performance. Our committed goal is that CPU-limited + * performance should be within 5% of what a native solution could achieve, and GPU-limited performance should be within + * 2%. + * + * PAL uses a C++ interface. The public interface is defined in .../pal/inc, and client must _only_ include headers + * from that directory. The interface is spread over many header files - typically one per class - in order to clarify + * dependencies and reduce build times. There are two sub-directories in .../pal/inc: + * + * + .../pal/inc/core - Defines the PAL Core (see @ref Overview). + * + .../pal/inc/gpuUtil - Defines the PAL GPU Utility Collection (see @ref GpuUtilOverview). + * + .../pal/inc/util - Defines the PAL Utility Collection (see @ref UtilOverview). + * + * + * @copydoc VersionHistory + * + * Next: @ref Build + *********************************************************************************************************************** + */ + +/** + *********************************************************************************************************************** + * @page Overview PAL Core Overview + * + * ### Introduction + * PAL's core interface is defined in the @ref Pal namespace, and defines an object-oriented model for interacting with + * the GPU and OS. The interface closely resembles the Vulkan and DX12 APIs. Some common features of these + * APIs that are central to the PAL interface: + * + * - All shader stages, and some additional "shader adjacent" state, are glommed together into a monolithic pipeline + * object. + * - Explicit, free-threaded command buffer generation. + * - Support for multiple, asynchronous engines for executing GPU work (graphics, compute, DMA). + * - Explicit system and GPU memory management. + * - Flexible shader resource binding model. + * - Explicit management of stalls, cache flushes, and compression state changes. + * + * However, as a common component supporting multiple APIs, the PAL interface tends to be lower level in places where + * client APIs diverge. + * + * ### Settings + * The PAL library has a number of configuration settings available for the client to modify either programmatically + * or via external settings. PAL also includes infrastructure for building/loading client-specific settings. + * See @ref Settings for a detailed description of this support. + * + * ### Initialization + * The first step to interacting with the PAL core is creating an IPlatform object and enumerating IDevice objects + * representing GPUs attached to the system and, optionally, IScreen objects representing displays attached to the + * system. See @ref LibInit for a detailed description. + * + * ### System Memory Allocation + * Clients have a lot of control over PAL's system memory allocations. Most PAL objects require the client to provide + * system memory; the client first calls a GetSize() method and then passes a pointer to PAL on the actual create call. + * Further, when PAL needs to make an internal allocation, it will optionally call a client callback, which can be + * specified on platform creation. This callback will specify a category for the allocation, which may imply an + * expected lifetime. + * + * ### Interface Classes + * The following diagram illustrates the relationship of some key PAL interfaces and how they interact to render a + * typical frame in a modern game. Below that is a listing of all of PAL's interface classes, and a very brief + * description of their purpose. Follow the link for each interface to see detailed reference documentation. + * + * @image html scheduling.png + * + * - __OS Abstractions__ + * + _IPlatform_: Root-level object created by clients that interact with PAL. Mostly responsible for enumerating + * devices and screens attached to the system and returning any system-wide properties.

+ * + _IDevice_: Configurable context for querying properties of a particular GPU and interacting with it. Acts as a + * factory for almost all other PAL objects.

+ * + _IQueue_: A device has one or more _engines_ which are able to issue certain types of work. Tahiti, for example, + * has 1 universal engine (supports graphics, compute, or copy commands), 2 compute engines (support + * compute or copy commands), and 2 DMA engines (support only copy commands). An IQueue object is a + * context for submitting work on a particular engine. This mainly takes the form of submitting command + * buffers and presenting images to the screen. Work performed in a queue will be started in order, but + * work executed on different queues (even if the queues reference the same engine) is not guaranteed + * to be ordered without explicit synchronization.

+ * + _IQueueSemaphore_: Queue semaphores can be signaled and waited on from an IQueue in order to control execution + * order between queues.

+ * + _IFence_: Used for coarse-grain CPU/GPU synchronization. Fences can be signalled from the GPU as part of a + * command buffer submission on a queue, then waited on from the CPU.

+ * + _IGpuMemory_: Represents a GPU-accessible memory allocation. Can either be virtual (only VA allocation which + * must be explicitly mapped via an IQueue operation) or physical. Residency of physical allocations + * must be managed by the client either globally for a device (IDevice::AddGpuMemoryReferences) or by + * specifying allocations referenced by command buffers at submit.

+ * + _ICmdAllocator_: GPU memory allocation pool used for backing an ICmdBuffer. The client is free to create one + * allocator per device, or one per thread to remove thread contention.

+ * + _IScreen_: Represents a display attached to the system. Mostly used for managing full-screen flip + * presents.

+ * + _IPrivateScreen_: Represents a display that is not otherwise visible to the OS, typically a VR head mounted + * display.

+ * - __Hardware IP Abstractions__ + * + __All IP__ + * - _ICmdBuffer_: Clients build command buffers to execute the desired work on the GPU, and submit them on a + * corresponding queue. Different types of work can be executed depending on the _queueType_ of + * the command buffer (graphics work, compute work, DMA work).

+ * - _IImage_: Images are a 1D, 2D, or 3D collection of pixels (i.e., _texture_) that can be accessed by the + * GPU in various ways: texture sampling, BLT source/destination, UAV, etc.

+ * + __GFXIP-only__ + * - _IShader_: Container for shader byte code used as an input to pipeline creation. No compilation occurs + * until an IPipeline is created. Currently, AMDIL is the only supported input language.

+ * - _IPipeline_: Comprised of all shader stages (CS for compute, VS/HS/DS/GS/PS for graphics), resource mappings + * describing how user data entries are to be used by the shaders, and some other fixed-function + * state like depth/color formats, blend enable, MSAA enable, etc.

+ * - _IColorTargetView_: IImage view allowing the image to be bound as a color target (i.e., RTV.).

+ * - _IDepthStencilView_: IImage view allowing the image to be bound as a depth/stencil target (i.e., DSV).

+ * - _IGpuEvent_: Used for fine-grained (intra-command buffer) synchronization between the CPU and GPU. GPU + * events can be set/reset from either the CPU or GPU and waited on from either.

+ * - _IQueryPool_: Collection of query slots for tracking occlusion or pipeline stats query results.

+ * - __Dynamic State Objects__: _IColorBlendState_, _IDepthStencilState_, _IMsaaState_, _IScissorState_, + * and _IViewportState_ define logical collections of related fixed function graphics + * state, similar to DX11.

+ * - _IPerfExperiment_: Used for gathering performance counter and thread trace data.

+ * - _IBorderColorPalette_: Provides a collection of indexable colors for use by samplers that clamp to an + * arbitrary border color.

+ * - __Common Base Classes__ + * + _IDestroyable_: Defines a _Destroy()_ method for the PAL interface. Calling _Destroy()_ will release any + * internally allocated resources for the object, but the client is still responsible for freeing + * the system memory provided for the object.

+ * + _IGpuMemoryBindable_: Defines a set of methods for binding GPU memory to the object. Interfaces that inherit + * _IGpuMemoryBindable_ require GPU memory in order to be used by the GPU. The client + * must query the requirements (e.g., alignment, size, heaps) and allocate/bind GPU memory + * for the object. _IGpuMemoryBindable_ inherits from _IDestroyable_.

+ * + * ### %Format Info + * Several helper methods are available for dealing with image formats in the @ref Formats namespace. + * + * ### Graphics/Compute Execution Model + * Most graphics/compute work is defined by first binding a set of states then issuing a draw or dispatch command to + * kick off the work. The complete set of graphics states available in PAL is illustrated below; compute is a subset + * of this that only includes the pipeline, user data entries, and border color palette. + * + * @image html stateBreakdown.jpg + * + * Most of these correspond directly to a PAL interface object above, and these items are bound by calling a + * corresponding _CmdBind...()_ method in the ICmdBuffer interface. The states marked in yellow and orange, however, + * are _immediate_ states for which there is no object, you just specify the required state values in the corresponding + * _CmdSet...()_ method in the ICmdBuffer interface. + * + * User data entries are the way that input resources are specified for the pipeline on an upcoming draw/dispatch. This + * mapping is complicated, and is described fully in @ref ResourceBinding. + * + * A final complication worth noting is that PAL provides no implicit surface synchronization. The client is + * responsible for explicitly inserting barriers to resolve data hazards, flush/invalidate caches, and ensure images + * are in the proper compression state. For more detail, see ICmdBuffer::CmdReleaseThenAcquire, CmdRelease, CmdAcquire, + * CmdReleaseEvent, CmdAcquireEvent and AcquireReleaseInfo. + * + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h new file mode 100644 index 0000000000..542d470247 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h @@ -0,0 +1,204 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdAllocator.h + * @brief Defines the Platform Abstraction Library (PAL) ICmdAllocator interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +// Forward declarations. +struct GpuMemSubAllocInfo; +class IGpuMemory; + +/// Flags controlling the creation of ICmdAllocator objects. +union CmdAllocatorCreateFlags +{ + struct + { + uint32 threadSafe : 1; ///< If set, the allocator will acquire a lock each time it is accessed; + /// otherwise it will not attempt to protect itself from multithreaded + /// access. + uint32 autoMemoryReuse : 1; ///< If set, the allocator will track when the GPU finishes accessing + /// each piece of command memory and attempt to reuse memory which the + /// GPU is done with before allocating more memory from the OS. If not + /// set, memory will only be recycled after a call to + /// @ref ICmdAllocator::Reset(). + uint32 disableBusyChunkTracking : 1; ///< If set, the allocator will not do any GPU-side tracking of which + /// command chunks are still in use. It will be the client's (or the + /// application's) responsibility to guarantee that command chunks are + /// not returned to the allocator before the GPU has finished processing + /// them. Failure to guarantee this will result in undefined behavior. + /// This flag has no effect if @ref autoMemoryReuse is not set. + uint32 autoTrimMemory : 1; ///< If set the allocator will automatically trim down the allocations + /// (where all chunks are idle on the freeList). A minimum of + /// allocFreeThreshold allocations are kept for fast reuse. + uint32 reserved : 28; ///< Reserved for future use. + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Different type of allocation data that an ICmdAllocator allocates and distributes to command buffers. +enum CmdAllocType : uint32 +{ + CommandDataAlloc = 0, ///< Data allocated is for executable commands. + EmbeddedDataAlloc, ///< Data allocated is for embedded data. + LargeEmbeddedDataAlloc, ///< Data allocated is for embedded data, allocation is >32kb + GpuScratchMemAlloc, ///< Data allocated is GPU-only accessible at command buffer execution-time. Possible + /// uses like GPU events. + CmdAllocatorTypeCount ///< Number of allocation types for ICmdAllocator's. +}; + +/// Specifies properties for creation of an ICmdAllocator object. Input structure to IDevice::CreateCmdAllocator(). +struct CmdAllocatorCreateInfo +{ + CmdAllocatorCreateFlags flags; ///< Flags controlling command allocator creation. + + struct + { + GpuHeap allocHeap; ///< Preferred allocation heap. For @ref GpuScratchMemAlloc, this field is + /// ignored and the allocation will always be in GPU-invisible memory. For + /// all other allocation types, this must be CPU-mappable. + /// For best performance, command allocators that will be used by the + /// UVD engine should prefer the Local heap + gpusize allocSize; ///< Size, in bytes, of the GPU memory allocations this allocator will create. + /// It must be an integer multiple of suballocSize. + gpusize suballocSize; ///< Size, in bytes, of the chunks of GPU memory this allocator will give to + /// command buffers. It must be an integer multiple of 4096. + /// Must be greater than zero even if the client doesn't plan on using this + /// allocation type. + uint32 allocFreeThreshold; ///< Minimum count of free allocations that the allocator should keep around + /// for fast reuse. It is used when the autoTrimMemory flag is set. + } allocInfo[CmdAllocatorTypeCount]; ///< Information for each allocation type. +}; + +/// Output structure for QueryUtilizationInfo(). +/// The CmdAllocator utilization data can be queried by PAL clients in order to decide whether to trim the allocations. +struct CmdAllocatorUtilizationInfo +{ + uint32 numAllocations; ///< Number of allocations owned by the allocator. + uint32 numFreeChunks; ///< Number of chunks that are reset and not in use. + uint32 numBusyChunks; ///< Number of chunks that in use by the GPU. + uint32 numReuseChunks; ///< Number of chunks that have been 'returned' to the allocator for reuse. +}; + +/** + *********************************************************************************************************************** + * @interface ICmdAllocator + * @brief Allocates and distributes GPU memory to command buffers on the client's behalf. + * + * All ICmdBuffer objects must be associated with an ICmdAllocator at creation. Command buffers may switch command + * allocators when ICmdBuffer::Reset() is called. The set of command buffers associated with a given command allocator + * will query that allocator for additional GPU memory as they are building commands. + * + * To protect against race conditions the client must ask for a thread safe command allocator unless its can guarantee + * that all command buffers associated with a given command allocator will be built, reset, and destroyed in a thread- + * safe manner. It is illegal to destroy a command allocator while it still has command buffers associated with it. + * + * @see IDevice::CreateCmdAllocator() + *********************************************************************************************************************** + */ +class ICmdAllocator : public IDestroyable +{ +public: + /// Explicitly resets a command allocator, marking all internal GPU memory allocations as unused. + /// + /// The client is responsible for guaranteeing that all command buffers associated with this allocator have finished + /// GPU execution and have been explicitly reset before calling this function. + /// + /// @param [in] freeMemory If the all GPU and CPU memory allocations should be returned to the OS. + /// + /// @returns Success if the command allocator was successfully reset. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result Reset(bool freeMemory) = 0; + + /// Explicitly trims a command allocator, deleting as many unused internal GPU memory allocations as possible. + /// + /// @returns Success if the command allocator was successfully trimmed. + /// + /// @param [in] allocTypeMask Gives control whether trimming will be applied for each CmdAllocType. + /// Use (1 << CmdAllocatorTypeCount) - 1 to apply trimming to all types. + /// When trimming only the embedded date use (1 << EmbeddedDataAlloc). + /// @param [in] dynamicThreshold Minimum count of free allocations that the allocator should keep around + + virtual Result Trim(uint32 allocTypeMask, uint32 dynamicThreshold) = 0; + + /// Query the numbers of allocations and chunks of the given CmdAllocator type. + /// This may help clients to decide whether they may apply trimming or not. + /// + /// @returns Success if valid values can be reported. + /// + /// @param [in] type CmdAllocType that is being queried + /// @param [out] pUtilizationInfo The allocation and chunk counts will be stored here. + + virtual Result QueryUtilizationInfo(CmdAllocType type, CmdAllocatorUtilizationInfo* pUtilizationInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + ICmdAllocator() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~ICmdAllocator() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h new file mode 100644 index 0000000000..ca38a765c1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h @@ -0,0 +1,5118 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdBuffer.h + * @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDevice.h" +#include "palGpuMemory.h" +#include "palImage.h" +#include "palMsaaState.h" +#include "palPipeline.h" +#include "palQueryPool.h" +#include "palCmdTracking.h" + +/// HSA kernel dispatch packet typedef +typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; +/// AMD kernel code typedef +typedef struct amd_kernel_code_s amd_kernel_code_t; +/// AMD kernel descriptor +namespace llvm {namespace amdhsa {struct kernel_descriptor_t;} } + +namespace Util +{ +class VirtualLinearAllocator; +class Event; +} + +namespace Pal +{ + +// Forward declarations. +class IBorderColorPalette; +class ICmdAllocator; +class ICmdBuffer; +class IColorBlendState; +class IColorTargetView; +class IDepthStencilState; +class IDepthStencilView; +class IGpuEvent; +class IGpuMemory; +class IIndirectCmdGenerator; +class IMsaaState; +class IPerfExperiment; +class IQueue; +class IQueryPool; +enum class PerfTraceMarkerType : uint32; +enum class PointOrigin : uint32; + +struct VideoCodecInfo; +struct VideoCodecAuxInfo; + +/// Specifies a pipeline bind point (i.e., compute or graphics). +enum class PipelineBindPoint : uint32 +{ + Compute = 0x0, + Graphics = 0x1, + Count +}; + +/// Fully specifies a type of graphics primitive and vertex ordering for geometry. +enum class PrimitiveTopology : uint8 +{ + PointList = 0x0, + LineList = 0x1, + LineStrip = 0x2, + TriangleList = 0x3, + TriangleStrip = 0x4, + RectList = 0x5, ///< Each rect is three 2D axis-aligned rectangle vertices. + QuadList = 0x6, + QuadStrip = 0x7, + LineListAdj = 0x8, + LineStripAdj = 0x9, + TriangleListAdj = 0xA, + TriangleStripAdj = 0xB, + Patch = 0xC, + TriangleFan = 0xD, + LineLoop = 0xE, + Polygon = 0xF, + TwoDRectList = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle. + /// Support is optional, see support2DRectList in DeviceProperties. + Count +}; + +/// Specifies how triangle primitives should be rasterized. +enum class FillMode : uint8 +{ + Points = 0x0, + Wireframe = 0x1, + Solid = 0x2, + Count +}; + +/// Specifies the triangle face direction that should result in culled primitives. +enum class CullMode : uint8 +{ + _None = 0x0, ///< All triangles are rasterized. + Front = 0x1, ///< Front facing triangles are culled. + Back = 0x2, ///< Back facing triangles are culled. + FrontAndBack = 0x3, ///< All triangles are culled. + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< All triangles are rasterized. +#endif +}; + +/// Specifies vertex winding order corresponding to a front facing triangle. @see CullMode. +enum class FaceOrientation : uint8 +{ + Ccw = 0x0, ///< Counter-clockwise vertex winding primitives are front facing. + Cw = 0x1 ///< Clockwise vertex winding primitives are front facing. +}; + +/// Specifies which vertex of a primitive is the _provoking vertex_. This impacts which vertex's "flat" VS outputs +/// are passed to the PS (i.e., flat shading). +enum class ProvokingVertex : uint8 +{ + First = 0x0, + Last = 0x1 +}; + +/// Specifies bit size of each element in an index buffer. +enum class IndexType : uint32 +{ + Idx8 = 0x0, + Idx16 = 0x1, + Idx32 = 0x2, + Count +}; + +/// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic(). +enum class AtomicOp : uint32 +{ + AddInt32 = 0x00, + SubInt32 = 0x01, + MinUint32 = 0x02, + MaxUint32 = 0x03, + MinSint32 = 0x04, + MaxSint32 = 0x05, + AndInt32 = 0x06, + OrInt32 = 0x07, + XorInt32 = 0x08, + IncUint32 = 0x09, + DecUint32 = 0x0A, + AddInt64 = 0x0B, + SubInt64 = 0x0C, + MinUint64 = 0x0D, + MaxUint64 = 0x0E, + MinSint64 = 0x0F, + MaxSint64 = 0x10, + AndInt64 = 0x11, + OrInt64 = 0x12, + XorInt64 = 0x13, + IncUint64 = 0x14, + DecUint64 = 0x15, + Count +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 +/// Specifies the point in the GPU pipeline where an action should take place. +/// +/// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps. +/// +/// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield +/// true if a stall is necessary to resolve a hazard between those two pipe points. This guideline does not +/// hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with +/// graphics shader work. +/// +/// @see ICmdBuffer::CmdSetEvent() +/// @see ICmdBuffer::CmdResetEvent() +/// @see ICmdBuffer::CmdPredicateEvent() +/// @see ICmdBuffer::CmdBarrier() +/// @see ICmdBuffer::CmdWriteTimestamp() +/// @see ICmdBuffer::CmdWriteImmediate() +enum HwPipePoint : uint32 +{ + HwPipeTop = 0x0, ///< Earliest possible point in the GPU pipeline (CP PFP), can be + /// used as wait point for indirect args and index buffer fetch. + HwPipePostPrefetch = 0x1, ///< Indirect arguments have been fetched for all prior + /// draws/dispatches (CP ME). + HwPipePreRasterization = 0x2, ///< All prior generated VS/HS/DS/GS waves have completed, can be + /// used as release point for VB/IB fetch and streamout target. + HwPipePostPs = 0x3, ///< All prior generated PS waves have completed. + /// Only valid as a pipe point to wait on (release point). + HwPipePreColorTarget = 0x4, ///< Represents the same point in pipe to HwPipePostPs, but provides + /// clients with a better option to accurately specify the pipeline + /// sync request. And PAL uses it as entry-point to add partial + /// flushes to prevent write-after-read hazard from corner cases. + /// Only valid as a wait point (acquire point). + HwPipePreIndexBuffer = HwPipeTop, ///< As late as possible before index buffer fetches (CP PFP). + HwPipePostIndexBuffer = HwPipePreRasterization,///< All prior index buffer fetches have completed. + + // The following points apply to compute-specific work: + HwPipePreCs = HwPipePostPrefetch, ///< As late as possible before CS waves are launched (CP ME). + HwPipePostCs = 0x5, ///< All prior generated CS waves have completed. + + // The following points apply to BLT-specific work: + HwPipePreBlt = HwPipePostPrefetch, ///< As late as possible before BLT operations are launched. + HwPipePostBlt = 0x6, ///< All prior requested BLTs have completed. + + HwPipeBottom = 0x7, ///< All prior GPU work (graphics, compute, or BLT) has completed. + HwPipePointCount +}; +#endif + +/// Bitmask values that can be OR'ed together to specify a synchronization scope. See srcStageMask and dstStageMask in +/// @ref AcquireReleaseInfo. +/// +/// When specifying an execution dependency at a synchronization point where previous operations must *happen-before* +/// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior +/// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs. +/// +/// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should +/// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering. +enum PipelineStageFlag : uint32 +{ + PipelineStageTopOfPipe = 0x00000001, + PipelineStageFetchIndirectArgs = 0x00000002, + PipelineStagePostPrefetch = 0x00000004, + PipelineStageFetchIndices = 0x00000008, + PipelineStageStreamOut = 0x00000010, + PipelineStageVs = 0x00000020, + PipelineStageHs = 0x00000040, + PipelineStageDs = 0x00000080, + PipelineStageGs = 0x00000100, + PipelineStagePs = 0x00000200, + PipelineStageSampleRate = 0x00000400, + PipelineStageEarlyDsTarget = 0x00000800, + PipelineStageLateDsTarget = 0x00001000, + PipelineStageColorTarget = 0x00002000, + PipelineStageCs = 0x00004000, + PipelineStageBlt = 0x00008000, + PipelineStageBottomOfPipe = 0x00010000, + PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget, + PipelineStageAllStages = 0x0001FFFF +}; + +/// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time. Such a +/// mask should be specified in the usages field of ImageLayout. These combined usages can be examined by PAL to infer +/// the layout (i.e., compression state) of the image. +/// +/// @note There is no layout corresponding to CmdClear*(). The layout flags passed to those functions will determine +/// the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the +/// layout the same. +enum ImageLayoutUsageFlags : uint32 +{ + LayoutUninitializedTarget = 0x00000001, ///< Initial state of any image that can be used as a color or + /// depth/stencil target. A layout transition out of this state will + /// likely result in a mask RAM initialization BLT. If this bit is + /// set, no other bits may be set. + LayoutColorTarget = 0x00000002, ///< Color target bound via CmdBindTargets(). This bit is exclusive + /// with LayoutDepthStencilTarget. + LayoutDepthStencilTarget = 0x00000004, ///< Depth/stencil target bound via CmdBindTargets(). This bit is + /// exclusive with LayoutColorTarget. + LayoutShaderRead = 0x00000008, ///< Any shader read state including texture, UAV, constant buffer, + /// vertex buffer. + LayoutShaderFmaskBasedRead = 0x00000010, ///< Images in this state support the load_fptr AMD IL instruction, + /// which will read decompressed fmask in order to access compressed + /// MSAA color data from a shader. + LayoutShaderWrite = 0x00000020, ///< Writeable UAV. + LayoutCopySrc = 0x00000040, ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or + /// CmdCopyTiledImageToMemory() source image. + LayoutCopyDst = 0x00000080, ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or + /// CmdCopyMemoryToTiledImage() destination image. + LayoutResolveSrc = 0x00000100, ///< CmdResolveImage() source. + LayoutResolveDst = 0x00000200, ///< CmdResolveImage() destination. + LayoutPresentWindowed = 0x00000400, ///< Windowed-mode IQueue::Present(). + LayoutPresentFullscreen = 0x00000800, ///< Fullscreen (flip) present. Layout must be supported by the + /// display engine. + LayoutUncompressed = 0x00001000, ///< Metadata fully decompressed/expanded layout + LayoutSampleRate = 0x00002000, ///< CmdBindSampleRateImage() source. + LayoutAllUsages = 0x00003FFF +}; + +/// Bitmask values that can be ORed together to specify all potential engines an image might be used on. Such a +/// mask should be specified in the engines field of ImageLayout. +/// +/// If the client API is unable to determine which engines might be used, it should specify all possible engines +/// corresponding to the usage flags. +enum ImageLayoutEngineFlags : uint32 +{ + LayoutUniversalEngine = 0x1, + LayoutComputeEngine = 0x2, + LayoutDmaEngine = 0x4, + LayoutVideoEncodeEngine = 0x8, + LayoutVideoDecodeEngine = 0x10, + LayoutVideoJpegDecodeEngine = 0x20, + LayoutAllEngines = 0x3F +}; + +/// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or +/// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages. +enum CacheCoherencyUsageFlags : uint32 +{ + CoherCpu = 0x00000001, ///< Data read or written by CPU. + CoherShaderRead = 0x00000002, ///< Data read by a GPU shader. + CoherShaderWrite = 0x00000004, ///< Data written by a GPU shader. + CoherCopySrc = 0x00000008, ///< Source of a ICmdBuffer::CmdCopy*() call. + CoherCopyDst = 0x00000010, ///< Destination of a ICmdBuffer::CmdCopy*() call. + CoherColorTarget = 0x00000020, ///< Color target. + CoherDepthStencilTarget = 0x00000040, ///< Depth stencil target. + CoherResolveSrc = 0x00000080, ///< Source of a CmdResolveImage() call. + CoherResolveDst = 0x00000100, ///< Destination of a CmdResolveImage() call. + CoherClear = 0x00000200, ///< Destination of a CmdClear() call. + CoherIndirectArgs = 0x00000400, ///< Source argument data read by CmdDrawIndirect() and similar functions. + CoherIndexData = 0x00000800, ///< Index buffer data. + CoherQueueAtomic = 0x00001000, ///< Destination of a CmdMemoryAtomic() call. + CoherTimestamp = 0x00002000, ///< Destination of a CmdWriteTimestamp() call. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 + CoherStreamOut = 0x00004000, ///< Data written as stream output. + CoherMemory = 0x00008000, ///< Data read or written directly from/to memory + CoherSampleRate = 0x00010000, ///< CmdBindSampleRateImage() source. + CoherPresent = 0x00020000, ///< Source of present. + CoherCp = 0x00080000, ///< HW Command Processor (CP) encompassing the front - end command + CoherAllUsages = 0x000FFFFF, ///< processing of any queue, including SDMA. +#else + CoherCeLoad = 0x00004000, ///< Source of a CmdLoadCeRam() call. + CoherCeDump = 0x00008000, ///< Destination of CmdDumpCeRam() call. + CoherStreamOut = 0x00010000, ///< Data written as stream output. + CoherMemory = 0x00020000, ///< Data read or written directly from/to memory + CoherSampleRate = 0x00040000, ///< CmdBindSampleRateImage() source. + CoherPresent = 0x00080000, ///< Source of present. + CoherCp = 0x00200000, ///< HW Command Processor (CP) encompassing the front - end command + CoherAllUsages = 0x003FFFFF, ///< processing of any queue, including SDMA. +#endif + + CoherShader = CoherShaderRead | CoherShaderWrite, + CoherCopy = CoherCopySrc | CoherCopyDst, + CoherResolve = CoherResolveSrc | CoherResolveDst, +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage(). +enum ClearColorImageFlags : uint32 +{ + ColorClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before + /// and after the clear assuming all subresources to be cleared are currently + /// ready for rendering as a color target (as is required by API convention in + /// DX12). Allows reduced sync costs in some situations since PAL knows + /// the details of how the clear will be performed. + ColorClearForceSlow = 0x00000002, ///< Force these to use slow clears. + ColorClearSkipIfSlow = 0x00000004, ///< Only issue the clear if it is a fast clear. + ColorClearAllFlags = 0x00000007 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil(). +enum ClearDepthStencilFlags : uint32 +{ + DsClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before + /// and after the clear assuming all subresources to be cleared are currently + /// ready for rendering as a depth/stencil target (as is required by API convention + /// in DX12). Allows reduced sync costs in some situations since PAL knows the + /// details of how the clear will be performed. + DsClearAllFlags = 0x00000001 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage(). +enum ResolveImageFlags : uint32 +{ + ImageResolveInvertY = 0x00000001, ///< PAL will invert the y-axis (flip upside down) of the resolved region to + /// the destination image. + ImageResolveDstAsSrgb = 0x00000002, ///< If set, a non-srgb destination image will be treated as srgb format. + /// The flag cannot be set when @ref ImageResolveDstAsNorm is set. + ImageResolveDstAsNorm = 0x00000004, ///< If set, a srgb destination image will be treated as non-srgb format. + /// The flag cannot be set when @ref ImageResolveDstAsSrgb is set. + ImageResolveSrcAsNorm = 0x00000008, ///< If set, a srgb source image will be treated as non-srgb format. + ImageResolveAllFlags = 0x0000000F ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies properties for creation of an ICmdBuffer object. Input structure to IDevice::CreateCmdBuffer(). +struct CmdBufferCreateInfo +{ + ICmdAllocator* pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory + /// If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset + /// with a non-null pCmdAllocator before calling ICmdBuffer::Begin. + QueueType queueType; ///< Type of queue commands in this command buffer will target. + /// This defines the set of allowed actions in the command buffer. + QueuePriority queuePriority; ///< Priority level of the queue this command buffer will target. + EngineType engineType; ///< Type of engine the queue commands will run on. + + union + { + struct + { + /// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root" + /// command buffer. Nested command buffers differ from root command buffers in how they are sent to the + /// GPU for execution: root command buffers must be submitted to the hardware by calling + /// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root + /// command buffer. + /// + /// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is + /// meaningless and unsupported. It is an error to attempt to create a nested DMA command buffer. + /// + /// @see ICmdBuffer::CmdExecuteNestedCmdBuffers. + uint32 nested : 1; + + /// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming. + uint32 realtimeComputeUnits : 1; + + /// Target queue uses dispatch tunneling. + uint32 dispatchTunneling : 1; + + /// Indicates that each subsequent Dispatch command is desired to be executed in alternating + /// order of forward and reverse workgroup walk order. This can improve cache locality when + /// subsequent Dispatches consume data from the previous Dispatch and the overall footprint + /// does not fit in cache. + /// This is a best effort as not all implementations or Queues may support this. + uint32 dispatchPingPongWalk : 1; + + /// Reserved for future use. + uint32 reserved : 28; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; + + } flags; ///< Command buffer creation flags. +}; + +/// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling +/// root-level command buffer. +union InheritedStateFlags +{ + struct + { + /// Color and depth target views are inherited from the root-level command buffer. The nested command buffer + /// should not modify this state. + uint32 targetViewState : 1; + + /// Occlusion query is inherited from the root-level command buffer. The nested command buffer + /// should not modify this state. + uint32 occlusionQuery : 1; + + /// Predication is inherited from the root-level command buffer. The nested command buffer should not modify + /// this state. + uint32 predication : 1; + + /// Reserved for future usage. + uint32 reserved : 29; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies parameters inherited from primary command buffer into nested command buffer. +struct InheritedStateParams +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891 + uint32 colorTargetCount; ///< Number of color targets bound in the + /// root-level command buffer. + SwizzledFormat colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color + /// target. + uint32 sampleCount[MaxColorTargets]; ///< Sample count for each color target. +#endif + InheritedStateFlags stateFlags; ///< States that are inherited from the + /// calling root-level command buffer. +}; + +/// Specifies optional hints to control command buffer building optimizations. +union CmdBufferBuildFlags +{ + struct + { + /// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end + /// limited. These optimizations include removing redundant PM4 commands and reducing the VGT prim group size. + /// This flag might increase the CPU overhead of building command buffers. + uint32 optimizeGpuSmallBatch : 1; + + /// Optimize command buffer building for exclusive command buffer submission. Command buffers built with this + /// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that + /// they are no longer in use. This flag allows PAL to modify the contents of command buffers during + /// submission. + uint32 optimizeExclusiveSubmit : 1; + + /// Optimize command buffer building for single command buffer submission. Command buffers built with this flag + /// cannot be submitted more than once. This flag allows PAL to modify the contents of command buffers during + /// submission. This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set + /// optimizeExclusiveSubmit if this flag is set. + uint32 optimizeOneTimeSubmit : 1; + + /// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients + /// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided. + uint32 optimizeTessDistributionFactors : 1; + + /// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound + /// pipeline object. This optimization might increase the CPU overhead of building command buffers and/or + /// introduce additional front-end GPU bottlenecks. + uint32 prefetchShaders : 1; + + /// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end. + /// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes. + uint32 prefetchCommands : 1; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(), + /// or CmdWriteCeRam() + uint32 usesCeRamCmds : 1; +#else + uint32 placeholder914 : 1; +#endif + + /// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet. + /// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if + /// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers. + uint32 disallowNestedLaunchViaIb2 : 1; + + /// placeholder + uint32 placeholder1 : 2; + + /// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write + /// non-TMZ memory, the results are undefined. Only valid for graphics and compute. + uint32 enableTmz : 1; + + uint32 placeholder3 : 1; + + /// If set, internal operations such as blits, copies, etc. will not affect active Query results. + /// Otherwise they may affect the results. + uint32 disableQueryInternalOps : 1; + + uint32 optimizeContextStatesPerBin : 1; + uint32 optimizePersistentStatesPerBin : 1; + + /// Reserved for future use. + uint32 reserved : 16; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies tessellation accum factors. +union TessDistributionFactors +{ + struct + { + /// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for + /// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a + /// different VGT. + uint32 isoDistributionFactor : 8; + uint32 triDistributionFactor : 8; ///< Recommended to be higher than quad factor. + uint32 quadDistributionFactor : 8; + /// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is + /// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still + /// increments the accumulator for the Patch distribution factor. + uint32 donutDistributionFactor : 5; + /// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch + /// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to + /// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken + /// into trapezoids. + uint32 trapDistributionFactor : 3; + }; + + /// Values packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies options that direct command buffer building. +struct CmdBufferBuildInfo +{ + /// Command buffer build flags, specifies optional hints to control command buffer build optimizations. + CmdBufferBuildFlags flags; + + /// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested + /// command buffer should not modify the software states. Any software params that may be needed within nested + /// command buffer needs to be provided here. + const InheritedStateParams* pInheritedState; + + /// If non-null, the command buffer will begin with all states set as they are in this previously built command + /// buffer. Any state specified in pInheritedState is excluded if it is also provided. + const ICmdBuffer* pStateInheritCmdBuffer; + + /// Optional allocator for PAL to use when allocating temporary memory during command buffer building. PAL will + /// stop using this allocator once command building ends. If no allocator is provided PAL will use an internally + /// managed allocator instead which may be less efficient. PAL will use this allocator in two ways: + /// + Temporary storage within a single command building call. PAL will rewind the allocator before returning to + /// free all memory allocated within the call. + /// + Temporary storage for the entire command building period. When Begin() is called, PAL will save the current + /// position of the allocator and rewind the allocator to that point when End() is called. If the client also + /// wishes to allocate temporary storage that lasts between command building function calls they must allocate it + /// before calling Begin() or PAL will accidentally free it. + Util::VirtualLinearAllocator* pMemAllocator; + + /// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the + /// optimizeTessDistributionFactors flag for these custom factors to take effect. + /// Nested command buffers inherit this value from the primary. + TessDistributionFactors clientTessDistributionFactors; + + /// Number of context states per PBB bin. + /// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect. + uint8 contextStatesPerBin; + + /// Number of persistent states per PBB bin. + /// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect. + uint8 persistentStatesPerBin; + + /// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer. + uint64 execMarkerClientHandle; +}; + +/// Specifies info on how a compute shader should use resources. +struct DynamicComputeShaderInfo +{ + float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively + /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a + /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] + /// and specify the maximum number of waves per compute unit. If the hardware has one wave + /// limit control for multiple shader stages PAL will select the most strict limit. + /// This option is converted internally to set set HW WavesPerSh setting and the non-integer + /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for + /// example specify less number of waves than number of CUs per shader array. + + uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on, + /// throttling it, to enable more graphics work to complete. 0 disables the limit. + + uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before + /// moving to the next compute unit. 0 selects optimal default. + + uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes. + /// Zero indicates that the LDS size determined at pipeline-compilation time will be used. +}; + +/// Specifies info on how a graphics shader should use resources. +struct DynamicGraphicsShaderInfo +{ + float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively + /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a + /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] + /// and specify the maximum number of waves per compute unit. If the hardware has one wave + /// limit control for multiple shader stages PAL will select the most strict limit. + /// This option is converted internally to set HW WavesPerSh setting and the non-integer + /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for + /// example specify less number of waves than number of CUs per shader array. +}; + +/// Specifies dynamic states of a graphics pipeline +struct DynamicGraphicsState +{ + uint32 colorWriteMask; ///< Color target write mask. 4b / RT (8 count) + struct + { + uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation. + uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate. + uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate. + uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage. + uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to + /// axis-aligned line end caps during line rasterization. + uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels. + uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend + uint32 vertexBufferCount : 6; ///< Number vertex buffer slots accessed by this pipeline + LogicOp logicOp : 4; ///< Logic operation to perform. + DepthRange depthRange : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport + /// transform: 0 to 1 or -1 to 1). + DepthClampMode depthClampMode : 2; ///< Depth clamping behavior. + uint32 reserved1 : 7; ///< Reserved + uint32 reserved : 5; ///< Reserved for future use. + }; + + union + { + struct + { + uint32 depthClampMode : 1; ///< Whether to enable dynamic state depthClampMode. + uint32 depthRange : 1; ///< Whether to enable dynamic state depthRange. + uint32 logicOp : 1; ///< Whether to enable dynamic state logicOp. + uint32 colorWriteMask : 1; ///< Whether to enable dynamic state colorWriteMask. + uint32 switchWinding : 1; ///< Whether to enable dynamic state switchWinding. + uint32 depthClipMode : 1; ///< Whether to enable dynamic state depthClipNear/FarEnable. + uint32 alphaToCoverageEnable : 1; ///< Whether to enable dynamic state alphaToCoverageEnable. + uint32 perpLineEndCapsEnable : 1; ///< Whether to enable dynamic state perpLineEndCapsEnable. + uint32 rasterizerDiscardEnable : 1; ///< Whether to enable dynamic state rasterizerDiscardEnable. + uint32 dualSourceBlendEnable : 1; ///< Whether to enable dynamic state dualSourceBlendEnable + uint32 vertexBufferCount : 1; ///< Whether to enable dynamic state vertexBufferCount. + uint32 reserved1 : 1; ///< Reserved. + uint32 reserved : 20; ///< Reserved for future use. + }; + uint32 u32All; + } enable; +}; + +/// Specifies info on how graphics shaders should use resources. +struct DynamicGraphicsShaderInfos +{ + union + { + // VS/HS/DS/GS or TS/MS are active + struct + { + DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information. + DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information. + DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information. + DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information. + }; + struct + { + DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information. + DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information. + }; + }; + + DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information. + + union + { + struct + { + uint8 vs : 1; // If set, there is dynamic VS shader info. + uint8 hs : 1; // If set, there is dynamic HS shader info. + uint8 ds : 1; // If set, there is dynamic DS shader info. + uint8 gs : 1; // If set, there is dynamic GS shader info. + uint8 ps : 1; // If set, there is dynamic PS shader info. + uint8 ts : 1; // If set, there is dynamic TS shader info. + uint8 ms : 1; // If set, there is dynamic MS shader info. + uint8 reserved : 1; // Reserved. + }; + uint8 u8All; + } enable; +}; + +/// Specifies parameters for binding a pipeline. +/// @see ICmdBuffer::CmdBindPipeline +struct PipelineBindParams +{ + PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics). + const IPipeline* pPipeline; ///< New pipeline to be bound. Can be null in order to unbind a previously + /// bound pipeline without binding a new one. + uint64 apiPsoHash; ///< 64-bit identifier provided by client driver based on the Pipeline State + /// Object. There exists a many-to-one correlation for ApiPsoHash to + /// internalPipelineHash to map the two. + union + { + DynamicComputeShaderInfo cs; ///< Dynamic Compute shader information. + + struct + { + DynamicGraphicsShaderInfos gfxShaderInfo; + DynamicGraphicsState gfxDynState; + }; + }; +}; + +/// Specifies per-MRT color target view and current image state. Used as input to ICmdBuffer::CmdBindTargets(). +struct ColorTargetBindInfo +{ + const IColorTargetView* pColorTargetView; ///< Color target view to bind. + ImageLayout imageLayout; ///< Specifies the current image layout based on bitmasks of currently + /// allowed operations and engines that may perform those operations. + /// At minimum, the LayoutColorTarget usage flag and + /// LayoutUniversalEngine engine flag must be set. +}; + +/// Specifies depth/stencil view and current image state of the depth and stencil planes. Used as input to +/// ICmdBuffer::CmdBindTargets(). +struct DepthStencilBindInfo +{ + const IDepthStencilView* pDepthStencilView; ///< Depth/stencil target view to bind. + ImageLayout depthLayout; ///< Specifies the current image layout of the depth plane based on + /// bitmasks of currently allowed operations and engines that may + /// perform those operations. At minimum, the + /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine + /// engine flag must be set. Ignored if the specified view does not + /// have a depth plane. + ImageLayout stencilLayout; ///< Specifies the current image layout of the stencil plane based on + /// bitmasks of currently allowed operations and engines that may + /// perform those operations. At minimum, the + /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine + /// engine flag must be set. Ignored if the specified view does not + /// have a stencil plane. +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 +/// Represents a GPU memory or image transition as part of a barrier. +/// +/// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the +/// specified set of destination caches. The source and destination designation is relative to the barrier itself +/// and does not indicate whether a particular cache is a read or write cache. +/// +/// Typically a transition flushes written data from the source caches into the destination caches and thus the source +/// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior +/// read-only caches accesses as PAL may be able to optimize its cache operations. +/// +/// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL +/// may still issue cache operations for internal reasons. +/// +/// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt. +/// +/// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches. +struct BarrierTransition +{ + + uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose + /// results need to be visible for subsequent operations. Flags for prior read operations + /// may be included as well and may be used for internal optimizations. + uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read + /// and/or write data flushed from the caches indicated by the srcCacheMask. + + struct + { + const IImage* pImage; ///< If non-null, indicates this transition only applies to the specified image. + /// The remaining members of this structure are ignored if this member is null. + SubresRange subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref + /// LayoutUninitializedTarget this range must cover all subresources of pImage + /// unless the perSubresInit image create flag was specified. + ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and + /// engines up to this point. These masks imply the previous compression state. No + /// usage flags should ever be set in oldLayout.usages that correspond to usages + /// that are not supported by the engine that is performing the transition. The + /// queue type performing the transition must be set in oldLayout.engines. + ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and + /// engines after this point. These masks imply the upcoming compression state. + /// point. This usage mask implies the upcoming compressions state. A difference + /// between oldLayoutUsageMask and newLayoutUsageMask may result in a + /// decompression. + + /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a + /// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum + /// valid position (not quite to the bottom/right border of the pixel). + /// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when + /// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call. + const MsaaQuadSamplePattern* pQuadSamplePattern; + + } imageInfo; ///< Image-specific transition information. +}; + +/// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier(). +/// +/// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate +/// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how +/// the GPU will use the image. +/// +/// This structure directly specifies how #1 is performed. #2 and #3 are managed by the list of @ref BarrierTransition +/// structures passed in pTransitions. +struct BarrierInfo +{ + /// Determine at what point the GPU should stall until all specified waits and transitions have completed. If the + /// specified wait point is unavailable, PAL will wait at the closest available earlier point. + HwPipePoint waitPoint; + + uint32 pipePointWaitCount; ///< Number of entries in pPipePoints. + const HwPipePoint* pPipePoints; ///< The barrier will stall until the hardware pipeline has cleared + /// up to each point specified in this array. One entry in this + /// array is typically enough, but CS and GFX operate in parallel + /// at certain stages. + + uint32 gpuEventWaitCount; ///< Number of entries in ppGpuEvents. + const IGpuEvent** ppGpuEvents; ///< The barrier will stall until each GPU event in this array is + /// in the set state. + + uint32 rangeCheckedTargetWaitCount; ///< Number of entries in ppTargets. + const IImage** ppTargets; ///< The barrier will stall until all previous rendering with any + /// color or depth/stencil image in this list bound as a target + /// has completed. If one of the targets is a nullptr it will + /// perform a full range sync. + + uint32 transitionCount; ///< Number of entries in pTransitions. + const BarrierTransition* pTransitions; ///< List of image/memory transitions to process. See + /// @ref BarrierTransition. The same subresource should never + /// be specified more than once in the list of transitions. + /// PAL assumes that all specified subresources are unique. + + uint32 globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined + /// (bitwise logical union) with the @ref srcCacheMask field belonging to every + /// element in @ref pTransitions. If this is zero or if there are no transitions, + /// then no global cache flags are applied during every transition. + + uint32 globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined + /// (bitwise logical union) with the @ref dstCacheMask field belonging to every + /// element in @ref pTransitions. If this is zero or if there are no transitions, + /// then no global cache flags are applied during every transition. + + uint32 reason; ///< The reason that the barrier was invoked. +}; +#endif + +/// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory +/// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain +/// cache operations. +/// +/// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of +/// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier. +/// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution +/// dependencies. +/// +/// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values. +/// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made +/// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or +/// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate +/// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed. +/// +/// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions +/// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero. +/// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations. +/// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations. +/// +/// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations +/// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional +/// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations. +/// If they are provided PAL may detect cases where future read operations use the same caches as the prior read +/// operations and thus can skip the usual visibility operations. +/// +/// Note that, +/// 1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags +/// were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise. +/// 2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal +/// optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one +/// @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource +/// transitions' stage or access mask into the global transition mask. +/// +/// This struct is used by @ref AcquireReleaseInfo. +struct MemBarrier +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + union + { + struct + { + uint32 globallyAvailable : 1; ///< Normally, data made available is in the GPU LLC. When this bit is + /// set, available means in memory, available to all clients in the + /// system. This is useful for rare cases like mid command buffer + /// synchronization with the CPU or another external device. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit uint. + } flags; ///< Flags controlling the memory barrier. +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 + GpuMemSubAllocInfo memory; ///< Specifies a portion of an IGpuMemory object this memory barrier affects. + /// Zero values of memory structure indicate full range barrier operations. +#endif + + uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope that must be confirmed complete as part of a release. Must be + /// 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope of operations to be performed after the acquire. Must be + /// 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// availability operation, as defined in the struct comment header. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// visibility operation, as defined in the struct comment header. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. +}; + +/// Specifies required layout transition, execution dependencies, *availability*, and/or *visibility* operations on a +/// subresource of an IImage object. +/// +/// See the header comment on @ref MemBarrier for a full description of the execution dependencies, availability and +/// visibility operations, including what rules the clients must follow when filling out srcAccessMask and +/// dstAccessMask. +/// +/// This struct is used by @ref AcquireReleaseInfo. +struct ImgBarrier +{ + const IImage* pImage; ///< Relevant image resource for this barrier. + SubresRange subresRange; ///< Selects a range of planes/slices/mips the barrier affects. If newLayout + /// includes @ref LayoutUninitializedTarget this range must cover all subresources of + /// pImage unless the perSubresInit image create flag was specified. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 + Box box; ///< Restricts the barrier to a sub-section of each subresource. The Z offset/extent + /// must be 0 for 1D/2D images, and the Y offset/extent must be 0 for 1D images. A + /// box with zero extents will be ignored, and the barrier will affect the entire + /// subresource range. This box may be used to restrict ranges of cache flushes or + /// invalidations, or may restrict what data is decompressed. However, the + /// implementation may not be able to optimize particular cases and may expand the + /// barrier to cover the entire subresource range. Specifying a subregion with a box + /// when newLayout includes @ref LayoutUninitializedTarget is not supported. +#endif + + uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope that must be confirmed complete as part of a release. Must be + /// 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope of operations to be performed after the acquire. Must be + /// 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// availability operation, as defined in the struct comment header. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// visibility operation, as defined in the struct comment header. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. + + ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and + /// engines up to this point. These masks imply the previous compression state. No + /// usage flags should ever be set in oldLayout.usages that correspond to usages + /// that are not supported by the engine that is performing the transition. The + /// engine type performing the transition must be set in oldLayout.engines. Can set + /// both oldLayout and newLayout to zero value for no layout transition case. + ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and + /// engines after this point. These masks imply the upcoming compression state. + /// point. A difference between oldLayoutUsageMask and newLayoutUsageMask may result + /// in a decompression. PAL's implementation will ensure the results of any layout + /// operations are consistent with the requested availability and visibility + /// operations. Can set both oldLayout and newLayout to zero value for no layout + /// transition case. + + /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a grid + /// where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum valid + /// position (not quite to the bottom/right border of the pixel). Specifies a custom sample pattern over a 2x2 + /// pixel quad. Can be left null for non-MSAA images or when a valid IMsaaState is bound prior to the barrier + /// call. + const MsaaQuadSamplePattern* pQuadSamplePattern; +}; + +/// Input structure to CmdRelease(), CmdReleaseEvent(), CmdAcquire(), CmdAcquireEvent(), and CmdReleastThenAcquire(). +/// It describes the execution dependencies, memory dependencies, and image layout transitions that must be resolved. +/// +/// Global transition doesn't have buffer or image info so it will assume the worst case and the barrier operations may +/// not be optimal (e.g. metadata may be misaligned and need issue LLC flush/invalidation). It's suggested that if +/// clients know the buffer or image info, try setting up the barrier call with the full buffer or image transition +/// info (including stageMask and accessMask) instead of global transition for optimal performance. +/// +/// Clients may OR multiple MemBarrier into a single MemBarrier on full range barrier cases for simple and saving CPU +/// overhead. To allow more optimization chances (e.g. skip unnecessary stalls for read only transitions) in PAL, +/// it's suggested to split the single grouped MemBarrier into two separate grouped MemBarriers: one is read only +/// MemBarrier and the other is writeable MemBarrier; both are then passed together to the barrier call. +struct AcquireReleaseInfo +{ + uint32 srcGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global + /// synchronization scope that must be confirmed complete as part of a + /// release. Must be 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global + /// synchronization scope of operations to be performed after the + /// acquire. Must be 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcGlobalAccessMask; ///< *Access scope* for the global availability operation. Serves the + /// same purpose as srcAccessMask in @ref MemoryBarrier, but will cause + /// all relevant caches to be flushed without range checking. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstGlobalAccessMask; ///< *Access scope* for the global visibility operation. Serves the + /// same purpose as dstAccessMask in @ref MemoryBarrier, but will cause + /// all relevant caches to be invalidated without range checking. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. + + uint32 memoryBarrierCount; ///< Number of entries in pMemoryBarriers. + const MemBarrier* pMemoryBarriers; ///< Describes memory dependencies specific to a range of a particular + /// IGpuMemory object. + + uint32 imageBarrierCount; ///< Number of entries in pImageBarriers. + const ImgBarrier* pImageBarriers; /// Describes memory dependencies and image layout transitions required + /// for a subresource range of a particular IImage object. + uint32 reason; ///< The reason that the barrier was invoked. + /// See @ref Developer::BarrierReason for internal reason codes, though + /// clients may define their own as well +}; + +/// Specifies barrier type, global (potentially mixed cases of buffer and image), buffer or image. +enum class BarrierType : uint32 +{ + Global, + Buffer, + Image +}; + +/// Number of all HW opaque release token types. +constexpr uint32 NumReleaseTokenTypes = 4; + +/// Synchronization token structure for CmdRelease() and CmdAcquire(). +/// +/// Clients should pass the ReleaseToken returned by CmdRelease() to CmdAcquire() directly without changing the value. +/// If a resource with given subresource range has multiple ReleaseToken, all related ReleaseToken should be passed to +/// CmdAcquire(). +/// +/// Passing ReleaseToken { .fenceValue = N; .type = T } into CmdAcquire() will wait for all prior releases with +/// .fenceValue <= N for .type == T. Resource with a large number of subresources may introduce lots of ReleaseToken +/// potentially (e.g. released per subresource). No need to track all ReleaseToken for each resource since clients +/// can optimize this based on the fact that release type and fenceValue are exposed for each ReleaseToken: define a +/// ReleaseToken array with size @ref NumReleaseTokenTypes, only track ReleaseToken with the largest fenceValue per +/// each release type; and then passing the tracked array ReleaseToken values to CmdAcquire() is enough. +union ReleaseToken +{ + struct + { + uint32 fenceValue : 24; ///< Release fence value per token type. + uint32 type : 8; ///< Release token type (HW opaque). Note that please increase the number of bits if + /// it can't hold all types, see @ref NumReleaseTokenTypes for details. + }; + + uint32 u32All; +}; + +/// Specifies parameters for a copy from one range of a source GPU memory allocation to a range of the same size in a +/// destination GPU memory allocation. Used as an input to ICmdBuffer::CmdCopyMemory(). +struct MemoryCopyRegion +{ + gpusize srcOffset; ///< Offset in bytes into the source GPU memory allocation to copy data from. + gpusize dstOffset; ///< Offset in bytes into the destination GPU memory allocation to copy data to. + gpusize copySize; ///< Amount of data to copy in bytes. +}; + +/// Specifies parameters for an image copy from one region in a source image subresource to a region of the same size in +/// a destination image subresource. Used as input to ICmdBuffer::CmdCopyImage(). +/// If the region describes a copy between a 2D and a 3D image, extent.depth and numSlices must be equal and may be +/// larger than 1. +struct ImageCopyRegion +{ + SubresId srcSubres; ///< Selects the source subresource. + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + SubresId dstSubres; ///< Selects the destination subresource. + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination + /// subresource. + Extent3d extent; ///< Size of the copy region in pixels. + uint32 numSlices; ///< Number of slices the copy will span. +}; + +/// Specifies parameters for a copy between an image and a GPU memory allocation. The same structure is used regardless +/// of direction, an input for both ICmdBuffer::CmdCopyImageToMemory() and ICmdBuffer::CmdCopyMemoryToImage(). +struct MemoryImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in pixels. + uint32 numSlices; ///< Number of slices the copy will span. + gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + SwizzledFormat swizzledFormat;///< If not Undefined, reinterpret both subresources using this format and swizzle. +}; + +/// Specifies parameters for a copy between a PRT and a GPU memory allocation. The same structure is used regardless +/// of direction, an input for both ICmdBuffer::CmdCopyTiledImageToMemory() and ICmdBuffer::CmdCopyMemoryToTiledImage(). +struct MemoryTiledImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource; must not be a part of the packed mip tail. + Offset3d imageOffset; ///< Tile offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in tiles. + uint32 numSlices; ///< Number of slices the copy will span. + gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// Used by copy operations to temporarily interpret a range of GPU memory as a "typed buffer". A typed buffer is +/// essentially a linear image with a caller-defined row pitch and depth pitch. Typed buffer copies do not require +/// the GPU memory objects to be created with the "typedBuffer" flag. +struct TypedBufferInfo +{ + SwizzledFormat swizzledFormat; ///< The pixels in this buffer have this format. + gpusize offset; ///< Offset in bytes to the start of the copy region in the buffer's GPU memory + /// allocation. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// Specifies parameters for a copy from one region of a typed buffer to a region of the same size in a destination +/// typed buffer. Used as an input to ICmdBuffer::CmdCopyTypedBuffer(). +struct TypedBufferCopyRegion +{ + TypedBufferInfo srcBuffer; ///< How to interpret the source GPU memory allocation as a typed buffer. + TypedBufferInfo dstBuffer; ///< How to interpret the destination GPU memory allocation as a typed buffer. + Extent3d extent; ///< Size of the copy region in pixels. +}; + +/// Specifies parameters for a scaled copy between an image and a typed buffer. The same structure is used regardless +/// of direction, an input for ICmdBuffer::CmdScaledCopyTypedBufferToImage(). +struct TypedBufferImageScaledCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset2d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent2d imageExtent; ///< Size of the image region in pixels. + TypedBufferInfo bufferInfo; ///< How to interpret the GPU memory allocation as a typed buffer. + Extent2d bufferExtent; ///< Size of the typed buffer region in pixels. + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. +}; + +/// Specifies parameters for a scaled image copy from one region in a source image subresource to a region in the +/// destination image subresource. Used as an input to ICmdBuffer::CmdScaledCopyImage. +struct ImageScaledCopyRegion +{ + SubresId srcSubres; ///< Selects the source subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 + uint32 srcSlices; ///< Number of source image slices to read across. +#endif + union + { + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + Offset3dFloat srcOffsetFloat; ///< Alternative representation in floating point. + }; + union + { + SignedExtent3d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates + /// a copy in the reverse direction. + Extent3dFloat srcExtentFloat; ///< Alternative representation in floating point. + }; + + SubresId dstSubres; ///< Selects the destination subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 + uint32 dstSlices; ///< Number of destination image slices to write. +#endif + union + { + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. + Offset3dFloat dstOffsetFloat; ///< Alternative representation in floating point. + }; + union + { + SignedExtent3d dstExtent; ///< Signed size of the destination region in pixels. A negative size + /// indicates a copy in the reverse direction. + Extent3dFloat dstExtentFloat; ///< Alternative representation in floating point. + }; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 887 + uint32 numSlices; ///< Number of slices the copy will span. +#endif + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. + /// The specified format needs to have been included in the "pViewFormats" list + /// specified at image-creation time, otherwise the result might be incorrect. +}; + +/// Specifies parameters for a color-space-conversion copy from one region in a source image subresource to a region in +/// a destination image subresource. Used as an input to ICmdBuffer::CmdColorSpaceConversionCopy. +struct ColorSpaceConversionRegion +{ + Offset2d srcOffset; ///< Offset to the start of the chosen region in the source subresource(s). + SignedExtent2d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates a copy + /// in the reverse direction. + Offset2d dstOffset; ///< Offset to the start of the chosen region in the destination subresource(s). + SignedExtent2d dstExtent; ///< Signed size of the destination region in pixels. A negative size indicates a + /// copy in the reverse direction. + SubresId rgbSubres; ///< Selects the first subresource of the RGB image where the copy will begin. This + /// can either be the source or destination of the copy, depending on whether the + /// copy is performing an RGB->YUV or YUV->RGB conversion. + uint32 yuvStartSlice; ///< Array slice of the YUV image where the copy will begin. All planes of planar + /// YUV images will be implicitly involved in the copy. This can either be the + /// source or destination of the copy, depending on whether the copy is performing + /// an RGB->YUV or YUV->RGB conversion. + uint32 sliceCount; ///< Number of slices the copy will span. +}; + +/// Specifies the color-space-conversion table used when converting between YUV and RGB Image formats. Used as an input +/// to ICmdBuffer:CmdColorSpaceConversionCopy. +struct ColorSpaceConversionTable +{ + float table[3][4]; ///< Values forming the conversion table matrix, which has three rows and four columns. For RGB + /// to YUV conversions, the conversion shader uses the following expressions to evaluate the + /// YUV color: + /// Y = dot( [R G B 1], [row #0] ) + /// U = dot( [R G B 1], [row #1] ) + /// V = dot( [R G B 1], [row #2] ) + /// For YUV to RGB conversions, the conversion shader uses the following expressions to + /// evaluate the RGB color: + /// R = dot( [Y U V 1], [row #0] ) + /// G = dot( [Y U V 1], [row #1] ) + /// B = dot( [Y U V 1], [row #2] ) + /// A fourth row is not needed because alpha is copied directly between the RGB and YUV colors. +}; + +/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy +/// to perform a YUV to RGB color space conversion. Represents the BT.601 standard (standard-definition TV). +extern const ColorSpaceConversionTable DefaultCscTableYuvToRgb; + +/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy +/// to perform a RGB to YUV color space conversion. Represents the BT.601 standard (standard-definition TV). +extern const ColorSpaceConversionTable DefaultCscTableRgbToYuv; + +/// Specifies flags controlling GPU copy behavior. Format related flags are ignored by DMA queues. +enum CopyControlFlags : uint32 +{ + CopyFormatConversion = 0x1, ///< Requests that the copy convert between two compatible formats. This is ignored + /// unless both formats support @ref FormatFeatureFormatConversion. + CopyRawSwizzle = 0x2, ///< If possible, raw copies will swizzle from the source channel format into the + /// destination channel format (e.g., RGBA to BGRA). + CopyEnableScissorTest = 0x4, ///< If set, do scissor test using the specified scissor rectangle. + CopyControlAllFlags = 0x7 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single +/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). +struct ImageResolveRegion +{ + uint32 srcPlane; ///< The source color, depth, or stencil plane. + uint32 srcSlice; ///< Selects the source starting slice + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + uint32 dstPlane; ///< The destination color, depth, or stencil plane. + uint32 dstMipLevel; ///< Selects destination mip level. + uint32 dstSlice; ///< Selects the destination starting slice + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. + /// The format must match both subresource's native formats. + + const MsaaQuadSamplePattern* pQuadSamplePattern; ///< Specifies sample pattern for MSAA depth image. It must be a + /// valid pointer if image was created with sampleLocsAlwaysKnown + /// flag set. +}; + +/// A list of the types of PRT+ resolves that can be performed. +enum class PrtPlusResolveType : uint32 +{ + Decode = 0x0, ///< Translate from AMD HW format to format of destination image. + Encode = 0x1, ///< Translate from source image to AMD HW format + Count = 0x2, +}; + +/// Input structure to the CmdResolvePrtPlusImage function +struct PrtPlusImageResolveRegion +{ + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 + SubresId srcSubresId; ///< Selects the source subresource +#else + uint32 srcMipLevel; ///< Selects source mip level + uint32 srcSlice; ///< Selects the source starting slice +#endif + + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 + SubresId dstSubresId; ///< Selects the destination subresource +#else + uint32 dstMipLevel; ///< Selects destination mip level + uint32 dstSlice; ///< Selects the destination starting slice +#endif + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Input structure to ICmdBuffer::CmdResolvePrtPlusImageToBuffer() +struct PrtPlusImageToBufferResolveRegion +{ + SubresId srcSubresId; ///< Selects the source subresource + + gpusize dstOffset; ///< Offset into destination subresource + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Input structure to ICmdBuffer::CmdResolvePrtPlusBufferToImage() +struct PrtPlusBufferToImageResolveRegion +{ + gpusize srcOffset; ///< Offset into source subresource + + SubresId dstSubresId; ///< Selects the destination subresource + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single +/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). +enum class ResolveMode : uint32 +{ + Average = 0x0, ///< Resolve result is an average of all the individual samples + Minimum = 0x1, ///< Resolve result is the minimum value of all individual samples + Maximum = 0x2, ///< Resolve result is the maximum value of all individual samples + Count = 0x4, +}; + +/// Specifies width of immediate data to be written out. +enum class ImmediateDataWidth : uint32 +{ + ImmediateData32Bit = 0x0, + ImmediateData64Bit = 0x1, + + Count = 0x2, +}; + +/// Specifies flags controlling GPU query behavior. +union QueryControlFlags +{ + struct + { + /// Controls accuracy of query data collection. Available only for occlusion queries. If set, occlusion query + /// is guaranteed to return imprecise non-zero value if any samples pass the depth and stencil test. Using + /// imprecise occlusion query results could improve rendering performance while an occlusion query is active. + uint32 impreciseData : 1; + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies layout of GPU memory used as an input to CmdDrawIndirectMulti. +struct DrawIndirectArgs +{ + uint32 vertexCount; ///< Number of vertices to draw. + uint32 instanceCount; ///< Number of instances to draw. + uint32 firstVertex; ///< Starting index value for the draw. Indices passed to the vertex shader will range from + /// firstVertex to firstVertex + vertexCount - 1. + uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from + /// firstInstance to firstInstance + instanceCount - 1. +}; + +/// Specifies layout of GPU memory used as an input to CmdDrawIndexedIndirectMulti. +/// +/// Indices passed to the vertex shader will be: +/// +/// + IndexBuffer[firstIndex] + vertexOffset +/// + IndexBuffer[firstIndex + 1] + vertexOffset, +/// + ... +/// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset +struct DrawIndexedIndirectArgs +{ + uint32 indexCount; ///< Number of vertices to draw. + uint32 instanceCount; ///< Number of instances to draw. + uint32 firstIndex; ///< Starting index buffer slot for the draw. + int32 vertexOffset; ///< Offset added to the index fetched from the index buffer before it is passed to the + /// vertex shader. + uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from + /// firstInstance to firstInstance + instanceCount - 1. +}; + +/// Specifies layout of GPU memory used as an input to CmdDispatchIndirect. +struct DispatchIndirectArgs +{ + uint32 x; ///< Threadgroups to dispatch in the X dimension. + uint32 y; ///< Threadgroups to dispatch in the Y dimension. + uint32 z; ///< Threadgroups to dispatch in the Z dimension. +}; + +/// Specifies layout of GPU memory used as an input to CmdDispatchMeshIndirect. +using DispatchMeshIndirectArgs = DispatchIndirectArgs; + +/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. +struct CpuVirtAddrAndStride +{ + const void* pCpuVirtAddr; ///< CPU virtual address of the 0th array element. + struct + { + uint64 stride : 32; ///< Distance between array elements in bytes. + uint64 _pad : 32; ///< Padding for structure alignment. + }; +}; + +/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. +struct GpuVirtAddrAndStride +{ + gpusize gpuVirtAddr; ///< GPU virtual address of the 0th array element. + struct + { + uint64 stride : 32; ///< Distance between array elements in bytes. + uint64 _pad : 32; ///< Padding for structure alignment. + }; +}; + +/// Flags to describe a dispatch +union DispatchInfoFlags +{ + struct + { + uint32 devDriverOverlay : 1; ///< Flag indicates this dispatch draws the DevDriver overlay + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies the different stages at which a combiner can choose between different shading rates. +enum class VrsCombinerStage : uint32 +{ + ProvokingVertex, ///< Chooses between the shading rate specified by the VrsRateParams struct and the shader + /// rate provided by the provoking vertex. + Primitive, ///< Chooses between previous combiner stage and the shader rate associated with the primitive + Image, ///< Chooses between previous combiner stage and the shader rate associated with an image + PsIterSamples, ///< Chooses between previous combiner stage and the PS_ITER_SAMPLES rate. + Max +}; + +/// Specifies the different possible shading rates. Not all are supported on all HW; see the supportedVrsRates +/// entry in the gfxipProperties structure. +enum class VrsShadingRate : uint32 +{ + _16xSsaa = 0x0, + _8xSsaa = 0x1, + _4xSsaa = 0x2, + _2xSsaa = 0x3, + _1x1 = 0x4, + _1x2 = 0x5, + _2x1 = 0x6, + _2x2 = 0x7, + Count +}; + +/// Indices into the centerOffset array member of the VrsCenterState structure. +enum class VrsCenterRates : uint32 +{ + _1x1 = 0x0, + _1x2 = 0x1, + _2x1 = 0x2, + _2x2 = 0x3, + Max = 0x4, +}; + +/// Specifies the different ways in which a combiner can choose between two different shading rate inputs. +enum class VrsCombiner : uint32 +{ + Passthrough = 0, ///< Keep previous shading rate. + Override = 1, ///< C.xy = B.xy + Min = 2, ///< min(A.xy, B.xy) + Max = 3, ///< max(A.xy, B.xy) + Sum = 4, ///< min(maxRate, A.xy + B.xy) + Count +}; + +/// Structure for defining paramters to the CmdSetPerDrawVrsRate function. +struct VrsRateParams +{ + /// The shading rate to be bound to the render state. + VrsShadingRate shadingRate; + + /// The state of all the combiners. + VrsCombiner combinerState[static_cast(VrsCombinerStage::Max)]; + + union + { + struct + { + uint32 exposeVrsPixelsMask : 1; ///< Controls how the shader input mask of a coarse pixel is generated. + /// 0 : Bitwise OR of all fine pixel`s mask + /// 1 : Pack fine pixels` coverage mask into iMask. Layout based + /// on VRS rate + uint32 reserved : 31; + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags controlling VRS rate parameters +}; + +/// Structure for defininig paramters to the CmdSetVrsCenterState function. +struct VrsCenterState +{ + /// The offset is scaled by the coarse pixel size and then added to the center location + /// Center offsets are specified as two 4 bits signed integer value representing a location on a 16x16 grid gd. + /// The offset is scaled by the coarse pixel size and then added to the center location + /// 1x1, 1x2, 2x1 and 2x2 shading rates can all have their own unique offsets + Offset2d centerOffset[static_cast(VrsCenterRates::Max)]; + + union + { + struct + { + uint32 overrideCenterSsaa : 1; ///< Override center interpolants to be evaluated at the sample + /// position. + uint32 overrideCentroidSsaa : 1; ///< Override centroid interpolants to be evaluated at the centroid + /// of each sample group being iterated (simply the sample position + /// in the typical case of 1-sample groups). + uint32 alwaysComputeCentroid : 1; ///< Don't assume the centroid of a fully covered shading region is + /// the center. It is possible all samples could be lit but the + /// center is not lit for certain combinations of centerOffset[] + /// values and programmable sample positions + uint32 reserved : 29; ///< Reserved for future HW + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags controlling VRS center state +}; + +/// @internal +/// Function pointer type definition for setting pipeline-accessible user data entries to the specified values. Each +/// command buffer object has one such callback per pipeline bind point, so the bind point is implicit. +/// +/// @see ICmdBuffer::CmdSetUserData(). +typedef void (PAL_STDCALL *CmdSetUserDataFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstEntry, + uint32 entryCount, + const uint32* pEntryValues); + +/// @internal Function pointer type definition for issuing non-indexed draws. +/// +/// @see ICmdBuffer::CmdDraw(). +typedef void (PAL_STDCALL *CmdDrawFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstVertex, + uint32 vertexCount, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId); + +/// @internal Function pointer type definition for issuing draws auto. +/// +/// @see ICmdBuffer::CmdDrawOpaque(). +typedef void (PAL_STDCALL *CmdDrawOpaqueFunc)( + ICmdBuffer* pCmdBuffer, + gpusize streamOutFilledSizeVa, + uint32 streamOutOffset, + uint32 stride, + uint32 firstInstance, + uint32 instanceCount); + +/// @internal Function pointer type definition for issuing indexed draws. +/// +/// @see ICmdBuffer::CmdDrawIndexed(). +typedef void (PAL_STDCALL *CmdDrawIndexedFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstIndex, + uint32 indexCount, + int32 vertexOffset, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId); + +/// @internal Function pointer type definition for issuing indirect draws. +/// +/// @see ICmdBuffer::CmdDrawIndirectMulti(). +typedef void (PAL_STDCALL *CmdDrawIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// @internal Function pointer type definition for issuing indexed, indirect draws. +/// +/// @see ICmdBuffer::CmdDrawIndexedIndirectMulti(). +typedef void (PAL_STDCALL *CmdDrawIndexedIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// @internal Function pointer type definition for issuing direct dispatches. +/// +/// @see ICmdBuffer::CmdDispatch(). +typedef void (PAL_STDCALL *CmdDispatchFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims size, + DispatchInfoFlags infoFlags); + +/// @internal Function pointer type definition for issuing indirect dispatches. +/// +/// @see ICmdBuffer::CmdDispatchIndirect(). +typedef void (PAL_STDCALL *CmdDispatchIndirectFunc)( + ICmdBuffer* pCmdBuffer, + gpusize gpuVirtAddr); +/// @internal Function pointer type definition for issuing direct dispatches with threadgroup offsets. +/// +/// @see ICmdBuffer::CmdDispatchOffset(). +typedef void (PAL_STDCALL *CmdDispatchOffsetFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims offset, + DispatchDims launchSize, + DispatchDims logicalSize); + +/// @internal Function pointer type definition for issuing direct mesh dispatches. +/// +/// @see ICmdBuffer::CmdDispatchMesh(). +typedef void (PAL_STDCALL *CmdDispatchMeshFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims size); + +/// @internal Function pointer type definition for issuing indirect mesh dispatches. +/// +/// @see ICmdBuffer::CmdDispatchMeshIndirectMulti(). +typedef void (PAL_STDCALL *CmdDispatchMeshIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// This struct provides the parameters of all the supported features for kernel dispatch +struct DispatchAqlParams +{ + const hsa_kernel_dispatch_packet_t* pAqlPacket; ///< Pointer to AQL packet contains the essential + /// information (size of workgroup, grid, data + /// segments, handle of kernel code object, kernel + /// arguments) of the kernel to be dispatched. + gpusize scratchAddr; ///< GPU VM scratch buffer address + uint32 scratchSize; ///< Scratch buffer size + uint32 scratchOffset; ///< Scratch buffer offset from the base for generic + /// address space +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 920 + const llvm::amdhsa::kernel_descriptor_t* pCpuAqlCode; ///< AMD kernel descriptor on CPU for PM4 emulation +#else + const amd_kernel_code_t* pCpuAqlCode; ///< AMD kernel code object on CPU for PM4 emulation +#endif + gpusize hsaQueueVa; ///< GPU VM address where amd_queue_t is allocated + uint32 wavesPerSh; ///< Waves Per Shade Array + bool useAtc; ///< Indicates whether ATC bit in registers should be set + /// ATC bit is used for indicating if the address + /// is GPUVM(=0) or SVM(=1). Valid only in gfx6 and older + uint64 kernargSegmentSize; ///< The size of kerarg segment that holds the + /// values of the arguments to the kernels + uint32 workitemPrivateSegmentSize; ///< The amount of fixed private address + /// space memory required for a workitem. + uint32 aqlPacketIndex; ///< AQL ID in QueueCreateInfo.aqlPacketList for debugger. + /// Note: debugger support enabled for Navi3x+ + +}; + +/// @internal Function pointer type definition for issuing AQL dispatches. +/// +/// @see ICmdBuffer::CmdDispatchAql(). +typedef void (PAL_STDCALL *CmdDispatchAqlFunc)( + ICmdBuffer* pCmdBuffer, + const DispatchAqlParams& dispatchInfo); + +/// Specifies input assembler state for draws. +/// @see ICmdBuffer::CmdSetInputAssemblyState +struct InputAssemblyStateParams +{ + PrimitiveTopology topology; ///< Defines how vertices should be interpretted and rendered by + /// the graphics pipeline. + uint8 patchControlPoints; ///< # of control points per patch. [0-32] valid. Should be set to + /// 0 by clients if topology is not PrimitiveTopology::Patch. + bool primitiveRestartEnable; ///< Enables the index specified by primitiveRestartIndex to _cut_ + /// a primitive (i.e., triangle strip) and begin a new primitive + /// with the next index. + bool primitiveRestartMatchAllBits; ///< Specifies which bits from primitiveRestartIndex to use. + /// false - only check relevant bits based on index type + /// true - check all 32 bits irrespective of index type + uint32 primitiveRestartIndex; ///< When primitiveRestartEnable is true, this is the index value + /// that will restart a primitive. When using a 16-bit index + /// buffer, the upper 16 bits of this value will be ignored. +}; + +/// Specifies parameters for controlling triangle rasterization. +/// @see ICmdBuffer::CmdSetTriangleRasterState +struct TriangleRasterStateParams +{ + struct + { + FillMode frontFillMode : 2; ///< Whether front-facing triangles should be rendered solid or wireframe. + FillMode backFillMode : 2; ///< Whether back-facing triangles should be rendered solid or wireframe. + CullMode cullMode : 2; ///< Specifies which, if any, triangles should be culled based on whether + /// they are front or back facing. + FaceOrientation frontFace : 1; ///< Specifies the vertex winding that results in a front-facing triangle. + ProvokingVertex provokingVertex : 1; ///< Specifies whether the first or last vertex of a primitive is the + }; /// provoking vertex as it affects flat shading. + union + { + struct + { + uint8 frontDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for front-facing + /// triangle-based primitives + uint8 backDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for back-facing + /// triangle-based primitives + uint8 reserved : 6; ///< Reserved for future use. + }; + uint8 u8All; ///< Flags packed as 8-bit uint. + } flags; ///< Triangle raster state flags. +}; + +/// Specifies parameters for controlling point and line rasterization. +/// @see ICmdBuffer::CmdSetPointLineRasterState +struct PointLineRasterStateParams +{ + float pointSize; ///< Width of a point primitive in pixels. + float lineWidth; ///< Width of a line primitive in pixels. + float pointSizeMin; ///< Minimum width of a point primitive in pixels. + float pointSizeMax; ///< Maximum width of a point primitive in pixels. +}; + +/// Specifies parameters for controlling line stippling. +/// @see ICmdBuffer::CmdSetLineStippleState +struct LineStippleStateParams +{ + uint16 lineStippleValue; ///< Line stipple bit pattern. + uint32 lineStippleScale; ///< Line stipple repeat factor. +}; + +/// Specifies paramters for setting up depth bias. Depth Bias is used to ensure a primitive can properly be displayed +/// (without Z fighting) in front (or behind) of the previously rendered co-planar primitive. This is useful for decal +/// or shadow rendering. +/// @see ICmdBuffer::CmdSetDepthBiasState +struct DepthBiasParams +{ + float depthBias; ///< Base depth bias to be added to each fragment's Z value. In units of the + /// minimum delta representable in the bound depth buffer. + float depthBiasClamp; ///< Maximum allowed depth bias result. Prevents polygons viewed at a sharp value + /// from generating very large biases. + float slopeScaledDepthBias; ///< Factor multiplied by the depth slope (change in Z coord per x/y pixel) to + /// create more bias for "steep" polygons. This result is applied to the final + /// Z value in addition to the base depthBias parameter. +}; + +/// Specifies parameters for setting the value range to be used for depth bounds testing. +/// @see ICmdBuffer::CmdSetDepthBounds +struct DepthBoundsParams +{ + float min; ///< Minimum depth value in passing range (closest). + float max; ///< Maximum depth value in passing range (farthest). +}; + +/// Specifies parameters for setting bit-masks applied to stencil buffer reads and writes. +/// @see ICmdBuffer::CmdSetStencilRefMasks +struct StencilRefMaskParams +{ + + uint8 frontRef; ///< Stencil reference value for front-facing polygons. + uint8 frontReadMask; ///< Bitmask to restrict stencil buffer reads for front-facing polygons. + uint8 frontWriteMask; ///< Bitmask to restrict stencil buffer writes for front-facing polygons. + uint8 frontOpValue; ///< Stencil operation value for front-facing polygons. + /// This is the value used as a parameter for a given stencil operation. + /// For example: StencilOp::IncWrap will use this value when incrementing the current + /// stencil contents. Typically, this would be set to one, but on AMD hardware, + /// this register is 8 bits so there is a greater flexibility. + + uint8 backRef; ///< Stencil reference value for back-facing polygons. + uint8 backReadMask; ///< Bitmask to restrict stencil buffer reads for back-facing polygons. + uint8 backWriteMask; ///< Bitmask to restrict stencil buffer writes for back-facing polygons. + uint8 backOpValue; ///< Stencil operation value for back-facing polygons - See description of frontOpValue + /// for further details. + union + { + uint8 u8All; ///< Flags packed as a 8-bit uint. + struct + { + uint8 updateFrontRef : 1; ///< Updating reference value for front-facing polygons. + uint8 updateFrontReadMask : 1; ///< Updating read mask value for front-facing polygons. + uint8 updateFrontWriteMask : 1; ///< Updating write mask value for front-facing polygons. + uint8 updateFrontOpValue : 1; ///< Updating stencil op value for front-facing polygons. + uint8 updateBackRef : 1; ///< Updating reference value for back-facing polygons. + uint8 updateBackReadMask : 1; ///< Updating read mask value for back-facing polygons. + uint8 updateBackWriteMask : 1; ///< Updating write mask value for back-facing polygons. + uint8 updateBackOpValue : 1; ///< Updating stencil op value for back-facing polygons. + }; + } flags; ///< Flags to indicate which of the stencil state values are being updated. +}; + +/// HiS always exposes two pretests. +constexpr uint32 NumHiSPretests = 2; + +/// Hierarchical stencil (HiS) allows work to be discarded by the stencil test at tile rate in certain cases. +/// In order to use HiS, the client will define a set of pretests that will be performed whenever a particular stencil +/// buffer is written. The stencil image will track the results of the pretest for each 8x8 tile, keeping a record of +/// whether any pixel in the tile "may-pass" or "may-fail" the specified pretest. When stencil testing is enabled, +/// the hardware may be able to discard whole tiles early based on what it can glean from the HiS pretest states. +/// +/// Each stencil image has two pretest slots per mip level. Pretest slots are reset when an initialization barrier +/// targets their mip level on the stencil plane. The client can then pass this struct to @ref CmdUpdateHiSPretests +/// to bind one or more valid pretests. It is legal to bind a pretest over a reset slot at any point. +/// +/// @warning Except in special cases, it is illegal to bind a pretest on top of an existing pretest. +/// +/// It is only legal to bind a new pretest on top of an existing pretest if: +/// 1. All array slices within the given mip have been reset using an initialization barrier. +/// 2. The client guarantees that they will rewrite all stencil values in all array slices within the given mip +/// before the next draw with stencil testing enabled by doing either: +/// a. One or more calls to @ref CmdClearDepthStencil. +/// b. One or more draws with the stencil test disabled and stencil writes enabled. +/// +/// Once pretests are selected via @ref CmdUpdateHiSPretests the client should keep track of which tests were enabled +/// on each stencil image and provide them to every call to @ref CmdClearDepthStencil. This is optional but PAL will +/// not be able to generate HiS optimized clears unless it is given the current pretests. +/// +/// @warning The pretests provided to @ref CmdUpdateHiSPretests are applied to all mips of all subresource ranges. +/// If the client varies pretests between mips they must guarantee that the given pretests were bound to all +/// mips in the given subresource ranges. +/// +/// This feature works best if the future stencil test behavior is known, either directly told via an API extension +/// or via an app profile in the client layer. For example, if the application 1) clears stencil, 2) does a pass to +/// write stencil, 3) then does a final pass that masks rendering based on the stencil value being > 0, ideally we +/// would choose a pretest of func=Greater, mask=0xFF, and value=0 so that #2 would update the stencil image with +/// per-tile data that lets #3 be accelerated at maximum effeciency. +/// +/// In absence of app-specific knowledge, the following algorithm may be a good generic approach: +/// 1. When the stencil image is cleared, set pretest #0 to func=Equal, mask=0xFF, and value set to the clear value. +/// 2. On the first draw with stencil writes enabled, set pretest #1 with the mask set to the app's current stencil +/// mask, and +/// a. If the stencil op is INC or DEC, set func=GreaterEqual and value the same as in #1. +/// b. If the stencil op is REPLACE, set func=Equal and set value to the app's current stencil ref value. +/// +/// Note that HiS can only be beneficial for GPU performance so clients that do not want to implement app profiles or +/// generic heuristics should at least hard-code both tests to something simple. +struct HiSPretests +{ + struct + { + CompareFunc func; ///< This function is used to compare the pretest value with the image's stencil value. + /// The expression is evaluated with the pretest value as the left-hand operand and the + /// image's stencil value as the right-hand operand. + uint8 mask; ///< This value is ANDed with both stencil values before evaluating the comparison. + uint8 value; ///< The pretest value, used as the left-hand operand in the comparison. + bool isValid; ///< True if this pretest contains valid information. Set to false to skip this test. + } test[NumHiSPretests]; ///< The set of pretest slots. +}; + +/// Specifies coordinates for setting up single user clip plane. +/// @see ICmdBuffer::CmdSetUserClipPlanes +struct UserClipPlane +{ + float x; ///< Plane coordinate x + float y; ///< Plane coordinate y + float z; ///< Plane coordinate z + float w; ///< Plane coordinate w +}; + +/// Specifies parameters for setting the constant factor to be used by the blend hardware when programmed with the +/// Blend::ConstantColor, Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend +/// coefficients. +/// @see ICmdBuffer::CmdSetBlendConst +struct BlendConstParams +{ + float blendConst[4]; ///< 4-component RGBA float specifying the new blend constant. +}; + +/// Specifies the parameters for a single viewport +struct Viewport +{ + float originX; ///< X coordinate for the viewport's origin. + float originY; ///< Y coordinate for the viewport's origin. + float width; ///< Width of the viewport. + float height; ///< Height of the viewport. + float minDepth; ///< Minimum depth value of the viewport. Must be in the [0..1] range. + float maxDepth; ///< Maximum depth value of the viewport. Must be in the [0..1] range. + PointOrigin origin; ///< Origin of the viewport relative to NDC. UpperLeft or LowerLeft. +}; + +/// Specifies the viewport transform parameters for setting a single viewport. +/// @see ICmdBuffer::CmdSetViewport +struct ViewportParams +{ + uint32 count; ///< Number of viewports. + float horzDiscardRatio; ///< The ratio between guardband discard rect width and viewport width. + /// For all guard band ratio settings, values less than 1.0f are illegal. + /// Value FLT_MAX opens the guardband as wide as the HW supports. + /// Value 1.0f disables the guardband. + float vertDiscardRatio; ///< The ratio between guardband discard rect height and viewport height. + float horzClipRatio; ///< The ratio between guardband clip rect width and viewport width. + float vertClipRatio; ///< The ratio between guardband clip rect height and viewport height. + DepthRange depthRange; ///< Specifies the target range of Z values + // Define viewports array at the end of the structure as it is common to only access the first N from the CPU. + Viewport viewports[MaxViewports]; ///< Array of desciptors for each viewport. +}; + +/// Specifies the parameters for specifing the scissor rectangle. +struct ScissorRectParams +{ + uint32 count; ///< Number of scissor rectangles. + Rect scissors[MaxViewports]; ///< Array of scissor regions corresponding to each viewport. +}; + +/// Specifies parameters for setting the global scissor rectangle. +/// @see ICmdBuffer::CmdSetGlobalScissor +struct GlobalScissorParams +{ + Rect scissorRegion; ///< Rectangle of the global scissor window. +}; + +/// Specifies parameters for binding the color targets and depth target. +/// @see ICmdBuffer::CmdBindTargets +struct BindTargetParams +{ + uint32 colorTargetCount; ///< Number of color targets to bind. + ColorTargetBindInfo colorTargets[MaxColorTargets]; ///< Array of color target descriptors. + DepthStencilBindInfo depthTarget; ///< Describes the depth target bind info. +}; + +/// Specifies parameters for binding the stream-output targets. +/// @see ICmdBuffer::CmdBindStreamOutTargets +struct BindStreamOutTargetParams +{ + struct + { + gpusize gpuVirtAddr; ///< GPU virtual address of this stream-output target. Must be DWORD-aligned. If + /// this is zero, 'size' is ignored and the target is considered un-bound. + gpusize size; ///< Size of this stream-output target, in bytes. Must be DWORD-aligned. + } target[MaxStreamOutTargets]; ///< Describes the stream-output target for each buffer slot. +}; + +/// Specifies the different types of predication ops available. +enum class PredicateType : uint32 +{ + Zpass = 1, ///< Enable occlusion predicate + PrimCount = 2, ///< Enable streamout predicate + Boolean64 = 3, ///< CP PFP treats memory as a 64bit integer which is either false (0) or true, DX12 style. + Boolean32 = 4, ///< CP PFP treats memory as a 32bit integer which is either false (0) or true, Vulkan style. + Count +}; + +/// Bitfield structure used to specify masks for functions that operate on depth and/or stencil planes of an image. +union DepthStencilSelectFlags +{ + struct + { + /// Select Depth. + uint32 depth : 1; + + /// Select Stencil. + uint32 stencil : 1; + + /// Reserved for future usage. + uint32 reserved : 30; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies information related to clearing a bound color target. Input structure to CmdClearBoundColorTargets(). +struct BoundColorTarget +{ + uint32 targetIndex; ///< Render target index where the target image is currently bound. + SwizzledFormat swizzledFormat; ///< Format and swizzle of the target image. + uint32 samples; ///< Sample count for the target. + uint32 fragments; ///< Fragment count for the target. + ClearColor clearValue; ///< clear color value. +}; + +/// Specifies clear region to clear a bound target. Input structure to CmdClearBoundColorTargets() and +/// CmdClearBoundDepthStencilTargets() +struct ClearBoundTargetRegion +{ + Rect rect; ///< The 2D region to clear. + uint32 startSlice; ///< The starting slice to clear. + uint32 numSlices; ///< The number of slices to clear. +}; + +/// Specifies flags controlling CmdSaveComputeState and CmdRestoreComputeState. PAL clients must be aware that saving +/// and restoring specific state in a nested command buffer may not be supported. The rule is simple: if the client +/// requires that the caller leak the given state to the callee, PAL will not support saving and restoring that state. +enum ComputeStateFlags : uint32 +{ + ComputeStatePipelineAndUserData = 0x1, ///< Selects the bound compute pipeline, all non-indirect user data, and all + /// kernel arguments (if applicable). Note that the current user data will + /// be invalidated on CmdSaveComputeState. + ComputeStateBorderColorPalette = 0x2, ///< Selects the bound border color pallete that affects compute pipelines. + ComputeStateAll = 0x3, ///< Selects all state +}; + +/// Provides dynamic command buffer flags during submission +/// The following flags are used for Frame Pacing when delay time is configured to be caculated by KMD. +/// (Currently DX clients require this). +/// For clients that do not need Frame Pacing with KMD caculated delay time, they can ignore these flags: +/// +/// - frameBegin and frameEnd : Client's presenting queue should track its present state, +/// and set frameBegin flag on the first command buffer after present, +/// set frameEnd flag on the the last command buffer before present. (Could be the Present command buffer itself.) +/// We don't need to set them on queues other than the presenting queue. +/// - P2PCmd : Mark a P2P copy command. KMD could use this flag for adjustments for its frame time calculation. +/// For the current frame time algorithm, clients should only set this flag on SW compositing copy command. +/// But KMD may adjust their algorithm, and clients should update the flag depending on KMD needs. +/// +/// The following flags are used for Direct Capture. +/// +/// - captureBegin and captureEnd : Direct capture info should be filled if any of these is set. And captureEnd flag +/// also notifies KMD that the on-screen primary is safe to release. +struct CmdBufInfo +{ + union + { + struct + { + uint32 isValid : 1; ///< Indicate if this CmdBufInfo is valid and should be submitted + uint32 frameBegin : 1; ///< First command buffer after Queue creation or Present. + uint32 dfSpmTraceBegin : 1; ///< This command buffer begins a DF SPM trace. + uint32 dfSpmTraceEnd : 1; ///< This command buffer ends a DF SPM trace. + uint32 frameEnd : 1; ///< Last command buffer before Present. + uint32 p2pCmd : 1; ///< Is P2P copy command. See CmdBufInfo comments for details. + uint32 captureBegin : 1; ///< This command buffer begins a Direct Capture frame capture. + uint32 captureEnd : 1; ///< This command buffer ends a Direct Capture frame capture. + uint32 rayTracingExecuted : 1; ///< This command buffer contains ray tracing work. + uint32 preflip : 1; ///< This command buffer has pre-flip access to DirectCapture resource + uint32 postflip : 1; ///< This command buffer has post-flip access to DirectCapture resource + uint32 privateFlip : 1; ///< Need to flip to a private primary surface for DirectCapture feature + uint32 vpBltExecuted : 1; ///< This command buffer comtains VP Blt work. + uint32 disableDccRejected : 1; ///< Reject KMD's DisableDcc request to avoid writing to front buffer. + uint32 noFlip : 1; ///< No flip when DirectCapture access submission completes + uint32 frameGenIndex : 4; ///< Index of the DirectCapture feature generated frames + uint32 noRenderPresent : 1; ///< Last command buffer before present which is no render present or not + uint32 motionVectorPropChanged : 1; ///< Indicates whether motion vector properties changed + uint32 depthPropChanged : 1; ///< Indicates whether depth properties changed + uint32 cameraPropChanged : 1; ///< Indicates whether camera matrix properties changed + uint32 capturePrimary : 1; ///< Has Direct Capture primary surface capture + uint32 captureMotionVector : 1; ///< Has Direct Capture motion vector capture + uint32 captureDepth : 1; ///< Has Direct Capture depth capture + uint32 captureCamera : 1; ///< Has Direct Capture camera matrix capture + uint32 hudLessImagePropChanged : 1; ///< Indicates whether HUD less image properties changed + uint32 captureHudLessImage : 1; ///< Has Direct Capture HUD less image capture + uint32 reserved : 3; ///< Reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. + }; + + const IGpuMemory* pPrimaryMemory; ///< The primary's gpu memory object used for passing its allocation handle + /// to KMD for pre-flip primary access (PFPA). If frame metadata flags + /// specifies that primaryHandle should be sent, clients should set this to + /// current frame pending primary's IGpuMemory object on the creating GPU + /// for the frameEnd command. Otherwise set this to nullptr. + const IGpuMemory* pDirectCapMemory; ///< The Direct Capture gpu memory object. It should be set if flag + /// captureBegin or captureEnd is set. Otherwise set this to nullptr. + const IGpuMemory* pPrivFlipMemory; ///< The gpu memory object of the private flip primary surface for the + /// DirectCapture feature. + const Util::Event* pEarlyPresentEvent; ///< The 'early present' event object. This variable can be nullptr. + uint64 frameIndex; ///< The frame index of this command buffer. It is only required for the + /// DirectCapture feature + uint32 vidPnSourceId; ///< The display source id for the DirectCapture feature. Clients must set + /// a valid vidPnSourceId when privateFlip flag is set and pDirectCapMemory + /// is nullptr. + uint64 frameId; ///< Present frame index, incremented at each present + const IGpuMemory* pMotionVectorMemory; ///< The motion vector gpu memory object for the DirectCapture feature. + const IGpuMemory* pDepthMemory; ///< The depth gpu memory object for the DirectCapture feature. + const IGpuMemory* pCameraMemory; ///< The camera gpu memory object for the DirectCapture feature. + const IGpuMemory* pHudLessImageMemory; ///< The HUD less image gpu memory object for DirectCapture. +}; + +/// Specifies rotation angle between two images. Used as input to ICmdBuffer::CmdScaledCopyImage. +enum class ImageRotation : uint32 +{ + Ccw0 = 0x0, ///< Counter clockwise degree 0 + Ccw90 = 0x1, ///< Counter clockwise degree 90 + Ccw180 = 0x2, ///< Counter clockwise degree 180 + Ccw270 = 0x3, ///< Counter clockwise degree 270 + Count +}; + +/// Describes a color-key value which can control a pixel get copied or ignored during a CmdScaledCopyImage operation. +struct ColorKey +{ + uint32 u32Color[4]; ///< The color value for each channel +}; + +/// Uniquely identifies the target of the a Present operation (swap chain / destination window / etc.) so that PAL's debug +/// layers can track frames-per-second or other statistics correctly when applications render to multiple displays or +/// windows. Client drivers which don't care about this can always specify a key value of 0. +using UniquePresentKey = uint64; + +/// Convert an OS window handle to a unique present key. +inline UniquePresentKey PresentKeyFromOsWindowHandle(OsWindowHandle handle) +#if defined(_WIN32) + { return reinterpret_cast(handle); } +#else + { return handle.win; } +#endif +/// Convert any pointer to a unique present key. +template +constexpr inline UniquePresentKey PresentKeyFromPointer(T* ptr) { return reinterpret_cast(ptr); } + +/// Specifies the input parameters for debug overlay's visual confirm. This struct is not functional. +/// The client is expected to default initialize this struct and then fill out any state that makes +/// sense under its presentation model. PAL will process any valid input and ignore fields that are +/// default initialized. +struct CmdPostProcessDebugOverlayInfo +{ + PresentMode presentMode; ///< The Presentation Mode of the application. + WsiPlatform wsiPlatform; ///< The WsiPlatform that Swap Chain works upon + UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. +}; + +/// Specifies the input parameters for ICmdBuffer::CmdPostProcessFrame. +struct CmdPostProcessFrameInfo +{ + union + { + struct + { + uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. + uint32 reserved : 31; ///< Reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. + } flags; + + union + { + const IImage* pSrcImage; ///< The image to postprocess (prior to presenting). + const IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to postprocess. + /// Must have been created as a typed buffer. + }; + + CmdPostProcessDebugOverlayInfo debugOverlay; + FullScreenFrameMetadataControlFlags fullScreenFrameMetadataControlFlags; + + Pal::ImageLayout srcImageLayout; +}; + +/// External flags for ScaledCopyImage. +union ScaledCopyFlags +{ + struct + { + uint32 srcColorKey : 1; ///< If set, enables source color-keying by using the value in the ColorKey member. + /// That is, any pixel in the source image that matches the color key should not be + /// copied to the destination image, and all of the source pixels that do not match + /// the color key should be copied. Mutually exclusive with dstColorKey. + uint32 dstColorKey : 1; ///< If set, enables destination color-keying by using the value in the ColorKey + /// member. That is, any pixel in the destination image that matches the color key + /// should be replaced with the corresponding pixel from the source image, and all of + /// the destination pixels that do not match the color key should not be replaced. + /// Mutually exclusive with srcColorKey. + uint32 srcAlpha : 1; ///< If set, use alpha channel in source surface as blend factor. + /// color = src alpha * src color + (1.0 - src alpha) * dst color. + uint32 dstAsSrgb : 1; ///< If set, a non-srgb destination image will be treated as srgb format. + /// Cannot be set if @ref dstAsNorm is set. + uint32 dstAsNorm : 1; ///< If set, a srgb destination image will be treated as non-srgb format. + /// Cannot be set if @ref dstAsSrgb is set. + uint32 scissorTest : 1; ///< If set, do scissor test using the specified scissor rectangle. + uint32 coordsInFloat : 1; ///< If set, copy regions are represented in floating point type. + uint32 srcAsNorm : 1; ///< If set, an srgb source image will be treated as non-srgb format. + /// Cannot be set if @ref srcAsSrgb is set. + uint32 srcAsSrgb : 1; ///< If set, a non-srgb source image will be treated as srgb format. + /// Cannot be set if @ref srcAsNorm is set. + uint32 reserved : 23; ///< reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. +}; + +/// Input structure to @ref ICmdBuffer::CmdScaledCopyImage. Specifies parameters needed to execute CmdScaledCopyImage. +struct ScaledCopyInfo +{ + const IImage* pSrcImage; ///< The source image to blt from. + ImageLayout srcImageLayout; ///< The source image layout. + const IImage* pDstImage; ///< The dest image to blt to. + ImageLayout dstImageLayout; ///< The dest image layout. + uint32 regionCount; ///< Copy region array size. + const ImageScaledCopyRegion* pRegions; ///< Region array to copy. + TexFilter filter; ///< Controlling how a given texture is sampled. + ImageRotation rotation; ///< Rotation option between two images. + const ColorKey* pColorKey; ///< Color key value. + const Rect* pScissorRect; ///< Scissor test rectangle. + ScaledCopyFlags flags; ///< Copy flags, identifies the type of blt to peform. +}; + +/// Input structure to @ref ICmdBuffer::CmdGenerateMipmaps. Specifies parameters needed to execute CmdGenerateMipmaps. +struct GenMipmapsInfo +{ + const IImage* pImage; ///< Populate mips in this image by reading from existing higher-level mips. + ImageLayout baseMipLayout; ///< The layout of all slices in the read-only base mip; must include LayoutCopySrc. + ImageLayout genMipLayout; ///< The layout of all slices and mips that will be generated; must include + /// LayoutCopySrc and LayoutCopyDst. + SubresRange range; ///< Which subresources should be generated from earlier mips. The starting mipLevel + /// must never be zero because there would be no larger mip to read. + TexFilter filter; ///< Controls texture sampling during mip generation. Linear texture filtering is + /// only supported for images with non-integer formats. + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret all subresources using this format and swizzle. + /// The specified format needs to have been included in the "pViewFormats" list + /// specified at image-creation time, otherwise the result might be incorrect. +}; + +/// Defines a single memory range to prefetch using CmdPrimeGpuCaches. +struct PrimeGpuCacheRange +{ + gpusize gpuVirtAddr; ///< Base GPU virtual address to be prefetched. + gpusize size; ///< Number of bytes to prefetch. Clients should keep range sizes small relative + /// to the GPU caches (e.g., tccSizeInBytes); the PAL implementation may clamp + /// prefetched ranges if they are too large for the cache being prefetched. + uint32 usageMask; ///< Bitmask of CacheCoherencyUsageFlags defining the usage to prefetch for. + /// E.g., if the mask includes CoherShader, then PAL will attempt to prefetch + /// into caches that are on the shader core's data path. This mask must be a + /// subset of the dstCacheMask specified in the last barrier operation executed + /// on this memory range. Performing the cache prefetch is considered a read + /// operation of the specified usage, and so must be properly accounted for + /// in future barrier memory dependencies for this range. + bool addrTranslationOnly; ///< If set, only the address translation caches (i.e., TLB) will be primed; + /// no data caches will be affected. If this is set, the prefetch operation + /// has no bearing on barrier execution or memory dependencies. +}; + +/// Magic number tag for payloads in command buffer dumps +constexpr uint32 CmdBufferPayloadSignature = 0x1337F77D; + +/// Maximum size, in DWORDs, of payload data in command buffer dumps. +constexpr uint32 MaxPayloadSize = 254; + +/// Payload types used in special embedded NOP packets. +enum class CmdBufferPayloadType : uint32 +{ + Integer = 0, ///< Payload consists of a single 32-bit signed integer. + UnsignedInteger = 1, ///< Payload consists of a single 32-bit unsigned integer. + Integer64 = 2, ///< Payload consists of a single 64-bit signed integer. + UnsignedInteger64 = 3, ///< Payload consists of a single 64-bit unsigned integer. + Float = 4, ///< Payload consists of a single 32-bit floating point number. + Double = 5, ///< Payload consists of a single 64-bit double precision floating point number. + Pointer = 6, ///< Payload consists of a single 64-bit pointer address. + String = 7, ///< Payload consists of a variable length string. Must contain null-terminator. + Binary = 8, ///< Payload consists of DWORD-aligned binary data. +}; + +/// Structure layout for embedded CmdBuffer payloads. This can be embedded into the command stream with the +/// @ref ICmdBuffer::CmdNop() function. +struct CmdBufferPayload +{ + uint32 signature; ///< Magic number tag indicating the structure to follow. + uint32 payloadSize; ///< Size of the NOP packet (one DWORD) plus the sizeof this structure and the + /// payload data to follow. + /// This value is in DWORDs. Payload size is expected to be under + /// MaxPayloadSize. + CmdBufferPayloadType type; ///< The type of payload. + uint32 payload[1]; ///< Initial DWORD of payload data with the other data to follow. +}; + +/// Flags controlling which sub-queue(s) of a command buffer should insert an RGP trace marker. Zeroing out this +/// union is invalid, because RGP markers must be sent to at least one sub-queue. +union RgpMarkerSubQueueFlags +{ + struct + { + uint32 includeMainSubQueue : 1; ///< If set, includes the main sub-queue in the RGP marker. + uint32 includeGangedSubQueues : 1; ///< If set, includes any ganged sub-queues in the RGP marker. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed into a uint32 +}; + +/** + *********************************************************************************************************************** + * @interface ICmdBuffer + * @brief Contains GPU rendering and other commands recorded by PAL on the client's behalf. + * + * A command buffer can be executed by the GPU multiple times and recycled, provided the command buffer is not pending + * execution on the GPU when it is recycled. + * + * Command buffers are fully independent and there is no persistence of GPU state between submitted command buffers. + * When a new command buffer is recorded, the state is undefined. All relevant state must be explicitly set by the + * client before state-dependent operations such as draws and dispatches. + * + * @see IDevice::CreateCmdBuffer() + *********************************************************************************************************************** + */ +class ICmdBuffer : public IDestroyable +{ +public: + /// Resets the command buffer's previous contents and state, then puts it in the _building_ _state_, allowing new + /// commands to be recorded. + /// + /// If this is a root command buffer, the state will be reset to a "clean slate" with nothing bound. If this is a + /// nested command buffer, the state is set to an "undefined" state so that all render state can be inherited from + /// any root command buffer which executes this one. + /// + /// @param [in] info Controls how PAL will generate commands for this command buffer. E.g., specifies whether the + /// command buffer may be submitted more than once, and controls options for optimizing PM4, etc. + /// + /// @returns Success if the command buffer was successfully reset and put into the _building_ _state_. Otherwise, + /// one of the following error codes may be returned: + /// + ErrorInvalidFlags if invalid flags are set in the flags parameter. + /// + ErrorIncompleteCommandBuffer if the command buffer is already in the _building_ _state_. + virtual Result Begin( + const CmdBufferBuildInfo& info) = 0; + + /// Completes recording of a command buffer in the _building_ _state_, making it _executable_. + /// + /// @returns Success if the command buffer was successfully made _executable_. Otherwise, one of the following + /// errors may be returned: + /// + ErrorIncompleteCommandBuffer if the command buffer is not in the _building_ _state_. + /// + ErrorBuildingCommandBuffer if some error occurred while building the command buffer, and it could not + /// be made _executable_. If this error is returned, the command buffer can not be submitted. + virtual Result End() = 0; + + /// Explicitly resets a command buffer, releasing any internal resources associated with it. + /// + /// This call must be used to reset command buffers that have previously reported a ErrorIncompleteCommandBuffer + /// error. + /// + /// @note @ref Begin will implicitly cause a command buffer to be reset in addition to putting it in the + /// _building_ _state_. This method just gives a way to release resources between when the client knows + /// it is done with the command buffer and when it is ready to reuse this command buffer object for + /// recording new commands. + /// + /// @param [in] pCmdAllocator If non-null, all future GPU memory allocations will be done using this allocator. + /// Otherwise the command buffer will continue to use its current command allocator. + /// + /// @param [in] returnGpuMemory If true then all GPU memory associated with this command buffer will be returned + /// to the allocator upon reset. If false data chunks will be retained and reused. + /// Note: This flag must be true if changing command allocators. + /// + /// @warning If returnGpuMemory is false, the client must guarantee that this command buffer is not queued for + /// execution, is not currently being executed, and that all other command buffers that have referenced + /// this command buffer in a @ref CmdExecuteNestedCmdBuffers call have also been reset. + /// + /// @returns Success if the command buffer was successfully reset. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result Reset(ICmdAllocator* pCmdAllocator, bool returnGpuMemory) = 0; + + /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to CmdAllocateEmbeddedData. + /// + /// This a property of the command buffer and its associated command allocator; it may change if the caller + /// specifies a different command allocator on Reset(). + /// + /// @returns How many DWORDs of embedded data the command buffer can allocate at once. + virtual uint32 GetEmbeddedDataLimit() const = 0; + + /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to + /// CmdAllocateLargeEmbeddedData. + /// + /// @returns Number of DWORDs that can be allocated in one call to CmdAllocateLargeEmbeddedData + virtual uint32 GetLargeEmbeddedDataLimit() const = 0; + + /// Binds a graphics or compute pipeline to the current command buffer state. + /// + /// Graphics pipelines must be compiled for the PAL ABI. Compute pipelines must either be compiled for the PAL ABI + /// or the HSA ABI, if it's supported. HSA ABI support is indicated by supportHsaAbi in @ref DeviceProperties. + /// + /// PAL ABI pipelines and HSA ABI pipelines use different mechanisms to bind inputs and outputs. PAL ABI pipelines + /// use user data entries set by @ref CmdSetUserData. HSA ABI pipelines use kernel arguments set by @ref + /// CmdSetKernelArguments. Binding or unbinding a compute pipeline can implicitly modify the user data and kernel + /// argument state, please read the @ref CmdSetUserData and @ref CmdSetKernelArguments documentation for details. + /// + /// @param [in] params Parameters necessary to manage dynamic pipeline shader information. + virtual void CmdBindPipeline( + const PipelineBindParams& params) = 0; + + /// Binds the specified MSAA state object to the current command buffer state. + /// + /// @param [in] pMsaaState New MSAA state to be bound. Can be null in order to unbind a previously bound MSAA state + /// object without binding a new one. + virtual void CmdBindMsaaState( + const IMsaaState* pMsaaState) = 0; + + /// Saves a copy of all of the current command buffer state that is used by graphics workloads. This feature is + /// intended to give PAL clients a convenient way to issue their own internal graphics workloads without modifying + /// the application-facing state. + /// + /// PAL cannot save multiple layers of state, each call to CmdSaveGraphicsState must be followed by a call to + /// CmdRestoreGraphicsState before the next call to CmdSaveGraphicsState. Any barriers, resolves, blits, etc are not + /// allowed while the state is pushed. + /// + /// This function can only be called on command buffers that support graphics workloads. All query counters will be + /// disabled until CmdRestoreGraphicsState is called. + virtual void CmdSaveGraphicsState() = 0; + + /// Restores all of the command buffer state that is used by graphics workloads. This feature is intended to + /// give PAL clients a convenient way to issue their own internal graphics workloads without modifying the + /// application-facing state. + /// + /// A call to this function must be preceded by a call to CmdSaveGraphicsState + /// + /// This function can only be called on command buffers that support graphics workloads. All previously disabled + /// query counters will be reactivated. + virtual void CmdRestoreGraphicsState() = 0; + + /// Sets the shading rate in the command buffer along with the state of the various combiners. + /// + /// @param [in] rateParams Nwe VRS shading rate parameters to be bound. + virtual void CmdSetPerDrawVrsRate( + const VrsRateParams& rateParams) = 0; + + /// Setup parameters regarding how pixel center will be evaluated with VRS. + /// + /// @param [in] centerState Nwe VRS parameters to be bound that control how pixel center is defined. + virtual void CmdSetVrsCenterState( + const VrsCenterState& centerState) = 0; + + /// Binds the shading rate data in the specified image into the pipeline for use with VRS. Only relevant if the + /// combiner stage for VrsCombinerStage is set to something other than Passthrough. + /// + /// This binding point requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageSampleRate + /// - CacheCoherency: @ref CoherSampleRate + /// - ImageLayout: @ref LayoutSampleRate + /// + /// @param [in] pImage Image that contains sample rate data. Pointer can be NULL to force 1x1 shading rate. + virtual void CmdBindSampleRateImage( + const IImage* pImage) = 0; + + /// Binds the specified color/blend state object to the current command buffer state. + /// + /// @param [in] pColorBlendState New color/blend state to be bound. Can be null in order to unbind a previously + /// bound color/blend state object without binding a new one. + virtual void CmdBindColorBlendState( + const IColorBlendState* pColorBlendState) = 0; + + /// Binds the specified depth/stencil state object to the current command buffer state. + /// + /// @param [in] pDepthStencilState New depth/stencil state to be bound. Can be null in order to unbind a previously + /// bound depth/stencil state object without binding a new one. + virtual void CmdBindDepthStencilState( + const IDepthStencilState* pDepthStencilState) = 0; + + /// Sets the value range to be used for depth bounds testing. + /// + /// The depth bounds test is enabled in the graphics pipeline. When enabled, an additional check will be done that + /// will reject a pixel if the pre-existing depth value stored at its destination location is outside of the + /// specified bounds. Applications would typically use this feature to optimize shadow volume rendering. + /// + /// @param [in] params Parameters necessary to set the depth bounds (such as min/max depth). + virtual void CmdSetDepthBounds( + const DepthBoundsParams& params) = 0; + + /// Sets pipeline-accessible user data to the specified values. + /// + /// The values set in user data entries will be interpreted based on the resource mapping specified for each shader + /// in the currently bound pipeline. For example, the client can write virtual addresses of tables containing + /// SRDs, immediate SRDs that can be loaded without an indirection, or even a small number of immediate ALU + /// constants. + /// + /// The user data values are only used by PAL ABI pipelines. Almost all pipelines used by PAL clients are compiled + /// for the PAL ABI, but PAL also supports HSA ABI compute pipelines which use @ref CmdSetKernelArguments instead. + /// When an HSA ABI pipeline is bound the current compute user data entries are saved and will be restored if the + /// client later binds a PAL ABI compute pipeline. + /// + /// @warning It's illegal to set compute user data if an HSA ABI pipeline is currently bound. + /// + /// If no compute pipeline is currently bound PAL assumes the client will bind a PAL ABI pipeline and thus accepts + /// user data bindings. Graphics user data are unaffected by all of this because graphics pipelines can only use + /// the PAL ABI. + /// + /// @see PipelineShaderInfo + /// @see ResourceMappingNode + /// @ingroup ResourceBinding + /// + /// @param [in] bindPoint Specifies which type of user-date is to be set (i.e., compute or graphics). + /// @param [in] firstEntry First user data entry to be updated. + /// @param [in] entryCount Number of user data entries to update; size of the pEntryValues array. Must be greater + /// than zero, and (firstEntry + entryCount) must not extend beyond MaxUserDataEntries. + /// @param [in] pEntryValues Array of 32-bit values to be copied into user data. + void CmdSetUserData( + PipelineBindPoint bindPoint, + uint32 firstEntry, + uint32 entryCount, + const uint32* pEntryValues) + { (m_funcTable.pfnCmdSetUserData[static_cast(bindPoint)])(this, firstEntry, entryCount, pEntryValues); } + + /// Copies all pipeline-accessible user-data from one bind point to another. It is invalid if the source and + /// dest parameters refer to the same bind point. + /// + /// @see CmdSetUserData for how the user-date entries will be interpreted by the pipeline. + /// + /// @param [in] source Specifies which bind point to copy from. + /// @param [in] dest Specifies which bind point to copy into. + virtual void CmdDuplicateUserData( + PipelineBindPoint source, + PipelineBindPoint dest) = 0; + + /// Sets one or more HSA code object kernel argument values. + /// + /// If the currently bound compute pipeline was compiled using the HSA compute ABI this function must be used to + /// bind that pipeline's arguments. The argument position and value types are static properties of the pipeline + /// and must be known by the client. + /// + /// @note Calling @ref CmdBindPipeline invalidates all prior kernel argument bindings, even if the new pipeline + /// also uses the HSA ABI. Any kernel arguments that the client intends to share between pipelines must + /// be manually rebound. + /// + /// @warning It's illegal to call this function if no compute pipeline is bound or if the bound compute pipeline + /// uses a different ABI (e.g., the PAL compute ABI). + /// + /// @ingroup ResourceBinding + /// + /// @param [in] firstArg The zero-based position of the first kernel argument to bind. + /// @param [in] argCount Number of kernel arguments this call binds. + /// @param [in] ppValues Array of pointers to kernel argument values. + virtual void CmdSetKernelArguments( + uint32 firstArg, + uint32 argCount, + const void*const* ppValues) = 0; + + /// Changes one or more of the command buffer's active vertex buffers. + /// + /// @note If bufferViews.offsetMode is false, PAL will construct SRDs for each bound vertex buffer which are + /// equivalent to the client calling @ref IDevice::CreateUntypedBufferViewSrd on each element of the + /// pBuffers parameter. + /// + /// Note that vertex buffers require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageVs + /// - CacheCoherency: @ref CoherShaderRead + /// + /// @param [in] bufferViews Vertex buffer view descriptors.This parameter defines which vertex mode is used through + /// @ref VertexBufferViews::offsetMode. VertexBufferViews::pVertexBufferViews or + /// VertexBufferViews::pBufferViewInfos must not be nullptr. + virtual void CmdSetVertexBuffers( + const VertexBufferViews& bufferViews) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 877 + void CmdSetVertexBuffers( + uint32 firstBuffer, + uint32 bufferCount, + const BufferViewInfo* pBuffers) + { + const VertexBufferViews bufferViews = + { + .firstBuffer = firstBuffer, + .bufferCount = bufferCount, + .offsetMode = false, + .pBufferViewInfos = pBuffers + }; + CmdSetVertexBuffers(bufferViews); + } +#endif + + /// Binds a range of memory for use as index data (i.e., binds an index buffer). + /// + /// The GPU virtual address must be index element aligned: 2-byte aligned for 16-bit indices or 4-byte aligned for + /// 32-bit indices. + /// + /// The index buffer binding point requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageFetchIndices + /// - CacheCoherency: @ref CoherIndexData + /// + /// @param [in] gpuAddr GPU virtual address of the index data. Can be zero to unbind the previously bound data. + /// @param [in] indexCount Maximum number of indices in the index data; the GPU may read less indices. + /// @param [in] indexType Specifies whether to use 8-bit, 16-bit or 32-bit index data. + virtual void CmdBindIndexData( + gpusize gpuAddr, + uint32 indexCount, + IndexType indexType) = 0; + + /// Binds color and depth/stencil targets to the current command buffer state. + /// + /// The current layout of each target must also be specified. + /// + /// The color target binding points require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// + /// The depth and stencil target binding points require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// + /// @param [in] params Parameters representing the color and depth/stencil targets to bind to the command buffer. + virtual void CmdBindTargets( + const BindTargetParams& params) = 0; + + /// Binds stream-output target buffers to the current command buffer state. + /// + /// At draw-time, the stream-output targets must be consistent with the soState parameters specified by the + /// currently bound graphics pipeline. + /// + /// The stream-output target buffers require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageStreamOut + /// - CacheCoherency: @ref CoherStreamOut + /// + /// @param [in] params Parameters representing the stream-output target buffers to bind to the command buffer. + virtual void CmdBindStreamOutTargets( + const BindStreamOutTargetParams& params) = 0; + + /// Sets the constant factor to be used by the blend hardware when programmed with the Blend::ConstantColor, + /// Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend coefficients. + /// + /// @param [in] params Parameters representing the blend constant factor. + virtual void CmdSetBlendConst( + const BlendConstParams& params) = 0; + + /// Sets input assembly state for upcoming draws in this command buffer. + /// + /// At draw-time, the topology specified with this method must be consistent with the _topologyInfo_ parameters + /// specified by the currently bound graphics pipeline. + /// + /// @param [in] params Parameters representing the input assembly state for upcoming draws. + virtual void CmdSetInputAssemblyState( + const InputAssemblyStateParams& params) = 0; + + /// Sets parameters controlling triangle rasterization. + /// + /// @param [in] params Parameters to set the triangle raster state (such as fill/cull mode). + virtual void CmdSetTriangleRasterState( + const TriangleRasterStateParams& params) = 0; + + /// Sets parameters controlling point and line rasterization. + /// + /// @param [in] params Parameters to set the point and line rasterization state (such as pointSize and lineWidth). + virtual void CmdSetPointLineRasterState( + const PointLineRasterStateParams& params) = 0; + + /// Sets parameters controlling line stippling. + /// + /// @param [in] params Parameters to set the line stipple state. + virtual void CmdSetLineStippleState( + const LineStippleStateParams& params) = 0; + + /// Sets depth bias parameters. + /// + /// Depth bias is used to ensure a primitive can properly be displayed (without Z fighting) in front (or behind) + /// of the previously rendered co-planar primitive. This is useful for decal or shadow rendering. + /// + /// @param [in] params Parameters for setting the depth bias (such as depth bias, depth bias clamp, and slope + /// scaled depth bias). + virtual void CmdSetDepthBiasState( + const DepthBiasParams& params) = 0; + + /// Sets stencil reference values and mask buffer reads and writes in upcoming draws. Separate reference values + /// can be specified for front-facing and back-facing polygons. Update flags should be set for state which needs to + /// be updated. All other state will be preserved. + /// Setting all the values (reference, read/write masks and stencil op) in the StencilRefMaskParams together + /// takes the faster path. + /// Setting either the ref value, read/write masks or the stencil op value individually takes the slower + /// read-modify-write path. + /// + /// @param [in] params Parameters for setting the stencil read and write masks. + virtual void CmdSetStencilRefMasks( + const StencilRefMaskParams& params) = 0; + + /// Sets user defined clip planes, should only be called on universal command buffers. + /// + /// @param [in] firstPlane The index of first plane in user define clip plane array. + /// @param [in] planeCount The count of planes in plane array. + /// @param [in] pPlanes Pointer to plane array. + virtual void CmdSetUserClipPlanes( + uint32 firstPlane, + uint32 planeCount, + const UserClipPlane* pPlanes) = 0; + + /// Sets clip rects, should only be called on universal command buffers. + /// + /// @param [in] clipRule 16-bit clip rule bits are used to determine if pixel shall be discarded or retained. + /// For each pixel, a 4-bit index is computed based on which clip rects the pixel is + /// inside (bitN represents rectN). Then uses this index to check the corresponding bit + /// in clip rule for this pixel - 0 for discarded, 1 for retained. + /// @param [in] rectCount The count of rectangles in rect list. This must be less than or equal to + /// MaxClipRects (4). + /// @param [in] pRectList Pointer to the rect list. + virtual void CmdSetClipRects( + uint16 clipRule, + uint32 rectCount, + const Rect* pRectList) = 0; + + /// Sets user defined MSAA quad-pixel sample pattern, should only be called on universal command buffers + /// This should be called before clearing, rendering, barriering and resolving of MSAA DepthStencil image. + /// + /// @param [in] numSamplesPerPixel Number of samples per pixel + /// @param [in] quadSamplePattern The input msaa sample pattern + virtual void CmdSetMsaaQuadSamplePattern( + uint32 numSamplesPerPixel, + const MsaaQuadSamplePattern& quadSamplePattern) = 0; + + /// Sets the specified viewports to the current command buffer state. + /// + /// @param [in] params Parameters for setting the specified number of viewports. + virtual void CmdSetViewports( + const ViewportParams& params) = 0; + + /// Sets the scissor regions corresponding to each viewport to the current command buffer state. + /// + /// @param [in] params Parameters for setting the specified number of scissor regions. + virtual void CmdSetScissorRects( + const ScissorRectParams& params) = 0; + + /// Sets the global scissor rectangle. + /// + /// @param [in] params Parameters for setting the global scissor rectangle from the top left to bottom right + /// coordinate. + virtual void CmdSetGlobalScissor( + const GlobalScissorParams& params) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 + /// Inserts a barrier in the current command stream that can stall GPU execution, flush/invalidate caches, or + /// decompress images before further, dependent work can continue in this command buffer. + /// + /// This operation does not honor the command buffer's predication state, if active. + /// + /// @param [in] barrierInfo See @ref BarrierInfo for detailed information. + virtual void CmdBarrier( + const BarrierInfo& barrierInfo) = 0; +#endif + /// Perform source pipeline stage and cache access optimization based on the acquire/release interface. + /// + /// @param [in] barrierType Barrier transition type @ref BarrierType. + /// @param [in] pImage Image pointer for image transition, required when @ref BarrierType is + /// BarrierType::Image. + /// @param [in/out] pSrcStageMask A source mask of ORed @ref PipelineStageFlag to optimize, can't be null. + /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null. + /// @param [in/out] pDstStageMask A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null. + /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize. + /// + /// @returns If need flush and invalidate GL2 cache. + /// + /// @note PipelineStageBlt will be converted to more accurate stage(s) based on the underlying implementation of + /// outstanding BLTs, but will be left as PipelineStageBlt if the internal outstanding BLTs can't be expressed + /// as a client-facing PipelineStage (e.g., if there are CP DMA BLTs in flight). + virtual bool OptimizeAcqRelReleaseInfo( + BarrierType barrierType, + const IImage* pImage, + uint32* pSrcStageMask, + uint32* pSrcAccessMask, + uint32* pDstStageMask, + uint32* pDstAccessMask) const = 0; + + /// Performs the release portion of an acquire/release-based barrier. This releases a set of resources from their + /// current usage, while CmdAcquire() is expected to be called to acquire access to the resources for future, + /// different usage. + /// + /// Conceptually, this method will: + /// - Ensure the specified source synchronization scope has completed. + /// - Ensure all specified resources are available in memory. The availability operation will flush all + /// write-back caches to the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// Once all of these operations are complete, the release issues a timestamp event that signals the operation + /// completion. The event type and timestamp value is returned to caller in a packed uint32 token. A corresponding + /// CmdAcquire() call is expected to wait on one or a list of such synchronization tokens and perform any necessary + /// visibility operations and/or layout transitions that could not be predicted at release-time. + /// + /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout + /// transitions. + /// @returns Synchronization token for the release operation. Pass this token to CmdAcquire to confirm completion. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 + virtual uint32 CmdRelease( +#else + virtual ReleaseToken CmdRelease( +#endif + const AcquireReleaseInfo& releaseInfo) = 0; + + /// Performs the acquire portion of an acquire/release-based barrier. This acquire a set of resources for a new + /// set of usages, assuming CmdRelease() was called to release access for the resource's past usage. + /// + /// Conceptually, this method will: + /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all + /// relevant caches above the last-level-cache. + /// - Perform any requested layout transitions. + /// - Ensure the release(s) have completed by waiting on the synchronization token of the release operation. + /// + /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout + /// layout transitions. + /// @param [in] syncTokenCount Number of entries in pSyncTokens, can be zero if no valid release token. + /// @param [in] pSyncTokens Array of synchronization tokens, as returned from CmdRelease, to confirm completion. + /// The token value(s) must have been returned by a CmdRelease call in the same command + /// buffer. pSyncTokens can be null if syncTokenCount is 0. + virtual void CmdAcquire( + const AcquireReleaseInfo& acquireInfo, + uint32 syncTokenCount, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 + const uint32* pSyncTokens) = 0; +#else + const ReleaseToken* pSyncTokens) = 0; +#endif + + /// Performs the release portion of an acquire/release event-based barrier. This releases a set of resources from + /// their current usage, while CmdAcquireEvent() is expected to be called to acquire access to the resources for + /// future, different usage. + /// + /// Conceptually, this method will: + /// - Ensure the specified source synchronization scope has completed. + /// - Ensure all specified resources are available in memory. The availability operation will flush all + /// write-back caches to the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// Once all of these operations are complete, the specified IGpuEvent object will be signaled. A corresponding + /// CmdAcquireEvent() call is expected to wait on this event and perform any necessary visibility operations and/or + /// layout transitions that could not be predicted at release-time. + /// + /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout + /// transitions. + /// @param [in] pGpuEvent Event to be signaled once the release has completed. Must be a valid (non-null) GPU + /// event pointer. Call CmdRelease()/CmdAcquire() pair instead if want to release/acquire + /// something but no GPU event is available. + virtual void CmdReleaseEvent( + const AcquireReleaseInfo& releaseInfo, + const IGpuEvent* pGpuEvent) = 0; + + /// Performs the acquire portion of an acquire/release event-based barrier. This acquire a set of resources for a + /// new set of usages, assuming CmdReleaseEvent() was called to release access for the resource's past usage. + /// + /// Conceptually, this method will: + /// - Ensure the release(s) have completed by waiting for the specified IGpuEvent early enough in the pipeline to + /// support the specified destination synchronization scope. + /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all + /// relevant caches above the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout + /// layout transitions. + /// @param [in] gpuEventCount Number of entries in pGpuEvents. + /// @param [in] ppGpuEvents Array of one or more events to wait on. Typically these will be set via + /// CmdReleaseEvent(), but it's valid to wait on an event set through a different means + /// like CmdSetEvent() from CPU side. Must be a valid (non-null) pointer to an array of + /// gpuEventCount valid GPU event pointers. Call CmdReleaseThenAcquire() instead if wait + /// to acquire something but no GPU event is available. + virtual void CmdAcquireEvent( + const AcquireReleaseInfo& acquireInfo, + uint32 gpuEventCount, + const IGpuEvent* const* ppGpuEvents) = 0; + + /// Conceptually equivalent to calling CmdRelease() followed immediately by CmdAcquire(), but it potentially has + /// better performance than calling CmdRelease()/CmdAcquire() directly. Can be called in cases where the client/ + /// application cannot detect separate release and acquire points for a transition. + /// + /// @param [in] barrierInfo Describes the synchronization scopes, availability/visibility operations, and the + /// required layout transitions. + virtual void CmdReleaseThenAcquire( + const AcquireReleaseInfo& barrierInfo) = 0; + + /// Issues an instanced, non-indexed draw call using the command buffer's currently bound graphics state. Results + /// in instanceCount * vertexCount vertices being processed. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// @param [in] firstVertex Starting index value for the draw. Indices passed to the vertex shader will range + /// from firstVertex to firstVertex + vertexCount - 1. + /// @param [in] vertexCount Number of vertices to draw. If zero, the draw will be discarded. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range + /// from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + /// @param [in] drawId Draw index for the draw. + void CmdDraw( + uint32 firstVertex, + uint32 vertexCount, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId) + { + m_funcTable.pfnCmdDraw(this, firstVertex, vertexCount, firstInstance, instanceCount, drawId); + } + + /// Issues draw opaque call using the command buffer's currently bound graphics state. + /// Uses the stream-out target of a previous draw as the input vertex data. + /// the number of vertices = (streamOutFilledSize (value of streamOutFilledSizeVa) - streamOutOffset) / stride + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// @param [in] streamOutFilledSizeVa gpuAddress of streamOut filled size for streamOut buffer. + /// @param [in] streamOutOffset the offset of begin of streamOut as vertex. + /// @param [in] stride stride for stream data as vertex. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader + /// will range from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + void CmdDrawOpaque( + gpusize streamOutFilledSizeVa, + uint32 streamOutOffset, + uint32 stride, + uint32 firstInstance, + uint32 instanceCount) + { + m_funcTable.pfnCmdDrawOpaque(this, + streamOutFilledSizeVa, + streamOutOffset, + stride, + firstInstance, + instanceCount); + } + + /// Issues an instanced, indexed draw call using the command buffer's currently bound graphics state. Results in + /// instanceCount * indexCount vertices being processed. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// Indices passed to the vertex shader will be: + /// + /// + IndexBuffer[firstIndex] + vertexOffset + /// + IndexBuffer[firstIndex + 1] + vertexOffset, + /// + ... + /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset + /// + /// @param [in] firstIndex Starting index buffer slot for the draw. + /// @param [in] indexCount Number of vertices to draw. If zero, the draw will be discarded. + /// @param [in] vertexOffset Offset added to the index fetched from the index buffer before it is passed to the + /// vertex shader. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range + /// from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + /// @param [in] drawId Draw index for the draw. + void CmdDrawIndexed( + uint32 firstIndex, + uint32 indexCount, + int32 vertexOffset, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId) + { + m_funcTable.pfnCmdDrawIndexed(this, firstIndex, indexCount, vertexOffset, firstInstance, instanceCount, drawId); + } + + /// Issues instanced, non-indexed draw calls using the command buffer's currently bound graphics state. The draw + /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find + /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its vertexCount or + /// instanceCount is zero. + /// + /// The layout of the argument data is defined in the @ref DrawIndirectArgs structure. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDraw + /// @see DrawIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDrawIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDrawIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + + /// Issues instanced, indexed draw calls using the command buffer's currently bound graphics state. The draw + /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find + /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its indexCount or + /// instanceCount is zero. + /// + /// The layout of the argument data is defined in the @ref DrawIndexedIndirectArgs structure. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDrawIndexed + /// @see DrawIndexedIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDrawIndexedIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDrawIndexedIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 909 + /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + inline void CmdDispatch( + DispatchDims size) + { + m_funcTable.pfnCmdDispatch(this, size, {}); + } +#endif + + /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + /// @param [in] infoFlags Additional information about the dispatch. + void CmdDispatch( + DispatchDims size, + DispatchInfoFlags infoFlags) + { + m_funcTable.pfnCmdDispatch(this, size, infoFlags); + } + + /// Dispatches a compute workload using the command buffer's currently bound compute state. The dimensions of the + /// workload come from GPU memory. The dispatch will be discarded if any of its dimensions are zero. + /// + /// The layout of the argument data is defined in the @ref DispatchIndirectArgs structure. + /// + /// @warning Does not support HSA ABI pipelines. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDispatch + /// @see DispatchIndirectArgs + /// + /// @param [in] gpuVirtAddr GPU virtual memory address where the indirect argument data is located. + /// The virtual address must be 4-byte aligned. + void CmdDispatchIndirect( + gpusize gpuVirtAddr) + { + m_funcTable.pfnCmdDispatchIndirect(this, gpuVirtAddr); + } + + /// Dispatches a compute workload of the given dimensions and offsets using the command buffer's currently bound + /// compute state. This command allows targeting regions of thread groups without adding the offset computations in + /// the shader. + /// + /// The caller may also provide a logical thread group count which is larger than the number of groups actually + /// launched. If the shader reads the dispatch's thread group count from PAL metadata it will see the logical size, + /// not the launch size. + /// + /// The combination of an offset, launch size, and logical size give the caller enough flexibility to take an + /// incoming dispatch, split it up into sub-dispatches, and execute those sub-dispatches using multiple + /// CmdDispatchOffset calls in whatever execution pattern they would like. Note that such an optimization + /// would not work if the shader has global logic that does make assumptions about thread group launch order. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] offset The thread groups offsets. Set them to zero if you don't want an offset. + /// @param [in] launchSize Thread groups to dispatch. If any components are zero the dispatch will be discarded. + /// @param [in] logicalSize The thread group dimensions reported to the shader via metadata. + void CmdDispatchOffset( + DispatchDims offset, + DispatchDims launchSize, + DispatchDims logicalSize) + { + m_funcTable.pfnCmdDispatchOffset(this, offset, launchSize, logicalSize); + } + + /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if + /// the currently bound graphics pipeline does not contain a mesh and/or task shader. + /// + /// The thread group size is defined in the mesh shader or task shader. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + void CmdDispatchMesh( + DispatchDims size) + { + m_funcTable.pfnCmdDispatchMesh(this, size); + } + + /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if + /// the currently bound graphics pipeline does not contain a mesh shader. The dimensions of the workload come from + /// GPU memory. The dispatch will be discarded if any of its dimensions are zero. + /// + /// The layout of the argument data is defined in the @ref DispatchMeshIndirectArgs structure. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDispatchMesh + /// @see DispatchMeshIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDispatchMeshIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDispatchMeshIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + + /// Copies multiple regions from one GPU memory allocation to another. + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping + /// will cause undefined results. + /// + /// This call should be used for buffer memory copy only; don't use it for image memory. + /// + /// For best performance, offsets and copy sizes should be 4-byte aligned. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuMemory GPU memory allocation where the source regions are located. + /// @param [in] dstGpuMemory GPU memory allocation where the destination regions are located. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, destination offset, and + /// copy size. + virtual void CmdCopyMemory( + const IGpuMemory& srcGpuMemory, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryCopyRegion* pRegions) = 0; + + /// Copies multiple regions from one GPU memory virtual address to another. + /// + /// @note The CmdCopyMemory() path should be preferred because it contains more optimizations due to more + /// knowledge about the memory itself that is lost when only virtual addresses are passed in. + /// + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory virtual address are the same. Any illegal + /// overlapping will cause undefined results. + /// + /// For best performance, addresses, offsets, and copy sizes should be 4-byte aligned. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuVirtAddr GPU memory vitrual address where the source regions are located. + /// @param [in] dstGpuVirtAddr GPU memory virtual address where the destination regions are located. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifynig a source offset, destination offset, + /// and copy size. + virtual void CmdCopyMemoryByGpuVa( + gpusize srcGpuVirtAddr, + gpusize dstGpuVirtAddr, + uint32 regionCount, + const MemoryCopyRegion* pRegions) = 0; + + /// Copies multiple regions from one image to another. + /// + /// The source and destination subresource of a particular region are not allowed to be the same, and will produce + /// undefined results. Additionally, destination subresources cannot be present more than once per CmdCopyImage() + /// call. + /// + /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image + /// extents are specified in compression blocks. + /// + /// The source and destination images must to be of the same type (1D, 2D or 3D), or optionally 2D and 3D with the + /// number of slices matching the depth. MSAA source and destination images must have the same number of samples. + /// + /// Each region must satisfy these restrictions. + /// - srcOffset >= 0 and dstOffset >= 0 + /// - srcOffset + extent <= srcSubres's extent + /// - dstOffset + extent <= dstSubres's extent + /// + /// Images copied via this function must have x/y/z offsets and width/height/depth extents aligned to the minimum + /// tiled copy alignment specified in @ref DeviceProperties for the engine this function is executed on. Note that + /// the DMA engine supports tiled copies regardless of the alignment; the reported minimum tiled copy alignments + /// are an indication of the minimum alignments for which the copy will be performant. + /// + /// When the per-engine capability flag supportsMismatchedTileTokenCopy (@see DeviceProperties) is false, + /// CmdCopyImage is only valid between two subresources that share the same tileToken (@see SubresLayout). + /// + /// Note that the copy can go through clone copy automatically if, + /// - Both source and destination images are created with @ref ImageCreateInfo::flags::cloneable = 1 + /// - Both source and destination images have same @ref ImageCreateInfo + /// - Source image's layout is compatible with destination images' layout + /// - This is a full image copy + /// - Copy flags @ref CopyControlFlags required to be 0. + /// + /// Basically clone copy clones all subresources' data of one image object in another while preserving the image + /// layout. It does raw copy on image data and metadata; and tries to keep the metadata (like DCC/HiZ/HiS) + /// unchanged but may be not true due to different HW design. + /// e.g. Client compression (fragment and ZPlane compression) will be missed during the compute based raw copy. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] srcImage Image where source regions reside. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Image where destination regions reside. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source subresource, destination + /// subresource, source x/y/z offset, destination x/y/z offset, and copy size in the + /// x/y/z dimensions. + /// @param [in] pScissorRect Rectangle for scissor test. + /// @param [in] flags A mask of ORed @ref CopyControlFlags that can be used to control copy behavior. + virtual void CmdCopyImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const ImageCopyRegion* pRegions, + const Rect* pScissorRect, + uint32 flags) = 0; + + /// Copies data directly (without format conversion) from a GPU memory object to an image. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The size of the data copied from memory is implicitly derived from the image extents. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdCopyMemoryToImage() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination + /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyMemoryToImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const MemoryImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from an image to a GPU memory object. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The size of the data copied to memory is implicitly derived from the image extents. + /// + /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A + /// destination region cannot be present more than once per CmdCopyImageToMemory() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc + /// + /// @param [in] srcImage Image where source data will be read from. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source + /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyImageToMemory( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a GPU memory object to a PRT. + /// + /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. + /// This function always copies entire tiles, even if parts of the tile are internal padding. + /// + /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that + /// operate in texels like the generic CmdCopyMemoryToImage() should be used instead. + /// + /// The size of the data copied from memory is implicitly derived from the image extents. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdCopyMemoryToTiledImage() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. Must have the "prt" flag set. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination + /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyMemoryToTiledImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const MemoryTiledImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a PRT to a GPU memory object. + /// + /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. + /// This function always copies entire tiles, even if parts of the tile are internal padding. + /// + /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that + /// operate in texels like the generic CmdCopyImageToMemory() should be used instead. + /// + /// The size of the data copied to memory is implicitly derived from the image extents. + /// + /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A + /// destination region cannot be present more than once per CmdCopyTiledImageToMemory() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc + /// + /// @param [in] srcImage Image where source data will be read from. + /// @param [in] srcImageLayout Current allowed usages and queues for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source + /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyTiledImageToMemory( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryTiledImageCopyRegion* pRegions) = 0; + + /// Copies multiple regions directly (without format conversion) from one typed buffer to another. + /// + /// For compressed formats, the extents are specified in compression blocks. + /// + /// The buffer memory offsets have to be aligned to the smaller of their texel sizes or 4 bytes. + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping + /// will cause undefined results. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source offset, + /// and copy size in the x/y/z dimensions. + virtual void CmdCopyTypedBuffer( + const IGpuMemory& srcGpuMemory, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const TypedBufferCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a 2D typed buffer to a 2D image. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdScaledCopyTypedBufferToImage() call. + /// + /// MSAA resource is unsupported. The client must resolve both resources before calling this function. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, copy size of source + /// region, a destination offset, destination subresource, and copy size of destination + /// region. + virtual void CmdScaledCopyTypedBufferToImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const TypedBufferImageScaledCopyRegion* pRegions) = 0; + + /// Copies a GPU register content to a GPU memory location. + /// + /// The destination memory offset has to be aligned to 4 bytes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherMemory + /// + /// @param [in] srcRegisterOffset Source register offset in bytes + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] dstOffset Destination memory offset in bytes. + virtual void CmdCopyRegisterToMemory( + uint32 srcRegisterOffset, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + + /// Copies multiple scaled regions from one image to another. + /// + /// The source and destination subresource of a particular region are not allowed to be the same, and will produce + /// undefined results. Additionally, destination subresources cannot be present more than once per + /// CmdScaledCopyImage() call. + /// + /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image + /// extents are specified in compression blocks. + /// + /// The source and destination images must to be of the same type (1D, 2D or 3D). Both single sampled images and + /// MSAA images are supported. + /// + /// Linear texture filtering is only supported for images with non-integer formats. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] copyInfo Specifies parameters needed to execute CmdScaledCopyImage. See + /// @ref ScaledCopyInfo for more information. + virtual void CmdScaledCopyImage( + const ScaledCopyInfo& copyInfo) = 0; + + /// Automatically generates texture data for a range of subresources such that they may be used as intermediate + /// images in a mipmap chain. The existing values in mip N are used to generate mip N+1. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the base mip, @ref CoherCopySrc and @ref CoherCopyDst for the others. + /// - ImageLayout: @ref LayoutCopySrc for the base mip, @ref LayoutCopySrc and @ref LayoutCopyDst for the others. + /// + /// @param [in] genInfo The parameters for CmdGenerateMipmaps. See @ref GenMipmapsInfo for more information. + virtual void CmdGenerateMipmaps( + const GenMipmapsInfo& genInfo) = 0; + + /// Copies multiple scaled regions from one image to another, converting between RGB and YUV color spaces during + /// the copy. The exact conversion between YUV and RGB is controlled by a caller-specified color-space-conversion + /// table. + /// + /// The source and destination images must both be of the 2D type. Only single-sampled images are supported. + /// One of the two images involved must have an RGB color format, and the other must have a YUV color format. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] srcImage Images where source region reside. If this is a YUV image, the destination must be + /// RGB, and this copy will convert YUV to RGB. Otherwise, the destination must be YUV, + /// and the copy will convert RGB to YUV. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Image where destination regions reside. If this is a YUV image, the source must be + /// RGB, and this copy will convert RGB to YUV. Otherwise, the source must be YUV and + /// the copy will convert YUV to RGB. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of conversion-copy regions, each entry specifying a source x/y/z offset, source + /// x/y/z extent, destination x/y/z offset, destination x/y/z extent, RGB subresource and + /// YUV subresource(s). + /// @param [in] filter Texture filtering for shader sample instruction. + /// @param [in] cscTable Color-space-conversion table which controls how YUV data is converted to a specific + /// RGB representation and vice-versa. + virtual void CmdColorSpaceConversionCopy( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const ColorSpaceConversionRegion* pRegions, + TexFilter filter, + const ColorSpaceConversionTable& cscTable) = 0; + + /// Clones data of one image object in another while preserving the image layout. + /// + /// The source and destination images must be created with identical creation parameters and must specify the + /// cloneable flag. The clone operation clones all subresources. + /// + /// Both resources can be in any layout before the clone operation. After the clone, the source image state is left + /// intact and the destination image layout becomes the same as the source. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcImage Source image. + /// @param [in] dstImage Destination image. + virtual void CmdCloneImageData( + const IImage& srcImage, + const IImage& dstImage) = 0; + + /// Directly updates a range of GPU memory with a small amount of host data. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// The client is responsible for choosing the proper method for optimal performance. If updating data size is less + /// equal than 8 bytes, CmdWriteImmediate() is preferred. + /// + /// @param [in] dstGpuMemory GPU memory object to be updated. + /// @param [in] dstOffset Byte offset into the GPU memory object to be udpated. Must be a multiple of 4. + /// @param [in] dataSize Amount of data to write, in bytes. Must be a multiple of 4. + /// @param [in] pData Pointer to host data to be copied into the GPU memory. + virtual void CmdUpdateMemory( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize dataSize, + const uint32* pData) = 0; + + /// Updates marker surface with a DWORD value to indicate an event completion. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] dstGpuMemory GPU memory object to be updated. + /// @param [in] offset Byte offset into marker address + /// @param [in] value Marker DWORD value to be copied to the bus addressable or external physical memory. + virtual void CmdUpdateBusAddressableMemoryMarker( + const IGpuMemory& dstGpuMemory, + gpusize offset, + uint32 value) = 0; + + /// Fills a range of GPU memory with the provided 32-bit data. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// @param [in] dstGpuMemory GPU memory object to be filled. + /// @param [in] dstOffset Byte offset into the GPU memory object to be filled. Must be a multiple of 4. + /// @param [in] fillSize Size to fill, in bytes. Must be a multiple of 4. + /// @param [in] data 32-bit value to be repeated in the filled range. + virtual void CmdFillMemory( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize fillSize, + uint32 data) = 0; + + /// Interprets a range of GPU memory as a color buffer and clears it to the specified clear color. + /// + /// The maximum clear range is determined by the buffer offset and buffer extent; if any Ranges are specified they + /// must be specified in texels with respect to the beginning of the buffer and must not exceed its extent. + /// With 96-bit formats, bufferOffset must be specified in bytes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs is expected but the more general @ref PipelineStageBlt is also OK. + /// - CacheCoherency: @ref CoherShader is expected but the more general @ref CoherClear is also OK. + /// + /// @param [in] gpuMemory GPU memory to be cleared. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] bufferFormat The format of the color data in the buffer. + /// @param [in] bufferOffset The offset to the beginning of the buffer, in units of texels + /// (or bytes for 96-bit texels). + /// @param [in] bufferExtent The extent of the buffer, in units of texels. + /// @param [in] rangeCount Number of ranges within the buffer to clear; size of the pRanges array. + /// If zero, the entire view will be cleared and pRanges will be ignored. + /// @param [in] pRanges Array of ranges within the GPU memory to clear. + virtual void CmdClearColorBuffer( + const IGpuMemory& gpuMemory, + const ClearColor& color, + SwizzledFormat bufferFormat, + uint32 bufferOffset, + uint32 bufferExtent, + uint32 rangeCount = 0, + const Range* pRanges = nullptr) = 0; + + /// Clears the currently bound color targets to the specified clear color. + /// + /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. + /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in + /// Vulkan where the color attachments are inherited. + /// + /// This requires regionCount being specified since resource size is for sure to be known. The bound color targets + /// shouldn't have UndefinedSwizzledFormat as their swizzle format. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// + /// @param [in] colorTargetCount Number of bound color target that needs to be cleared. + /// @param [in] pBoundColorTargets Color target information for the bound color targets. + /// @param [in] regionCount Number of volumes within the image to clear; size of the pClearRegions array. + /// This need to be non-zero. + /// @param [in] pClearRegions Array of volumes within the subresources to clear. + virtual void CmdClearBoundColorTargets( + uint32 colorTargetCount, + const BoundColorTarget* pBoundColorTargets, + uint32 regionCount, + const ClearBoundTargetRegion* pClearRegions) = 0; + + /// Clears a color image to the specified clear color. + /// + /// If any Boxes have been specified, all subresource ranges must contain a single, identical mip level. + /// + /// The imageLayout can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick + /// to layouts that are likely to support compression like @ref LayoutColorTarget. + /// + /// This function requires use of the following barrier flags if @ref flags includes @ref ColorClearAutoSync: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// Otherwise the following barrier flags must be used: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherClear + /// + /// @param [in] image Image to be cleared. + /// @param [in] imageLayout Current allowed usages and engines for the target image. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] clearFormat If clearFormat.format is Undefined (e.g. if UndefinedSwizzledFormat is provided), do not + /// reinterpret the subresources' formats. Otherwise, the subresources' formats will be + /// reinterpreted according to this parameter. The specified format needs to have been + /// included in the "pViewFormats" list specified at image-creation time, otherwise + /// corruption may occur. + /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. + /// @param [in] pRanges Array of subresource ranges to clear. + /// @param [in] boxCount Number of volumes within the image to clear; size of the pBoxes array. + /// If zero, entire subresources will be cleared and pBoxes will be ignored. + /// @param [in] pBoxes Array of volumes within the subresources to clear. + /// @param [in] flags Mask of ClearColorImageFlags values controlling behavior of the clear. + virtual void CmdClearColorImage( + const IImage& image, + ImageLayout imageLayout, + const ClearColor& color, + const SwizzledFormat& clearFormat, + uint32 rangeCount, + const SubresRange* pRanges, + uint32 boxCount, + const Box* pBoxes, + uint32 flags) = 0; + + /// Clears the currently bound depth/stencil targets to the specified clear values. + /// + /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. + /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in + /// Vulkan where the color attachments are inherited. + /// + /// This requires regionCount being specified since resource size is for sure to be known. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// + /// @param [in] depth Depth clear value. + /// @param [in] stencil Stencil clear value. + /// @param [in] stencilWriteMask Stencil write mask to clear specific stencil planes. + /// @param [in] samples Sample count. + /// @param [in] fragments Fragment count. + /// @param [in] flag Select to depth, stencil or depth and stencil. + /// @param [in] regionCount Number of volumes within the bound depth/stencil target to clear. + /// @param [in] pClearRegions Array of volumes within the subresources to clear. + virtual void CmdClearBoundDepthStencilTargets( + float depth, + uint8 stencil, + uint8 stencilWriteMask, + uint32 samples, + uint32 fragments, + DepthStencilSelectFlags flag, + uint32 regionCount, + const ClearBoundTargetRegion* pClearRegions) = 0; + + /// Clears a depth/stencil image to the specified clear values. + /// + /// If any Rects have been specified, all subresource ranges must contain a single, identical mip level. + /// + /// The layouts can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick to + /// layouts that are likely to support compression like @ref LayoutDepthStencilTarget. + /// + /// This function requires use of the following barrier flags if @ref flags includes @ref DsClearAutoSync: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// Otherwise the following barrier flags must be used: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherClear + /// + /// @param [in] image Image to be cleared. + /// @param [in] depth Depth clear value. + /// @param [in] depthLayout Current allowed usages and engines for the depth plane. + /// @param [in] stencil Stencil clear value. + /// @param [in] stencilWriteMask Write-mask to apply to the stencil subresource ranges during the clear. + /// @param [in] stencilLayout Current allowed usages and engines for the stencil plane. + /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. + /// @param [in] pRanges Array of subresource ranges to clear. + /// @param [in] rectCount Number of areas within the image to clear; size of the pRects array. If zero, + /// the entire subresources will be cleared and pRects will be ignored. + /// @param [in] pRects Array of areas within the subresources to clear. + /// @param [in] flags Mask of ClearDepthStencilFlags values controlling behavior of the clear. + virtual void CmdClearDepthStencil( + const IImage& image, + ImageLayout depthLayout, + ImageLayout stencilLayout, + float depth, + uint8 stencil, + uint8 stencilWriteMask, + uint32 rangeCount, + const SubresRange* pRanges, + uint32 rectCount, + const Rect* pRects, + uint32 flags) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 910 + /// Clears a range of GPU memory to the specified clear color using the specified buffer view SRD. + /// + /// The maximum clear range is determined by the view; if any Ranges are specified they must fit within the view's + /// range. The view must support shader writes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs + /// - CacheCoherency: @ref CoherShader + /// + /// @note You may use the more general @ref PipelineStageBlt and @ref CoherClear if you wish but they may result in + /// higher barrier overhead. + /// + /// @param [in] gpuMemory GPU memory to be cleared. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] pBufferViewSrd The image view SRD that will be used to interpret the image. + /// @param [in] rangeCount Number of ranges within the GPU memory to clear; size of the pRanges array. + /// If zero, the entire view will be cleared and pRanges will be ignored. + /// @param [in] pRanges Array of ranges within the GPU memory to clear. + virtual void CmdClearBufferView( + const IGpuMemory& gpuMemory, + const ClearColor& color, + const void* pBufferViewSrd, + uint32 rangeCount = 0, + const Range* pRanges = nullptr) = 0; + + /// Clears an image to the specified clear color using the specified image view SRD. + /// + /// The clear subresouce range is determined by the view; if any Rects have been specified, the image view must + /// contain a single mip level. The view must support shader writes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs but the more general @ref PipelineStageBlt is also OK. + /// - CacheCoherency: @ref CoherShader but the more general @ref CoherClear is also OK. + /// - ImageLayout: @ref LayoutShaderWrite + /// + /// @param [in] image Image to be cleared. + /// @param [in] imageLayout Current allowed usages and engines for the image, must include LayoutShaderWrite. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] pImageViewSrd The image view SRD that will be used to interpret the image. + /// @param [in] rectCount Number of volumes within the image to clear; size of the pRects array. + /// If zero, entire subresources will be cleared and pRects will be ignored. + /// @param [in] pRects Array of volumes within the subresources to clear. The begin and end slices to be + /// cleard are from SubresRange in pImageViewSrd. + virtual void CmdClearImageView( + const IImage& image, + ImageLayout imageLayout, + const ClearColor& color, + const void* pImageViewSrd, + uint32 rectCount = 0, + const Rect* pRects = nullptr) = 0; +#endif + + /// Resolves multiple regions of a multisampled image to a single-sampled image. + /// + /// The source image must be a 2D multisampled image and the destination must be a single-sampled image. + /// The formats of the source and destination images must match unless all regions specify a valid format. + /// + /// For color images, if the source image has an integer numeric format, a single sample is copied (sample 0). + /// + /// For depth/stencil images, the resolve is performed by simply copying sample 0 from every source pixel to the + /// destination pixel. + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherResolveSrc for the source and @ref CoherResolveDst for the destination. + /// - ImageLayout: @ref LayoutResolveSrc for the source and @ref LayoutResolveDst for the destination. + /// + /// @param [in] srcImage MSAA source image. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutResolveSrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Single-sample destination image. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutResolveDst and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] resolveMode Resolve mode + /// @param [in] pRegions Specifies src/dst subresources and rectangles. + /// @param [in] flags Mask of ResolveImageFlags values controlling behavior of the resolve. + virtual void CmdResolveImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + ResolveMode resolveMode, + uint32 regionCount, + const ImageResolveRegion* pRegions, + uint32 flags) = 0; + + /// Resolves multiple regions of a Sampler Feedback map to another image. + /// + /// The source image must be: + /// Decode: A 2D Feedback map, in which case the destination image will hold the decoded sampled data + /// Encode: A texture, in which case the destination will hold the encoded sampler map + /// + /// The formats of the source and destination images must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderRead for the source and @ref LayoutShaderWrite for the destination. + /// + /// @param [in] srcImage Source image, depends on resolve type + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must + /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] dstImage Destination image, depends on resolve type + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] resolveType Resolve type + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] pRegions Specifies src/dst subresources and rectangles + virtual void CmdResolvePrtPlusImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + PrtPlusResolveType resolveType, + uint32 regionCount, + const PrtPlusImageResolveRegion* pRegions) = 0; + + /// Encodes a buffer into a Sampler Feedback image. + /// + /// This interface only supports encoding raw data from a buffer to an encoded Sampler Feedback map + /// + /// The format of the data in the source buffer and destination image must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderWrite for the destination. + /// + /// @param [in] srcBuffer Source buffer, depends on the resolve type + /// @param [in] dstImage Destination image, depends on resolve type + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] regionCount Number of regions to resolve, size of pRegions array + /// @param [in] pRegions Specifies src/dst subresources and rectangles + virtual void CmdResolvePrtPlusBufferToImage( + const IGpuMemory& srcBuffer, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const PrtPlusBufferToImageResolveRegion* pRegions) = 0; + + /// Decodes a Sampler Feedback map to a buffer + /// + /// This interface only supports decoding a Sampler Feedback map to buffer + /// + /// The format of the data in the source image and destination buffer must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderRead for the source + /// + /// @param [in] srcImage Source image, depends on the resolve type + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must + /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] dstBuffer Destination buffer, depends on resolve type + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] pRegions Specifies src/dst subresources and rectangles. + virtual void CmdResolvePrtPlusImageToBuffer( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstBuffer, + uint32 regionCount, + const PrtPlusImageToBufferResolveRegion* pRegions) = 0; + + /// Puts the specified event into the _set_ state when all prior GPU work has progressed past the given stages. + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] gpuEvent GPU event to be set. + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the event is set. The set + /// will be performed at the earliest possible stage after the prior stages. + virtual void CmdSetEvent( + const IGpuEvent& gpuEvent, + uint32 stageMask) = 0; + + /// Puts the specified event into the _reset_ state when all prior GPU work has progressed past the given stages. + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] gpuEvent GPU event to be reset. + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the event is reset. The + /// reset will be performed at the earliest possible stage after the prior stages. + virtual void CmdResetEvent( + const IGpuEvent& gpuEvent, + uint32 stageMask) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// Puts the specified GPU event into the _set_ state when all previous GPU work reaches the specified point in the + /// pipeline. + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] gpuEvent GPU event to be set. + /// @param [in] setPoint Point in the graphics pipeline where the GPU event will be _set_, indicating all prior + /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't + /// support this operation at the exact specified point, the set will be performed at the + /// earliest possible point _after_ the specified point. + inline void CmdSetEvent( + const IGpuEvent& gpuEvent, + HwPipePoint setPoint) + { CmdSetEvent(gpuEvent, HwPipePointToStage[setPoint]); } + + /// Puts the specified GPU event into the _reset_ state when all previous GPU work reaches the specified point in + /// the pipeline. + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] gpuEvent GPU event to be reset. + /// @param [in] resetPoint Point in the graphics pipeline where the GPU event will be _reset_, indicating all prior + /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't + /// support this operation at the exact specified point, the reset will be performed at the + /// earliest possible point _after_ the specified point. + inline void CmdResetEvent( + const IGpuEvent& gpuEvent, + HwPipePoint resetPoint) + { CmdResetEvent(gpuEvent, HwPipePointToStage[resetPoint]); } +#endif + + /// Predicate the subsequent jobs in the command buffer if the event is set. + /// + /// @param [in] gpuEvent GPU event to be checked. + virtual void CmdPredicateEvent( + const IGpuEvent& gpuEvent) = 0; + + /// Performs the specified 32- or 64-bit memory operation. These operations are atomic with respect to shader + /// atomic operations. + /// + /// The data size (32-bit or 64-bit) is determined by the operation type. For 32-bit atomics, only the lower + /// 32-bits of srcData is used. + /// + /// The destination GPU memory offset must be 4-byte aligned for 32-bit atomics and 8-byte aligned for 64-bit + /// atomics. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherQueueAtomic + /// + /// @param [in] dstGpuMemory Destination GPU memory object. + /// @param [in] dstOffset Offset into the memory object where the atomic will be performed. + /// @param [in] srcData Source data for the atomic operation. Use depends on the atomicOp. + /// @param [in] atomicOp Specifies which atomic operation to perform. @see AtomicOp. + virtual void CmdMemoryAtomic( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + uint64 srcData, + AtomicOp atomicOp) = 0; + + /// Starts a query operation for the given slot of a query pool. + /// + /// The query slot must have been previously cleared with CmdResetQueryPool() before starting a query. + /// + /// @note Queries may not span multiple command buffers. + /// + /// @param [in] queryPool Query pool for this query. + /// @param [in] queryType The type of query this operation will produce. + /// @param [in] slot Slot in pQueryPool where the results of this query should be accumulated. + /// @param [in] flags Flags controlling query behavior. @see QueryControlFlags. + virtual void CmdBeginQuery( + const IQueryPool& queryPool, + QueryType queryType, + uint32 slot, + QueryControlFlags flags) = 0; + + /// Stops a query operation for the given slot of a query pool. + /// + /// The query slot must have an open query on it when this is called. + /// + /// @param [in] queryPool Query pool for this query. + /// @param [in] queryType The type of query this operation will produce. + /// @param [in] slot Slot in pQueryPool where the query is running. + virtual void CmdEndQuery( + const IQueryPool& queryPool, + QueryType queryType, + uint32 slot) = 0; + + /// Resolves the results of a range of queries to the specified query type into the specified GPU memory location. + /// + /// This function requires use of the following barrier flags on @ref dstGpuMemory: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// This operation does not honor the command buffer's predication state, if active. + /// + /// @param [in] queryPool Query pool holding the source queries. + /// @param [in] flags Flags that control the result data layout and how the results are retrieved. + /// @param [in] queryType The type of queries this resolve will produce. + /// @param [in] startQuery First slot in pQueryPool to resolve. + /// @param [in] queryCount Number of query pool slots to resolve. + /// @param [in] dstGpuMemory Destination GPU memory object. + /// @param [in] dstOffset 4-byte aligned offset into pDstGpuMemory where the results should be written. + /// @param [in] dstStride 4-byte aligned stride between where results are written into pDstGpuMemory. + virtual void CmdResolveQuery( + const IQueryPool& queryPool, + QueryResultFlags flags, + QueryType queryType, + uint32 startQuery, + uint32 queryCount, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize dstStride) = 0; + + /// Rests a range of slots in a query pool. A query slot must be reset each time before a query can be started + /// using that slot. + /// + /// @param [in] queryPool Query pool to be reset. + /// @param [in] startQuery First slot in pQueryPool to be reset. + /// @param [in] queryCount Number of slots to reset. + virtual void CmdResetQueryPool( + const IQueryPool& queryPool, + uint32 startQuery, + uint32 queryCount) = 0; + + /// Writes a GPU performance timestamp to memory when all prior GPU work has progressed past the given stages. + /// + /// The timestamp data is a 64-bit value that increments once per clock. @ref timestampFrequency in DeviceProperties + /// reports the frequency the timestamps are clocked at. Timestamps are only supported by engines that report + /// @ref supportsTimestamps in DeviceProperties. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag(s) specified in @ref stageMask. + /// - CacheCoherency: @ref CoherTimestamp + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the timestamp is written. + /// The timestamp will be performed at the earliest possible stage after the prior stages. + /// Note that the SDMA engine only supports bottom-of-pipe timestamps. + /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. + /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to + /// minTimestampAlignment in DeviceProperties. + virtual void CmdWriteTimestamp( + uint32 stageMask, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + + /// Writes an immediate value to memory when all prior GPU work has progressed past the given stages. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag(s) specified in @ref stageMask. + /// - CacheCoherency: @ref CoherCp + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope + /// that restricts which stages of prior GPU work must happen before the immediate + /// value is written. The write will be occur at the earliest possible stage after + /// the prior stages. Note that the SDMA engine only supports bottom-of-pipe writes. + /// @param [in] data Value to be written to gpu address. + /// @param [in] ImmediateDataWidth Size of the data to be written out. + /// @param [in] address GPU address where immediate value should be written. + virtual void CmdWriteImmediate( + uint32 stageMask, + uint64 data, + ImmediateDataWidth dataSize, + gpusize address) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// Writes a HwPipePostPrefetch or HwPipeBottom timestamp to the specified memory location. + /// + /// The timestamp data is a 64-bit value that increments once per clock. timestampFrequency in DeviceProperties + /// reports the frequency the timestamps are clocked at. + /// + /// Timestamps are only supported by engines that report supportsTimestamps in DeviceProperties. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag specified in @ref pipePoint. + /// - CacheCoherency: @ref CoherTimestamp + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. The only + /// valid choices are HwPipePostPrefetch and HwPipeBottom. HwPipePostPrefetch timestamps + /// are not supported on the SDMA engine, so all timestamps will be executed as + /// bottom-of-pipe. + /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. + /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to + /// minTimestampAlignment in DeviceProperties. + inline void CmdWriteTimestamp( + HwPipePoint pipePoint, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) + { CmdWriteTimestamp(HwPipePointToStage[pipePoint], dstGpuMemory, dstOffset); } + + /// Writes a top-of-pipe or bottom-of-pipe immediate value to the specified memory location. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag specified in @ref pipePoint. + /// - CacheCoherency: @ref CoherCp + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. + /// The only valid choices are HwPipeTop, HwPipePostPrefetch and HwPipeBottom. + /// Top-of-pipe timestamps are not supported on the SDMA engine, so all timestamps + /// will be executed as bottom-of-pipe. + /// @param [in] data Value to be written to gpu address. + /// @param [in] ImmediateDataWidth Size of the data to be written out. + /// @param [in] address GPU address where immediate value should be written. + inline void CmdWriteImmediate( + HwPipePoint pipePoint, + uint64 data, + ImmediateDataWidth dataSize, + gpusize address) + { CmdWriteImmediate(HwPipePointToStage[pipePoint], data, dataSize, address); } +#endif + + /// Loads the current stream-out buffer-filled-sizes stored on the GPU from memory, typically from a target of a + /// prior CmdSaveBufferFilledSizes() call. + /// + /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use + /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr Array of GPU virtual addresses to load each counter from. If any of these are zero, + /// the corresponding filled-size counter is not loaded. + virtual void CmdLoadBufferFilledSizes( + const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; + + /// Saves the current stream-out buffer-filled-sizes into GPU memory. + /// + /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use + /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr Array of GPU virtual addresses to save each counter into. If any of these are zero, + /// the corresponding filled-size counter is not saved. + virtual void CmdSaveBufferFilledSizes( + const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; + + /// Set the offset to buffer-filled-size for a stream-out target. + /// + /// @param [in] bufferId Stream-out buffer ID, it could be in the range [0, MaxStreamOutTargets). + /// @param [in] offset The value to be written into the buffer filled size counter. + /// + virtual void CmdSetBufferFilledSize( + uint32 bufferId, + uint32 offset) = 0; + + /// Binds the specified border color palette for use by samplers. + /// + /// @param [in] pipelineBindPoint Specifies which pipeline type is affected (i.e., graphics or compute). + /// @param [in] pPalette Border color palette object to bind. + virtual void CmdBindBorderColorPalette( + PipelineBindPoint pipelineBindPoint, + const IBorderColorPalette* pPalette) = 0; + + /// Sets predication for this command buffer to use the specified GPU memory location. Any draw, dispatch or copy + /// operation between this command and the corresponding reset/disable call will be skipped if the value in spec- + /// ified location matches the passed-in predicated value + /// + /// This function requires use of the following barrier flags on @ref pGpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] pQueryPool pointer to QueryPool obj, not-nullptr means this is a QueryPool based predication + /// - Zpass/Occlusion based predication + /// - or PrimCount/Streamout based predication + /// @param [in] slot Slot to use for setting occlusion predication, valid when pQueryPool is not nullptr + /// @param [in] pGpuMemory GPU memory object for the predication value, only valid when pQueryPool is nullptr + /// @param [in] offset GPU memory offset for the predication value + /// @param [in] predType Predication type. + /// @param [in] predPolarity Controls the polarity of the predication test + /// true = draw_if_visible_or_no_overflow + /// false = draw_if_not_visible_or_overflow + /// @param [in] waitResults Hint only valid for Zpass/Occlusion. + /// false = wait_until_final_zpass_written + /// true = draw_if_not_final_zpass_written + /// @param [in] accumulateData true(1) = allow_accumulation of Zpass and PrimCount across command buffer boundaries. + /// + /// pQueryPool and gpuVirtAddr should be exclusively set, when both are nullptr/0, other params will be ignored + /// and it means to reset/disable predication so that the following commands can perform normally. + virtual void CmdSetPredication( + IQueryPool* pQueryPool, + uint32 slot, + const IGpuMemory* pGpuMemory, + gpusize offset, + PredicateType predType, + bool predPolarity, + bool waitResults, + bool accumulateData) = 0; + + /// Suspend/resume any active predication for this command buffer + /// + /// @param [in] suspend Controls if predication should be paused + /// true = suspend active predication + /// false = resume active predication + /// + /// Any suspended predication must be resumed prior to disabling predication using CmdSetPredication with pQueryPool + /// and gpuVirtAddr with nullptr/0. This is only valid on universal and compute command buffers. + virtual void CmdSuspendPredication( + bool suspend) = 0; + + /// Begins a conditional block in the current command buffer. All commands between this and the corresponding + /// CmdEndIf() (or CmdElse() if it is present) command are executed if the specified condition is true. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] offset Offset within the memory object where the tested memory location begins. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. + virtual void CmdIf( + const IGpuMemory& gpuMemory, + gpusize offset, + uint64 data, + uint64 mask, + CompareFunc compareFunc) = 0; + + /// Begins a conditional block in the current command buffer. All commands between this and the corresponding + /// CmdEndIf() command are executed if the condition specified in the innermost active conditional block are false. + virtual void CmdElse() = 0; + + /// Ends the innermost active conditional block in the current command buffer. + virtual void CmdEndIf() = 0; + + /// Begins a while loop in the current command buffer. All commands between this and the corresponding CmdEndWhile() + /// command are executed repeatedly as long as the specified condition remains true. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] offset Offset within the memory object where the tested memory location begins. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. + virtual void CmdWhile( + const IGpuMemory& gpuMemory, + gpusize offset, + uint64 data, + uint64 mask, + CompareFunc compareFunc) = 0; + + /// Ends the innermost active while loop in the current command buffer. + virtual void CmdEndWhile() = 0; + + /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a + /// GPU register. + /// + /// The client (or application) is supposed to do necessary barriers before calling this function, but for now this + /// is only need to wait some display or timer related registers. + /// + /// @param [in] registerOffset The offset in bytes of GPU register to be tested. + /// @param [in] data Source data to compare against the value of GPU register. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never shouldn't + /// be used as the hardware does not support it. + virtual void CmdWaitRegisterValue( + uint32 registerOffset, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a + /// GPU memory location. + /// + /// The client (or application) is expected to transiton the memory to proper state before calling this function. + /// The memory location for the condition must be 4-byte aligned. + /// This function requires use of the following barrier flags on @ref gpuVirtAddr: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr GPU memory address containing the data to be tested. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not + /// be used as the hardware does not support it. + virtual void CmdWaitMemoryValue( + gpusize gpuVirtAddr, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Stalls a command buffer execution until an external device writes to the marker surface in the GPU bus + /// addressable memory location. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not + /// be used as the hardware does not support it. + virtual void CmdWaitBusAddressableMemoryMarker( + const IGpuMemory& gpuMemory, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Begins the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to begin. + virtual void CmdBeginPerfExperiment( + IPerfExperiment* pPerfExperiment) = 0; + + /// Updates the sqtt token mask on the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to update. + /// @param [in] tokenConfig updated token and reg mask to apply. + /// + /// @note: This function is only valid to call if pPerfExperiment is a thread trace experiment that is currently + // active. + virtual void CmdUpdatePerfExperimentSqttTokenMask( + IPerfExperiment* pPerfExperiment, + const ThreadTraceTokenConfig& tokenConfig) = 0; + + /// Updates the sqtt token mask on all running traces, if any. + /// + /// @note This may overwrite the stall settings (making them more conservative) + /// @param [in] tokenConfig updated token and reg mask to apply. + virtual void CmdUpdateSqttTokenMask( + const ThreadTraceTokenConfig& tokenConfig) = 0; + + /// Ends the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to end. + virtual void CmdEndPerfExperiment( + IPerfExperiment* pPerfExperiment) = 0; + + /// Inserts a trace marker into the command buffer. + /// + /// A trace marker can be inserted to mark particular points of interest in a command buffer to be viewed with the + /// trace data collected in a performance experiment. + /// + /// @param [in] markerType Selects one of two generic marker categories ("A" or "B"). + /// @param [in] markerData 32-bit marker value to be inserted. + virtual void CmdInsertTraceMarker( + PerfTraceMarkerType markerType, + uint32 markerData) = 0; + + /// Inserts a set of SQ thread trace markers for consumption by the Radeon GPU Profiler (RGP). + /// + /// Only supported on Universal and Compute engines. + /// + /// @param [in] numDwords Number of dwords in pData to be inserted as SQTT markers. + /// @param [in] pData SQTT marker data. See the RGP SQTT Instrumentation Specification for details on how this + /// data should be formatted. + virtual void CmdInsertRgpTraceMarker( + RgpMarkerSubQueueFlags subQueueFlags, + uint32 numDwords, + const void* pData) = 0; + + /// This function is to be used to copy the DF SPM (MALL SPM) data from the output buffers to an accessible buffer. + /// The buffer that HW outputs to is allocated with a special KMD flag and therefore cannot be the same as the + /// normal IPerfExperiment buffer so we need a special command to get the data. + /// + /// The bulk of the implementation for this is done by the KMD. They are in charge of starting and stopping the + /// trace as well as all of the register programming. When KMD recieves a dfSpmTraceEnd bit from a CmdBufInfo + /// flag, they will wait for the command buffer to be completely idle before stopping the trace. Therefore, a + /// CmdEndPerfExperiment call does not stop this particular sample, the end of a command buffer with a + /// dfSpmTraceEnd does. This means that calling CmdCopyDfSpmTraceData in the same command buffer as + /// dfSpmTraceEnd will give you incorrect data. The sample will still be in progress when the copy happens. + /// You must call CmdCopyDfSpmTraceData in a separate command buffer after one where the dfSpmTraceEnd bit is + /// set. + /// + /// There is also a metadata buffer that does not need a special KMD flag. It is also stored in a separate buffer + /// and is copied along with the output buffer with this command. It contains a uint32 trace size, a uint32 pad, + /// a uint64 start trace GPU timestamp and a uint64 stop trace GPU timestamp and is placed at the beginning of the + /// dstGpuMemory. + /// + /// The minimum size of the dstGpuMemory should be the size of the metadata struct plus the size of the DF SPM + /// ringSize given to the perf experiment. The SPM data may not fill the entire memory, but the client is + /// responsible for parsing the data. + /// + /// This function requires use of the following barrier flags on @ref dstGpuMemory: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// @param [in] perfExperiment The perfExperiment that we will be copying the data from + /// @param [in] dstGpuMemory The memory location that the DF SPM trace data will be copied to. + /// @param [in] dstOffset The offset into the destination memory that the data will be copied to. + virtual void CmdCopyDfSpmTraceData( + const IPerfExperiment& perfExperiment, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Loads data from the provided GPU Memory object into Constant Engine RAM. + /// + /// @param [in] srcGpuMemory GPU Memory object containing the source data to be loaded to CE RAM. + /// @param [in] memOffset Offset within the memory object where the source data is located, + /// must be 32-byte aligned. + /// @param [in] ramOffset Byte offset destination in CE RAM where the data should be loaded, + /// must be 32-byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be loaded into CE RAM, must be a multiple of 8. + void CmdLoadCeRam( + const IGpuMemory& srcGpuMemory, + gpusize memOffset, + uint32 ramOffset, + uint32 dwordSize) {} + + /// Dumps data from Constant Engine RAM to the provided GPU Memory address which may be located in a GPU ring buffer + /// managed by the CE. The CE can be used to automatically handle the synchronization between the DE and CE when + /// manipulating a GPU ring buffer. In order for PAL to instruct the CE to handle this, we need to know the current + /// position (entry) within the ring buffer being dumped to, as well as the total size (in entries) of the ring. + /// + /// @param [in] dstGpuMemory GPU Memory object destination where the data should be dumped from CE RAM. + /// @param [in] memOffset Offset within the memory object where data should be dumped, must be 4 byte aligned. + /// @param [in] ramOffset Byte offset source in CE RAM for data that should be dumped, must be 4 byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be dumped from CE RAM into GPU Memory + /// @param [in] currRingPos Current position (ring entry) in the GPU ring buffer being managed by the CE which the + /// dump location corresponds to. + /// @param [in] ringSize Number of entries in the GPU ring buffer being managed by the CE. If the memory being + /// dumped into is not managed in a ring-like fashion, this should be set to zero. + void CmdDumpCeRam( + const IGpuMemory& dstGpuMemory, + gpusize memOffset, + uint32 ramOffset, + uint32 dwordSize, + uint32 currRingPos, + uint32 ringSize) {} + + /// Writes CPU data to Constant Engine RAM + /// + /// @param [in] pSrcData Pointer to the source CPU data to be written to CE RAM. + /// @param [in] ramOffset Byte offset in CE RAM where the data should be written, must be 4 byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be written from pSrcData into CE RAM. + virtual void CmdWriteCeRam( + const void* pSrcData, + uint32 ramOffset, + uint32 dwordSize) {} +#endif + + /// Allocates a chunk of command space that the client can use to embed constant data directly in the command + /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address + /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within + /// this command buffer (e.g., as an SRD table address). + /// + /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the + /// value reported by GetEmbeddedDataLimit(). + /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. + /// @param [out] pGpuAddress The GPU address of the embedded space. + /// + /// @returns The DWORD-aligned CPU address of the embedded space. + virtual uint32* CmdAllocateEmbeddedData( + uint32 sizeInDwords, + uint32 alignmentInDwords, + gpusize* pGpuAddress) = 0; + + /// Allocates a chunk of command space that the client can use to embed constant data directly in the command + /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address + /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within + /// this command buffer (e.g., as an SRD table address). + /// + /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the + /// value reported by GetLargeEmbeddedDataLimit(). + /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. + /// @param [out] pGpuAddress The GPU address of the embedded space. + /// + /// @returns The DWORD-aligned CPU address of the embedded space. + virtual uint32* CmdAllocateLargeEmbeddedData( + uint32 sizeInDwords, + uint32 alignmentInDwords, + gpusize* pGpuAddress) = 0; + + /// Get memory from scratch memory and bind to GPU event. For now only GpuEventPool and CmdBuffer's internal + /// GpuEvent use this path to allocate and bind GPU memory. These usecases assume the bound GPU memory is GPU access + /// only, so client is responsible for resetting the event from GPU, and cannot call Set(), Reset(), GetStatus(). + /// + /// @param [in] pGpuEvent The GPU event that needs to bind a memory. Must not be nullptr. + /// + /// @returns Success if the GPU event successfully binds a GPU memory. Otherwise, one of the following errors may + /// be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result AllocateAndBindGpuMemToEvent( + IGpuEvent* pGpuEvent) = 0; + + /// Issues commands to prime GPU caches shortly before accessing the specified GPU address range(s). The benefit of + /// this prefetching is likely to be platform-dependent based on the GPU's cache hierarchy, memory subsystem, + /// available prefetching tools in hardware, etc., so caller beware. + /// + /// This operation may read data from memory into caches and therefore counts as a general BLT SRC operation with + /// regard to barrier execution and memory dependencies. + /// + /// @param [in] rangeCount Number of entries in pRanges. + /// @param [in] pRanges Array of structs defining a memory range and properties controlling prefetching of that + /// range. + virtual void CmdPrimeGpuCaches( + uint32 rangeCount, + const PrimeGpuCacheRange* pRanges) = 0; + + /// Issues commands which execute the specified group of nested command buffers. The observable behavior of this + /// operation should be indiscernible from directly recording the nested command buffers' commands directly into + /// this command buffer. Naturally, the queue type of the nested command buffers must match this command buffer. + /// + /// Conceptually, executing a nested command buffer is similar to calling a subroutine: the root command buffer is + /// like the "caller", while the nested ones are the "callees". + /// + /// If any nested command buffers were allocated from a @ref ICmdAllocator with @ref autoMemoryReuse enabled, + /// resetting or destroying those nested command buffers will render them retroactively uncallable. This effectively + /// makes the caller command buffer invalid and illegal to submit even if it was otherwise valid and executable in + /// the past. If the nested command allocator has autoMemoryReuse disabled, the calls to reset nested command + /// buffers remain valid until the allocator itself is reset. + /// + /// State inheritance/leakage between the caller and callee(s) has the following behavior: + /// + The callee only inherits the state specified in the callee CmdBufferBuildInfo. It is up to the client to + /// bind any default state necessary when they called @ref ICmdBuffer::Begin() to begin building the callee. + /// By default no state is inherited and all state must be specified by the client. + /// + The callee leaks any render and resource-binding state back into the caller after it completes. It is up to + /// the client to rebind the caller's state after this operation completes if they don't want state leakage. + /// + Both of the above points apply in between callees, if more than one command buffer is being executed by this + /// call. + /// + /// @param [in] cmdBufferCount Number of nested command buffers to execute. (i.e., size of the ppCmdBuffers + /// array). This must be at least one, otherwise making this call is pointless. + /// @param [in,out] ppCmdBuffers Array of nested command buffers to execute. It is an error condition if any + /// of the following are true: (Debug assertions are used to check them.) + /// + ppCmdBuffers is null. + /// + Any member of ppCmdBuffers is null. + /// + Any member of ppCmdBuffers is a root command buffer, or has a different + /// queue type than this command buffer. + virtual void CmdExecuteNestedCmdBuffers( + uint32 cmdBufferCount, + ICmdBuffer*const* ppCmdBuffers) = 0; + + /// Saves a copy of some set of the current command buffer state that is used by compute workloads. This feature is + /// intended to give PAL clients a convenient way to issue their own internal compute workloads without modifying + /// the application-facing state. + /// + /// PAL cannot save multiple layers of state, each call to CmdSaveComputeState must be followed by a call to + /// CmdRestoreComputeState before the next call to CmdSaveComputeState. + /// + /// This function can only be called on command buffers that support compute workloads. All query counters will be + /// disabled until CmdRestoreComputeState is called. + /// + /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to save. + virtual void CmdSaveComputeState( + uint32 stateFlags) = 0; + + /// Restores some set of the command buffer state that is used by compute workloads. This feature is intended to + /// give PAL clients a convenient way to issue their own internal compute workloads without modifying the + /// application-facing state. + /// + /// A call to this function must be preceded by a call to CmdSaveComputeState and the save stateFlags must contain + /// all restore stateFlags, otherwise the values of the restored state are undefined. + /// + /// This function can only be called on command buffers that support compute workloads. All previously disabled + /// query counters will be reactivated. + /// + /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to restore. + virtual void CmdRestoreComputeState( + uint32 stateFlags) = 0; + + /// Issues commands which complete two tasks: using the provided @ref IIndirectCmdGenerator object to translate the + /// indirect argument buffer into a format understandable by the GPU; and then executing the generated commands. + /// + /// The virtual address must be 4-byte aligned. + /// + /// The indirect argument data offset in memory must be 4-byte aligned. The expected layout of the argument data + /// is defined by the @ref IIndirectCmdGenerator object. + /// + /// It is unsafe to call this method on a command buffer which was not begun with either the optimizeOneTimeSubmit + /// or optimizeExclusiveSubmit flags. This is because there is a potential race condition if the same command buffer + /// is generating indirect commands on multiple Queues simultaneously. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] generator Indirect command generator object which can translate the indirect argument buffer + /// into a command buffer format which the GPU can understand. + /// @param [in] gpuVirtAddr Gpu virtual address where the indirect argument data is located. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr is nonzero, the + /// value at that memory location is clamped to this maximum. If countGpuAddr is zero, + /// Then the number of draws issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. Must be 4-byte aligned. + virtual void CmdExecuteIndirectCmds( + const IIndirectCmdGenerator& generator, + gpusize gpuVirtAddr, + uint32 maximumCount, + gpusize countGpuAddr) = 0; + + /// Updates one or more HiS pretests bound to the given stencil image within a range of mip levels. + /// See @ref HiSPretests for a summary of HiS. + /// + /// @warning Improper use of pretests can cause corruption. Please see @ref HiSPretests for more information. + /// + /// @param [in] image The stencil image that will receive the new pretest(s). + /// @param [in] pretests The new pretest(s). + /// @param [in] firstMip The beginning of the mip range which will receive the new pretest(s). + /// @param [in] numMips The number of mips in the mip range which will receive the new pretest(s). + virtual void CmdUpdateHiSPretests( + const IImage* pImage, + const HiSPretests& pretests, + uint32 firstMip, + uint32 numMips) = 0; + + /// Reserve @ref CommandDataAlloc space for external command packets up to a size of @ref sizeInDwords. + /// This method is only supported on command buffers for the following queue types: + /// + /// @warning @ref CmdCommitSpace must be called once after this function is called. + // Failing to pair up these function calls will result in undefined behavior. + /// + /// @param [in] sizeInDwords Size of the command buffer space to reserve in dwords. + /// If this param is 0, the default command stream reserve limit will be used. + /// @param [in] reserveInNewChunk Selection to reserve space in a new chunk or current chunk. + /// + /// @returns A pointer to the reserved command space. + virtual uint32* CmdReserveSpace( + uint32 sizeInDwords, + bool reserveInNewChunk) = 0; + + /// Ensure data is commited the command buffer and unused space is reclaimed. + /// This method is only supported on command buffers for the following queue types: + /// + /// @param [in] pCmdSpace Pointer to the next unused dword in the command buffer. + virtual void CmdCommitSpace( + uint32* pCmdSpace) = 0; + + /// Executes any internal postprocessing commands to be performed on a frame, such as drawing the dev driver + /// overlay. Calling this prior to presenting (via any path) is a requirement, and must be prior to or + /// concurrent with frameEnd if FSFM is applicable. This must be called using the image that will be the + /// source of the present. + /// + /// @param [in] postProcessInfo Information about the frame to be postprocessed. + /// @param [out] pAddedGpuWork (Optional) Set to true if commands were added as part of this call. + virtual void CmdPostProcessFrame( + const CmdPostProcessFrameInfo& postProcessInfo, + bool* pAddedGpuWork) = 0; + + /// Inserts a string embedded inside a NOP packet with a signature that is recognized by tools and can be printed + /// inside a command buffer disassembly. Note that this is a real NOP that will really be submitted to the GPU + /// and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug dumps. + /// + /// The maximum length of a string that may be embedded in the command buffer is currently 128 characters, + /// including the NUL-terminator. This is defined in the internal command buffer class in MaxCommentStringLength. + /// + /// @param [in] pComment Pointer to NUL-terminated string that will be inserted into the command buffer. + virtual void CmdCommentString( + const char* pComment) = 0; + + /// Inserts the specified payload embedded inside a NOP packet. Note that this is a real NOP that will be submitted + /// to the GPU and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug + /// dumps. + /// + /// @param [in] pPayload Pointer to binary data to embed. + /// @param [in] payloadSize Size of the payload in DWORDs, expected to be under MaxPayloadSize. + virtual void CmdNop( + const void* pPayload, + uint32 payloadSize) = 0; + + /// Marks the begin or end of a user-defined region of GPU work; analyzed post-mortem in crash-dump analysis tools. + /// Each 'Begin' marker must be paired with a corresponding 'End' marker; however, markers may be nested by + /// inserting multiple 'Begin' markers consecutively. + /// + /// @warning This function is a no-op if Crash Analysis mode is not enabled. + /// + /// @param [in] isBegin Whether this is a 'Begin' marker (true) or an 'End' marker (false). + /// @param [in] sourceId The application layer ID at which the marker is being created: + /// 0x0 => Application + /// 0x1 => API (e.g. DX12, Vulkan, etc.) + /// 0x2 => PAL + /// Developers may use IDs within the range of 10 - 15 to define a custom + /// application layer. + /// @param [in] pMarkerName A NULL-terminated string containing a name for this marker, used for annotation + /// purposes in external tools. Only valid for 'Begin' markers, and will be ignored if + /// isBeginMarker is false. + /// @param [in] markerNameSize Size of the marker string, in bytes. + /// + /// @returns Non-zero counter value of the embedded execution marker. + /// If Crash Analysis mode is disabled, this will always return zero. + virtual uint32 CmdInsertExecutionMarker( + bool isBegin, + uint8 sourceId, + const char* pMarkerName, + uint32 markerNameSize) = 0; + + /// Performs the virtual queue handshake. The host queue will do the following: + /// - Wait until the parent kernel is done + /// - Change the parent kernel state + /// - CP waits until the child counter is 0 + /// - CP sends the termination signal to the device queue + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] parentState Address of the parent kernel state + /// @param [in] newStateValue The new state value of the parent kernel + /// @param [in] parentChildCounter Address of the parent child counter + /// @param [in] signal Address of the virtual queue signal + /// @param [in] dedicatedQueue Runtime uses a dedicated queue for the scheduler + /// + /// @note This function is to support OpenCL AQL submissions. + virtual void CmdVirtualQueueHandshake( + gpusize parentState, + uint32 newStateValue, + gpusize parentChildCounter, + gpusize signal, + bool dedicatedQueue) = 0; + + /// Returns GPU address of the loop start with dispatch templates. The pointer will be passed to the + /// scheduler kernel for the update of dispatch templates. + /// Also initializes common registers for each dispatch template. + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @note This function is to support OpenCL AQL submissions. + virtual gpusize CmdVirtualQueueDispatcherStart() = 0; + + /// Programs CP iterator with dispatch templates for device enqueue in OpenCL2.0 + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] signal Address for the termination signal + /// @param [in] loopStart GPU address of the loop start for CP + /// @param [in] numTemplates The number of dispatch templates + /// + /// @note This function is to support OpenCL AQL submissions. + virtual void CmdVirtualQueueDispatcherEnd( + gpusize signal, + gpusize loopStart, + uint32 numTemplates) = 0; + + /// Emulates AQL dispatch with PM4 commands. + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] dispatchInfo Pointer to kernel dispatch info + /// + /// @note This function is to support OpenCL AQL submissions. + void CmdDispatchAql( + const DispatchAqlParams& dispatchInfo) + { + m_funcTable.pfnCmdDispatchAql(this, dispatchInfo); + } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + /// XDMA was retired starting in gfx10 so this function has no use anymore. + inline void CmdXdmaWaitFlipPending() {} +#endif + + /// Starts thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP + /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these + /// functions) + /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. + /// The caller is responsible for setting up valid GPU Profiler panel settings. + virtual void CmdStartGpuProfilerLogging() = 0; + + /// Stops thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP + /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these + /// functions) + /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. + /// The caller is responsible for setting up valid GPU Profiler panel settings. + virtual void CmdStopGpuProfilerLogging() = 0; + + /// Set a mask to control which view instances are enabled for subsequent draws, should only be called on + /// universal command buffers. + /// + /// @param [in] mask The mask to control which view instances are enabled. + virtual void CmdSetViewInstanceMask(uint32 mask) = 0; + + /// Get used size of all chunks in bytes for given CmdAllocType. For CommandDataAlloc with multi-queue scheme, the + /// size reported will be the sum of all command streams associated with the command buffer. It's legal to call + /// this function while in the command building state. + /// + /// @param [in] type Allocation type for ICmdAllocator + /// + /// @returns Used allocation data size in bytes for provided CmdAllocType. + virtual uint32 GetUsedSize( + CmdAllocType type) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Gets the internal unique Id of the command buffer. + /// This function was originally only for internal debugging, and the Id is not unique across different queue types. + /// However, CmdDisassembly::ICmdBufferReporting requires a way to differentiate cmdLists that matches + /// the meaning of UniqueId(), in particular, not requiring uniqueness across different queue types. + /// + /// @returns Unique Id of the command buffer + virtual uint32 UniqueId() const = 0; + + /// Get the number of arrays of tracking data (for correlation) held by this ICmdBuffer. + /// For TrackCmdLocationBefore and TrackCmdLocationAfter below, only (idx < GetNumTrackingArrays()) + /// can yield a non-nullptr result + /// + /// @detail If GetNumTrackingArrays() returns 0 but GetTrackedCmdLocationArraySizeInBytes() does not, it will + /// be possible to create them on this ICmdBuffer using CreateTrackedCmdLocationArray + /// + /// @returns the number of a TrackedCmdLocationArray's held by the ICmdBuffer. + virtual uint32 GetNumTrackingArrays() const = 0; + + /// Get the number of bytes required by CreateTrackedCmdLocationArray. + /// + /// @detail The value returned here accomdates the full number of TrackedCmdLocationArray's to be + /// created, from a single contiguous allocation. + /// If allocation has not yet occured, (GetNumTrackingArrays() == 0). + /// If (GetTrackedCmdLocationArraySizeInBytes() > 0) && (GetNumTrackingArrays() == 0) + /// this ICmdBuffer supports TrackedCmdLocationArray's, but has not yet allocated them + /// If (GetTrackedCmdLocationArraySizeInBytes() == 0), this ICmdBuffer does not support + /// TrackedCmdLocationArray's + /// + /// @returns 0 if TrackedCmdLocationArray's are not supported + /// The total number of bytes required requied by CreateTrackedCmdLocationArray otherwise. + virtual uint32 GetTrackedCmdLocationArraySizeInBytes() const = 0; + + /// Uses the memory pMemory to initialize GetNumTrackingArrays() TrackedCmdLocationArray's on this + /// ICmdBuffer. + /// + /// @param [in] pMemory Address of memory allocated for the purpose of creating TrackedCmdLocationArray's + /// This memory should be at larger than GetTrackedCmdLocationArraySizeInBytes() + /// bytes. + /// + /// @returns Result::Success: Indicates creation was successful + /// Result::Unsupported: This ICmdBuffer does not support TrackedCmdLocationArray's + /// Confirm (GetTrackedCmdLocationArraySizeInBytes() > 0) before using this function + /// Result::ErrorInvalidPointer: pMemory == nullptr + /// Result::AlreadyExists: TrackedCmdLocationArray's have already been allocated on + /// this ICmdBuffer + /// other: Error values originating from Util::Vector::Reserve() or + /// Util::Vector::PushBack() + /// + virtual Result CreateTrackedCmdLocationArray( + void* pMemory) = 0; + + /// Executes the destructors for all TrackedCmdLocationArray's owned by this ICmdBuffer. This should be + /// called prior to deleting the memory pMemory that was originally provided to CreateTrackedCmdLocationArray + /// + /// @param [out] ppAllocatedMemory + /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() > 0) + /// The original value for pMemory provided in CreateTrackedCmdLocationArray(pMemory) will + /// by returned in *ppAllocatedMemory. ie *ppAllocatedMemory = pMemory + /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() == 0), + /// *ppAllocatedMemory = nullptr; + /// + virtual void DestroyTrackedCmdLocationArray( + void** ppAllocatedMemory) = 0; + + /// Creates a new TrackedCmdLocation in the TrackedCmdLocationArray corresponding to idx + /// of type TrackedCmdLocationMode::Begin. This location will be furnished with a pointer to the address of + /// the next PM4Packet to be created on the CmdBuffer referred to by idx, and m_event == eventId. + /// + /// @detail Note there is two potential valid corner cases. + /// (pBeforeResult->Get()->m_correlateInternal.m_ptr. == 0) + /// Indicates there is a commandstream but it has not begun building PM4Packets + /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins + /// (pBeforeResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) + /// Indicates there is not yet an associated commandstream. + /// + /// @param [in] idx The idx corresponding to GetTrackingArray(idx). + /// Only (idx < GetNumTrackingArrays()) will yield non-trivial results. + /// @param [in] eventId The code for the event being tracked + /// @param [out] pBeforeResult If successful, returns a TrackedCmdLocationRef to a TrackedCmdLocation within + /// the TrackedCmdLocationArray corresponding to idx + /// Otherwise, if (pBeforeResult != nullptr), is initialized to the default for + /// TrackedCmdLocationRef() + /// + /// @returns + /// Pal::Result::Success if successful + /// Pal::Result::ErrorInvalidPointer if (pBeforeResult == nullptr) + /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer + /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) + /// Pal::Result::NotFound if there is no tracking array corresponding to idx + /// This can occur if CreateTrackedCmdLocationArray has not been + /// called - which may mean the feature is disabled + /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable + /// to allocate memory + /// + /// + virtual Pal::Result TrackCmdLocationBefore( + uint32 idx, + uint8 eventId, + CmdDisassembly::TrackedCmdLocationRef* pBeforeResult) = 0; + + /// Similar to TrackCmdLocationBefore, TrackCmdLocationAfter creates a TrackedCmdLocation, of type + /// TrackedCmdLocationMode::End or TrackedCmdLocationMode::Delta. This location will be furnished with a + /// pointer to the address of the next PM4Packet to be created on the CmdBuffer referred to by idx. Type + /// TrackedCmdLocationMode::Delta will only occur if parameter before is the last TrackedCmdLocation + /// for the TrackedCmdLocationArray corresponding to idx, has the same m_event == eventId, and the change in + /// pointer address is small enough to be represented in 6 bits. + /// + /// @detail Note there is two potential valid corner cases. + /// (pAfterResult->Get()->m_correlateInternal.m_ptr. == 0) + /// Indicates there is a commandstream but it has not begun building PM4Packets + /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins + /// (pAfterResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) + /// Indicates there is not yet an associated commandstream. + /// In both of these cases, in parameter "before" had the same value for m_correlateInternal.m_ptr. + /// and "before" referred to the most recent TrackedCmdLocation, this tracked location will be of + /// type TrackedCmdLocationMode::Delta, with (before.Get()->m_correlateInternal.m_deltaInDWords == 0) + /// + /// @param [in] idx The idx corresponding to GetTrackingArray(idx). + /// Only (idx < GetNumTrackingArrays()) can yield non-trivial results. + /// @param [in] eventId The code for the event being tracked + /// @param [in] before The corresponding location generated by TrackCmdLocationBefore + /// This may be CmdDisassembly::TrackedCmdLocationRef() if no location from + /// TrackCmdLocationBefore before exists (such as on Reset) + /// @param [out] pAfterResult Returns a TrackedCmdLocationRef to a TrackedCmdLocation within the TrackedCmdLocationArray + /// corresponding to idx + /// + /// @returns + /// Pal::Result::Success if successful + /// Pal::Result::ErrorInvalidPointer if (pAfterResult == nullptr) + /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer + /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) + /// Pal::Result::ErrorInvalidValue if (eventId != before.m_correlateInternal.m_event) + /// Pal::Result::NotFound if there is no tracking array corresponding to idx + /// This can occur if CreateTrackedCmdLocationArray has not been + /// called - which may mean the feature is disabled + /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable + /// to allocate memory + /// + + virtual Pal::Result TrackCmdLocationAfter( + uint32 idx, + uint8 eventId, + CmdDisassembly::TrackedCmdLocationRef before, + CmdDisassembly::TrackedCmdLocationRef* pAfterResult) = 0; + + /// An accessor function for the TrackedCmdLocationArray corresponding to idx + /// + /// @param idx There is a CmdDisassembly::TrackedCmdLocationArray* corresponding to each + /// sub-cmdBuffer for this cmdBuffer. This idx indexes these in the same fashion. + /// Only idx < GetNumTrackingArrays() can yield non-nullptr results. + /// + /// @returns the TrackedCmdLocationArray corresponding to idx + virtual CmdDisassembly::TrackedCmdLocationArray* GetTrackingArray( + uint32 idx) const = 0; + + /// TrackClientEvent operates similarly to TrackCmdLocationBefore and TrackCmdLocationAfter above but + /// operates on all TrackedCmdLocationArray's on this CmdBuffer. The TrackedCmdLocation generated here + /// corresponds to type TrackedCmdLocationMode::ClientEvent, where clientId is a value the client is using + /// to track this cmdBuffer, and eventId refers to some event the client is tracking outside of driver. + /// + /// @detail The first call to TrackClientEvent will include a TrackedCmdLocation with + /// (m_mode == TrackedCmdLocationMode::ClientId), to define clientId on this cmdBuffer + /// All calls will generate the TrackedCmdLocationMode::ClientEventId with + /// (m_clientEvent.m_clientEventId == clientEventId) + /// and be followed with a TrackedCmdLocationMode::Delta location with (m_eventId == PostClientEvent (0xFF)) + /// + /// @param [in] clientId + /// @param [in] eventId + /// + /// @returns + /// Result::Success if the tracked client event was successfully recorded + /// Result::Unsupported if the implementation of ICmdBuffer does not support tracking + /// Result::ErrorInvalidPointer if there was an error encountered determining the cmdList correlation + /// requested. This is likely to be an out-of-memory situation. + /// Result::AlreadyExists if registering clientId occured multiple times. This should only occur for + /// race conditions, if the code calling TrackClientEvent is not threadsafe + virtual Result TrackClientEvent( + uint64 clientId, + uint64 clientEventId) = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + ICmdBuffer() : m_pClientData(nullptr) + { + } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~ICmdBuffer() { } + + /// Structure for function pointers for the ICmdBuffer::Cmd* functions. + struct CmdBufferFnTable + { + /// CmdSetUserData function pointers for each pipeline bind point. + CmdSetUserDataFunc pfnCmdSetUserData[static_cast(PipelineBindPoint::Count)]; + + CmdDrawFunc pfnCmdDraw; ///< CmdDraw function pointer. + CmdDrawOpaqueFunc pfnCmdDrawOpaque; ///< CmdDrawOpaque function pointer. + CmdDrawIndexedFunc pfnCmdDrawIndexed; ///< CmdDrawIndexed function pointer. + CmdDrawIndirectMultiFunc pfnCmdDrawIndirectMulti; ///< CmdDrawIndirectMulti function pointer. + CmdDrawIndexedIndirectMultiFunc pfnCmdDrawIndexedIndirectMulti; ///< CmdDrawIndexedIndirectMulti func pointer. + CmdDispatchFunc pfnCmdDispatch; ///< CmdDispatch function pointer. + CmdDispatchIndirectFunc pfnCmdDispatchIndirect; ///< CmdDispatchIndirect function pointer. + CmdDispatchOffsetFunc pfnCmdDispatchOffset; ///< CmdDispatchOffset function pointer. + CmdDispatchMeshFunc pfnCmdDispatchMesh; ///< CmdDispatchmesh function pointer. + CmdDispatchMeshIndirectMultiFunc pfnCmdDispatchMeshIndirectMulti; ///< CmdDispatchMeshIndirect function pointer. + CmdDispatchAqlFunc pfnCmdDispatchAql; ///< CmdDispatchAql function pointer. + } m_funcTable; ///< Function pointer table for Cmd* functions. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// @internal Some back-compat glue for some of the HwPipePoint interfaces in this file. + static constexpr uint32 HwPipePointToStage[] = + { + PipelineStageTopOfPipe, // HwPipeTop = 0x0 + PipelineStagePostPrefetch, // HwPipePostPrefetch = 0x1 + PipelineStageVs, // HwPipePreRasterization = 0x2 + PipelineStagePs, // HwPipePostPs = 0x3 + PipelineStageLateDsTarget, // HwPipePreColorTarget = 0x4 + PipelineStageCs, // HwPipePostCs = 0x5 + PipelineStageBlt, // HwPipePostBlt = 0x6 + PipelineStageBottomOfPipe, // HwPipeBottom = 0x7 + }; +#endif +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h new file mode 100644 index 0000000000..656b412a85 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h @@ -0,0 +1,370 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdTracking.h + * @brief Defines a number of support classes used for construction and storage of struct TrackedCmdLocation + * defined in trackedCmdLocation.h + * + * - struct TrackingEventInfo: A single from uint8 to name, used for logging + * - class TrackedCmdSupportBase A set of TrackingEventInfo, maintained outside of Pal + * - class TrackedCmdLocationArray The arrays for TrackedCmdLocation's used for reporting + * correlation data through ICmdBufferReporting::CorrelationReportOnSubmit + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palVector.h" + +#include "trackedCmdLocation.h" + +namespace Pal +{ + +// forward decl +class Platform; + +namespace CmdDisassembly +{ + +// forward definition +class TrackedCmdLocationArray; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdLocationRef +* A copyable reference to a member in a TrackedCmdLocationArray, invariant to that array be +* re-allocated. +* +* @detail Is simply a pointer to a TrackedCmdLocationArray, and an index in to that array +* +************************************************************************************************************************ +*/ +class TrackedCmdLocationRef +{ +public: + TrackedCmdLocationRef() + : m_pSourceArray(nullptr), + m_index(0) + { + } + + TrackedCmdLocationRef( + TrackedCmdLocationArray* pSourceArray, + Util::uint32 index) + : m_pSourceArray(pSourceArray), + m_index(index) + { + } + + TrackedCmdLocationRef( + TrackedCmdLocationRef&& other) = default; + TrackedCmdLocationRef( + TrackedCmdLocationRef const& other) = default; + TrackedCmdLocationRef& operator=( + TrackedCmdLocationRef&& other) = default; + TrackedCmdLocationRef& operator=( + TrackedCmdLocationRef const& other) = default; + + bool operator==( + TrackedCmdLocationRef const& other) const + { return (this->m_pSourceArray == other.m_pSourceArray) && (this->m_index == other.m_index); } + bool operator!=( + TrackedCmdLocationRef const& other) const + { return (this->m_pSourceArray != other.m_pSourceArray) || (this->m_index != other.m_index); } + + TrackedCmdLocation* Use(); + const TrackedCmdLocation* Get() const; + + Util::uint32 GetIndex() const + { + return m_index; + } + + /// Helper functions + /// + + /// Clears the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result Clear(); + + /// @returns + /// TrackedCmdLocationMode::Invalid if (IsValid() == false) + /// Get()->m_mode otherwise + TrackedCmdLocationMode GetMode() const; + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::Before + /// + /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it + /// Most likely, a value registered to a TrackedCmdSupportBase + /// @param [in] beforePtr The end pointer for the cmdList being tracked before the event referred to by eventId + /// Only 48-bits of beforePtr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsBefore( + uint8 eventId, + uint64 beforePtr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::After + /// + /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it + /// Most likely, a value registered to a TrackedCmdSupportBase + /// @param [in] afterPtr The end pointer for the cmdList being tracked after the event referred to by eventId + /// Only 48-bits of afterPtr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsAfter( + uint8 eventId, + uint64 afterPtr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::Delta, with no begin or end (ie, no data can be written to + /// the cmdList being tracked "during" the event referred to be eventId + /// + /// @param [in] eventId Refers to an uint8 event that does not have a begin and/or an end associated with it + /// Such as Pal::CmdDisassembly::TrackedCmdLocation::PostClientEvent + /// @param [in] ptr The end pointer for the cmdList being tracked after the event referred to by eventId + /// Only 48-bits of ptr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsEmptyDelta( + uint8 eventId, + uint64 ptr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::ClientId + /// + /// @param [in] clientId A 61-bit bit value used by the client application to identify which cmdList is being + /// tracked + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsClientId( + uint64 clientId); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::ClientEventId + /// + /// @param [in] clientEventId A 61-bit bit value used by the client application to identify + /// a client event relative to the current end position of the cmdList being tracked + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsClientEvent( + uint64 clientEventId); + + /// @brief bool TrackedCmdLocation::TrySetAsDelta(uint64 afterPtr) + /// Will attempt to set this TrackedCmdLocation to type TrackedCmdLocationMode::Delta + /// + /// @detail If GetMode() == TrackedCmdLocationMode::Before and afterPtr - m_correlateInternal.m_ptr is small + /// enough to be encoded in m_correlateInternal.m_deltaInDWords, the mode will be altered to + /// TrackedCmdLocationMode::Delta, with afterPtr - m_correlateInternal.m_ptr encoded in + /// m_correlateInternal.m_deltaInDWords. + /// If this attempt fails, the calling function should instead create a TrackedCmdLocationMode::After + /// TrackedCmdLocation + /// + /// @param [in] afterPtr, the value a TrackedCmdLocationMode::After would have for m_correlateInternal.m_ptr + /// @return Result::Success if it was possible to set this TrackedCmdLocation to type + /// TrackedCmdLocationMode::Delta + /// Result::Unsupported if the conditions described above are not met. + Result TrySetAsDelta( + uint64 afterPtr); + +private: + TrackedCmdLocationArray* m_pSourceArray; + Util::uint32 m_index; + + Result SetMode( + TrackedCmdLocationMode mode); +}; + +/// @brief struct TrackingEventInfo +/// Essentially just a name, plus a boolean to indicate whether the name is valid / has been set +struct TrackingEventInfo +{ + Util::StringView name; + bool isValid; + + TrackingEventInfo() + : isValid(false) + {} +}; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdSupportBase translates eventId's to strings for internal correlation events +* +* @detail For use in Pal::Queue when dumping to text files. Corresponds to +* TrackedCmdLocation::m_correlateInternal.m_event for the cases where TrackedCmdLocation::m_mode +* is not TrackedCmdLocationMode::ClientEvent +* +* The implementation for this is in whatever client of Pal that is creating the internal correlation events, +* +************************************************************************************************************************ +*/ +class TrackedCmdSupportBase +{ +public: + virtual ~TrackedCmdSupportBase() = default; + + void SetEventIdName( + uint8 eventId, + const char* name) + { + PAL_ASSERT(static_cast(eventId) < NumUInt8Values); + m_allEventsMap[eventId].name = name; + m_allEventsMap[eventId].isValid = true; + } + + TrackingEventInfo const& GetEventInfo( + uint8 eventId) const + { + PAL_ASSERT(static_cast(eventId) < NumUInt8Values); + return m_allEventsMap[eventId]; + } + +protected: + static constexpr uint32 NumUInt8Values = UINT8_MAX + 1; + + TrackingEventInfo m_allEventsMap[NumUInt8Values]; + + TrackedCmdSupportBase() = default; +}; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdLocationArray is simple a TrackedCmdLocationVec together with a clientId +* and some helpers. TrackedCmdLocationArray live on Pal::GfxCmdBuffer +* +* @detail Each Pal::GfxCmdBuffer has at most CmdDisassembly::MaxNumSubCmdBuffers TrackedCmdLocationArray's +* corresponding to Pal::GfxCmdBuffer::NumCmdStreams(); +* +* The clientId used for TrackedCmdLocationArray::m_clientId, corresponds to the client Id used in +* TrackedCmdLocation::m_clientId.m_clientId +* +* For the moment, the underlying implementation used is +* Util::Vector, but could be changed to use a Chunk +* scheme, especially as sizes of cmdLists can become very large. +* The only requirement to a change, is for TrackedCmdLocationRef continues to function as an accessor +* +* Note that the functions in TrackedCmdLocationArray are not designed for thread-safety, as they are +* issued from command-list-building functions that are, in their turn, not thread safe. Adding mutex +* behavior here would potentially hide issues relating to thread-safety. +* +************************************************************************************************************************ +*/ +class TrackedCmdLocationArray +{ +public: + static constexpr uint32 DefaultCapacity = 1024; + static constexpr uint32 BadIndex = UINT32_MAX; + static constexpr uint64 InvalidClientId = UINT64_MAX; + + typedef Util::Vector TrackedCmdLocationVec; + + static uint32 GetTrackedCmdLocationArraySizeInBytes() + { + return sizeof(TrackedCmdLocationArray); + } + + static TrackedCmdLocationArray* CreateTrackedCmdLocationArray( + void* pMemory, + Pal::Platform* pPlatform); + + void Reset() + { + m_lastLocation = TrackedCmdLocationRef(this, BadIndex); + m_clientId = InvalidClientId; + m_locations.Clear(); + } + + void Destroy(); + + uint64 GetClientId() const + { + return m_clientId; + } + + Result SetClientId( + uint64 clientId); + + Util::uint32 GetTotalSize() const + { + return m_locations.size(); + } + + const TrackedCmdLocationVec& GetLocationsVec() const + { + return m_locations; + } + + TrackedCmdLocationVec& UseLocationsVec() + { + return m_locations; + } + + Pal::Result MakeNext( + TrackedCmdLocationRef* pResult); + + const TrackedCmdLocationRef GetLast() const + { + return m_lastLocation; + } + + bool IsLast( + TrackedCmdLocationRef const& location) const + { + return location == m_lastLocation; + } + +private: + TrackedCmdLocationVec m_locations; + Pal::Platform* m_pPlatform; + uint64 m_clientId; + TrackedCmdLocationRef m_lastLocation; + + TrackedCmdLocationArray( + Pal::Platform* pPlatform); + + ~TrackedCmdLocationArray() = default; +}; + +} // namespace CmdDisassembly +} // namespace Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h b/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h new file mode 100644 index 0000000000..efb4b60668 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h @@ -0,0 +1,70 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDestroyable.h + * @brief Defines the Platform Abstraction Library (PAL) IDestroyable interface. + *********************************************************************************************************************** + */ + +#pragma once + +namespace Pal +{ + +/** + *********************************************************************************************************************** + * @interface IDestroyable + * @brief Interface inherited by objects that must be explicitly destroyed by the client. + * + * This includes all objects except: + * + * + @ref IColorTargetView, @ref IDepthStencilView - These classes are treated as SRDs by the DX12 runtime. Therefore, + * PAL guarantees that no action needs to be taken at Destroy() - the client should just free the memory backing these + * classes. + * + @ref IDevice - These objects are created during IPlatform::EnumerateDevices() and are automatically destroyed + * along with the Platform object. + * + @ref IPrivateScreen - These objects are created as during IPlatform::EnumerateDevices() based on + * which screens are attached to each device. They are automatically destroyed along with the Platform object. + *********************************************************************************************************************** + */ +class IDestroyable +{ +public: + /// Frees all resources associated with this object. + /// + /// It is the client's responsibility to only call this method once there are no more existing references to this + /// object. This method does not free the system memory associated with the object (as specified in pPlacementAddr + /// during creation); the client is responsible for freeing that memory since they allocated it. + virtual void Destroy() = 0; + +protected: + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IDestroyable() { } +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h b/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h new file mode 100644 index 0000000000..f68d1d730f --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h @@ -0,0 +1,626 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDeveloperHooks.h + * @brief Common include for PAL developer callbacks. Defines common enums, typedefs, structures, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palCmdBuffer.h" + +namespace Pal +{ + +// Forward declarations. +class ICmdBuffer; +class IImage; +class IPipeline; + +namespace Developer +{ + +/// The type of the developer callback so the callback can properly perform whatever actions it needs. +/// +/// @see Callback +enum class CallbackType : uint32 +{ + AllocGpuMemory = 0, ///< This callback is to inform that GPU memory has been allocated. + FreeGpuMemory, ///< This callback is to inform that GPU memory has been freed. + PresentConcluded, ///< This callback is to inform that a present has concluded. + ImageBarrier, ///< This callback is to inform that a barrier is being executed. + CreateImage, ///< This callback is to inform that an image has been created. + BarrierBegin, ///< This callback is to inform that a barrier is about to be executed. + BarrierEnd, ///< This callback is to inform that a barrier is done being executed. + DrawDispatch, ///< This callback is to inform that a draw or dispatch command is being recorded. + BindPipeline, ///< This callback is to inform that a pipeline (client or internal) has been bound. + SurfRegData, ///< This callback is to inform tools of the register state of a surface. +#if PAL_DEVELOPER_BUILD + DrawDispatchValidation, ///< This callback is to describe the state validation needed by a draw or dispatch. + BindPipelineValidation, ///< This callback is to describe the state validation needed by a pipeline bind. + OptimizedRegisters, ///< This callback is to describe the PM4 optimizer's removal of redundant register + /// sets. +#endif + BindGpuMemory, ///< This callback is to inform of a new binding to GPU memory. + SubAllocGpuMemory, ///< This callback is to inform of suballocation from base GPU memory allocation. + SubFreeGpuMemory, ///< This callback is to inform that GPU memory suballocation has been freed. +#if PAL_DEVELOPER_BUILD + RpmBlt, ///< This callback is to describe the internal RPM blt calls. +#endif + Count, ///< The number of info types. +}; + +constexpr uint32 AllCallbackTypesMask = Util::BitfieldGenMask(static_cast(CallbackType::Count)); + +constexpr uint32 DefaultDisabledCallbackTypes = (1 << static_cast(CallbackType::BindGpuMemory)) | + (1 << static_cast(CallbackType::SubAllocGpuMemory)) | + (1 << static_cast(CallbackType::SubFreeGpuMemory)); + +constexpr uint32 DefaultEnabledCallbackTypes = AllCallbackTypesMask & ~DefaultDisabledCallbackTypes; + +/// Definition for developer callback. +/// +/// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. +/// @param [in] deviceIndex Unique index for the device so that the installer can properly dispatch the event. +/// @param [in] infoType Information about the callback so the installer can make informed decisions about +/// what actions to perform. +/// @param [in] pInfoData Additional data related to the particular callback type. +typedef void (PAL_STDCALL *Callback)( + void* pPrivateData, + const uint32 deviceIndex, + CallbackType type, + void* pCbData); + +/// Enumeration describing the different ways GPU memory is allocated. +enum class GpuMemoryAllocationMethod : uint32 +{ + Unassigned = 0, ///< Unassigned allocation method. + Normal, ///< Virtual memory allocation (not pinned/peer). + Pinned, ///< Pinned memory allocation. + Peer, ///< Peer memory allocation. + MultiDevice, ///< MultiDevice memory allocation. + Opened, ///< Shared memory allocation. + Svm, ///< Shared virtual memory allocation. +}; + +/// Enumeration describing the different Presentation modes an application can take. +enum class PresentModeType : uint32 +{ + Unknown = 0, ///< When the present mode is not known. + Flip, ///< when the presentation surface is used directly as the front buffer. + Composite, ///< When the flipped image is drawn by a window compositor instead + /// of the application. + Blit, ///< when the presentation surface is copied to the front buffer. +}; + +/// Information about the presentation mode an application is in. +struct PresentationModeData +{ + PresentModeType presentationMode; ///< Information about present mode from above enumeration. + UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. +}; + +/// Information for allocation/deallocation of GPU memory. +struct GpuMemoryData +{ + gpusize size; ///< Size, in bytes, of the allocation. + GpuHeap heap; ///< The first requested heap of the allocation. + + /// Allocation description flags + struct Flags + { + uint32 isClient : 1; ///< This allocation is requested by the client. + uint32 isFlippable : 1; ///< This allocation is marked as flippable. + uint32 isUdmaBuffer : 1; ///< This allocation is for a UDMA buffer. + uint32 isVirtual : 1; ///< This allocation is for virtual memory. + uint32 isCmdAllocator : 1; ///< This allocation is for a CmdAllocator. + uint32 isExternal : 1; ///< This allocation is marked as external. + uint32 buddyAllocated : 1; ///< This allocation is buddy allocated. + uint32 appRequested : 1; ///< This allocation is Pal internal, but application requested + uint32 reserved : 24; ///< Reserved for future use. + } flags; ///< Flags describing the allocation. + + GpuMemoryAllocationMethod allocMethod; ///< Allocation method + const IGpuMemory* pGpuMemory; ///< Handle to the Pal::IGpuMemory object of this GPU memory allocation + gpusize offset; ///< Offset, in bytes, of a suballocation within a base allocation. For + /// base allocations, offset is always zero. +}; + +#if PAL_DEVELOPER_BUILD +/// PWS acquire point for barrier logger +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 901 +enum AcquirePoint : uint8 +{ + AcquirePointPfp, + AcquirePointMe, + AcquirePointPreShader, + AcquirePointPreDepth, + AcquirePointPrePs, + AcquirePointPreColor, + AcquirePointEop, + + AcquirePointCount +}; +#else +enum class AcquirePoint : uint8 +{ + Pfp = 0, + Me, + PreShader, + PreDepth, + PrePs, + PreColor, + Eop, // Invalid, for internal optimization purpose. + + Count +}; +#endif +#endif + +/// Information pertaining to the cache flush/invalidations and stalls performed during barrier execution. +struct BarrierOperations +{ + union + { + struct + { + uint16 eopTsBottomOfPipe : 1; ///< Issue an end-of-pipe event that can be waited on. + /// When combined with waitOnTs, makes a full pipeline stall. + uint16 vsPartialFlush : 1; ///< Stall at ME, waiting for all prior VS waves to complete. + uint16 psPartialFlush : 1; ///< Stall at ME, waiting for all prior PS waves to complete. + uint16 csPartialFlush : 1; ///< Stall at ME, waiting for all prior CS waves to complete. + uint16 pfpSyncMe : 1; ///< Stall PFP until ME is at same point in command stream. + /// flushed/invalidated are specified in the caches bitfield. + uint16 syncCpDma : 1; ///< Issue dummy cpDma command to confirm all prior cpDmas have + /// completed. + uint16 eosTsPsDone : 1; ///< Issue an end-of-pixel-shader event that can be waited on. + uint16 eosTsCsDone : 1; ///< Issue an end-of-compute-shader event that can be waited on + uint16 waitOnTs : 1; ///< Wait on an timestamp event (EOP or EOS) at the ME. + /// Which event is not necesarily specified here, though any + /// that are specified here would be waited on. + uint16 reserved : 7; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } pipelineStalls; ///< Information about pipeline stalls performed. + + union + { + struct + { + uint16 depthStencilExpand : 1; ///< Decompression of depth/stencil image. + uint16 htileHiZRangeExpand : 1; ///< Expansion of HTile's HiZ range. + uint16 depthStencilResummarize : 1; ///< Resummarization of depth stencil. + uint16 dccDecompress : 1; ///< DCC decompress BLT for color images. + uint16 fmaskDecompress : 1; ///< Fmask decompression for shader readability. + uint16 fastClearEliminate : 1; ///< Expand latest specified clear color into pixel data for the fast + /// cleared color/depth resource. + uint16 fmaskColorExpand : 1; ///< Completely decompresses the specified color resource. + uint16 initMaskRam : 1; ///< Memsets uninitialized memory to prepare it for use as + /// CMask/FMask/DCC/HTile. + uint16 updateDccStateMetadata : 1; ///< DCC state metadata was updated. + uint16 reserved : 7; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } layoutTransitions; ///< Information about layout translation performed. + + union + { + struct + { + uint16 invalTcp : 1; ///< Invalidate vector caches. + uint16 invalSqI$ : 1; ///< Invalidate the SQ instruction caches. + uint16 invalSqK$ : 1; ///< Invalidate the SQ constant caches (scalar caches). + uint16 flushTcc : 1; ///< Flush L2 cache. + uint16 invalTcc : 1; ///< Invalidate L2 cache. + uint16 flushCb : 1; ///< Flush CB caches. + uint16 invalCb : 1; ///< Invalidate CB caches. + uint16 flushDb : 1; ///< Flush DB caches. + uint16 invalDb : 1; ///< Invalidate DB caches. + uint16 invalCbMetadata : 1; ///< Invalidate CB meta-data cache. + uint16 flushCbMetadata : 1; ///< Flush CB meta-data cache. + uint16 invalDbMetadata : 1; ///< Invalidate DB meta-data cache. + uint16 flushDbMetadata : 1; ///< Flush DB meta-data cache. + uint16 invalTccMetadata : 1; ///< Invalidate L2 meta-data cache (also called the GLM). + uint16 invalGl1 : 1; ///< Invalidate the global L1 cache + uint16 placeholder : 1; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } caches; ///< Information about cache operations performed for the barrier. + +#if PAL_DEVELOPER_BUILD + AcquirePoint acquirePoint; +#endif +}; + +/// Enumeration for PAL barrier reasons +enum BarrierReason : uint32 +{ + BarrierReasonInvalid = 0, ///< Invalid barrier reason + + BarrierReasonFirst = 0x80000000, ///< The first valid barrier reason value + /// The only value that can smaller than this is the + /// invalid value. + BarrierReasonLast = 0xbfffffff, ///< The last valid barrier reason value + /// The only value that can larger than this is the + /// unknown value. + + BarrierReasonPreComputeColorClear = BarrierReasonFirst, ///< Barrier issued before a color clear + BarrierReasonPostComputeColorClear, ///< Barrier issued after a color clear + BarrierReasonPreComputeDepthStencilClear, ///< Barrier issued before a depth/stencil clear + BarrierReasonPostComputeDepthStencilClear, ///< Barrier issued after a depth/stencil clear + BarrierReasonMlaaResolveEdgeSync, ///< Barrier issued to sync mlaa edge calculations + BarrierReasonAqlWaitForParentKernel, ///< Barrier issued to wait for the parent kernel to + /// complete in an AQL submission + BarrierReasonAqlWaitForChildrenKernels, ///< Barrier issued to wait for the children kernels to + /// complete in an AQL submission + BarrierReasonP2PBlitSync, ///< Barrier issued to synchronize peer-to-peer blits + BarrierReasonTimeGraphGrid, ///< Barrier issued to wait for the time graph grid + BarrierReasonTimeGraphGpuLine, ///< Barrier issued to wait for the time graph gpu line + BarrierReasonDebugOverlayText, ///< Barrier issued to wait for the debug overlay text + BarrierReasonDebugOverlayGraph, ///< Barrier issued to wait for the debug overlay graph + BarrierReasonDevDriverOverlay, ///< Barrier issued to wait for developer driver overlay + BarrierReasonDmaImgScanlineCopySync, ///< Barrier issued to synchronize between image scanline + /// copies on the dma hardware + BarrierReasonPostSqttTrace, ///< Barrier issued to wait for work from an sqtt trace + BarrierReasonPrePerfDataCopy, ///< Barrier issued to wait for perf data to become + /// available for copy + BarrierReasonFlushL2CachedData, ///< Barrier issued to flush L2 cached data to main memory + BarrierReasonResolveImage, ///< Barrier issued before and after resolve image shader + BarrierReasonPerPixelCopy, ///< Barrier issued between CS copy and per-pixel copy steps + BarrierReasonGenerateMipmaps, ///< Barrier issued between generating mip levels + + /// Newly defined barrier reasons should be before this one. + BarrierReasonInternalLastDefined, ///< Only used for asserts. + BarrierReasonUnknown = 0xFFFFFFFF, ///< Unknown barrier reason + + /// Backwards compatibility reasons + BarrierReasonPreSyncClear = BarrierReasonPreComputeColorClear, + BarrierReasonPostSyncClear = BarrierReasonPostComputeColorClear +}; + +/// Style of barrier +enum class BarrierType : uint32 +{ + Full = 0, ///< A traditional blocking barrier. + Release, ///< A pipelined barrier that flushes caches and starts transitions. + Acquire, ///< A barrier that waits on previous 'Release' barriers. + + Count +}; + +/// Information for barrier executions. +struct BarrierData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the barrier. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 902 + ImgBarrier transition; ///< The particular image barrier with layout transition blt that is currently + /// executing, only used during a CallbackType::ImageBarrier. +#else + BarrierTransition transition; ///< The particular transition with layout transition blt that is currently + /// executing, only used during a CallbackType::ImageBarrier. +#endif + bool hasTransition; ///< Whether or not the transition structure is populated. + BarrierOperations operations; ///< Detailed cache and pipeline operations performed during this barrier execution + uint32 reason; ///< Reason that the barrier was invoked. Only filled at BarrierBegin. + BarrierType type; ///< What style of barrier this is. Only filled at BarrierBegin. +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Enumeration describing the different types of tile mode dimensions +enum class Gfx6ImageTileModeDimension : uint32 +{ + Linear = 0, ///< Linear tile mode. + Dim1d, ///< 1D tile mode. + Dim2d, ///< 2D tile mode. + Dim3d, ///< 3D tile mode. +}; + +/// Tile mode information +struct Gfx6ImageTileMode +{ + Gfx6ImageTileModeDimension dimension; ///< Dimensionality of tile mode. + + union + { + struct + { + uint32 prt : 1; ///< Image is a PRT. + uint32 thin : 1; ///< Thin tiled. + uint32 thick : 1; ///< Thick tiled. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } properties; ///< Bitfield of properties +}; + +/// Enumeration describing the different tile types +enum class Gfx6ImageTileType : uint32 +{ + Displayable = 0, ///< Displayable tiling. + NonDisplayable, ///< Non-displayable tiling. + DepthSampleOrder, ///< Same as non-displayable plus depth-sample-order. + Rotated, ///< Rotated displayable tiling. + Thick, ///< Thick micro-tiling. +}; +#endif + +/// Meta-data-related properties +struct ImageMetaDataInfo +{ + union + { + struct + { + uint32 color : 1; ///< Flag indicates this is a color buffer. + uint32 depth : 1; ///< Flag indicates this is a depth/stencil buffer. + uint32 stencil : 1; ///< Flag indicates this is a stencil buffer. + uint32 texture : 1; ///< Flag indicates this is a texture. + uint32 cube : 1; ///< Flag indicates this is a cubemap. + uint32 volume : 1; ///< Flag indicates this is a volume texture. + uint32 fmask : 1; ///< Flag indicates this is an fmask. + uint32 compressZ : 1; ///< Flag indicates z buffer is compressed. + uint32 overlay : 1; ///< Flag indicates this is an overlay surface. + uint32 noStencil : 1; ///< Flag indicates this depth has no separate stencil. + uint32 display : 1; ///< Flag indicates this should match display controller req. + uint32 opt4Space : 1; ///< Flag indicates this surface should be optimized for space + /// i.e. save some memory but may lose performance. + uint32 prt : 1; ///< Flag for partially resident texture. + uint32 tcCompatible : 1; ///< Image's metadata is TC-compatible. This reduces the maximum + /// compression levels, but allows the shader to read the data without + /// an expensive decompress operation. + uint32 dccCompatible : 1; ///< GFX 8: whether to make MSAA surface support dcc fast clear. + uint32 dccPipeWorkaround : 1; ///< GFX 8: whether to workaround the HW limit that + /// dcc can't be enabled if pipe config of tile mode + /// is different from that of ASIC. + uint32 disableLinearOpt : 1; ///< Disable tile mode optimization to linear. + uint32 reserved : 15; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } properties; ///< Bitfield of properties +}; + +/// Information for allocation of a PAL Image - AddrLib surface info. +struct ImageDataAddrMgrSurfInfo +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + union + { + struct + { + Gfx6ImageTileMode mode; ///< Tile mode. + Gfx6ImageTileType type; ///< Micro tiling type. + } gfx6; + struct + { + uint32 swizzle; ///< Swizzle mode. + } gfx9; + } tiling; +#endif + + ImageMetaDataInfo flags; ///< Metadata info. + uint32 swizzle; ///< HW-specific swizzle mode. + uint64 size; ///< Surface size, in bytes. + uint32 bpp; ///< Bits per pixel. + uint32 width; ///< Width. + uint32 height; ///< Height. + uint32 depth; ///< Depth. +}; + +/// Type of surface for which the register data is being provided +enum class SurfRegDataType : uint32 +{ + RenderTargetView, ///< Render Target View information. +}; + +/// Information for surface addresses for a SurfRegData callback +struct SurfRegDataInfo +{ + SurfRegDataType type; ///< Type of surface to which the register data corresponds. + uint32 regData; ///< Hardware-specific register data for the specific surface type. +}; + +/// Type of draw or dispatch operation for a DrawDispatch callback +enum class DrawDispatchType : uint32 +{ + CmdDraw = 0, ///< Auto-indexed draw. + CmdDrawOpaque, ///< Auto draw. + CmdDrawIndexed, ///< Indexed draw. + CmdDrawIndirectMulti, ///< (Multi) indirect draw. + CmdDrawIndexedIndirectMulti, ///< (Multi) indirect indexed draw. + CmdDispatchMesh, ///< Task/Mesh shader dispatch. + CmdDispatchMeshIndirectMulti, ///< Indirect Task/Mesh shader dispatch. + CmdGenExecuteIndirectDraw, ///< ExecuteIndirect draw. + CmdGenExecuteIndirectDrawIndexed, ///< ExecuteIndirect indexed draw. + CmdGenExecuteIndirectDispatchMesh, ///< ExecuteIndirect Task/Mesh shader dispatch. + CmdDispatch, ///< Direct compute dispatch. + CmdDispatchAce, ///< Direct Compute dispatch through implicit ganged-submit ACE stream. + CmdDispatchIndirect, ///< Indirect compute dispatch. + CmdDispatchOffset, ///< Direct compute dispatch (offsetted start). + CmdGenExecuteIndirectDispatch, ///< ExecuteIndirect dispatch. + CmdDispatchAql, ///< AQL compute dispatch + + Count, + FirstDispatch = CmdDispatch ///< All callbacks with an enum value greater or equal than this are dispatches +}; + +/// Draw-specific information for DrawDispatch callbacks +struct DrawDispatchDrawArgs +{ + /// Contains information about user data register indices for certain draw parameter state. + /// Some of these values may not be available for all draws on all clients, and in such + /// cases the value will be UINT_MAX. + struct + { + uint32 firstVertex; ///< Vertex offset (first vertex) user data register index + uint32 instanceOffset; ///< Instance offset (start instance) user data register index + uint32 drawIndex; ///< Draw ID SPI user data register index + } userDataRegs; +}; + +/// Dispatch-specific information for DrawDispatch callbacks +struct DrawDispatchDispatchArgs +{ + DispatchDims groupStart; ///< Thread/workgroup start offsets in X/Y/Z dimensions. Only valid for CmdDispatchOffset. + DispatchDims groupDims; ///< Thread/workgroup counts in X/Y/Z dimensions. Only valid for CmdDispatch[Offset]. + DispatchDims logicalSize; ///< Thread/workgroup counts as seen by the shader. Only valid for CmdDispatchOffset. + /// Optional flags to help the client driver understand the dispatch. + /// For example, if the dispatch originated in PAL rather than the client driver. + DispatchInfoFlags infoFlags; +}; + +/// Information for DrawDispatch callbacks +struct DrawDispatchData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command + DrawDispatchType cmdType; ///< Draw/dispatch command type. This influences which sub-structure below is valid. + + union + { + /// Draw-specific parameters. Valid when cmdType is CmdDraw*. + DrawDispatchDrawArgs draw; + + /// Dispatch-specific parameters. Valid when cmdType is CmdDispatch* + DrawDispatchDispatchArgs dispatch; + }; + + /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), + /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert + /// the marker. + RgpMarkerSubQueueFlags subQueueFlags; +}; + +/// Information for BindPipeline callbacks +struct BindPipelineData +{ + const IPipeline* pPipeline; ///< The currently-bound pipeline + const PipelineInfo* pPipelineInfo; ///< General information about the bound pipeline + ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command + uint64 apiPsoHash; ///< The hash to correlate APIs and corresponding PSOs. + PipelineBindPoint bindPoint; ///< The bind point of the pipeline within a queue. + + /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), + /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert + /// the marker. + RgpMarkerSubQueueFlags subQueueFlags; +}; + +#if PAL_DEVELOPER_BUILD +/// Information for DrawDispatchValidation callbacks +struct DrawDispatchValidationData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. + uint32 userDataCmdSize; ///< Size of PM4 commands used to validate the current user-data entries (bytes). + uint32 miscCmdSize; ///< Size of PM4 commands for all other draw- or dispatch-time validation (bytes). +}; + +// Information for BindPipelineValidation callbacks +struct BindPipelineValidationData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. + uint32 pipelineCmdSize; ///< Size of PM4 commands used to validate the current pipeline state (bytes). +}; + +/// Information for OptimizedRegisters callbacks +struct OptimizedRegistersData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering PM4 stream. + /// Array containing the number of times the PM4 optimizer saw a SET packet which modified each register + const uint32* pShRegSeenSets; + ///< Array containing the number of times the PM4 optimizer kept a SET packet which modified each register + const uint32* pShRegKeptSets; + uint32 shRegCount; ///< Number of SH registers + uint16 shRegBase; ///< Base address of SH registers + /// Array containing the number of times the PM4 optimizer saw a SET or RMW packet which modified each register + const uint32* pCtxRegSeenSets; + ///< Array containing the number of times the PM4 optimizer kept a SET or RMW packet which modified each register + const uint32* pCtxRegKeptSets; + uint32 ctxRegCount; ///< Number of context registers + uint16 ctxRegBase; ///< Base address of context registers +}; + +/// Internal RPM blt type +enum class RpmBltType : uint32 +{ + CpDmaCopy = 0, + CpDmaUpdate, + Draw, + Dispatch, + + Count +}; + +/// Describes the RPM blt call +struct RpmBltData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the blt. + RpmBltType bltType; ///< Type of RPM blt, @ref RpmBltType. +}; +#endif + +/// Describes the binding of a GPU Memory object to a resource +struct BindGpuMemoryData +{ + const void* pObj; ///< Opaque pointer to the resource having memory bound to it. + gpusize requiredGpuMemSize; ///< GPU memory size required by pObj. + const IGpuMemory* pGpuMemory; ///< IGpuMemory object being bound to the resource. + gpusize offset; ///< Offset within pGpuMemory where the resource is being bound. + bool isSystemMemory; ///< If true then system memory is being bound to the object. In this case, + /// pGpuMemory and offset should be set to zero. +}; + +/// Describes an user marker operation +enum class UserMarkerOpType : uint8 +{ + Invalid = 0, ///< Invalid user marker operation + Push, ///< Push user marker operation + Pop, ///< Pop user marker operation + Set ///< Set user marker operation +}; + +/// Describes an user marker operation, used in UserMarkerHistoryTraceSource +struct UserMarkerOpInfo +{ + union + { + struct + { + uint32 opType : 2; ///< UserMarkerOpType + uint32 strIndex : 30; ///< Index of the user marker in the in corresponding string table + }; + uint32 u32All; + }; +}; + +} // Developer +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h b/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h new file mode 100644 index 0000000000..9d22946b6d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h @@ -0,0 +1,5896 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDevice.h + * @brief Defines the Platform Abstraction Library (PAL) IDevice interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palCmdAllocator.h" +#include "palDestroyable.h" +#include "palFence.h" +#include "palFile.h" +#include "palGpuMemory.h" +#include "palImage.h" +#include "palInlineFuncs.h" +#include "palLib.h" +#include "palPerfExperiment.h" +#include "palPipeline.h" +#include "palQueue.h" +#include + +#if PAL_KMT_BUILD +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Util +{ + class Event; +} + +namespace Pal +{ + +// Forward declarations. +class IBorderColorPalette; +class ICmdAllocator; +class ICmdBuffer; +class IColorBlendState; +class IColorTargetView; +class IDepthStencilState; +class IDepthStencilView; +class IDevice; +class IFence; +class IGpuEvent; +class IGpuMemory; +class IImage; +class IIndirectCmdGenerator; +class IMsaaState; +class IPerfExperiment; +class IPipeline; +class IPrivateScreen; +class IQueryPool; +class IQueue; +class IQueueSemaphore; +class IShaderLibrary; +class ISwapChain; +struct BorderColorPaletteCreateInfo; +struct CmdAllocatorCreateInfo; +struct CmdBufferCreateInfo; +struct ColorBlendStateCreateInfo; +struct ColorTargetViewCreateInfo; +struct ComputePipelineCreateInfo; +struct DepthStencilStateCreateInfo; +struct DepthStencilViewCreateInfo; +struct ExternalImageOpenInfo; +struct ExternalGpuMemoryOpenInfo; +struct ExternalQueueSemaphoreOpenInfo; +struct ExternalResourceOpenInfo; +struct GpuEventCreateInfo; +struct GpuMemoryCreateInfo; +struct GpuMemoryOpenInfo; +struct GpuMemoryRef; +struct GraphicsPipelineCreateInfo; +struct ImageCreateInfo; +struct IndirectCmdGeneratorCreateInfo; +struct MsaaStateCreateInfo; +struct MsaaQuadSamplePattern; +struct PeerGpuMemoryOpenInfo; +struct PeerImageOpenInfo; +struct PerfExperimentCreateInfo; +struct PinnedGpuMemoryCreateInfo; +struct PresentableImageCreateInfo; +struct PrivateScreenCreateInfo; +struct PrivateScreenNotifyInfo; +struct QueryPoolCreateInfo; +struct QueueCreateInfo; +struct QueueSemaphoreCreateInfo; +struct QueueSemaphoreOpenInfo; +struct ShaderLibraryCreateInfo; +struct SwapChainCreateInfo; +struct SwapChainProperties; +struct SvmGpuMemoryCreateInfo; +struct GraphicPipelineViewInstancingInfo; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 881 +enum class WsiPlatform : uint32; +#else +enum WsiPlatform : uint32; +#endif +enum class PipelineBindPoint : uint32; +enum class VaRange : uint32; +enum class SwapChainMode : uint32; +namespace CmdDisassembly +{ + class ICmdBufferReporting; +} + +/// Maximum string length for GPU names. @see DeviceProperties. +constexpr uint32 MaxDeviceName = 256; + +/// Maximum number of indirect user-data tables managed by PAL's command buffer objects. @see DeviceFinalizeInfo. +constexpr uint32 MaxIndirectUserDataTables = 1; + +/// Maximum number of supported entries in the MSAA sample pattern palette. See IDevice::SetSamplePatternPalette(). +constexpr uint32 MaxSamplePatternPaletteEntries = 16; + +/// Maximum number of supported units in the gpu. These can be much larger than the actual values, but useful for arrays. +constexpr uint32 MaxShaderEngines = 32; +/// Maximum number of supported subunits each Shader Engine splits into (SH or SA, depending on generation) +constexpr uint32 MaxShaderArraysPerSe = 2; + +/// Size of the Active Pixel Packer Mask in DWORDs +constexpr uint32 ActivePixelPackerMaskDwords = 4; + +/// Maximum number of pixel packers per SE expected by PAL +constexpr uint32 MaxPixelPackerPerSe = 4; + +/// Defines host flags for Semaphore/Fence Array wait +enum HostWaitFlags : uint32 +{ + HostWaitAny = 0x1, ///< if set this bit, return after any signle semaphore/fence in the array has + /// completed. if not set, wait for completion of all semaphores/fences in the + /// array before returning. +}; + +/// Specifies what type of GPU a particular IDevice is (i.e., discrete vs. integrated). +enum class GpuType : uint32 +{ + Unknown = 0x0, ///< The GPU type can't be determined and is unknown. + Integrated = 0x1, ///< Integrated GPU (i.e., APU). + Discrete = 0x2, ///< Discrete GPU. + Virtual = 0x3, ///< Virtualized GPU. + Count +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Specifies which operating-system-support IP level (OSSIP) this device has. +enum class OssIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an OSSIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an OSSIP block, or its level cannot be determined +#endif + + OssIp2_4 = 0x3, + OssIp4 = 0x4, +}; + +/// Specifies which VCE IP level this device has. +enum class VceIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an VCEIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an VCEIP block, or its level cannot be determined +#endif + + VceIp1 = 0x1, + VceIp2 = 0x2, + VceIp3 = 0x3, + VceIp3_1 = 0x4, + VceIp3_4 = 0x5, + VceIp4 = 0x6, +}; + +/// Specifies which UVD IP level this device has. +enum class UvdIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an UVDIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an UVDIP block, or its level cannot be determined +#endif + + UvdIp3_2 = 0x1, + UvdIp4 = 0x2, + UvdIp4_2 = 0x2, + UvdIp5 = 0x3, + UvdIp6 = 0x4, + UvdIp6_2 = 0x5, + UvdIp6_3 = 0x6, + UvdIp7 = 0x7, + UvdIp7_2 = 0x8, +}; +#endif + +/// Specifies which VCN IP level this device has. +enum class VcnIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an VCNIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an VCNIP block, or its level cannot be determined +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + VcnIp1 = 0x1, +#endif +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Specifies which SPU IP level this device has. +enum class SpuIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an SPUIP block, or its level cannot be determined +#ifndef None + None = _None, ///< The device does not have an SPUIP block, or its level cannot be determined +#endif + SpuIp = 0x1, +}; +#endif + +/// Specifies which PSP IP level this device has. +enum class PspIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an PSPIP block, or its level cannot be determined +#ifndef None + None = _None, ///< The device does not have an PSPIP block, or its level cannot be determined +#endif + PspIp10 = 0x1, +}; + +/// Specified video decode type +enum class VideoDecodeType : uint32 +{ + H264 = 0x0, ///< H264 VLD + Vc1 = 0x1, ///< VC1 VLD + Mpeg2Idct = 0x2, ///< Partial MPEG2 decode (IT+MP) + Mpeg2Vld = 0x3, ///< Full MPEG2 decode (RE+IT+MP+DB) + Mpeg4 = 0x4, ///< MPEG4 + Wmv9 = 0x5, ///< WMV9 IDCT + Mjpeg = 0x6, ///< Motion JPEG + Hevc = 0x7, ///< HEVC + Vp9 = 0x8, ///< VP9 + Hevc10Bit = 0x9, ///< HEVC 10bit + Vp910Bit = 0xa, ///< VP9 10bit + Av1 = 0xb, ///< AV1 8/10bit + Av112Bit = 0xc, ///< AV1 12bit + Count, +}; + +/// Video CODEC to use for encoding +enum class VideoEncodeCodec : uint32 +{ + H264 = 0x0, ///< H.264 + H265 = 0x1, ///< H.265 + Av1 = 0x2, ///< AV1 + Count +}; + +/// Specifies a virtual address range memory should be allocated in. +enum class VaRange : uint32 +{ + Default, ///< Default VA range. Choose this for most allocations. + DescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for descriptor tables. Knowing + /// an allocation is allocated in this range, only one user data entry is required to + /// specify a descriptor table. @see ResourceMappingNodeType. + ShadowDescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for "shadow" descriptor tables. + /// A shadow descriptor table is an additional table with the same layout as its parent + /// descriptor table that can hold infrequently needed data like fmask SRDs or UAV counter + /// data. This scheme allows the client and SC to work out a known location for + /// infrequently needed data without wasting a user data entry or wasting half of every + /// descriptor cache line. + /// Only supported if DeviceProperties::gpuMemoryProperties::flags::shadowDescVaSupport is + /// set. + Svm, ///< Place the allocation in a VA range reserved by PAL for shared virtual memory(SVM). + /// This is a GPU VA range that is reserved also on the CPU-side. + /// The size of reserved VA is set by PAL client by calling CreatePlatform. + CaptureReplay, ///< Place the allocation in a VA range reserved for capture and playback. + Count, +}; + +/// Enumerates tmz(trusted memory zone) support level. +enum class TmzSupportLevel : uint32 +{ + None = 0, ///< TMZ not supported. + PerQueue = 1, ///< Enable TMZ mode per queue. + PerSubmission = 2, ///< Enable TMZ mode per submission. + PerCommandOp = 3 ///< Enable TMZ mode per command operation. +}; + +/// How to interpret a single bit in a swizzle equation. +union SwizzleEquationBit +{ + struct + { + uint8 valid : 1; ///< Indicates whether this channel setting is valid. + uint8 channel : 2; ///< 0 for x channel, 1 for y channel, 2 for z channel. + uint8 index : 5; ///< The channel index. + }; + uint8 u8All; ///< The above values packed in an 8-bit uint. +}; + +constexpr uint32 SwizzleEquationMaxBits = 20; ///< Swizzle equations will consider no more than this many bits. +constexpr uint8 InvalidSwizzleEqIndex = 0xFF; ///< Indicates an invalid swizzle equation index in the equation table. +constexpr uint8 LinearSwizzleEqIndex = 0xFE; ///< An invalid eq. index indicating a row-major, linear memory layout. + +/// Texture fetch meta-data capabilities bitfield definition, used with tcCompatibleMetaData setting +enum TexFetchMetaDataCaps : uint32 +{ + TexFetchMetaDataCapsNoAaColor = 0x00000001, + TexFetchMetaDataCapsMsaaColor = 0x00000002, + TexFetchMetaDataCapsFmask = 0x00000004, + TexFetchMetaDataCapsNoAaDepth = 0x00000008, + TexFetchMetaDataCapsMsaaDepth = 0x00000010, + TexFetchMetaDataCapsAllowStencil = 0x00000020, + TexFetchMetaDataCapsAllowZ16 = 0x00000040, +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 +/// Catalyst AI setting enums +enum CatalystAiSettings : uint32 +{ + CatalystAiDisable = 0, + CatalystAiEnable = 1, + CatalystAiMaximum = 2, +}; +#endif + +/// Texture Filter optimization enum values +enum TextureFilterOptimizationSettings : uint32 +{ + TextureFilterOptimizationsDisabled = 0, + TextureFilterOptimizationsEnabled = 1, + TextureFilterOptimizationsAggressive = 2, +}; + +/// Distribution Tess Mode enum values +enum DistributionTessMode : uint32 +{ + DistributionTessOff = 0, + DistributionTessDefault = 1, + DistributionTessPatch = 2, + DistributionTessDonut = 3, + DistributionTessTrapezoid = 4, + DistributionTessTrapezoidOnly = 5, +}; + +/// Defines the context roll optimization flags +enum ContextRollOptimizationFlags : uint32 +{ + OptFlagNone = 0x00000000, + PadParamCacheSpace = 0x00000001, +}; + +/// Defines the initial value to use for DCC metadata +enum class DccInitialClearKind { + Uncompressed = 0x0, + OpaqueBlack = 0x1, + OpaqueWhite = 0x2, + ForceBit = 0x10, + ForceOpaqueBlack = (ForceBit | OpaqueBlack), + ForceOpaqueWhite = (ForceBit | OpaqueWhite), +}; + +/// Enum defining the different scopes (i.e. registry locations) where settings values are stored +enum InternalSettingScope : uint32 +{ + PrivateDriverKey = 0x0, + PublicPalKey = 0x1, + PrivatePalKey = 0x2, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + PublicCatalystKey = 0x3, + PrivatePalGfx9Key = 0x4, + PrivatePalGfx12Key = 0x5, +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + PrivatePalGfx6Key = 0x3, + PrivatePalGfx9Key = 0x4, + PublicCatalystKey = 0x5, + PrivatePalGfx12Key = 0x7, +#endif + PublicPalFile = 0x9, + RootDriverKey = 0xa, +}; + +/// Enum defining override states for feature settings. +enum class FeatureOverride : uint32 +{ + Default = 0, ///< Default setting state. + Enabled = 1, ///< (Force) enabled state. Default may change itself to this state. + Disabled = 2 ///< (Force) disabled state. Default may change itself to this state. +}; + +/// Enum bitmask defining externally-controlled (e.g. by Radeon Settings/KMD) driver feature settings. +enum RsFeatureType : uint32 +{ + RsFeatureTypeTurboSync = (1u << 0), + RsFeatureTypeChill = (1u << 1), + RsFeatureTypeDelag = (1u << 2), + RsFeatureTypeBoost = (1u << 4), + RsFeatureTypeProVsr = (1u << 5), +}; + +/// Output structure containing information about the requested RsFeatureType (singular). +union RsFeatureInfo +{ + /// Global TurboSync settings. + struct + { + bool enabled; ///< Specifies whether TurboSync is enabled globally. + } turboSync; + + /// Global Chill settings. + struct + { + bool enabled; ///< Specifies whether Chill is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Chill. + uint32 minFps; ///< Specifies the global Chill minimum FPS limit. + uint32 maxFps; ///< Specifies the global Chill maximum FPS limit. + } chill; + + /// Global Delag settings. + struct + { + bool enabled; ///< Specifies whether Delag is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Delag. + uint32 hotkeyInd;///< If nonzero, specifies the virtual key code assigned to Delag's indicator. + uint32 limitFps; ///< Specifies the global Delag FPS limit. + uint32 level; ///< Specifies the global Delag level. + } delag; + + /// Global Boost settings. + struct + { + bool enabled; ///< Specifies whether Boost is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Boost. + uint32 hotkeyInd; ///< If nonzero, specifies the virtual key code assigned to Boost's indicator. + uint32 minRes; ///< Specifies the global Boost minimum resolution. + bool adaptiveVrsEnabled; ///< Specifies whether BoostAdaptiveVrs is enabled globally. + } boost; + + /// Global ProVsr settings. + struct + { + bool enabled; ///< Specifies whether ProVsr is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to ProVsr. + } proVsr; + +}; + +/// High-dynamic range (HDR) surface display modes. Used to indicate the HDR display standard for a particular swap +/// chain texture format and screen colorspace/transfer function combination. +enum class HdrDisplayMode : uint32 +{ + Sdr = 0, ///< Standard dynamic range; non-HDR compatible (default). + Hdr10 = 1, ///< HDR10 PQ. Requires 10:10:10:2 swap chain. + ScRgb = 2, ///< scRGB HDR (Microsoft and FreeSync2 linear mode). 1.0 = 80 nits, 125.0 = 10000 nits. + /// Requires FP16 swapchain. + FreeSync2 = 3, ///< FreeSync2 HDR10 Gamma 2.2. Requires 10:10:10:2 swap chain. +}; + +static constexpr uint32 MaxMiscStrLen = 61; + +/// Whether to use graphics or compute for performing fast clears on depth stencil views. +enum class FastDepthStencilClearMode : uint8 +{ + Default, ///< Compute or graphics will be chosen at the driver's discretion + Graphics, ///< Graphics will always be used + Compute ///< Compute will always be used +}; + +enum DeferredBatchBinMode : uint32 +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 875 + DeferredBatchBinCustom = 0, + DeferredBatchBinAccurate = 1 +#else + DeferredBatchBinDisabled = 0, + DeferredBatchBinCustom = 1, + DeferredBatchBinAccurate = 2 +#endif +}; + +/// PWS enable mode: e.g. disabled, fully enabled or partially enabled. +enum class PwsMode : uint32 +{ + Disabled = 0, ///< PWS feature is disabled + Enabled = 1, ///< PWS feature is fully enabled if HW supports. + NoLateAcquirePoint = 2 ///< PWS feature is enabled with PWS counter only if HW supports, no late acquire points. +}; + +#if defined(__unix__) +/// Defines the supported VM always valid modes. +enum VmAlwaysValidEnable : uint32 +{ + VmAlwaysValidDefaultEnable = 0, ///< Only enable the optimization when kernel driver can support + VmAlwaysValidForceDisable = 1, ///< Force the optimization always disabled + VmAlwaysValidForceEnable = 2 ///< Force the optimization always enabled +}; +#endif + +/// Alignment mode for accessing graphics/compute buffers. +/// This doesn't change the alignment of the whole buffer, just the alignment at which accesses must be made. +enum class BufferAlignmentMode : uint8 +{ + Default, ///< KMD (and therefore PAL) picks alignment requirement. Client should be prepared for anything. + Dword, ///< Hardware will automatically align requests to the smaller of: element-size or DWORD. + Unaligned ///< Any request alignment is allowed. +}; + +enum TemporalHintsMrtBehavior : uint8 +{ + TemporalHintsDynamicRt = 0x0, ///< Enable Dynamic RT Temporal hints. PAL chooses NT vs RT based on heuristics. + TemporalHintsStaticRt = 0x1, ///< Regular temporal for both near and far read/write caches. + TemporalHintsStaticNt = 0x2, ///< Non-temporal (re-use not expected) for both near and far read/write caches. +}; + +/// Client-controllable behavior for Gfx12-specific software workaround to HiSZ hardware bug. +/// Allows the client to override PAL defaults for performance or profiled reasons. +enum class HiSZWorkaroundBehavior : uint8 +{ + Default = 0x0, ///< Let PAL decide what the default is. + ForceDisableAllWar, ///< Force disable all workarounds. + /// Note: This should rarely be chosen and carries an extremely high risk of issue. + /// Should only be used when application has been profiled to guarantee no + /// risk of issue. + ForceHiSZDisableBasedWar, ///< Force the disable HiZ/S based workaround behavior. + ForceHiSZEventBasedWar, ///< Force the event-after-draw workaround behavior. + /// Note: This carries a risk that the hang may still be seen. + ForceHiSZDisableBaseWarWithReZ ///< Force the disable HiZ/S based workaround behavior, but with an added + /// optimization to force ZOrder mode to EarlyZThenReZ to reclaim some performance. + /// This will apply to all graphics pipelines that trigger the workaround + /// condition, except those pipelines that have set + /// GraphicsPipelineCreateInfo::noForceReZ. This ZOrder change will occur only when + /// HiZ/S is forcibly disabled by the workaround; otherwise the ZOrder remains what + /// was chosen by the compiler. +}; + +/// Pal settings that are client visible and editable. +struct PalPublicSettings +{ + /// Maximum border color palette size supported by any queue. + uint32 borderColorPaletteSizeLimit; + + /// Whether to use graphics or compute for performing fast clears on depth stencil views. + FastDepthStencilClearMode fastDepthStencilClearMode; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Forces all serialized loads (LoadPipeline or LoadCompoundState) to fail. + bool forceLoadObjectFailure; +#endif + + /// Controls the distribution mode for tessellation, which affects how patches are processed by different VGT + /// units. 0: None - No distribution across VGTs (legacy mode). 1: Default - Optimal settings are chosen depending + /// on the gfxip. 2: Patch - Individual patches are distributed to different VGTs. 3: Donut - Patches are split + /// into donuts and distributed to different VGTs. 4: Trapezoid - Patches from donuts are split into trapezoids and + /// distributed to different VGTs. Falls back to donut mode if HW does not support this mode. 5: Trapezoid only - + /// Distribution turned off if HW does not support this mode. + uint32 distributionTessMode; + + /// Flags that control PAL optimizations to reduce context rolls. 0: Optimization disabled. 1: Pad parameter cache + /// space. Sets VS export count and PS interpolant number to per-command buffer maximum value. Reduces context rolls + /// at the expense of parameter cache space. + uint32 contextRollOptimizationFlags; + + /// The number of unbound descriptor debug srds to allocate. To detect reads of unbound descriptor within arrays, + /// multiple debug srds can be allocated. + uint32 unboundDescriptorDebugSrdCount; + + /// Disables compilation of internal PAL shaders. It can be enabled only if a PAL client won't use any of PAL blit + /// functionalities on gfx/compute engines. + bool disableResourceProcessingManager; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Controls app detect and image quality altering optimizations exposed by CCC. + uint32 catalystAI; +#endif + + /// Controls texture filtering optimizations exposed by CCC. + uint32 textureOptLevel; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Disables SC initialization. It can be enabled only if a PAL client won't use SC for shader compilation and + /// provide direct ISA binaries(usually AQL path). + bool disableScManager; +#endif + + /// Information about the client performing the rendering. For example: Rendered By PAL (0.0.1) + char renderedByString[MaxMiscStrLen]; + + /// Debug information that the client or tester might want reported. + char miscellaneousDebugString[MaxMiscStrLen]; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Allows SC to make optimizations at the expense of IEEE compliance. + bool allowNonIeeeOperations; + + /// Controls whether shaders should execute one atomic instruction per wave for UAV append/consume operations. + /// If false, one atomic will be executed per thread. + bool appendBufPerWaveAtomic; +#endif + + /// Bitmask of cases where texture compatible meta data will be used Single-sample color surface: 0x00000001 MSAA + /// color surface: 0x00000002 FMask data: 0x00000004 Single-sample depth surface: 0x00000008 MSAA depth surface: + /// 0x00000010 Allow stencil: 0x00000020 Allow Z-16 surfs 0x00000040 + uint32 tcCompatibleMetaData; + + /// Specifies the threshold below which CmdCopyMemory() is executed via a CpDma BLT, in bytes. CPDMA copies have + /// lower overhead than CS/Gfx copies, but less throughput for large copies. + uint32 cpDmaCmdCopyMemoryMaxBytes; + + /// Forces high performance state for allocated queues. Note: currently supported in Windows only. + bool forceHighClocks; + + /// When submitting multiple command buffers in a single grQueueSubmit call, the ICD will patch the command streams + /// so that the command buffers are chained together instead of submitting through KMD multiple times. This setting + /// limits the number of command buffers that will be chained together; reduce to prevent problems due to long + /// running submits. + uint32 cmdBufBatchedSubmitChainLimit; + + /// Flags that control PAL's command allocator residency optimizations. If a command allocation isn't optimized PAL + /// will wait for it to become resident at creation. 0x1 - Wait for command data to become resident at Submit-time. + /// 0x2 - Wait for embedded data to become resident at Submit-time. 0x4 - Wait for marker data to become resident at + /// Submit-time. + uint32 cmdAllocResidency; + + /// Overrides max queued frames allowed + uint32 maxQueuedFrames; + + /// Maximum number of presentable images per adapter(including LDA chain) which is recommended. If app exceeds the + /// presentable image number threshold, awarning may be reported. + uint32 presentableImageNumberThreshold; + + /// Provides a hint to PAL that client knows that every individual depth stencil surfaces are always cleared with + /// same values.If TRUE, per-tile tracking of exp/clear will be enabled (requires HTile). + bool hintInvariantDepthStencilClearValues; + + /// Provides a hint to PAL that PAL should disable color compression on surfaces that are smaller than or equal to + /// this setting (setting * setting) in size. + uint32 hintDisableSmallSurfColorCompressionSize; + + /// Disables Escape call to KMD. This is a temporary setting for experimentation that is expected to break features + /// that currently needs Escape call. + bool disableEscapeCall; + + /// A hint to the Windows OS that this application has submissions that are expected to run for a long time. This + /// tells the OS that checking elapsed execution time is not a good way to judge if the GPU is hung. + /// + /// If this flag is set, Windows shouldn't TDR long submissions on uncontested hardware queues. Submissions on + /// contested queues must preempt within the time limit to avoid a TDR. Setting this may also extend the TDR timer + /// on compute queues. + bool longRunningSubmissions; + + /// Disables MCBP on demand. This is a temporary setting until ATOMIC_MEM packet issue with MCBP is resolved. + bool disableCommandBufferPreemption; + + /// Disable the fast clear eliminate skipping optimization. This optimization will conservatively track the usage + /// of clear values to allow the vast majority of images that never clear to a value that isn't TC-compatible to + /// skip the CPU and front-end GPU overhead of issuing a predicated fast clear eliminate BLT. + bool disableSkipFceOptimization; + + /// Sets the minimum BPP of surfaces which will have DCC enabled + uint32 dccBitsPerPixelThreshold; + + /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have + /// PAL automatically pad allocation starting virtual address alignments to enable this optimization. By + /// default, PAL will use the KMD-reported limit. + gpusize largePageMinSizeForVaAlignmentInBytes; + + /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have + /// PAL automatically pad allocation sizes to fill an integral number of large pages. By default, PAL will + /// use the KMD-reported limit. + gpusize largePageMinSizeForSizeAlignmentInBytes; + + /// Makes the unbound descriptor debug srd 0 so the hardware drops the load and ignores it instead of pagefaulting. + /// Used to workaround incorrect app behavior. + bool zeroUnboundDescDebugSrd; + + /// Preferred heap for uploading client pipelines. Default is set to @ref GpuHeap::GpuHeapInvisible. Setting is + /// ignored for internal pipelines and are uploaded to @ref GpuHeap::GpuHeapLocal. + GpuHeap pipelinePreferredHeap; + + bool depthClampBasedOnZExport; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 + /// Force the PreColorTarget to an earlier PreRasterization point if used as a wait point. This is to prevent a + /// write-after-read hazard for a corner case: shader exports from distinct packers are not ordered. Advancing + /// wait point from PreColorTarget to PostPrefetch could cause over-sync due to extra VS/PS_PARTIAL_FLUSH + /// inserted. It is default to false, but client drivers may choose to app-detect to enable if see corruption. + bool forceWaitPointPreColorToPostPrefetch; +#endif + + /// Allows the client to disable debug overlay visual confirm after DebugOverlay::Platform is created when the + /// panel setting DebugOverlayEnabled is globally set but a certain application might need to turn off visual + /// confirm to make the screen not too noisy. + bool disableDebugOverlayVisualConfirm; + + bool enableExecuteIndirectPacket; + + /// Offers flexibility to the client to choose Graphics vs Compute engine for Indirect Command Generation + /// (Shader path) based on performance and other factors. The default is false since we have seen perf gains using + /// the ACE. + bool disableExecuteIndirectAceOffload; + + /// Value to initialize metadata for DCC surfaces to, if they are compressable. This has no effect on non-DCC + /// images. Images whose initial layout is not compressable are only affected if this is "forced". + /// 0x00 - Uncompressed (default) + /// 0x01 - Opaque Black + /// 0x02 - Opaque White + /// 0x11 - Forced Opaque Black + /// 0x12 - Forced Opaque White + uint32 dccInitialClearKind; + + /// Allows the client to not create internal VrsImage. Pal internal will create a 16M image as vrsImageSize. + bool disableInternalVrsImage; + + /// Allows the client to control binning persistent and context states per bin. + /// A value of 0 tells PAL to pick the number of states per bin. + uint32 binningPersistentStatesPerBin; + uint32 binningContextStatesPerBin; + + /// This key controls if binning will be disabled when the PS may kill pixels. + OverrideMode disableBinningPsKill; + + /// Controls GS LateAlloc val (for pos/prim allocations NOT param cache) on NGG pipelines. Can be no more than 127. + uint32 nggLateAllocGs; + + /// Bitmask of cases where RPM view memory accesses will bypass the MALL + /// RpmViewsBypassMallOff (0x0): Disable MALL bypass + /// RpmViewsBypassMallOnRead (0x1): Skip MALL for read access of views created in RPM + /// RpmViewsBypassMallOnWrite (0x2): Skip MALL for write access of views created in RPM + /// RpmViewsBypassMallOnCbDbWrite (0x4): Control the RPM CB/DB behavior + RpmViewsBypassMall rpmViewsBypassMall; + + /// Optimize color export format for depth only rendering. Only applicable for RB+ parts + bool optDepthOnlyExportRate; + + /// Controls whether or not we should expand Hi-Z to full range rather than doing fine-grain resummarize + /// operations. Expanding Hi-Z leaves the Hi-Z data in a less optimal state but is a much faster operation + /// than the fine-grain resummarize. + bool expandHiZRangeForResummarize; + + /// Control whether to have command buffer emit SQTT marker events. Useful for client driver to perform SQTT + /// dump without the involvement of dev driver. + bool enableSqttMarkerEvent; + + /// Controls the value of CB_COLOR0_ATTRIB.LIMIT_COLOR_FETCH_TO_256B_MAX. This bit limits CB fetch to 256B on cache + /// miss, regardless of sector size. + bool limitCbFetch256B; + + /// Controls whether or not deferred batch binning is enabled 0 : Batch binning always disabled 1 : Use custom bin + /// sizes 2 : Optimal. + DeferredBatchBinMode binningMode; + + /// Controls the custom batch bin size.Only used when deferredBatchBinMode == 1 High word is for x, low word is for + /// y. Default is 128x128. Values must be power of two between 16 and 512. + uint32 customBatchBinSize; + + /// Maximum number of primitives per batch. The maximum value is 1024. + uint32 binningMaxPrimPerBatch; + + /// Controls PWS enable mode: disabled, fully enabled or partially enabled. Only take effect if HW supports PWS. + PwsMode pwsMode; + + /// Controls the MaxScratchRingSizeBaseline, which is really just the maximum size of the scratch ring + gpusize maxScratchRingSizeBaseline; + + /// Controls the maximum size of the scratch ring allocation + uint32 maxScratchRingSizeScalePct; + +#if defined(__unix__) + /// Whether enable vm-always-valid feature on Linux while allocating Bo + VmAlwaysValidEnable enableVmAlwaysValid; +#endif + + /// Alignment mode for graphics and compute buffers. + /// Clients must check Pal::DeviceProperties::osProperties::flags::forceAlignmentSupported + /// to see if anything other than default will work. + BufferAlignmentMode hardwareBufferAlignmentMode; + + // Disallows putting the shader ring in system memory for performance purposes + // This is done by un-listing GpuHeapGartUswc as a possible heap for the shader rings. + // Instead only allowing GpuHeapInvisible and GpuHeapLocal + bool forceShaderRingToVMem; + + /// If the client sets this to true they promise they've done exhaustive testing on every ASIC to prove that this + /// application can use AC01 fast clears safely. This should never be forced to true unconditionally. + bool ac01WaNotNeeded; + + /// Toggles whether or not image copies will prefer using the graphics pipeline. This setting does not force all + /// copies to use graphics or compute, it changes what method will be selected in cases where either could be used. + bool preferGraphicsImageCopy; + + /// Bitmask to control adding Waits around Flush events + /// This is public setting to allow AppDetect to override of 'WaitOnFlush' setting. + /// Setting 'waitOnFlush' for actual workarounds is strongly discouraged because: + /// 1) It has a negative performance impact. + /// 2) Waits effect the timing and pipeline execution which can hide underlying hw/fw/sw bugs. + /// Issues resolved by added waits should be root caused. + uint32 waitOnFlush; + + /// Provides the ability for mall to be alloc-ed/noalloc-ed using the dynamic or static behavior. + TemporalHintsMrtBehavior temporalHintsMrtBehavior; + + /// Allows the client to change the behavior of the Gfx12 HW Bug that impacts HiS and HiZ. + /// This should never be forced away from default unconditionally. + HiSZWorkaroundBehavior hiSZWorkaroundBehavior; + + /// For event-based HiSZ workarounds (Gfx12), specifies the timeouts supported by the Scan Converter and Depth Block + /// for their tile summarizer controller. + /// Ignored unless @ref hiSZWorkaroundBehavior is set to ForceHiSZEventBasedWar. + /// If set to 0, PAL picks a default value. + /// Note: Choosing any value other than 0 carries with it **significant risk** of hangs, as the timeout value + /// determined by PAL is the most optimal to avoid the most hangs. Any deviation from the default must be + /// thoroughly tested and is not guaranteed to be safe! + uint32 tileSummarizerTimeout; + +#if PAL_KMT_BUILD + /// If client sets it to false, native fence will be disabled in OS wide no matter OS or hardware supports + /// native fence or not. + bool enableNativeFence; +#endif + + /// Whether HiZ/HiS is valid after depth/stencil image is copied as destination. + /// - If false, this is not guaranteed, PAL will issue necessary HiZ/HiS resummarization at later client barrier + /// call time when transition to another HiZ/HiS valid layout like DepthStencilTarget. + /// - If true, PAL does necessary HiZ/HiS fixup after copy if needed and HiZ/HiS will be always valid after copy. + /// At later client barrier call time, no HiZ/HiS resummarization will happen. + bool makeLayoutCopyDstHiSZCompatible; +}; + +/// Defines the modes that the GPU Profiling layer can use when its buffer fills. +enum GpuProfilerStallMode : uint32 +{ + GpuProfilerStallAlways = 0, ///< Always stall to get accurate trace data + GpuProfilerStallLoseDetail = 1, ///< Lose register-level detail if under pressure to avoid stalls + GpuProfilerStallNever = 2, ///< Never stall, miss trace packets +}; + +/// Describes the equations needed to interpret the raw memory of a tiled texture. +struct SwizzleEquation +{ + SwizzleEquationBit addr[SwizzleEquationMaxBits]; ///< Address setting: each bit is the result of addr ^ xor ^ xor2. + SwizzleEquationBit xor1[SwizzleEquationMaxBits]; ///< xor setting. + SwizzleEquationBit xor2[SwizzleEquationMaxBits]; ///< xor2 setting. + uint32 numBits; ///< The number of bits in the equation. + bool stackedDepthSlices; ///< True if depth slices are treated as being stacked vertically + /// prior to swizzling. +}; + +/// Specifies the hardware features supported for PRT (sparse images). +enum PrtFeatureFlags : uint32 +{ + PrtFeatureBuffer = 0x00000001, ///< Indicates support for sparse buffers + PrtFeatureImage2D = 0x00000002, ///< Indicates support for sparse 2D images + PrtFeatureImage3D = 0x00000004, ///< Indicates support for sparse 3D images + PrtFeatureImageMultisampled = 0x00000008, ///< Indicates support for sparse multisampled images + PrtFeatureImageDepthStencil = 0x00000010, ///< Indicates support for sparse depth/stencil images + PrtFeatureShaderStatus = 0x00000020, ///< Indicates support for residency status in shader instructions + PrtFeatureShaderLodClamp = 0x00000040, ///< Indicates support for LOD clamping in shader instructions + PrtFeatureUnalignedMipSize = 0x00000080, ///< Indicates support for non-miptail levels with dimensions that + /// aren't integer multiples of the tile size as long as they are + /// at least as large as a single tile + PrtFeaturePerSliceMipTail = 0x00000100, ///< Indicates support for per-slice miptail (slice-major order) + + PrtFeatureTileAliasing = 0x00000200, ///< Indicates support for aliasing tiles (without metadata) + PrtFeatureStrictNull = 0x00000400, ///< Indicates whether reads of unmapped tiles always return zero +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + PrtFeatureNonStandardImage3D = 0x00000800, ///< Indicates support for sparse 3D images restricted to + /// non-standard tile shapes that match the tile mode block depth + PrtFeaturePrtPlus = 0x00001000, ///< Indicates that this image supports use of residency maps. +#else + PrtFeaturePrtPlus = 0x00000800, ///< Indicates that this image supports use of residency maps. +#endif +}; + +/// Describe the settings' scope accessible by clients. +enum class SettingScope +{ + Driver, ///< For settings specific to a UMD + Global, ///< For global settings controlled by CCC + File, ///< For settings that are only read from a file + Root, ///< For settings under HKR +}; + +/// Big Software (BigSW) Release information structure +/// Software release management uses this version # to control a rollout of big SW features together. +struct BigSoftwareReleaseInfo +{ + uint32 majorVersion; ///< BigSW Release Major version + uint32 minorVersion; ///< BigSW Release Minor version. + uint32 miscControl; ///< BigSW Release miscellaneous control. +}; + +/// Virtual display capabilities as determined by the OS. The reported values bound the valid ranges of values supported +/// by the @ref VirtualDisplayInfo structure passed in to @ref IDevice::CreateVirtualDisplay. +struct VirtualDisplayCapabilities +{ + uint32 maxVirtualDisplays; ///< The maximum number of virtual display supported + Rational minRefreshRate; ///< The minimum refresh rate + Rational maxRefreshRate; ///< The maximum refresh rate +}; + +/// The properties of a specific virtual display +struct VirtualDisplayProperties +{ + bool isVirtualDisplay; ///< True, if it's a virtual display +}; + +/// Enumerates all of the types of local video memory which could be associated with a GPU. +enum class LocalMemoryType : uint32 +{ + Unknown = 0, + Ddr2, + Ddr3, + Ddr4, + Gddr5, + Gddr6, + Hbm, + Hbm2, + Hbm3, + Lpddr4, + Lpddr5, + Ddr5, + Count +}; + +/// Bitmask of all MSAA/EQAA types supported, in terms of samples (S) and shaded fragments (F) +enum MsaaFlags : uint16 +{ + MsaaS1F1 = 0x0001, + MsaaS2F1 = 0x0002, + MsaaS4F1 = 0x0004, + MsaaS8F1 = 0x0008, + MsaaS16F1 = 0x0010, + MsaaAllF1 = 0x001F, + + MsaaS2F2 = 0x0020, + MsaaS4F2 = 0x0040, + MsaaS8F2 = 0x0080, + MsaaS16F2 = 0x0100, + MsaaAllF2 = 0x01E0, + + MsaaS4F4 = 0x0200, + MsaaS8F4 = 0x0400, + MsaaS16F4 = 0x0800, + MsaaAllF4 = 0x0E00, + + MsaaS8F8 = 0x1000, + MsaaS16F8 = 0x2000, + MsaaAllF8 = 0x3000, + + MsaaAll = 0x3FFF, +}; + +/// Supported RTIP version enumeration +enum class RayTracingIpLevel : uint32 +{ + _None = 0x0, ///< The device does not have an RayTracing Ip Level +#ifndef None + None = _None, ///< The device does not have an RayTracing Ip Level +#endif + RtIp1_0 = 0x1, ///< First Implementation of HW RT + RtIp1_1 = 0x2, ///< Added computation of triangle barycentrics into HW + RtIp2_0 = 0x3, ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, + /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 + RtIp3_1 = 0x5, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, + /// 128 Byte Quantized box node, obb support, wide sort) +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, + /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 + RtIp3_1 = 0x6, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, + /// 128 Byte Quantized box node, obb support, wide sort) +#endif +}; + +/// Which ip version a component has reported +struct IpTriple +{ + uint32 major : 8; /// Major revision value + uint32 minor : 8; /// Minor revision value + uint32 stepping : 16; /// Stepping value + + constexpr operator uint32() const + { + return ((stepping ) | + (minor << 16) | + (major << 24)); + } +}; + +/// Reports various properties of a particular IDevice to the client. @see IDevice::GetProperties. +struct DeviceProperties +{ + uint32 vendorId; ///< Vendor ID (should always be 0x1002 for AMD). + uint32 deviceId; ///< GPU device ID (e.g., Hawaii XT = 0x67B0). + uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different + /// SKUs or revisions. Corresponds to one of the PRID_* revision IDs. + uint32 eRevId; ///< GPU emulation/internal revision ID. + AsicRevision revision; ///< ASIC revision. + GpuType gpuType; ///< Type of GPU (discrete vs. integrated) + uint16 gpuPerformanceCapacity; ///< Portion of GPU assigned in virtualized system (SRIOV) + ///< 0-65535, 0 invalid (not virtualized), 1 min, 65535 max + GfxIpLevel gfxLevel; ///< IP level of this GPU's GFX block +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + OssIpLevel ossLevel; ///< IP level of this GPU's OSS block + VceIpLevel vceLevel; ///< IP level of this GPU's VCE block + UvdIpLevel uvdLevel; ///< IP level of this GPU's UVD block +#endif + VcnIpLevel vcnLevel; ///< IP level of this GPU's VCN block +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + SpuIpLevel spuLevel; ///< IP level of this GPU's SPU block +#endif + PspIpLevel pspLevel; ///< IP level of this GPU's PSP block + uint32 gfxStepping; ///< Stepping level of this GPU's GFX block + IpTriple gfxTriple; ///< Full GFX IP level (major.minor.step) of this GPU + char gpuName[MaxDeviceName]; ///< Null terminated string identifying the GPU. + uint32 gpuIndex; ///< Device's index in a linked adapter chain. + uint32 maxGpuMemoryRefsResident; ///< Maximum number of GPU memory references that can be resident + /// at any time. Memory references set both via IQueue and IDevice + /// (via AddGpuMemoryReferences() or Submit()) count against this limit. + uint64 timestampFrequency; ///< Frequency of the device's timestamp counter in Hz. + /// @see ICmdBuffer::CmdWriteTimestamp. + uint32 attachedScreenCount; ///< Number of screen attached to the device. + uint32 maxSemaphoreCount; ///< Queue semaphores cannot have a signal count higher than this value. + /// For example, one indicates that queue semaphores are binary. + PalPublicSettings settings; ///< Public settings that the client has the option of overriding +#if PAL_CLIENT_EXAMPLE + AddrHandle hAddrlib; ///< Handle to addrlib for directed image tests. +#endif + + struct + { + union + { + struct + { + /// This engine supports timestamps (ICmdBuffer::CmdWriteTimestamp()). + uint32 supportsTimestamps : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on Streamout/Occlusion query + uint32 supportsQueryPredication : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on a 32-bit GPU memory allocation + uint32 supports32bitMemoryPredication : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on a 64-bit GPU memory allocation + uint32 supports64bitMemoryPredication : 1; + + /// This engine supports ICmdBuffer::If(), Else() and EndIf() calls. + uint32 supportsConditionalExecution : 1; + + /// This engine supports ICmdBuffer::While() and EndWhile() calls. + uint32 supportsLoopExecution : 1; + + /// This engine supports ICmdBuffer::CmdWaitRegisterValue(), WaitMemoryValue() and + /// CopyRegisterToMemory() calls. + uint32 supportsRegMemAccess : 1; + + /// This engine supports ICmdBuffer::CmdCopyImage() between optimally tiled images with + /// mismatched tiling tokens. + uint32 supportsMismatchedTileTokenCopy : 1; + + /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref + /// LayoutUninitializedTarget layout. + uint32 supportsImageInitBarrier : 1; + + /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref + /// LayoutUninitializedTarget layout for individual subresources. If this is not set and + /// supportsImageInitBarrier is set, the subresource range must span the entire image. + uint32 supportsImageInitPerSubresource : 1; + + /// This engine does not support any virtual memory features. IQueue::RemapVirtualMemoryPages and + /// IQueue::CopyVirtualPageMappings are not supported on Queues using this engine. + uint32 runsInPhysicalMode : 1; + + /// Indicates whether this engine can do virtual memory remap or not. + uint32 supportVirtualMemoryRemap : 1; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Indicates whether this Queues using this engine can maintain the contents of CE RAM across + /// consecutive submissions. If this is not set, the client must not specify a nonzero value for + /// either @ref QueueCreateInfo::persistentCeRamSize or @ref QueueCreateInfo::persistentCeRamOffset. + uint32 supportPersistentCeRam : 1; +#else + uint32 reserved914 : 1; +#endif + + /// Indicates whether the engine supports the command allocator tracks which chunk is idle. + uint32 supportsTrackBusyChunks : 1; + + /// Indicates whether the engine can safely access non-resident ranges of resources. + uint32 supportsUnmappedPrtPageAccess : 1; + + /// This engine supports clear or copy with MSAA depth-stencil destination + uint32 supportsClearCopyMsaaDsDst : 1; + + /// Reserved for future use. + uint32 reserved : 16; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Engines property flags. + + struct + { + union + { + struct + { + uint32 exclusive : 1; ///< Engine is exclusively owned by one client at a time. + uint32 mustUseDispatchTunneling : 1; ///< Queues created on this engine must use dispatch + /// tunneling. + /// Indicates whether this engine instance can be used for gang submission workloads via + /// a multi-queue. + /// @see IDevice::CreateMultiQueue. + uint32 supportsMultiQueue : 1; + uint32 hwsEnabled : 1; + uint32 isHighPriority : 1; + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Capabilities property flags. + + uint32 queuePrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue + /// priority levels are supported by this engine. + uint32 dispatchTunnelingPrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue + /// priority levels support dispatch tunneling on this engine. + uint32 maxFrontEndPipes; ///< Up to this number of IQueue objects can be consumed in + /// parallel by the front-end of this engine instance. It will + /// only be greater than 1 on hardware scheduled engine backed + /// by multiple hardware pipes/threads. + } capabilities[MaxAvailableEngines]; ///< Lists each engine of this type (up to engineCount) and their + /// properties. + + uint32 engineCount; ///< Number available engines of this type. + uint32 queueSupport; ///< Mask of QueueTypeSupport flags indicating which queues are + /// supported by this engine. + uint32 maxBorderColorPaletteSize; ///< Maximum size of a border color palette on this engine. + uint32 controlFlowNestingLimit; ///< Maximum depth of command-buffer control flow nesting on this + /// engine. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 ceRamSizeAvailable; ///< Size, in bytes, of constant engine RAM available on this engine. +#endif + Extent3d minTiledImageCopyAlignment; ///< Minimum alignments (pixels) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyImage() between optimally tiled images. + Extent3d minTiledImageMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyImage() with an optimally tiled image and a + /// linearly tiled image. Also applies to + /// ICmdBuffer::CmdCopyImageToMemory() or + /// ICmdBuffer::CmdCopyMemoryToImage() with an optimally tiled image. + Extent3d minLinearMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyTypedBuffer(). + uint32 minTimestampAlignment; ///< If supportsTimestamps is set, this is the minimum address alignment + /// in bytes of the dstOffset in ICmdBuffer::CmdWriteTimestamp(). + uint32 maxNumDedicatedCu; ///< The maximum number of dedicated CUs for the real time audio queue + uint32 maxNumDedicatedCuPerQueue; ///< The maximum number of dedicated CUs per queue + uint32 dedicatedCuGranularity; ///< The granularity at which compute units can be dedicated to a queue + /// Specifies the suggested heap preference clients should use when creating an @ref ICmdAllocator that will + /// allocate command space for this engine type. These heap preferences should be specified in the allocHeap + /// parameter of @ref CmdAllocatorCreateInfo. Clients are free to ignore these defaults and use their own + /// heap preferences, but may suffer a performance penalty. + GpuHeap preferredCmdAllocHeaps[CmdAllocatorTypeCount]; + + /// Indicate which queue supports per-command, per-submit, or per-queue TMZ based on the queue type. + TmzSupportLevel tmzSupportLevel; + } engineProperties[EngineTypeCount]; ///< Lists available engines on this device and their properties. + + struct + { + union + { + struct + { + /// This queue supports IQueue::PresentSwapChain() calls. Note that a queue may support swap chain + /// presents even if the supportedDirectPresentModes flags below indicate no support for direct + /// presents; instead swap chain PresentMode support is queried via GetSwapChainInfo. + uint32 supportsSwapChainPresents : 1; + uint32 reserved744 : 1; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 927 + /// Set if the queue supports additional split barrier feature on top of basic acquire/release + /// interface support. This provides CmdAcquire() and CmdRelease() to implement split barriers. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 + /// Note: supportReleaseAcquireInterface is a prerequisite to supportSplitReleaseAcquire. +#endif + uint32 supportSplitReleaseAcquire : 1; +#else + uint32 reserved927 : 1; +#endif + + /// Reserved for future use. + uint32 reserved : 29; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Queue property flags. + + uint32 supportedDirectPresentModes; ///< A mask of PresentModeSupport flags indicating support for various + /// PresentModes when calling IQueue::PresentDirect(). + } queueProperties[QueueTypeCount]; ///< Lists the properties of all queues supported by PAL. + + struct + { + union + { + struct + { + /// Indicates support for virtual GPU memory allocations. @see IQueue::RemapVirtualMemoryPages. + uint32 virtualRemappingSupport : 1; + + /// Indicates support for pinning system memory for access as GPU memory. + /// @see IDevice::PinSystemMemory. + uint32 pinningSupport : 1; + + /// Indicates support pinned memory which is host-mapped from foreign device. + uint32 supportHostMappedForeignMemory : 1; + + /// Indicates whether specifying memory references at Submit time is supported. If not supported + /// all memory references must be manged via IDevice or IQueue AddGpuMemoryReferences() + uint32 supportPerSubmitMemRefs : 1; + + /// Indicates support for GPU virtual addresses that are visible to all devices. + uint32 globalGpuVaSupport : 1; + + /// Indicates support for Shared Virtual Memory VA range. + uint32 svmSupport : 1; + + /// Indicates support for shadow desc VA range. + uint32 shadowDescVaSupport : 1; + + /// Indicates support for IOMMUv2. Fine grain SVM is not supported without IOMMU. + /// PAL client needs to check this flag before using fine grain SVM. + /// IOMMU is a memory management unit (MMU) that connects a direct-memory-access-capable + /// (DMA-capable) I/O bus to the main memory. + uint32 iommuv2Support : 1; + + /// Indiciates that the platform supports automatic GPU memory priority management. + uint32 autoPrioritySupport : 1; + + /// Indicates KMD has enabled HBCC(High Bandwidth Cache Controller) page migration support. This means + /// shaders must be compiled such that all memory clauses can be replayed in response to an XNACK. + uint32 pageMigrationEnabled : 1; + /// Indicates TMZ (or HSFB) protected memory allocations are supported. + uint32 supportsTmz : 1; + + /// Memory allocations on this device support MALL (memory access last level); essentially + /// the lowest level cache possible. + uint32 supportsMall : 1; + + /// Support for querying page fault information + uint32 supportPageFaultInfo : 1; + + /// Indicates if this device supports GFX12-style distributed compression. Client can control + /// whether distributed compression is enabled or not per IGpuMemory object using the + /// distributedCompression field in @ref GpuMemoryCreateInfo. + uint32 supportDistributedCompression : 1; + + /// All GPU memory allocations in all heaps are always initialized to zero on creation. + /// The @ref initializeToZero flag is not needed when this is true. + uint32 alwaysInitializedToZero : 1; + + /// Reserved for future use. + uint32 reserved : 17; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory property flags. + + gpusize realMemAllocGranularity; ///< The addresses and sizes of "real" GPU memory objects must be aligned + /// to at least this many bytes. + gpusize virtualMemAllocGranularity; ///< The addresses and sizes of virtual GPU memory objects must be aligned + /// to at least this many bytes. + gpusize virtualMemPageSize; ///< Size in bytes of a virtual GPU memory page. + /// @see IQueue::RemapVirtualMemoryPages. + gpusize fragmentSize; ///< Size in bytes of a video memory fragment. If GPU memory object + /// addresses and sizes are aligned to at least this value, VA translation + /// will be a bit faster. It is aligned to the allocation granularities. + gpusize largePageSizeInBytes; ///< The large page optimization will allow compatible allocations to + /// potentially be upgraded to a page size larger than 64KiB to reduce TLB + /// pressure. PAL will automatically pad the size and alignment of some + /// allocations to enable this optimization; + /// see largePageMinSizeForAlignmentInBytes in PalPublicSettings. + gpusize maxVirtualMemSize; ///< Total virtual GPU memory available (total VA space size). + gpusize maxPhysicalMemSize; ///< Total VRAM available (Local + Invisible + non-Local heap sizes). + gpusize vaStart; ///< Starting address of the GPU's virtual address space. + gpusize vaEnd; ///< Ending address of the GPU's virtual address space. + gpusize descTableVaStart; ///< Starting address of the descriptor table's virtual address space + gpusize shadowDescTableVaStart; ///< Starting address of the shadow descriptor table's virtual address space + gpusize privateApertureBase; ///< Private memory base address for generic address space (Windows only). + gpusize sharedApertureBase; ///< Shared memory base address for generic address space (Windows only). + + gpusize busAddressableMemSize; ///< SDI/DirectGMA GPU aperture size set in CCC + gpusize maxLocalMemSize; ///< Total VRAM available on the GPU (Local + Invisible heap sizes). + LocalMemoryType localMemoryType; ///< Type of local memory used by the GPU. + gpusize maxCaptureReplaySize; ///< Total virtual GPU available for Capture/Replay + gpusize barSize; ///< Total VRAM which can be accessed by the CPU. + + struct + { + float maxMemClock; ///< Maximum GPU memory clock in MHz. For DX builds this value is valid only after + /// the device has been finalized. + uint32 memPerfRating; ///< Precomputed performance rating of memory operations. + uint32 vramBusBitWidth; ///< Memory bus width. + uint32 memOpsPerClock; ///< Memory operations per clock. + } performance; ///< Performance-related memory properties. + + } gpuMemoryProperties; ///< Memory properties for this device. + + struct + { + union + { + struct + { + /// Images created on this device supports AQBS stereo mode, this AQBS stereo mode doesn't apply to the + /// array-based stereo feature supported by Presentable images. + uint32 supportsAqbsStereoMode : 1; + + /// Set if images created on this device support being created with corner sampling. + uint32 supportsCornerSampling : 1; + + /// Load Bearing placeholder, do not touch. + uint32 placeholder0 : 1; + + /// Reserved for future use. + uint32 reserved : 29; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory property flags. + + Extent3d maxDimensions; ///< Maximum supported width/height/depth for an image. + uint32 maxArraySlices; ///< Maximum supported number of array slices for a 1D or 2D image. + PrtFeatureFlags prtFeatures; ///< PRT features supported by the hardware. + gpusize prtTileSize; ///< Size, in bytes, of a PRT tile. + MsaaFlags msaaSupport; ///< Bitflags for MSAA sample/fragment count support. + uint8 maxMsaaFragments; ///< Max number of MSAA fragments per pixel (may have more samples). + uint8 numSwizzleEqs; ///< How many swizzle equations are in pSwizzleEqs. + Extent2d vrsTileSize; ///< Pixel dimensions of a VRS tile. 0x0 indicates image-based shading rate + /// is not supported. + const SwizzleEquation* pSwizzleEqs; ///< These describe how to interpret device-dependent tiling modes. + + bool tilingSupported[static_cast(ImageTiling::Count)]; ///< If each image tiling is supported. + } imageProperties; ///< Image properties for this device. + + struct + { + /// Maximum number of available shader-accessible user data entries. @see PipelineShaderInfo. + uint32 maxUserDataEntries; + uint32 maxThreadGroupSize; ///< Per-device limit on threads per threadgroup for compute shaders. + /// Some hardware supported by PAL has a bug which can cause a GPU hang if async compute enginesare used while + /// compute shaders with > maxAsyncComputeThreadGroupSize are in flight on any queue. This reports the + /// maximum "safe" limit on threads per threadgroup for compute shaders for this device if the client wishes to + /// use async compute engines. Note that the bug can occur if the following conditions are met: + /// (a) Async compute workloads are running *somewhere* on the GPU, in any process; + /// (b) Some compute workloads on either the async compute engine or on the universal engine have a threads per + /// threadgroup amount which exceeds maxAsyncComputeThreadGroupSize. + /// + /// It is up to the client to choose how to work around this bug. They are free to either limit applications to + /// only creating compute shaders with <= maxAsyncComputeThreadGroupSize threads per group, or to avoid using + /// the async compute engines at all. + /// + /// If this value equals maxThreadGroupSize, then the device does not have this bug and the client can use + /// any compute shader on any queue. + uint32 maxAsyncComputeThreadGroupSize; + + uint32 maxComputeThreadGroupCountX; ///< Maximum number of thread groups supported + uint32 maxComputeThreadGroupCountY; ///< Maximum number of thread groups supported + uint32 maxComputeThreadGroupCountZ; ///< Maximum number of thread groups supported + + uint32 maxBufferViewStride; ///< Maximum stride, in bytes, that can be specified in a buffer view. + + uint32 hardwareContexts; ///< Number of distinct state contexts available for graphics workloads. Mostly + /// irrelevant to clients, but may be useful to tools. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 ceRamSize; ///< Maximum on-chip CE RAM size in bytes. +#endif + uint32 maxPrimgroupSize; ///< Maximum primitive group size. + uint32 supportedVrsRates; ///< Bitmask of VrsShadingRate enumerations indicating which modes are supported. + + uint32 mallSizeInBytes; ///< Size of total MALL (Memory Attached Last Level - L3) cache in bytes. + + uint32 gl2UncachedCpuCoherency; ///< If supportGl2Uncached is set, then this is a bitmask of all + /// CacheCoherencyUsageFlags that will be coherent with CPU reads/writes. + /// Note that reporting CoherShader only means that GLC accesses will be + /// CPU coherent. + /// Note: Only valid if @ref supportGl2Uncached is true. + + uint32 maxGsOutputVert; ///< Maximum number of GS output vertices. + uint32 maxGsTotalOutputComponents; ///< Maximum number of GS output components totally. + uint32 maxGsInvocations; ///< Maximum number of GS prim instances, corresponding to geometry shader + /// invocation in glsl. + + RayTracingIpLevel rayTracingIp; ///< HW RayTracing IP version + + uint32 cpUcodeVersion; ///< Command processor feature version. + uint32 pfpUcodeVersion; ///< Command processor, graphics prefetch firmware version. + + union + { + struct + { + uint64 support8bitIndices : 1; ///< Hardware natively supports 8bit indices + uint64 support16BitInstructions : 1; ///< Hardware supports FP16 and INT16 instructions + uint64 supportBorderColorSwizzle : 1; ///< Hardware supports border color swizzle + uint64 supportDoubleRate16BitInstructions : 1; ///< Hardware supports double rate packed math + uint64 supportFp16Fetch : 1; ///< Hardware supports FP16 texture fetches + uint64 supportFp16Dot2 : 1; ///< Hardware supports a paired FP16 dot product. + uint64 supportConservativeRasterization : 1; ///< Hardware supports conservative rasterization + uint64 supportImplicitPrimitiveShader : 1; ///< Device supports implicit compiling of the + /// hardware vertex shader as a primitive shader to + /// perform culling and compaction optimizations in + /// the shader. + uint64 supportMeshShader : 1; ///< Indicates support for mesh shaders. + uint64 supportTaskShader : 1; ///< Indicates support for task shaders. + uint64 supportMsFullRangeRtai : 1; ///< HW supports full range render target array + /// index for Mesh Shaders. + uint64 supportPrtBlendZeroMode : 1; ///< Blend zero mode support. + uint64 supports2BitSignedValues : 1; ///< Hardware natively supports 2-bit signed values. + uint64 supportPrimitiveOrderedPs : 1; ///< Hardware supports primitive ordered UAV + /// accesses in the PS. + uint64 supportPatchTessDistribution : 1; ///< Hardware supports patch level tessellation + /// distribution among VGTs. + uint64 supportDonutTessDistribution : 1; ///< Hardware supports donut granularity of + /// tessellation distribution among VGTs. + uint64 supportTrapezoidTessDistribution : 1; ///< Hardware supports trapezoid granularity of + /// tessellation distribution among VGTs. + uint64 supportSingleChannelMinMaxFilter : 1; ///< Hardware supports min/max filtering that can + /// return one channel at a time. + uint64 supportPerChannelMinMaxFilter : 1; ///< Hardware returns min/max value on a per-channel + /// basis. + uint64 supportRgpTraces : 1; ///< Hardware supports RGP traces. + uint64 supportMsaaCoverageOut : 1; ///< Set if HW supports MSAA coverage feature + uint64 supportPostDepthCoverage : 1; ///< Set if HW supports post depth coverage feature + uint64 supportSpiPrefPriority : 1; ///< Set if HW supports preference priority. + uint64 supportWaveBreakSize : 1; ///< The HW supports specifying the wavebreak size + /// in the pixel shader pipeline. + uint64 supportsPerShaderStageWaveSize : 1; ///< If set, the "waveSize" setting in the + /// @ref PipelineShaderInfo structure is meaningful. + uint64 placeholder2 : 1; ///< Placeholder for backward compatibility, no use it. + uint64 supportSpp : 1; ///< Hardware supports Shader Profiling for Power. + uint64 timestampResetOnIdle : 1; ///< GFX timestamp resets after idle between + /// submissions. The client cannot assume that + /// timestamps will increase monotonically across + /// command buffer submissions. + uint64 support1xMsaaSampleLocations : 1; ///< HW supports 1xMSAA custom quad sample patterns +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 + uint64 supportReleaseAcquireInterface : 1; ///< Set if HW supports the basic functionalities of + /// acquire/release-based barrier interface. This + /// provides CmdReleaseThenAcquire() as a convenient + /// way to replace the legacy barrier interface's + /// CmdBarrier() to handle single point barriers. +#else + uint64 placeholder4 : 1; ///< Placeholder for backward compatibility, no use it. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 883 + uint64 supportSplitReleaseAcquire : 1; ///< Set if HW supports additional split barrier feature + /// on top of basic acquire/release interface support. + /// This provides CmdAcquire() and CmdRelease() to + /// implement split barriers. + /// Note: supportReleaseAcquireInterface is a + /// prerequisite to supportSplitReleaseAcquire. +#else + uint64 placeholder3 : 1; ///< Placeholder for backward compatibility, no use it. +#endif + uint64 supportGl2Uncached : 1; ///< Indicates support for the allocation of GPU L2 + /// un-cached memory. @see gl2UncachedCpuCoherency + uint64 supportOutOfOrderPrimitives : 1; ///< HW supports higher throughput for out of order + uint64 supportIntersectRayBarycentrics : 1; ///< HW supports the ray intersection mode which + /// returns triangle barycentrics. + uint64 supportFloat32BufferAtomics : 1; ///< Hardware supports float32 buffer atomics + uint64 supportFloat32ImageAtomics : 1; ///< Hardware supports float32 image atomics + uint64 supportFloat32BufferAtomicAdd : 1; ///< Hardware supports float32 buffer atomic add + uint64 supportFloat32ImageAtomicAdd : 1; ///< Hardware supports float32 image atomic add + uint64 supportFloat64Atomics : 1; ///< Hardware supports float64 atomics + uint64 supportFloat32ImageAtomicMinMax : 1; ///< Hardware supports float32 image atomic min and max + uint64 supportFloat64BufferAtomicMinMax : 1; ///< Hardware supports float64 buffer atomic min and max + uint64 supportFloat64SharedAtomicMinMax : 1; ///< Hardware supports float64 shared atomic min and max + uint64 support64BitInstructions : 1; ///< Hardware supports 64b instructions + uint64 supportShaderSubgroupClock : 1; ///< HW supports clock functions across subgroup. + uint64 supportShaderDeviceClock : 1; ///< HW supports clock functions across device. + uint64 supportAlphaToOne : 1; ///< HW supports forcing PS output alpha channel to 1 + uint64 supportCaptureReplay : 1; ///< HW supports captureReplay + uint64 supportSortAgnosticBarycentrics : 1; ///< HW supports sort-agnostic Barycentrics for PS + uint64 supportVrsWithDsExports : 1; ///< If true, asic support coarse VRS rates + /// when z or stencil exports are enabled + uint64 supportRayTraversalStack : 1; ///< HW assisted ray tracing traversal stack support + uint64 supportPointerFlags : 1; ///< Ray tracing HW supports flags embedded in the node + /// pointer bits + uint64 supportTextureGatherBiasLod : 1; ///< HW supports SQ_IMAGE_GATHER4_L_O + uint64 supportInt8Dot : 1; ///< Hardware supports a dot product 8bit. + uint64 supportInt4Dot : 1; ///< Hardware supports a dot product 4bit. + uint64 supportMixedSignIntDot : 1; ///< Hardware supports a integer dot product with mixed + /// sign inputs. + uint64 support2DRectList : 1; ///< HW supports PrimitiveTopology::TwoDRectList. + uint64 supportHsaAbi : 1; ///< PAL supports HSA ABI compute pipelines. + uint64 supportImageViewMinLod : 1; ///< Indicates image srd supports min_lod. + uint64 supportStaticVmid : 1; ///< Indicates support for static-VMID + uint64 support3dUavZRange : 1; ///< HW supports read-write ImageViewSrds of 3D images + /// with zRange specified. + uint64 supportCooperativeMatrix : 1; ///< HW supports cooperative matrix + uint64 support1dDispatchInterleave : 1; ///< Indicates support for 1D Dispatch Interleave. + uint64 support2dDispatchInterleave : 1; ///< Indicates support for 2D Dispatch Interleave. + uint64 supportBFloat16 : 1; ///< HW supports bf16 instructions. + uint64 supportFloat8 : 1; ///< HW supports float 8-bit instructions. + uint64 supportInt4 : 1; ///< HW supports integer 4-bit instructions. + uint64 supportCooperativeMatrix2 : 1; ///< HW supports Gfx12 extension cooperative matrix. + uint64 placeholder14 : 2; + uint64 reserved : 59; ///< Reserved for future use. + }; + uint64 u64All[2]; ///< Flags packed as 32-bit uint. + } flags; ///< Device IP property flags. + + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 + union + { + uint32 bufferView; +#endif + uint32 typedBufferView; ///< Size in bytes (and required alignment) of a typed buffer view SRD. + /// @see IDevice::CreateTypedBufferViewSrds(). +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 + }; +#endif + uint32 untypedBufferView; ///< Size in bytes (and required alignment) of a untyped buffer view SRD. + /// @see IDevice::CreateUntypedBufferViewSrds(). + uint32 imageView; ///< Size in bytes (and required alignment) of an image view SRD. + /// @see IDevice::CreateImageViewSrds(). + uint32 fmaskView; ///< Size in bytes (and required alignment) of an fmask view SRD. + /// @see IDevice::CreateFmaskViewSrds(). This value can be zero to denote + /// a lack of fMask support. + uint32 sampler; ///< Size in bytes (and required alignment) of a sampler SRD. + /// @see IDevice::CreateSamplerSrds(). + uint32 bvh; ///< Size in bytes (and required alignment) of a BVH SRD + /// Will be zero if HW doesn't support ray-tracing capabilities. + /// @see IDevice::CreateBvhSrds(). + } srdSizes; ///< Sizes for various types of _shader resource descriptor_ (SRD). + + struct + { + const void* pNullBufferView; ///< Pointer to null buffer view srd + const void* pNullImageView; ///< Pointer to null image view srd + const void* pNullFmaskView; ///< Pointer to null fmask view srd. This pointer can be nullptr to + /// indicate a lack of fMask support. + const void* pNullSampler; ///< Pointer to null sampler srd + } nullSrds; ///< Null SRDs are used to drop shader writes or read 0 + + struct + { + float maxGpuClock; ///< Maximum GPU engine clock in MHz. For DX builds this value is valid only after + /// the device has been finalized. + float aluPerClock; ///< Maximum shader ALU operations per clock. + float texPerClock; ///< Maximum texture fetches per clock. + float primsPerClock; ///< Maximum primitives processed per clock. + float pixelsPerClock; ///< Maximum pixels processed per clock. + uint32 gfxipPerfRating; ///< Precomputed performance rating of the GfxIp block. + } performance; ///< Performance-related device properties. + + struct + { + union + { + struct + { + uint32 eccProtectedGprs : 1; ///< Whether or not the GPU has ECC protection + ///< on its VGPR's + uint32 placeholder0 : 1; + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit unsigned integer. + } flags; + + uint32 numShaderEngines; ///< Number of non-harvested shader engines. + uint32 numShaderArrays; ///< Number of shader arrays. + uint32 numCusPerShaderArray; ///< Number of CUs per shader array that are actually usable. + uint32 maxCusPerShaderArray; ///< Maximum number of CUs per shader array. Count of physical CUs prior to + ///< harvesting CUs for yield in certain variants of ASICs (ex: Fiji PRO). + uint32 numSimdsPerCu; ///< Number of SIMDs per compute unit. + uint32 numWavefrontsPerSimd; ///< Number of wavefront slots in each SIMD. + uint32 numActiveRbs; ///< Number of active Renderbackends + uint32 nativeWavefrontSize; ///< The native wavefront size. + uint32 minWavefrontSize; ///< The smallest supported wavefront size. + uint32 maxWavefrontSize; ///< All powers of two between the min size and max size are supported. + uint32 numAvailableSgprs; ///< Number of available SGPRs. + uint32 sgprsPerSimd; ///< Number of physical SGPRs per SIMD. + uint32 minSgprAlloc; ///< Minimum number of SGPRs that can be allocated by a wave. + uint32 sgprAllocGranularity; ///< SGPRs are allocated in groups of this size. Meaning, if your shader + /// only uses 1 SGPR, you will still end up reserving this number of + /// SGPRs. + uint32 numAvailableVgprs; ///< Number of available VGPRs. + uint32 vgprsPerSimd; ///< Number of physical VGPRs per SIMD. + uint32 minVgprAlloc; ///< Minimum number of VGPRs that can be allocated by a wave. + uint32 vgprAllocGranularity; ///< VGPRs are allocated in groups of this size. Meaning, if your shader + /// only uses 1 VGPR, you will still end up reserving this number of + /// VGPRs. On hardware where wave32 is available, the granularity for a + /// wave64 shader is half of this value, but the VGPR allocation is + /// double. The same number of total physical registers is allocated for + /// each unit of allocation with either wave size. + uint32 ldsSizePerCu; ///< Local Data Store size available in bytes per CU. + uint32 ldsSizePerThreadGroup; ///< Local Data Store size available in bytes per thread-group. + uint32 ldsGranularity; ///< Local Data Store allocation granularity expressed in bytes. + uint32 gsPrimBufferDepth; ///< Hardware configuration for the GS prim buffer depth. + uint32 gsVgtTableDepth; ///< Hardware configuration for the GS VGT table depth. + uint32 numOffchipTessBuffers; ///< Number of offchip buffers that are used for offchip tessellation to + /// pass data between shader stages. + uint32 offchipTessBufferSize; ///< Size of each buffer used for passing data between shader stages when + /// tessellation passes data using off-chip memory. + uint32 tessFactorBufSizePerSe; ///< Size of GPU's the tessellatio-factor buffer, per shader engine. + uint32 tccSizeInBytes; ///< Size of total L2 TCC cache in bytes. + uint32 tcpSizeInBytes; ///< Size of one L1 TCP cache in bytes. There is one TCP per CU. + uint32 maxLateAllocVsLimit; ///< Maximum number of VS waves that can be in flight without + /// having param cache and position buffer space. + uint32 shaderPrefetchBytes; ///< Number of bytes the SQ will prefetch, if any. + uint32 gl1cSizePerSa; ///< Size in bytes of GL1 cache per SA. + uint32 instCacheSizePerCu; ///< Size in bytes of instruction cache per CU/WGP. + uint32 scalarCacheSizePerCu; ///< Size in bytes of scalar cache per CU/WGP. + uint32 numAvailableCus; ///< Total number of CUs that are actually usable. + uint32 numPhysicalCus; ///< Count of physical CUs prior to harvesting. + /// Mask of active pixel packers. The mask is 128 bits wide, assuming a max of 32 SEs and a max of 4 pixel + /// packers (indicated by a single bit each) per SE. + uint32 activePixelPackerMask[ActivePixelPackerMaskDwords]; + /// Mask of present, non-harvested CUs (Virtual Layout) + uint32 activeCuMask[MaxShaderEngines][MaxShaderArraysPerSe]; + } shaderCore; ///< Properties of computational power of the shader engine. + + } gfxipProperties; + + struct + { + union + { + struct + { + uint32 supportTurboSync : 1; ///< Whether TurboSync is supported by KMD + uint32 enableUmdFpsCap : 1; ///< Whether UMD FPS CAP enabled + uint32 isCwgSupported : 1; ///< KMD supports Creator Who Game (CWG) feature + uint32 isGamingDriver : 1; ///< KMD works in gaming mode +#if defined(_WIN32) + uint32 canShareSemaphoreKmtHandle : 1; ///< OS supports sharing queue semaphores by KMT handle. +#else + uint32 placeholder0 : 1; +#endif + uint32 ifhModeEnabled : 1; ///< Whether the IFH mode is enabled + uint32 requireFrameEnd : 1; ///< If the client must tag the last command buffer + /// submission in each frame with a @ref CmdBufInfo with + /// the frameEnd flag set. + uint32 supportNativeHdrWindowing : 1; ///< Support HDR presentation that does not require FSE. + uint32 flipQueueSupportsDecodeDst : 1; ///< If set, Decode destination images are supported + /// in the OS flip-queue. + uint32 supportFreeMux : 1; ///< Whether FreeMux is supported by KMD + uint32 isDataCenterBoard : 1; ///< Whether the current board in use is a Data Center board. + /// This is meant to support a unified VDI/CG driver package. +#if defined(__unix__) + uint32 hasPrimaryDrmNode : 1; ///< Set if the device has a primary DRM node. + uint32 hasRenderDrmNode : 1; ///< Set if the device has a render DRM node. +#else + uint32 placeholder1 : 2; +#endif + uint32 forceAlignmentSupported : 1; ///< If PalPublicSettings::hardwareBufferAlignmentMode + /// has any effect. + uint32 reserved : 18; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< OS-specific property flags. + +#if (PAL_KMT_BUILD) + int32 luidHighPart; ///< High 32 bits of the LUID (locally unique identifier) for this device. + uint32 luidLowPart; ///< Low 32 bits of the LUID (locally unique identifier) for this device. + bool supportNtHandle; ///< Support export/import resource with NT handle. + bool supportNativefence; ///< Expose the capability of native fence support to client in DeviceProperties. +#endif + + union + { + struct + { + uint32 support : 1; ///< Support Timeline type semaphore. + uint32 supportHostQuery : 1; ///< Support Timeline type semaphore host query. + uint32 supportHostWait : 1; ///< Support Timeline type semaphore host wait. + uint32 supportHostSignal : 1; ///< Support Timeline type semaphore host signal. + uint32 supportWaitBeforeSignal : 1; ///< Support Timeline type semaphore wait before signal. + + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; + } timelineSemaphore; + +#if defined(__unix__) + bool supportOpaqueFdSemaphore; ///< Support export/import semaphore as opaque fd in linux KMD. + bool supportSyncFileSemaphore; ///< Support export/import semaphore as sync file in linux KMD. + bool supportSyncFileFence; ///< Support export/import fence as sync file in linux KMD. +#endif + + bool supportQueuePriority; ///< Support create queue with priority + bool supportDynamicQueuePriority; ///< Support set the queue priority through IQueue::SetExecutionPriority + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) + bool supportMemoryBudgetQuery; ///< Support memory budget query through IDevice::QueryGpuMemoryBudgetInfo +#endif + +#if defined(_WIN32) + bool supportArbitaryPrtMapUnmap; ///< Support arbitary prt map unmap operation. +#endif + + uint32 umdFpsCapFrameRate; ///< The frame rate of the UMD FPS CAP + VirtualDisplayCapabilities virtualDisplayCaps; ///< Capabilities of virtual display, it's provided by KMD + + union + { + struct + { + uint32 supportDevice : 1; ///< GPU time domain + uint32 supportClockMonotonic : 1; ///< POSIX CLOCK_MONOTONIC time domain + uint32 supportClockMonotonicRaw : 1; ///< POSIX CLOCK_MONOTONIC_RAW time domain + uint32 supportQueryPerformanceCounter : 1; ///< Windows Query Performance Counter time domain + + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; + } timeDomains; + +#if defined(_WIN32) + char driverStorePath[Util::MaxPathStrLen]; ///< Driver store path +#endif + +#if defined(__unix__) + int64 primaryDrmNodeMajor; ///< DRM primary node major number. + int64 primaryDrmNodeMinor; ///< DRM primary node minor number. + int64 renderDrmNodeMajor; ///< DRM render node major number. + int64 renderDrmNodeMinor; ///< DRM render node minor number. +#endif + union + { + struct + { + uint32 supportPostflip : 1; ///< KMD support DirectCapture post-flip access + uint32 supportPreflip : 1; ///< KMD support DirectCapture pre-flip access + uint32 supportRSync : 1; ///< KMD support RSync + uint32 maxFrameGenRatio : 4; ///< Maximum frame generation ratio or zero if not supported + uint32 supportNonPrimary : 1; ///< KMD support non-primary DirectCapture auxiliary data + uint32 reserved : 24; ///< Reserved for future use. + }; + uint32 u32All; + } directCapture; + } osProperties; ///< OS-specific properties of this device. + + struct + { + uint32 domainNumber; ///< PCI bus number. + uint32 busNumber; ///< PCI bus number. + uint32 deviceNumber; ///< PCI device number. + uint32 functionNumber; ///< PCI function number. + + union + { + struct + { + uint32 gpuConnectedViaThunderbolt : 1; ///< Device is an externally housed GPU connected to the system + /// via Thunderbolt. This will drastically impact CPU read and + /// write performance of memory in the @ref GpuHeapLocal heap. + uint32 gpuEmulatedInSoftware : 1; ///< Device is really a software package which emulates the + /// GPU. This is meant for pre-silicon development. + uint32 gpuEmulatedInHardware : 1; ///< Device is a hardware emulated GPU. This is meant for + /// pre-silicon development. + uint32 gpuVirtualization : 1; ///< Set if running under VM. + uint32 atomicOpsSupported : 1; ///< Set if pcie atomic is supported. + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< PCI bus property flags. + } pciProperties; ///< PCI bus properties of this device. + + BigSoftwareReleaseInfo bigSoftwareReleaseInfo; ///< Big Software (BigSW) Release Version information +}; + +/// Defines callback function to notify client of private screen changes. +typedef void (PAL_STDCALL *TopologyChangeNotificationFunc)(void* pClient); + +/// Defines callback function to notify client of the private screen removal. +typedef void (PAL_STDCALL *DestroyNotificationFunc)(void* pOwner); + +/// Specifies the private screen topology change notification data. +struct PrivateScreenNotifyInfo +{ + void* pClient; ///< Pointer to client, PAL use this pointer as parameter when PAL + /// calls callback pfnOnTopology. + TopologyChangeNotificationFunc pfnOnTopology; ///< Pointer to client provided function. PAL should call this when + /// the topology change happens and let the client handle the change. + DestroyNotificationFunc pfnOnDestroy; ///< Pointer to client provdided function. PAL should call this when + /// a private screen object is to be destroyed. The pOwner data is + /// passed at @ref IPrivateScreen::BindOwner() time. +}; + +/// Specifies fullscreen frame metadata control flags. Used for the KMD to notify clients about which types of frame +/// metadata it needs to send to KMD. The meaning depends on the context: +/// - During device finalization, client can set the flags indicating the specified metadata 'is supported' by client. +/// - During present, client can query these flags that indicate which metadata 'is enabled' currently so that the +/// client should send them to the KMD. +union FullScreenFrameMetadataControlFlags +{ + struct + { + uint32 timerNodeSubmission : 1; ///< Timer node submission, used for cases such as FRTC/FP/PFPA. + uint32 frameBeginFlag : 1; ///< FrameBegin flag on CmdBufInfo, see CmdBufInfo for details. + uint32 frameEndFlag : 1; ///< FrameEnd flag on CmdBufInfo, see CmdBufInfo for details. + uint32 primaryHandle : 1; ///< Pending primary handle for pre-flip primary access (PFPA) + uint32 p2pCmdFlag : 1; ///< P2P copy command. See CmdBufInfo comments for details. + uint32 forceSwCfMode : 1; ///< Force software crossfire mode. + uint32 postFrameTimerSubmission : 1; ///< It indicates whether the timer node submission at frame N is to + /// synchronize the flip of frame N (postFrameTimerSubmission == TRUE) + /// or N+1 (postFrameTimerSubmission == FALSE). + /// It's only valid when timerNodeSubmission is also set. + uint32 useHp3dForDwm : 1; ///< KMD Informs (DX11) UMD to use HP3D for DWM or not (Output only). + uint32 expandDcc : 1; ///< KMD notifies UMD to expand DCC (Output only). + uint32 enableTurboSyncForDwm : 1; ///< Indicates DWM should turn on TurboSync(Output only). + uint32 enableDwmFrameMetadata : 1; ///< When cleared, no frame metadata should be sent for DWM(Output only). + uint32 flipIntervalOverride : 3; ///< KMD-UMD interface FLIP_INTERVAL_OVERRIDE, for KMD to request flip + /// interval override from UMD. + uint32 disableFreeMux : 1; ///< KMD notifies UMD to disable FreeMux. + uint32 maxFrameLatency : 2; ///< KMD can notify UMD to override the frame latency of an app. + uint32 sendMotionVectors : 1; ///< Send the motion vector in CmdBufInfo once per frame + uint32 sendDepth : 1; ///< Send the depth buffer in CmdBufInfo once per frame + uint32 sendCameraMatrix : 1; ///< Send the camera matrix in CmdBufInfo once per frame + uint32 sendHudLessImage : 1; ///< Send the HUD less image in CmdBufInfo once per frame + uint32 reserved : 11; ///< Reserved for future use. + + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Indicates the desired UMD behavior with timer node submission. +/// This is used to distinguish FP Vsync On + FreeSync Off case from HSync or FreeSync cases, the former case doesn't +/// hold flip while the later cases do. +enum class TimerNodeMode : uint32 +{ + Unspecified, ///< Unspecified, client can decide what to do with the timer submission. + ForceFlipHold, ///< Client must hold flip with the timer submission +}; + +/// Specifies fullscreen frame metadata control data. Including FullScreenFrameMetadataControlFlags plus extended data. +/// According to KMD's design, the difference is that 'flags' can be used to indicate 'client caps' during device +/// initialization, while the 'data' is only passed from KMD to UMD. +struct PerSourceFrameMetadataControl +{ + FullScreenFrameMetadataControlFlags flags; ///< The frame metadata control flags + TimerNodeMode timerNodeSubmissionMode; ///< Desired UMD behavior with timer node submission +}; + +/// Specifies the texture optimization level to use for an image. +/// +/// @ingroup ResourceBinding +enum class ImageTexOptLevel : uint32 +{ + Default = 0, ///< Use device default setting + Disabled, ///< Disable texture filter optimization + Enabled, ///< Enable texture filter optimization + Maximum, ///< Maximum texture filter optimization + Count +}; + +/// Specifies properties for @ref IDevice finalization. Input structure to IDevice::Finalize(). +struct DeviceFinalizeInfo +{ + union + { + struct + { + uint32 supportPrivateScreens : 1; ///< Initializes private screen support. + uint32 requireFlipStatus : 1; ///< Requires to initialize flip status shared memory + uint32 requireFrameMetadata : 1; ///< Requires to initialize frame metadata flags shared memory. + /// Clients should only set this flag on the master device in an + /// LDA chain. + uint32 internalGpuMemAutoPriority : 1; ///< Forces internal GPU memory allocation priorities to be + /// determined automatically. It is an error to set this flag + /// if the device does not report that it supports this feature. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Device finalization flags. + + /// Specifies which engines of each type should be created for the device. + struct + { + uint32 engines; ///< A mask of which engines are requested. + } requestedEngineCounts[EngineTypeCount]; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Bytes of CE RAM to be used by the client for each engine type. This value must be <= ceRamSizeAvailable reported + /// for that engine type. In the case where more than one engine of a given type is requested it is assumed each + /// engine of that type will use this amount of CE RAM so the total size of (ceRamSizeUsed * queueCounts) must be <= + /// ceRamSizeAvailable for that engine type. Each entry must be either zero or a multiple of 32 bytes. + size_t ceRamSizeUsed[EngineTypeCount]; +#endif + + /// @see PrivateScreenNotifyInfo + /// Private screen notify info, must be filled when supportPrivateScreens=1. The client pointer and callback are to + /// be saved in device. PAL should call the callback when there is any topology (hotplug) change with the client + /// pointer as parameter. + PrivateScreenNotifyInfo privateScreenNotifyInfo; + + /// Fullscreen frame metadata control flags indicating the types of metadata that the client supports. + /// During adapter initialization, capable KMD notifies clients that it supports frame metadata, + /// clients should then set these flags on device finalization info, indicating which types of metadata the client + /// supports. + FullScreenFrameMetadataControlFlags supportedFullScreenFrameMetadata; + + /// Specify the texture optimization level which only applies to internally-created views by PAL (e.g., for BLTs), + /// client-created views must use the texOptLevel parameter in ImageViewInfo. + ImageTexOptLevel internalTexOptLevel; +}; + +/// Reports the compatibility and available features when using two particular devices in a multi-GPU system. Output +/// structure from IDevice::GetMultiGpuCompatibility(). +struct GpuCompatibilityInfo +{ + union + { + struct + { + uint32 gpuFeatures : 1; ///< The devices have an exact feature match: same internal tiling, same + /// pipeline binary data, etc. + uint32 iqMatch : 1; ///< Devices produce images with same precision. + uint32 peerTransferWrite : 1; ///< Peer-to-peer transfers write are supported. See + /// IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). + uint32 peerTransferRead : 1; ///< Peer-to-peer transfers based on xmgi are supported. + /// See IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). + uint32 sharedMemory : 1; ///< Devices can share memory objects with. IDevice::OpenSharedMemory(). + uint32 sharedSync : 1; ///< Devices can share queue semaphores with + /// IDevice::OpenSharedQueueSemaphore(). + uint32 shareThisGpuScreen : 1; ///< Either device can present to this device. Means that the device + /// indicated by the otherDevice param in + /// IDevice::GetMultiGpuCompatibility() can present to the device the + /// method was called on. + uint32 shareOtherGpuScreen : 1; ///< Either device can present to the other device. Means that the + /// device IDevice::GetMultiGpuCompatibility() was called on can present + /// to the GPU indicated by the otherGpu param. + uint32 peerEncode : 1; ///< whether encoding HW can access FB memory of remote GPU in chain + uint32 peerDecode : 1; ///< whether decoding HW can access FB memory of remote GPU in chain + uint32 peerTransferProtected : 1; ///< whether protected content can be transferred over P2P + uint32 crossGpuCoherency : 1; ///< whether remote FB memory can be accessed without need for cache flush + uint32 reserved : 20; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU compatibility flags. +}; + +/// Reports properties of a GPU memory heap. +/// +/// @note The performance ratings represent an approximate memory throughput for a particular access scenario, but +/// should not be taken as an absolute performance metric. +struct GpuMemoryHeapProperties +{ + union + { + struct + { + uint32 cpuVisible : 1; ///< Accessible with IGpuMemory::Map() + uint32 cpuGpuCoherent : 1; ///< Cache coherent between the CPU and GPU. + uint32 cpuUncached : 1; ///< Not cached by CPU, but could still be GPU cached. + uint32 cpuWriteCombined : 1; ///< CPU write-combined memory. + uint32 holdsPinned : 1; ///< GPU memory objects created by IDevice::CreatePinnedGpuMemory() are in + /// this heap. + uint32 shareable : 1; ///< GPU memory objects in this heap can be shared between multiple devices. + uint32 supportsTmz : 1; ///< This heap supports TMZ allocations. + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory heap property flags. + + gpusize logicalSize; ///< Size of the heap in bytes. If HBCC is enabled, certain heaps may be + /// virtualized and the logical size will exceed the physical size. + gpusize physicalSize; ///< Physical size of the heap in bytes +}; + +/// Reports properties of a specific GPU block required for interpretting performance experiment data from that block. +/// See @ref PerfExperimentProperties. +struct GpuBlockPerfProperties +{ + bool available; ///< If performance data is available for this block. + uint32 instanceCount; ///< How many instances of this block are in the device. + uint32 maxEventId; ///< Maximum event ID for this block. + uint32 maxGlobalOnlyCounters; ///< Number of counters available only for global counts. + uint32 maxGlobalSharedCounters; ///< Total counters available including state shared between global and SPM. + uint32 maxSpmCounters; ///< Counters available for streaming only. + + /// If the instance group size is equal to one, every block instance has its own independent counter hardware. + /// PAL guarantees this is true for all non-DF blocks. + /// + /// Otherwise the instance group size will be a value greater than one which indicates how many sequential + /// instances share the same counter hardware. The client must take care to not enable too many counters within + /// each of these groups. + /// + /// For example, the DfMall block may expose 16 instances with 8 global counters but define a group size of 16. + /// In that case all instances are part of one massive group which uses one pool of counter state such that no + /// combination of DfMall counter configurations can exceed 8 global counters. + uint32 instanceGroupSize; +}; + +/// Reports performance experiment capabilities of a device. Returned by IDevice::GetPerfExperimentProperties(). +struct PerfExperimentProperties +{ + PerfExperimentDeviceFeatureFlags features; ///< Performance experiment device features. + + size_t maxSqttSeBufferSize; ///< SQTT buffer size per shader engine. + size_t sqttSeBufferAlignment; ///< SQTT buffer size and base address alignment. + uint32 shaderEngineCount; ///< Number of shader engines. + + /// Reports availability and properties of each device block. + GpuBlockPerfProperties blocks[static_cast(GpuBlock::Count)]; +}; + +/// Reports maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' +/// elements are no larger than maxElementSize. +struct LinearImageAlignments +{ + uint16 maxElementSize; ///< Maximum element size in bytes. + uint16 baseAddress; ///< Minimum required base address alignment in bytes. + uint16 rowPitch; ///< Minimum required row pitch alignment in bytes. + uint16 depthPitch; ///< Minimum required depth pitch alignment in bytes. +}; + +/// Specifies image view type (i.e., 1D, 2D, 3D, or cubemap). +/// +/// @ingroup ResourceBinding +enum class ImageViewType : uint32 +{ + Tex1d = 0x0, + Tex2d = 0x1, + Tex3d = 0x2, + TexCube = 0x3, + + Count +}; + +/// Enumeration which defines the mode for magnification and minification sampling +/// +/// @ingroup ResourceBinding +enum XyFilter : uint32 +{ + XyFilterPoint = 0, ///< Use single point sampling + XyFilterLinear, ///< Use linear sampling + XyFilterAnisotropicPoint, ///< Use anisotropic with single point sampling + XyFilterAnisotropicLinear, ///< Use anisotropic with linear sampling + XyFilterCount +}; + +/// Enumeration which defines the mode for volume texture sampling +/// +/// @ingroup ResourceBinding +enum ZFilter : uint32 +{ + ZFilterNone = 0, ///< Disable Z filtering + ZFilterPoint, ///< Use single point sampling + ZFilterLinear, ///< Use linear sampling + ZFilterCount +}; + +/// Enumeration which defines the mode for mip-map texture sampling +/// +/// @ingroup ResourceBinding +enum MipFilter : uint32 +{ + MipFilterNone = 0, ///< Disable Mip filtering + MipFilterPoint, ///< Use single point sampling + MipFilterLinear, ///< Use linear sampling + MipFilterCount +}; + +/// Specifies parameters for an image view descriptor controlling how a given texture is sampled +/// +/// @ingroup ResourceBinding +struct TexFilter +{ + union + { + struct + { + uint32 magnification : 2; ///< Used with enum XyFilter for Plane magnification filtering + uint32 minification : 2; ///< Used with enum XyFilter for Plane minification filtering + uint32 zFilter : 2; ///< Used with enum ZFilter for volume texture filtering + uint32 mipFilter : 2; ///< Used with enum MipFilter for mip-map filtering + uint32 reserved : 24; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + }; +}; + +/// Determines if "TexFilter" should be ignored or not. +enum class TexFilterMode : uint32 +{ + Blend = 0x0, ///< Use the filter method specified by the TexFilter enumeration + Min = 0x1, ///< Use the minimum value returned by the sampler, no blending op occurs + Max = 0x2, ///< Use the maximum value returned by the sampler, no blending op occurs + Count +}; + +/// Specifies how texture coordinates outside of texture boundaries are interpreted. +/// +/// @ingroup ResourceBinding +enum class TexAddressMode : uint32 +{ + Wrap = 0x0, ///< Repeat the texture. + Mirror = 0x1, ///< Mirror the texture by flipping it at every other coordinate interval. + Clamp = 0x2, ///< Clamp the texture to the texture's edge pixel. + MirrorOnce = 0x3, ///< Mirror the texture once then clamp. + ClampBorder = 0x4, ///< Clamp the texture to the border color specified in the sampler. + MirrorClampHalfBorder = 0x5, ///< Mirror the texture once then clamp the texture to half of the edge color. + ClampHalfBorder = 0x6, ///< Clamp the texture to half of the edge color. + MirrorClampBorder = 0x7, ///< Mirror the texture once then clamp the texture to the samler's border color. + Count +}; + +/// Specifies how a border color should be chosen when the TexAddressClampBorder texture addressing is used by a +/// sampler. +/// +/// @ingroup ResourceBinding +enum class BorderColorType : uint32 +{ + White = 0x0, ///< White border color (1.0, 1.0, 1.0, 1.0). + TransparentBlack = 0x1, ///< Transparent black border color (0.0, 0.0, 0,0, 0.0). + OpaqueBlack = 0x2, ///< Opaque black border color (0.0, 0.0, 0.0, 1.0). + PaletteIndex = 0x3, ///< Fetch border color from the border color palette. + Count +}; + +/// Residency maps are helper surfaces used in conjunction with PRT+. They reflect the resident mip levels +/// associated with a given UV region of the parent image. +enum class PrtMapAccessType : uint32 +{ + Raw = 0x0, ///< Read / write the map image as a normal image. + Read = 0x1, ///< Read the residency map as floating point data + WriteMin = 0x2, ///< Write the residency map with min(existing,new) + WriteMax = 0x3, ///< Write the residency map with max(existing,new) + WriteSamplingStatus = 0x4, ///< Write to the sampling status map. + Count +}; + +/// Specifies compression behavior for an IImage or image/buffer view. +enum class CompressionMode : uint32 +{ + Default = 0, ///< Let PAL choose behavior. + ReadEnableWriteEnable = 1, ///< Override default to force read and write compression on. + ReadEnableWriteDisable = 2, ///< Support reading compressed data, but force any writes to be uncompressed (keeping + /// physical metadata consistent). + ReadBypassWriteDisable = 3, ///< Bypass physical metadata on reads (assume decompressed), all writes will be + /// uncompressed and will write physical metatdata marking updated blocks as being + /// uncompressed. This mode is intended to handle placed resources that do not + /// want compression in memory allocations that have distributed compression enabled. + /// WARNING: Using this mode to read compressed data will result in corruption. + Count, +}; + +/// Specifies client compression behavior for an IImage. +enum class ClientCompressionMode : uint32 +{ + Default = 0, ///< Let implementation decide whether to enable or disable + Enable = 1, ///< Force enable + Disable = 2, ///< Force disable + DisableClearOnly = 3, ///< Force enable for all image views except for image clears + Count, +}; + +/// Specifies parameters for a buffer view descriptor that control how a range of GPU memory is viewed by a shader. +/// +/// Input to either CreateTypedBufferViewSrds() or CreateUntypedBufferViewSrds(). Used for any buffer descriptor, +/// including read-only shader resources, UAVs, vertex buffers, etc. The usage of stride and format depends on the +/// expected shader instruction access: +/// +/// + _Typed buffer_ access must set a valid format and channel mapping. +/// + _Raw buffer_ access is indicated by setting an invalid format and setting stride to 1. +/// + _Structured buffer_ access is indicated by setting an invalid format and setting stride to any value except 1. A +/// stride of 0 maps all view accesses to the first structure stored in memory. +/// +/// _Typed buffer_ SRD's must be created using @ref IDevice::CreateTypedBufferViewSrds(). +/// _Raw buffer_ and _structured buffer_ SRD's must be created using @ref IDevice::CreateUntypedBufferViewSrds(). +/// +/// If necessary, PAL will adjust the out of bounds read/write behavior to match the client's API requirements based on +/// the client defines - PAL_CLIENT_VULKAN, etc. +/// +/// @ingroup ResourceBinding +struct BufferViewInfo +{ + gpusize gpuAddr; ///< GPU memory virtual address where the buffer view starts, in bytes. + /// Must be aligned to bytes-per-element for typed access. + gpusize range; ///< Restrict the buffer view to this many bytes. Will be rounded down to a + ///< multiple of the stride. + gpusize stride; ///< Stride in bytes. Must be aligned to bytes-per-element for typed access. + SwizzledFormat swizzledFormat; ///< Format and channel swizzle for typed access. Must be Undefined for structured + /// or raw access. + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. + /// Only relevant if the backing memory pages enable compression. + + union + { + struct + { + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + uint32 reserved : 30; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; +}; + +/// D3D12_VERTEX_BUFFER_VIEW structure. Match vertex offset mode. +/// @ref ICmdBuffer::CmdSetVertexBuffers function. +struct VertexBufferView +{ + gpusize gpuva; ///< GPU virtual address to vertex buffer. + uint32 sizeInBytes; ///< The size in bytes of the buffer. + uint32 strideInBytes; ///< The size in bytes of each vertex entry. +}; + +/// Specifies an array of vertex buffer view descriptors. PAL supports multiple vertex buffer modes which change how +/// the GPU memory is viewed by a shader. This struct wraps up the choice between these modes. +/// +/// If offsetMode is true PAL will create "vertex offset mode" SRDs using the @ref VertexBufferView array. This new SRD +/// has stronger robustness. It avoids that the last vertex is removed when buffer size is not multiple of the stride. +/// +/// @see ICmdBuffer::CmdSetVertexBuffers +struct VertexBufferViews +{ + uint32 firstBuffer; ///< First vertex buffer slot to change. Must be less than @ref MaxVertexBuffers. + uint32 bufferCount; ///< The length of the buffer view array. + bool offsetMode; ///< If true, pVertexBufferViews must be used, otherwise pBufferViewInfos is used. + + union + { + const VertexBufferView* pVertexBufferViews; ///< Offset mode vertex buffer views. Valid if offsetMode is true. + const BufferViewInfo* pBufferViewInfos; ///< Index mode vertex buffer views. Valid if offsetMode is false. + }; +}; + +/// Specifies parameters for an image view descriptor controlling how a shader will view the specified image. +/// +/// Input to CreateImageViewSrd(). Used for any image view descriptor, including read-only shader resources and UAVs. +/// +/// @ingroup ResourceBinding +struct ImageViewInfo +{ + const IImage* pImage; ///< Image associated with the view. + ImageViewType viewType; ///< 1D, 2D, 3D, or Cubemap. Typically this should match the image type, but a + /// Cubemap view can be imposed on a 2D array image. + SwizzledFormat swizzledFormat; ///< Specifies the image view format and channel swizzle. Must be compatible (same + /// bit-widths per channel) with the image's base format. + /// @note: YUV formats are invalid for an ImageView. A format should be chosen to be + /// compatible with either the luma or chroma plane(s) of the YUV format. + SubresRange subresRange; ///< Specifies a subset of subresources to include in the view. If the base Image + /// has a YUV planar format, the number of array slices in the range must be 1. + /// If zRange feature is used, the number of mips in the range must be 1. + float minLod; ///< Minimum mip level of detail to use for this view. + + uint32 samplePatternIdx; ///< Index into the currently bound MSAA sample pattern palette to be + /// read/evaluated when samplepos shader instructions are executed on this + /// view. Can be ignored if the samplepos shadinstruction will not be used. + /// Must be less than MaxSamplePatternPaletteEntries. See + /// IDevice::SetSamplePatternPalette(). + Range zRange; ///< Specifies the z offset and z range. + + ImageTexOptLevel texOptLevel; ///< Specific the texture optimization level. + + const IImage* pPrtParentImg; ///< Meaningful only if "mapAccess" is not "raw". + PrtMapAccessType mapAccess; ///< Type of access to be done if "pImage" is a PRT+ meta-data image. + /// See @ref ImageCreateInfo + + ImageLayout possibleLayouts; ///< Union of all possible layouts this view can be in while accessed by this view. + /// (ie. what can be done with this SRD without having a layout transition?) + /// In DX, for example, it's possible that a texture SRV could be accessed in a state + /// with all other read-only usages allowed, but a UAV must exclusively be accessed + /// in the UNORDERED_ACCESS state. + /// The primary purpose of this flag is to avoid compressed shader writes if a + /// different usage does not support compression and PAL won't get an opportunity to + /// decompress it (ie. a transition in a barrier) + + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. + /// Only relevant if the backing IImage resource and its bound memory pages + /// enable compression. ReadBypassWriteDisable is only valid if compressionMode + /// in ImageCreateInfo disables compressed write. + + union + { + struct + { + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + + uint32 zRangeValid : 1; ///< whether z offset/ range value is valid. + uint32 includePadding : 1; ///< Whether internal padding should be included in the view range. + + uint32 reserved : 28; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; ///< Image view flags. +}; + +/// Specifies parameters controlling execution of sample instructions in a shader. Input to CreateSamplerSrd(). +/// +/// @ingroup ResourceBinding +struct SamplerInfo +{ + TexFilterMode filterMode; ///< Min/max filtering modes + TexFilter filter; ///< Filtering to apply to texture fetches. + TexAddressMode addressU; ///< Addressing mode for U texture coords outside of the [0..1] range. + TexAddressMode addressV; ///< Addressing mode for V texture coords outside of the [0..1] range. + TexAddressMode addressW; ///< Addressing mode for W texture coords outside of the [0..1] range. + float mipLodBias; ///< Bias for mipmap level of detail selection. + uint32 maxAnisotropy; ///< Anisotropy value clamp when the filter mode is TexFilterAnisotropic. + CompareFunc compareFunc; ///< Comparison function to apply to fetched data. + float minLod; ///< High-resolution mipmap LOD clamp. + float maxLod; ///< Low-resolution mipmap LOD clamp. + BorderColorType borderColorType; ///< Selects border color when an address mode is TexAddressClampBorder. + uint32 borderColorPaletteIndex; ///< Choose color from the border color palette when borderColorType is + /// BorderColorPalette. + float anisoThreshold; ///< Opt-in, flags.useAnisoThreshold == 1 and flags.preciseAniso == 0. + /// The value should be computed taking account the maxAnisotropy + /// setting. This is a high resolution value which is quantized and + /// clamped down to 3 bits to the domain [0.0, 0.875] for current Hw. + /// We can interpret the functioning of the threshold value as follows. + /// maxAnisotropy per-pixel can be 1, 2, 4, 8 or 16 (N). + /// During sampling, the initial count (or S) is computed in Hw for each + /// quad and the domain for current Hw is [0-16]. + /// Final sample count = min(pow(2, ceil(log2(S - anisoThreshold))), N) + /// Note: when flags.useAnisoThreshold == 0, Pal will ignore this value + /// and instead use a maximum of 0.25 at the highest anisotropic setting. + /// It is important to be aware that this feature tunes quality vs + /// performance, so care should be taken to not degrade image quality + /// 'noticeably' when enabling using this feature + uint32 perfMip; ///< Controls the value of the PERF_MIP field in Sampler SRD's. + /// This field basically controls the Fractional part of the LOD + /// calculation. if LOD is fractional so let us say 1.23, in this case + /// you must avg.out your samples from both MIP 1 and 2.But if PERF_MIP + /// is set to nonzero the HW will perform an optimization and may fetch + /// from only 1 MIP. + + // These values are used to define a filtering line used when sampling a residency map. The defined + // slopes in both the X (U) and Y (V) directions are to avoid visible disconnects when sampling between + // different samples. + Offset2d uvOffset; ///< u/v offset value selectors. Values specified are in + /// log2 of fractions of pixel. i.e., 1 / (1 << x). Not all values + /// are supported by all HW. + Offset2d uvSlope; ///< u/v slope value selectors. Supported slope values are + /// specified in degrees. In the case of a 3D image, the supplied + /// uvSlope.y is interpreted as wSlope. + /// 0 2.5 + /// 1 3 + /// 2 4 + /// 3 5 + /// 4 8 + /// 5 16 + /// 6 32 + /// 7 64 + /// other values: unsupported + + union + { + struct + { + uint32 mgpuIqMatch : 1; ///< Enables image compatibility for MGPU scenarios where paired devices + /// come from different hardware families. + uint32 preciseAniso : 1; ///< Anisotropic filtering should prefer precision over speed. + uint32 unnormalizedCoords : 1; ///< If set then always use unnormalized texture coordinates instead of + /// zero to one. Only works under certain conditions (no mip filtering, + /// no computed LOD, no offsets, only edge or border clamp address modes) + uint32 truncateCoords : 1; ///< If set then hardware will truncate mantissa instead of + /// rounding to nearest even in float point to fixed point + /// texture coordinate conversion + uint32 seamlessCubeMapFiltering : 1; ///< If set then there's filtering across the edges of the cube map. + uint32 prtBlendZeroMode : 1; ///< Allow unmapped PRT texels to be treated as zero and blended with + /// mapped texels. If set to 0, the destination of the sample instruction + /// is written with all 0s when TFE == 0; if set to 1, Treat unmapped + /// texels as zeros and blend them with other mapped texels, write the + /// result of this sample instruction to the destination GPRs. + uint32 useAnisoThreshold : 1; ///< If set, Hw will use the value assigned in anisoThreshold, but + /// only if preciseAniso is set to 0, also. + + /// This allows the sampler to turn off overriding anisotropic filtering when the resource view contains a + /// single mipmap level. Not all graphics IP supports overriding anisotropic filtering, and this flag will + /// be ignored for such GPUs. + uint32 disableSingleMipAnisoOverride : 1; + + uint32 forResidencyMap : 1; ///< Set if the surface being sampled is a residency map used in PRTs. + /// Only meaningful if the corresponding ImageView's mapAccess is set to + /// "read". Only valid for devices that report the "PrtFeaturePrtPlus" + /// flag. + uint32 reserved : 23; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; +}; + +/// Specifies which heuristic should be utilized for sorting children when box sorting is enabled +enum class BoxSortHeuristic : uint32 +{ + ClosestFirst = 0x0, ///< Traversal is ordered to enter the children that + ///< intersect the ray closer to the ray origin first. + ///< This is good baseline option. Default option for RT IP 1.x. + LargestFirst = 0x1, ///< Traversal is ordered to enter the children that have the largest + ///< interval where the box intersects the ray first. + ///< Good for shadow rays with terminate on first hit. + ClosestMidPoint = 0x2, ///< Traversal is ordered to enter the children that have a midpoint in the interval + ///< where the box intersects that has the lowest intersection time before clamping( + ///< Good for reflection rays. + Disabled = 0x3, ///< Box sort and heuristic are disabled. + Count +}; + +/// Specifies parameter for creating a BvH (bounding volume hierarchy, used by ray-trace) descriptor +struct BvhInfo +{ + const IGpuMemory* pMemory; ///< Memory object holding the BVH nodes + gpusize offset; ///< Offset from memory address specified by pMemory. Combination of + /// pMemory address and the offset must be 256 byte aligned. + gpusize numNodes; ///< Number of nodes in the view + uint32 boxGrowValue; ///< Number of ULPs (unit in last place) to be added during ray-box test. + + BoxSortHeuristic boxSortHeuristic; ///< Specifies which heuristic should be utilized for + ///< sorting children when box sorting is enabled + union + { + struct + { + uint32 useZeroOffset : 1; ///< If set, SRD address is programmed to zero + uint32 returnBarycentrics : 1; ///< When enabled, ray intersection will return triangle barycentrics. + /// Note: Only valid if @see supportIntersectRayBarycentrics is true. + + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + uint32 pointerFlags : 1; ///< If set, flags are encoded in the node pointer bits + uint32 highPrecisionBoxNode : 1; ///< If set, enable 64-byte high precision box node + uint32 wideSort : 1; ///< If set, enable wide sort + uint32 hwInstanceNode : 1; ///< If set, enable hardware instance node + uint32 sortTrianglesFirst : 1; ///< If set, triangle nodes are treated specially during child sorting + uint32 compressedFormatEn : 1; ///< If set, enable compressed format support. This include enable + /// support for compressed primitive packets, BVH8-128B box nodes, + /// and changes to triangle intersection test return data. + uint32 reserved : 22; ///< Reserved for future HW + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< BVH creation flags. +}; + +/// Specifies parameters for an fmask view descriptor. +/// +/// Input to CreateFmaskViewSrd(). Allows the client to access fmask from a shader using the load_fptr IL instruction. +/// +/// @ingroup ResourceBinding +struct FmaskViewInfo +{ + const IImage* pImage; ///< Image associated with the fmask view. + uint32 baseArraySlice; ///< First slice in the view. + uint32 arraySize; ///< Number of slices in the view. + + union + { + struct + { + uint32 shaderWritable : 1; ///< True if used with an image that has been transitioned to a shader- + /// writable image state (e.g. [Graphics|Compute][WriteOnly|ReadWrite]) + uint32 reserved : 31; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; ///< Fmask view flags +}; + +/// Element of the multisample pattern representing a sample position (X, Y), type of SamplePatternPalette, which +/// matches the layout defined by SC. +struct SamplePos +{ + float x; ///< x coordinate of sample position. + float y; ///< y coordinate of sample position. + uint32 reserved1; ///< reserved for future use + uint32 reserved2; ///< reserved for future use +}; + +/// Specifies a palette of MSAA sample patterns used by the client. Input to SetSamplePatternPalette, which is used +/// to implement samplepos shader instruction support. +typedef SamplePos SamplePatternPalette[MaxSamplePatternPaletteEntries][MaxMsaaRasterizerSamples]; + +/// Provides a GPU timestamp along with the corresponding CPU timestamps, for use in calibrating CPU and GPU timelines. +struct CalibratedTimestamps +{ + uint64 gpuTimestamp; ///< GPU timestamp value compatible with ICmdBuffer::CmdWriteTimestamp(). + uint64 cpuClockMonotonicTimestamp; ///< POSIX CLOCK_MONOTONIC timestamp + uint64 cpuClockMonotonicRawTimestamp; ///< POSIX CLOCK_MONOTONIC_RAW timestamp + uint64 cpuQueryPerfCounterTimestamp; ///< Windows QueryPerformanceCounter timestamp + uint64 maxDeviation; ///< Maximum deviation in nanoseconds between the GPU and CPU timestamps +}; + +/// Specifies connector types +enum class DisplayConnectorType : uint32 +{ + Unknown = 0, ///< Unknown connector type + Vga, ///< VGA + DviD, ///< DVI_D + DviI, ///< DVI_I + Hdmi, ///< HDMI + Dp, ///< DP + Edp, ///< EDP + Minidp, ///< MINI_DP + Count +}; + +/// Specifies properties for display connectors connected to GPU +struct DisplayConnectorProperties +{ + DisplayConnectorType type; ///< Connector type - VGA, DVI, HDMI, DP etc +}; + +/// Specifies pre-defined power profile which is used to communicate with KMD/PPLib and set correspond power states. +enum class PowerProfile : uint32 +{ + Default = 0, ///< Default power profile. + VrCustom = 1, ///< Power profile used by custom VR scenario. + VrDefault = 2, ///< Power profile used by default VR scenario. + Idle = 3, ///< Power profile used for forced DPM0, in case HMD is taken off but the game is still running. + Count +}; + +/// Fine-grain power switch info. +struct PowerSwitchInfo +{ + uint32 time; ///< Time in microseconds, relative to the frame start at V-sync. Clients should consider the + /// powerSwitchLatency value reported in @ref PrivateScreenProperties when specifying + /// switch times. + + uint32 performance; ///< Performance to be set (between 0-100), which is mapped to a certain DPM level by KMD. +}; + +/// Maximum number of power switch info allowed in one custom power profile. +static constexpr uint32 MaxNumPowerSwitchInfo = 5; + +/// Fine-grain power management for dynamic power mode. This structure specifies multiple DPM states to be cycled +/// through each frame. +struct CustomPowerProfile +{ + IPrivateScreen* pScreen; ///< Dynamic power mode needs V-sync so a private screen object is needed. + uint32 numSwitchInfo; ///< Number of discrete DPM states to cycle through per frame. Number entries + /// in switchInfo[] and actualSwitchInfo[]. + + PowerSwitchInfo switchInfo[MaxNumPowerSwitchInfo]; ///< Specifies the set of power states to cycle through each + /// frame. Each entry specifies an offset into the frame where + /// the DPM state should be switched, and a rough performance + /// requirement value which will be translated into an + /// appropriate DPM state by KMD. + PowerSwitchInfo actualSwitchInfo[MaxNumPowerSwitchInfo]; ///< The actual set of power states that KMD/PPLib sets. +}; + +/// Flags for IDevice::AddGpuMemoryReferences(). Depending on their residency model, a client may set these flags as +/// directed by the application or hard-code them to a single value. Driver-internal memory references should be marked +/// as CantTrim unless the client explicitly handles trim support. +/// +/// Note that the CantTrim and MustSucceed flags are based on the same WDDM flags; it is expected that PAL will ignore +/// them on non-WDDM platforms. +enum GpuMemoryRefFlags : uint32 +{ + GpuMemoryRefCantTrim = 0x1, ///< The caller can't or won't free this allocation on OS request. + GpuMemoryRefMustSucceed = 0x2, ///< Hint to the OS that we can't process a failure here, this may result in a TDR. + GpuMemoryRefAllFlags = 0x3 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies input arguments for IDevice::GetPrimaryInfo(). Client must specify a display ID and properties of the +/// primary surface that will drive that display in order to query capabilities. +struct GetPrimaryInfoInput +{ + uint32 vidPnSrcId; ///< Video present source id. + uint32 width; ///< Primary surface width. + uint32 height; ///< Primary surface height. + SwizzledFormat swizzledFormat; ///< Format and swizzle of the primary surface. + Rational refreshRate; ///< Video refresh rate, this is only valid if refreshRateValid is set. + union + { + struct + { + uint32 qbStereoRequest : 1; ///< Going to set a stereo mode. + uint32 refreshRateValid : 1; ///< Refresh rate is valid. + uint32 freeSyncInCrossFireSupport : 1; ///< True if client supports FreeSync in CrossFire. + uint32 useKmdCalcFramePacing : 1; ///< True if client uses KMD frame pacing. If so, the client + /// creates a timer queue to delay the present, and the delay + /// value is calculated by KMD. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< get primary surface info input flags. +}; + +/// Specifies output arguments for IDevice::GetStereoDisplayModes(), returning supported stereo mode +struct StereoDisplayModeOutput +{ + Extent2d extent; ///< Dimensions in pixels WxH. + Rational refreshRate; ///< Refresh rate. + SwizzledFormat format; ///< Format and swizzle of the primary surface. +}; + +/// Specifies output arguments for IDevice::GetActive10BitPackedPixelMode(), returning which, if any, 10-bit +/// display mode is active. +struct Active10BitPackedPixelModeOutput +{ + bool isInWs10BitMode; ///< Whether the workstation 10-bit feature is enabled. + bool notifyKmd10bitsPresent; ///< When in 10-bit mode and at present time, if the + /// client driver sees a 10-bit to 8-bit surface blt, + /// it needs to call RequestKmdReinterpretAs10Bit() to + /// inform the KMD that the dst surface must be reinterpreted + /// as 10-bits per channel for all KMD-initiated BLTs. + PackedPixelType packedPixelType; ///< Format of the packed pixels. + uint32 pixelPackRatio; ///< The number of 10-bit pixels that are packed into one 8-8-8-8 + /// format pixel. +}; + +/// Specifies primary surface stereo mode. +enum StereoMode : uint32 +{ + StereoModeHwAlignedViews = 0, ///< The stereo views are HW aligned on the display. + StereoModeSwPackedViews = 1, ///< The layout of the stereo views on the display are determined by the client. + StereoModeNotSupported = 2, ///< Not support stereo mode + StereoModeSideBySide = 3, ///< The two stereo views are put side by side on the display. + StereoModeTopBottom = 4 ///< One stereo view is on the top of the display, and the other is on the bottom. +}; + +/// Enumerates the supported workstation stereo modes. +enum class WorkstationStereoMode : uint32 +{ + Disabled, + ViaConnector, ///< Active Stereo for 3 Pin VESA connector. + ViaBlueLine, ///< Blue line Active Stereo for laptops. + Passive, ///< Passive Stereo (Dual head). + PassiveInvertRightHoriz, ///< Passive Stereo with Horizontal Invert (Dual Head). + PassiveInvertRightVert, ///< Passive Stereo with Vertical Invert (Dual Head). + Auto, ///< Auto Stereo Vertical Interleaved. + AutoHoriz, ///< Auto Stereo Horizontal Interleaved. + AutoCheckerboard, ///< Auto Stereo Checkerboard Interleaved. + AutoTsl, ///< Tridelity SL Auto Stereo. + Count, +}; + +/// Specifies output arguments for IDevice::GetPrimaryInfo(), returning capabilitiy information for a display in +/// a particular mode. +struct GetPrimaryInfoOutput +{ + uint32 tilingCaps; ///< Tiling caps supported by this primary surface. + StereoMode stereoMode; ///< Stereo mode supported by this primary surface. + uint32 mallCursorCacheSize; ///< Size of the mall cursor cache in bytes + union + { + struct + { + /// MGPU flag: this primary surface supports DVO HW compositing mode. + uint32 dvoHwMode : 1; + /// MGPU flag: this primary surface supports XDMA HW compositing mode. + uint32 xdmaHwMode : 1; + /// MGPU flag: this primary surface supports client doing SW compositing mode. + uint32 swMode : 1; + /// MGPU flag: this primary surface supports freesync. + uint32 isFreeSyncEnabled : 1; + /// Single-GPU flag: gives hint to the client that they should use rotated tiling mode. + uint32 hwRotationPortraitMode : 1; + /// Single-GPU flag: this primary surface supports non local heap. + uint32 displaySupportsNonLocalHeap : 1; + /// Reserved for future use. + uint32 reserved : 26; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< get primary surface support info output flags. +}; + +/// Specifies different clock modes that the device can be set to. +enum class DeviceClockMode : uint32 +{ + Default = 0, ///< Device clocks and other power settings are restored to default. + Query = 1, ///< Queries the current device clock ratios. Leaves the clock mode of the device unchanged. + Profiling = 2, ///< Scale down from peak ratio. Clocks are set to a constant amount which is + /// known to be power and thermal sustainable. The engine/memory clock ratio + /// will be kept the same as much as possible. + MinimumMemory = 3, ///< Memory clock is set to the lowest available level. Engine clock is set to + /// thermal and power sustainable level. + MinimumEngine = 4, ///< Engine clock is set to the lowest available level. Memory clock is set to + /// thermal and power sustainable level. + Peak = 5, ///< Clocks set to maximum when possible. Fan set to maximum. Note: Under power + /// and thermal constraints device will clock down. + QueryProfiling = 6, ///< Queries the profiling device clock ratios. Leaves the clock mode of the device unchanged. + QueryPeak = 7, ///< Queries the peak device clock ratios. Leaves the clock mode of the device unchanged. + Count +}; + +/// Specifies input argument to IDeive::SetClockMode. The caller can read the clock ratios the device is currently +/// running by querying using the mode DeviceClockMode::DeviceClockModeQuery. +struct SetClockModeOutput +{ + uint32 memoryClockFrequency; /// Current mem clock (absolute) value in Mhz + uint32 engineClockFrequency; /// Current gpu core clock (absolute) value in Mhz +}; + +/// Specifies input argument to IDeive::SetClockMode. The caller must specify the mode in which to set the device. +struct SetClockModeInput +{ + DeviceClockMode clockMode; ///< Used to specify the clock mode for the device. +}; + +/// Specifies primary surface MGPU compositing mode. +enum MgpuMode : uint32 +{ + MgpuModeOff = 0, ///< MGPU compositing mode off, the client does not do SW compositing at all, e.g. AFR disabled. + MgpuModeSw = 1, ///< MGPU SW compositing mode, the client handle the SW compositing. + MgpuModeDvo = 2, ///< MGPU DVO HW compositing mode + MgpuModeXdma = 3, ///< MGPU XDMA HW compositing mode + MgpuModeCount +}; + +/// Specifies input arguments for IDevice::SetMgpuMode(). A client set a particular MGPU compositing mode and whether +/// frame pacing is enabled for a display. +struct SetMgpuModeInput +{ + uint32 vidPnSrcId; ///< Video present source id. + MgpuMode mgpuMode; ///< Primary surface MGPU compositing mode. + bool isFramePacingEnabled; ///< True if frame pacing enabled. If so, the client creates a timer queue + /// to delay the present, and the delay value is calculated by KMD. +}; + +constexpr uint32 XdmaMaxDevices = 8; ///< Maximum number of Devices for XDMA compositing. + +/// Specifies XDMA cache buffer info for each gpu. +struct XdmaBufferInfo +{ + uint32 bufferSize; ///< XDMA cache buffer size of each device + uint32 startAlignment; ///< XDMA cache buffer start alignment of each device +}; + +/// Specifies output arguments for IDevice::GetXdmaInfo(), returning the XDMA cache buffer information of each GPU for +/// a display. +struct GetXdmaInfoOutput +{ + XdmaBufferInfo xdmaBufferInfo[XdmaMaxDevices]; ///< Output XDMA cache buffer info +}; + +/// Specifies flipping status flags on a specific VidPnSource. It's Windows specific. +union FlipStatusFlags +{ + struct + { + uint32 immediate : 1; ///< Is immediate flip + uint32 dwmFlip : 1; ///< Is DWM conducted flip + uint32 iFlip : 1; ///< Is independent exclusive flip + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies the VSync mode of virtual display. +enum class VirtualDisplayVSyncMode : uint32 +{ + Default = 0, ///< Using the default VSync mode based on refresh rate + Immediate = 1, ///< The presentation should be executed immediately without waiting for vsync to display + HMD = 2, ///< Using HMD VSync, the HMD is specified by pPrivateScreen + Count +}; + +/// The VirtualDisplayInfo is provided by application and KMD uses it to create a virtual display. +/// @see IDevice::CreateVirtualDisplay. +struct VirtualDisplayInfo +{ + uint32 width; ///< Horizontal dimension in pixels + uint32 height; ///< Vertical dimension in pixels + Rational refreshRate; ///< Refresh rate of virtual display + VirtualDisplayVSyncMode vsyncMode; ///< VSync mode + uint32 vsyncOffset; ///< VSync front porch location in pixels or lines. + /// It's needed when VSyncMode is HMD + Pal::IPrivateScreen* pPrivateScreen; ///< A pointer to IPrivateScreen. + /// It's needed when VSyncMode is HMD +}; + +/// Function pointer type definition for creating a buffer view SRD. +/// +/// @see IDevice::CreateTypedBufferViewSrds()/CreateUntypedBufferViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. +/// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateBufferViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut); + +/// Function pointer type definition for creating an image view SRD. +/// +/// @see IDevice::CreateImageViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. +/// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateImageViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const ImageViewInfo* pImgViewInfo, + void* pOut); + +/// Function pointer type definition for creating a fmask view SRD. +/// +/// @see IDevice::CreateFmaskViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. +/// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateFmaskViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const FmaskViewInfo* pFmaskViewInfo, + void* pOut); + +/// Function pointer type definition for creating a sampler SRD. +/// +/// @see IDevice::CreateSamplerSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. +/// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateSamplerSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const SamplerInfo* pSamplerInfo, + void* pOut); + +/// Function pointer type definition for creating a ray tracing SRD. +/// +/// @see IDevice::CreateBvhSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. +/// @param [in] pBvhInfo Array of BVH descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateBvhSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const BvhInfo* pBvhInfo, + void* pOut); + +/// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. +/// +/// @param [in] pDevice +/// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. +/// @param [out] pViewInfo Pointer to memory to be filled with decoded info. +typedef void (PAL_STDCALL *DecodeBufferViewSrdFunc)( + const IDevice* pDevice, + const void* pBufferViewSrd, + BufferViewInfo* pViewInfo); + +/// Structure filled in by DecodeImageViewSrd(). +struct DecodedImageSrd +{ + SwizzledFormat swizzledFormat; ///< SRD's swizzled format. + SubresRange subresRange; ///< SRD's subresource range; the array-range is always { 0, 1 } for 3D images. + Range zRange; ///< z-range of the SRD's subresRange.startSubres. +}; + +/// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). +/// +/// @param [in] pDevice +/// @param [in] pImage The same image the SRD was created on. +/// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. +/// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. +typedef void (PAL_STDCALL *DecodeImageViewSrdFunc)( + const IDevice* pDevice, + const IImage* pImage, + const void* pImageViewSrd, + DecodedImageSrd* pDecodedInfo); + +/// Function pointer table for SRD methods. +struct DeviceInterfacePfnTable +{ + CreateBufferViewSrdsFunc pfnCreateTypedBufViewSrds; ///< Typed Buffer view SRD creation function pointer. + CreateBufferViewSrdsFunc pfnCreateUntypedBufViewSrds; ///< Untyped Buffer view SRD creation function ptr. + CreateImageViewSrdsFunc pfnCreateImageViewSrds; ///< Image view SRD creation function pointer. + CreateFmaskViewSrdsFunc pfnCreateFmaskViewSrds; ///< Fmask View SRD creation function pointer. + CreateSamplerSrdsFunc pfnCreateSamplerSrds; ///< Sampler SRD creation function pointer. + CreateBvhSrdsFunc pfnCreateBvhSrds; ///< BVH SRD creation function pointer. + DecodeBufferViewSrdFunc pfnDecodeBufferViewSrd; ///< Buffer SRD decode function pointer. + DecodeImageViewSrdFunc pfnDecodeImageViewSrd; ///< Image SRD decode function pointer. +}; + +/// Specifies output arguments for IDevice::QueryWorkstationCaps(), returning worksation feature information +/// on this device workstation board. +union WorkStationCaps +{ + struct + { + uint32 workStationBoard : 1; ///< Running a workstation driver on a workstation board. + /// On workstation boards that support CWG (Creator Who Game), + /// the user can switch to a Gaming/consumer driver on the + /// workstation board, and then this will be false. + uint32 supportWorkstationAppPerfOpt : 1; ///< Workstation boards have optimizations for kinds of workstation + /// applications. These optimization is enabled if it is set. + uint32 supportWorkstationEdgeFlag : 1; ///< Workstation boards have a DX9 feature that edge flag can be + /// exported via point size output in VS. The feature is enabled + /// if it is set. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// FrameLock/GenLock support state enum +enum class FlglSupport : uint32 +{ + NotAvailable = 0, ///< FL/GL not supported by the GPU + NotConnected = 1, ///< FL/GL support available in the GPU, but is not connected to a GLSync board + Available = 2, ///< FL/GL support available and connected + Count +}; + +/// Container structure for FrameLock/GenLock state. +struct FlglState +{ + union + { + struct + { + uint32 genLockEnabled : 1; ///< True if genlock is currently enabled. Genlock is a system-wide setting + ///< in CCC. Genlock provides a singal source (which is used in framelock) + uint32 frameLockEnabled : 1; ///< True if (KMD) framelock is currently enabled. + ///< Framelock is the mechanism to sync all presents in multiple adapters. + uint32 isTimingMaster : 1; ///< True if the display being driven by the current adapter is the timing + ///< master in a genlock configuration + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Packed 32-bit uint value. + }; + FlglSupport support; ///< The state of the FLGL support in current adapter + uint32 firmwareVersion; ///< Firmware version number of the GLSync hardware (S400 board), if available +}; + +/// GlSync setting mask definition, used with GlSyncConfig +enum GlSyncConfigMask : uint32 +{ + GlSyncConfigMaskSignalSource = 0x00000001, + GlSyncConfigMaskSyncField = 0x00000002, + GlSyncConfigMaskSampleRate = 0x00000004, + GlSyncConfigMaskSyncDelay = 0x00000008, + GlSyncConfigMaskTriggerEdge = 0x00000010, + GlSyncConfigMaskScanRateCoeff = 0x00000020, + GlSyncConfigMaskFrameLockCntl = 0x00000040, + GlSyncConfigMaskSigGenFrequency = 0x00000080 +}; + +/// specify GLSYNC framelock control state +enum GlSyncFrameLockCtrl : uint32 +{ + GlSyncFrameLockCntlNone = 0x00000000, + GlSyncFrameLockCntlEnable = 0x00000001, + GlSyncFrameLockCntlDisable = 0x00000002, + GlSyncFrameLockCntlResetSwapCounter = 0x00000004, + GlSyncFrameLockCntlAckSwapCounter = 0x00000008, + GlSyncFrameLockCntlVersionKmd = 0x00000010 +}; + +/// Specifies GlSync Signal Source +enum GlSyncSignalSource : uint32 +{ + GlSyncSignalSourceGpuMask = 0x0FF, + GlSyncSignalSourceUndefined = 0x100, + GlSyncSignalSourceFreerun = 0x101, + GlSyncSignalSourceBncPort = 0x102, + GlSyncSignalSourceRj45Port1 = 0x103, + GlSyncSignalSourceRj45Port2 = 0x104 +}; + +/// Specifies GlSync Sync Field +enum GlSyncSyncField : uint8 +{ + GlSyncSyncFieldUndefined = 0, + GlSyncSyncFieldBoth = 1, + GlSyncSyncField1 = 2 +}; + +/// Specifies GlSync Sync Trigger Edge +enum GlSyncTriggerEdge : uint8 +{ + GlSyncTriggerEdgeUndefined = 0, + GlSyncTriggerEdgeRising = 1, + GlSyncTriggerEdgeFalling = 2, + GlSyncTriggerEdgeBoth = 3 +}; + +/// Specifies GlSync scan rate coefficient/multiplier options +enum GlSyncScanRateCoeff : uint8 +{ + GlSyncScanRateCoeffUndefined = 0, + GlSyncScanRateCoeffx5 = 1, + GlSyncScanRateCoeffx4 = 2, + GlSyncScanRateCoeffx3 = 3, + GlSyncScanRateCoeffx5Div2 = 4, + GlSyncScanRateCoeffx2 = 5, + GlSyncScanRateCoeffx3Div2 = 6, + GlSyncScanRateCoeffx5Div4 = 7 +}; + +/// Container structure for FrameLock/GenLock config. +struct GlSyncConfig +{ + uint32 validMask; ///< Mask that specifies which settings are actually referred in the structure. + /// GlSyncConfigMask* + uint32 syncDelay; ///< Delay of sync signal in microseconds + uint32 framelockCntlVector; ///< Vector of Framelock control bits. GlSyncFrameLockCntl* + uint32 signalSource; ///< Source of sync signal. Can be House Sync, RJ45 Port or GPUPort. + /// GlSyncSignalSource* or GPUPort Index + uint8 sampleRate; ///< Number of VSyncs per sample. 0 - no sampling, syncronized by singal VSync. + uint8 syncField; ///< Sync to Field 1 or to both Fields when input signal is interlaced. + /// GlSyncSyncField* + uint8 triggerEdge; ///< Which edge should be used as trigger. GlSyncTriggerEdge* + uint8 scanRateCoeff; ///< Scan Rate Multiplier applied to original sync signal. GlSyncScanRateCoeff* + uint32 sigGenFrequency; ///< Frequency in mHz of internal signal generator +}; + +#if PAL_KMT_BUILD +/// The type of NT object, it decides which KMT function should be called to retrieve the NT handle. +enum class ExternalObjectType : uint32 +{ + Allocation = 0, ///< Specifies the NT object is an allocation, such as GPUMemory etc. + SyncObject = 1, ///< Specifies the NT object is a sync object, such as semaphore and fence etc. +}; + +/// Specifies the NT handle attributes when opening from name. +struct ExternalHandleInfo +{ + ExternalObjectType objectType; ///< The type of NT object, it decides which KMT function should + /// be called to retrieve the NT handle. + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT handle + /// with a name, and then the handle can be acquired via this + /// name. + uint32 accessFlags; ///< Desried access rights of the object the handle refers to. +}; +#endif + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) +/// Gpu heap group enumeration. One heap group contains several pal GpuHeap. +enum GpuHeapGroup : uint32 +{ + GpuHeapGroupLocal = 0x0, /// Local heap group includes GpuHeapLocal and GpuHeapInvisible on Windows. + /// But GpuHeapInvisible is not included on Linux. + GpuHeapGroupNonLocal = 0x1, /// NonLocal heap group includes GpuHeapGartUswc and GpuHeapGartCacheable. +#if PAL_AMDGPU_BUILD + GpuHeapGroupInvisible = 0x2, /// This is used on Linux as GpuHeapLocal and GpuHeapInvisible are not combined. +#endif + GpuHeapGroupCount, +}; + +/// Struct for querying current gpu memory usage info and budget info. +struct GpuMemoryBudgetInfo +{ +#if PAL_AMDGPU_BUILD + gpusize systemUsage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of whole system. +#else + gpusize usage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of current process. + gpusize budget[GpuHeapGroupCount]; /// Current total memory budget of specified heap group of the device which + /// implies how much memory the device can allocate from that heap group + /// before allocations may fail or cause performance degradation, including + /// all allocated memory. Budget might be affected by OS status and other + /// processes. +#endif +}; +#endif + +/// Reclaim allocation result enumeration. +enum class ReclaimResult : uint8 +{ + Ok = 0, ///< Reclaim result is OK. + Discarded = 1, ///< Reclaim result is discarded. + NotCommitted = 2, ///< Reclaim result is not committed. + Count +}; + +#if defined(_WIN32) +/// KMD has an interface in place to support UMD registration of Events to be signaled based on various +/// system-level conditions. Each enum here represents a certain condition which will cause KMD to signal +/// the event Clients previously registered through RegisterEvent. +enum class EventTrackingType : uint32 +{ + ShaderInterrupt = 0, + EarlyPresent = 1, + Count +}; + +/// Clients specify the Util::Event and EventTrackingType it wants to register with PAL in this struct. +/// Under the condition specified by the trackingType, pEvent will be signaled by KMD. +struct RegisterEventInfo +{ + Util::Event* pEvent; + EventTrackingType trackingType; +}; + +/// As an input of UnregisterEvent to specify the event and trackingType Clients want to unregister from KMD. +struct UnregisterEventInfo +{ + Util::Event* pEvent; + EventTrackingType trackingType; +}; + +/// Output parameter of RegisterEvent. There's one struct for each type of EventTrackingType. +union RegisterEventOutputInfo +{ + struct + { + uint32 eventId; + gpusize eventMailboxGpuVa; + } shaderInterrupt; + + struct + { + uint32 eventId; + } earlyPresent; +}; +#endif + +/// Contains the page fault status of the GPU. +struct PageFaultStatus +{ + union + { + struct + { + uint32 pageFault : 1; ///< Set if there was a GPU page fault. + uint32 readFault : 1; ///< Set if the page fault was during a read operation. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; + } flags; + + gpusize faultAddress; ///< GPU virtual address where page fault occurred. Ignored if @ref pageFault is not set. +}; + +/// Input to the RegisterRuntimeState call, which allows the HIP runtime to pass information to KMD which can be shared +/// with the HIP debugger. +struct HipRuntimeSetup +{ + const void* pRdebug; ///< Address of the r_debug structure in the runtime + uint32 runtimeState; ///< Runtime-specific enum indicating runtime state + uint32 ttmpSetupHint; ///< Hint indicating that ttmp values should be initialized +}; + +/** + *********************************************************************************************************************** + * @interface IDevice + * @brief Interface representing a client-configurable context for a particular GPU. + * + * This object becomes the root of all client/PAL interaction to get work done on that GPU. The main functionality + * provided by the device object: + * + * + Creation of all other PAL objects. + * + GPU memory management. + *********************************************************************************************************************** + */ +class IDevice +{ +public: + /// Get the maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' + /// elements are no larger than pAlignments->maxElementSize. + /// + /// @param [out] pAlignments Its maxElementSize will be used to fill the rest of its members with valid alignments. + /// + /// @returns Success if pAlignments was filled with data. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pAlignments is null. + /// + ErrorInvalidValue if pAlignments->maxElementSize is zero. + virtual Result GetLinearImageAlignments( + LinearImageAlignments* pAlignments) const = 0; + + /// Fills out a structure with details on the properties of this device. This includes capability flags, + /// supported engines/queues, performance characteristics, etc. This should only be called after a client has + /// called @ref CommitSettingsAndInit(). + /// + /// @see DeviceProperties + /// + /// @param [out] pInfo Properties structure to be filled out by PAL based on properties of this device. + /// + /// @returns Success if the device properties were successfully returned in pInfo. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetProperties( + DeviceProperties* pInfo) const = 0; + + /// Checks and returns execution state of the device. Currently unsupported for DX clients and + /// will return Unavailable if called by those clients. + /// + /// @param [out] pPageFaultStatus This struct is filled out when this function returns ErrorGpuPageFaultDetected. + /// + /// @returns Success if device is operational and running. Otherwise, one of the following errors may be + /// + ErrorDeviceLost if device is lost, reset or not responding, + /// + ErrorInvalidValue if failed to get device reset state, + /// + ErrorOutOfGpuMemory if ran out of GPU memory, + /// + ErrorGpuPageFaultDetected if page fault was detected, + /// + ErrorUnknown if device is in unknown state. + virtual Result CheckExecutionState( + PageFaultStatus* pPageFaultStatus) = 0; + + /// Returns this devices client-visible settings structure initialized with appropriate defaults. Clients can + /// modify parameters in this structure as they wish in order to modify PAL's behavior for this device. After + /// modifying settings, the client must call CommitSettingsAndInit() before creating finalizing the device. + /// + /// @warning The returned value points to an internal PAL structure. Modifying data using this pointer after + /// calling CommitSettingsAndInit() will result in undefined behavior. + /// + /// @returns Pointer to this devices public settings for examination and/or modification by the client. + virtual PalPublicSettings* GetPublicSettings() = 0; + + /// Reads a specific setting from the operating system specific source (e.g. registry or config file). + /// + /// @param [in] pSettingName Name of the setting. Must be null-terminated. + /// @param [in] settingScope The scope of settings accessible. + /// @param [in] valueType The type of the setting to return (e.g. bool or int). + /// @param [out] pValue Buffer to write data that was read. Must be non-null. + /// @param [out] bufferSz Size of string buffer (pValue). Only necessary for ValueType::Str. + /// @param [out] pInRegistry If not null, this will be set to true if the setting was read from the registry. + /// + /// @returns True if the read of specified setting is successful. False indicates failure. + virtual bool ReadSetting( + const char* pSettingName, + SettingScope settingScope, + Util::ValueType valueType, + void* pValue, + size_t bufferSz = 0, + bool* pInRegistry = nullptr) const = 0; + +#if defined(_WIN32) + /// Clients call this function to register a Util::Event() object to be signaled based on specified conditions. + /// For now, we will only add one condition: shader interrupt. + /// + /// @param [in] input RegisterEventInfo which contains Util::Event client wants to be signaled and + /// EventTrackingType. + /// @param [out] pOutput Depending on the trackingType in input, the pOutput may refers different info + /// which clients may need. Refer to the definition of RegisterEventOutputInfo + /// for details. + /// @returns Success if no errors occurred. + virtual Result RegisterEvent( + const RegisterEventInfo& input, + RegisterEventOutputInfo* pOutput) = 0; + + /// Clients call this function to unregister an event previously registered with RegisterEvent(). + /// + /// @param [in] input UnregisterEventInfo which contains Util::Event and tracing type the client + /// wants to unregister. + /// @returns Success if no errors occurred. + virtual Result UnregisterEvent( + const UnregisterEventInfo& input) = 0; + +#endif + + /// Indicates that the client has finished overriding public settings so the settings struct can be finalized and + /// any late-stage initialization can be done. This method must be called before @ref IDevice::Finalize() can be + /// called. + /// + /// @note The only functions in IDevice that are able to be called before CommitSettingsAndInit(): + /// + GetLinearImageAlignments() + /// + GetPublicSettings() + /// + ReadSetting() + /// + /// @note Finalizing the settings may override values set by the client. This can occur if: + /// + Invalid settings, either because they are not supported by hardware or are somehow self-conflicting, + /// will be overridden. + /// + Settings specified in the private settings will override client-specified settings. + /// + /// @returns Success if settings have been committed successfully and any late-stage initialization is completed + /// successfully as well. + virtual Result CommitSettingsAndInit() = 0; + + /// Returns the largest possible GPU memory alignment requirement for any IGpuMemoryBindable object created on this + /// device. + /// + /// This is useful for clients that may want to allocate generic GPU memory rafts up front to support many objects + /// without creating all of those objects to query their alignment requirements ahead of time. In practice, most + /// objects have a much smaller alignment requirement than the allocation granularity, but images may require more + /// than the allocation granularity on some devices. + /// + /// @returns Largest possible GPU memory byte alignment for an IGpuMemoryBindable object on this device. + virtual gpusize GetMaxGpuMemoryAlignment() const = 0; + + /// Indicates that the client is able to finalize the initialization of this device with the requisite information. + /// This method must be called before any of the factory creation methods may be called. + /// + /// @note The only functions in IDevice that are able to be called before Finalize(): + /// + The functions listed in IDevice::CommitSettingsAndInit(). + /// + GetMaxGpuMemoryAlignment() + /// + GetProperties() + /// + /// @param [in] finalizeInfo Device finalization properties. + /// + /// @returns Success if final initialization is successful. + virtual Result Finalize( + const DeviceFinalizeInfo& finalizeInfo) = 0; + + /// Cleans up all internal state, undoing any work done by CommitSettingsAndInit() and Finalize(). Following a call + /// to this function, the device will be in its initial state as if it was re-enumerated; the client may requery + /// settings and build up the device for further use. If the client doesn't call this function, it will be called + /// automatically when IPlatform::Destroy() is called or when devices are re-enumerated. + /// + /// This function provides clients with a way to return devices to a trival state, one in which they have no + /// lingering OS or kernel driver dependencies. If a client pairs external state (e.g., an OS handle) with their + /// devices they may be required to call this function when they destroy their API device objects. + /// + /// It is expected that all PAL objects created by the device have already been destroyed (e.g. GPU memory, queues), + /// if not, the device may fall into an illegal state and the client will experience undefined behavior. + /// + /// @returns Success if no errors occurred. + virtual Result Cleanup() = 0; + + /// Returns if dual-source blending can be enabled. It checks the ColorBlendStateCreateInfo for any src1 blending + /// options. Then it checks if we are going to override those src1 options because the blend func is + /// min or max. + /// + /// @param [in] createInfo The ColorBlendStateCreateInfo that is checked for conditions that call for dual-source + /// blending. + /// + /// @returns true if the blend state calls for dual-source blending to be enabled. + virtual bool CanEnableDualSourceBlend( + const ColorBlendStateCreateInfo& createInfo) const = 0; + + /// Specifies how many frames can be placed in the presentation queue. This limits how many frames the CPU can get + /// in front of the device. + /// + /// @param [in] maxFrames Maximum number of frames that can be batched. Specifying a value of 0 resets the limit to + /// a default system value (3 frames on Windows). + /// + /// @returns Success if the limit was successfully adjusted. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + virtual Result SetMaxQueuedFrames( + uint32 maxFrames) = 0; + + /// Compares this device against another device object to determine how compatible they are for multi-GPU + /// operations. + /// + /// @param [in] otherDevice Device to determine MGPU compatibility with. + /// @param [out] pInfo Result compatibility info. + /// + /// @returns Success if the compatibility info was successfully returned in pInfo. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetMultiGpuCompatibility( + const IDevice& otherDevice, + GpuCompatibilityInfo* pInfo) const = 0; + + /// Reports properties of all GPU memory heaps available to this device (e.g., size, whether it is CPU visible or + /// not, performance characteristics, etc.). + /// + /// @param [out] info Properties of each GPU heap available to this device, indexed by the GPU ID defined in + /// @ref GpuHeap. If a particular heap is unavailable, its entry will report a size of 0. + /// + /// @returns Success if the heap properties were successfully queried and returned in info[]. Otherwise, one of the + /// following errors may be returned: + /// + ErrorUnknown if an unexpected internal error occured. + virtual Result GetGpuMemoryHeapProperties( + GpuMemoryHeapProperties info[GpuHeapCount]) const = 0; + + /// Reports all format and tiling mode related properties for this device. + /// + /// @param [out] pInfo Output properties. + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetFormatProperties( + MergedFormatPropertiesTable* pInfo) const = 0; + + /// Reports performance experiment related properties for this device. + /// + /// Enumerates the GPU family, blocks, capabilities, etc.. + /// + /// @param [out] pProperties Output properties. + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pProperties is null. + virtual Result GetPerfExperimentProperties( + PerfExperimentProperties* pProperties) const = 0; + + /// Fills out the default MSAA quad sample pattern for the given sample count. + /// + /// @param [in] samples The number of valid samples in the sample pattern. Must be a power of two. + /// @param [out] pQuadSamplePattern Fill this with the default pattern. + /// + /// @returns Success if @ref pQuadSamplePattern was filled with the default sample pattern. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if @ref pQuadSamplePattern is null. + /// + ErrorInvalidValue if @ref samples is not a supported power of two. + /// + ErrorUnavailable if this device lacks GfxIp support. + virtual Result GetDefaultSamplePattern( + uint32 samples, + MsaaQuadSamplePattern* pQuadSamplePattern) const = 0; + + /// Adds a list of per-device memory object references that persist across command buffer submissions. It is the + /// responsibility of the client to make sure that all required memory references have been added before submitting + /// the command buffer that uses on them. References can be added at the device, queue or specified at submit time. + /// gpuMemRefCount and ppGpuMemory cannot be 0/null. PAL will assert and crash if these values are invalid. If + /// multiple references are provided for the same memory, PAL will retain the safest set of GpuMemoryRef flags. + /// + /// see @ref IQueue::Submit() + /// + /// @param [in] gpuMemRefCount Number of memory references in the memory reference list, must be non-zero. + /// @param [in] pGpuMemoryRefs Array of gpuMemRefCount GPU memory references. + /// @param [in] pQueue Optional IQueue that the memory references will be used on, used to optimize + /// residency operations, can be null. Note, if a queue is specified here the same queue + /// should be specified in RemoveGpuMemoryReferences. + /// @param [in] flags Flags from GpuMemoryRefFlags that will apply to all memory object references. + /// + /// @returns Success if the memory references were successfully added. Can also return NotReady if the client + /// passes in a valid pPagingFence pointer and the operation doesn't complete before the function returns. + /// Otherwise, one of the following errors may be returned: + /// + ErrorTooManyMemoryReferences if gpuMemRefCount will cause the total reference count to exceed the + /// limit of this device. + /// + ErrorOutOfMemory if GPU memory objects will not fit in available GPU memory space (i.e. GPU Memory + /// is overcommitted). + virtual Result AddGpuMemoryReferences( + uint32 gpuMemRefCount, + const GpuMemoryRef* pGpuMemoryRefs, + IQueue* pQueue, + uint32 flags + ) = 0; + + /// Removes a list of per-device memory object references that have previously been added via + /// IDevice::AddGpuMemoryReferences(). PAL is responsible for ensuring that timestamps have been retired prior to + /// actually performing any residency operations related to removal of a memory reference, so clients are free to + /// call this function without regard for command buffer use. Memory references are reference counted, so an + /// individual memory reference will only be removed when the total internal reference count reaches zero. + /// gpuMemoryCount and ppGpuMemory cannot be 0/null, PAL will assert and crash if these values are invalid. + /// + /// @param [in] gpuMemoryCount Number of memory objects in the memory reference list (size of ppGpuMemory array). + /// This count must be greater than zero. + /// @param [in] ppGpuMemory Array of GPU memory references. + /// @param [in] pQueue Optional IQueue that the memory references were used on, used to optimize residency + /// operations, can be null. Note, if a queue was specified in AddGpuMemoryReferences, + /// pQueue must match. + /// + /// @returns Success if the memory references were successfully updated. + virtual Result RemoveGpuMemoryReferences( + uint32 gpuMemoryCount, + IGpuMemory*const* ppGpuMemory, + IQueue* pQueue + ) = 0; + + /// Queries the Device for the total amount of referenced GPU memory for each heap type. These totals include all + /// memory added to the Device or any Queue using @ref AddGpuMemoryReferences and not yet removed using @ref + /// RemoveGpuMemoryReferences. Internal PAL allocations are included in these totals, but memory referenced using + /// the per-submit list in @ref IQueue::Submit is not included in these amounts. + /// + /// The intended use for this interface is for clients to be able to manage budgeting of resident GPU memory. + /// + /// @param [out] referencedGpuMemTotal Array containing the total amount of referenced GPU memory for each GPU + /// memory heap. + virtual void GetReferencedMemoryTotals( + gpusize referencedGpuMemTotal[GpuHeapCount]) const = 0; + + /// Get primary surface MGPU support information based upon primary surface create info and input flags provided + /// by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] primaryInfoInput Primary surface info input arguments. + /// @param [in,out] pPrimaryInfoOutput Primary surface info output arguments. + /// + /// @returns Success if the primary surface MGPU support information were successfully queried. + virtual Result GetPrimaryInfo( + const GetPrimaryInfoInput& primaryInfoInput, + GetPrimaryInfoOutput* pPrimaryInfoOutput) const = 0; + + /// Returns the supported stereo modes list. + /// + /// @param [in,out] pStereoModeCount Input value specifies the maximum number of stereo modes to enumerate, and the + /// output value specifies the total number of stereo modes that were enumerated + /// in pStereoModeList. The input value is ignored if pStereoModeList is null. + /// This pointer must not be null. + /// @param [out] pStereoModeList Output list of stereo modes. Can be null, in which case the total number of + /// available modes will be written to pStereoModeCount. + /// + /// @returns Success if the display modes were successfully queried and the results were reported in + /// pStereoModeCount/pStereoModeList. Otherwise, one of the following errors may be returned: + /// + Unsupported if stereo mode is not supported, or the stereo modes can't be queried. + /// + ErrorOutOfMemory if temp memeory allocation failed. + virtual Result GetStereoDisplayModes( + uint32* pStereoModeCount, + StereoDisplayModeOutput* pStereoModeList) const = 0; + + /// Returns the currently selected Workstation stereo mode on Windows OS. + /// + /// @param [out] pWsStereoMode Output currently selected Workstation Stereo mode. + /// + /// @returns Success if the currently selected Workstation stereo mode were successfully queried + /// and the results were reported in pWsStereoMode. + virtual Result GetWsStereoMode(WorkstationStereoMode* pWsStereoMode) const = 0; + + /// Return information about active workstation support for 10-bit (potentially packed pixel) displays. + /// + /// @param [out] pMode Output reports if the workstation 10-bit display feature is enabled, and if so, + /// details on any required pixel packing. + /// + /// @returns Success if the 10-bits and packed-pixel format were successfully queried and the result were + /// reported in pMode. + virtual Result GetActive10BitPackedPixelMode( + Active10BitPackedPixelModeOutput* pMode) const = 0; + + /// Inform the KMD that this allocation must be reinterpreted as 10-bits per channel for the all + /// KMD-initiated BLTs. + /// + /// When in 10-bit mode and at present time, if the dx9p driver sees a 10-bit to 8-bit surface blt, + /// it will use this interface to inform the KMD that the blt dst surface must be reinterpreted as + /// 10-bits per channel for the all KMD-initiated BLTs. + /// + /// @param [in] pGpuMemory The dst GPU memory reference which will be marked as 10 bits format. + /// + /// @returns Success if the KMD has been sucessfully notified. + virtual Result RequestKmdReinterpretAs10Bit( + const IGpuMemory* pGpuMemory) const = 0; + + /// Set or query device clock mode. + /// + /// This function can be called by clients to set the device engine and memory clocks to certain pre-defined ratios. + /// If a call to restore the device clocks to default does not occur, the device stays in the previously set mode. + /// + /// @param [in] setClockModeInput Specify the clock mode to set the device to. + /// @param [out] pSetClockModeOutput @b Optional - Output device clock mode. If not nullptr, it is used + /// to query the current clock mode the device is running in. + /// + /// @returns Success if the device clock mode query/set request was successful. + virtual Result SetClockMode( + const SetClockModeInput& setClockModeInput, + SetClockModeOutput* pSetClockModeOutput) = 0; + + /// Request to enable/disable static VMID for the device. + /// + /// The function must be called with enable = true before a profiling session starts and enable = false after a + /// profiling session ends. It may be called any time, though it is illegal to disable without a prior corresponding + // enable/acquire. Only after this returns success (when enabling) can the driver make submissions targeting the + /// static VMID. + /// + /// @param [in] enable Specifies whether acquiring or releasing the static VMID + /// + /// @returns Success if the static VMID acquire/release request was successful. + virtual Result SetStaticVmidMode( + bool enable) = 0; + + /// Set up MGPU compositing mode of a display provided by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] setMgpuModeInput Set MGPU compositing mode input arguments. + /// + /// @returns Success if the MGPU compositing mode were successfully set. + virtual Result SetMgpuMode( + const SetMgpuModeInput& setMgpuModeInput) const = 0; + + /// Get XDMA cache buffer information of each GPU based upon video present source ID provided by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] vidPnSrcId Video present source id. + /// @param [in] gpuMemory Primary surface GPU memory. + /// @param [in,out] pGetXdmaInfoOutput Set XDMA cache buffer info output arguments. + /// + /// @returns Success if the XDMA cache buffer information were successfully queried. + virtual Result GetXdmaInfo( + uint32 vidPnSrcId, + const IGpuMemory& gpuMemory, + GetXdmaInfoOutput* pGetXdmaInfoOutput) const = 0; + + /// Polls current fullscreen frame metadata controls on given vidPnSourceId, including extended data. + /// + /// The function is used by clients that support frame metadata through KMD-UMD shared memory. + /// It polls the frame metadata shared memory for the given VidPnSource. Indicating which types of metadata + /// the UMD should send to KMD. + /// Clients should only call this function on the master device in an LDA chain. + /// + /// @param [in] vidPnSrcId Video present source id + /// @param [out] pFrameMetadataControl @b Optional - Output frame metadata controls. Clients can pass null to + /// check if the buffer is initialized successfully and if the + /// vidPnSrcId is valid. + /// + /// @returns Success if the metadata controls on the given vidPnSrcId was successfully polled. + /// Otherwise, one of the following erros may be returned: + /// + ErrorInvalidValue if vidPnSrcId is invalid (out of range) + /// + ErrorUnavailable if no implementation on current platform or if metadata shared buffer is null. + virtual Result PollFullScreenFrameMetadataControl( + uint32 vidPnSrcId, + PerSourceFrameMetadataControl* pFrameMetadataControl) const = 0; + + /// Get flip status flags and a flag indicating if current device owns the flags. (DX only) + /// + /// The function is used by clients that need flip status polling through KMD-UMD shared memory. + /// It provides caller the FlipStatusFlags on the given VidPnSource. Note that the flag returned is only a hint, + /// and can have a one frame delay during flip status transition. + /// + /// @param [in] vidPnSrcId Video present source id. + /// @param [out] pFlipFlags Output flip flags on given vidPnSrcId. Must not be null. + /// @param [out] pIsFlipOwner Output indicating if the current device owns the flip flags. + /// + /// @returns Success if flipping flags on given vidPnSrcId was successfully polled. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if vidPnSrcId is invalid. + /// + ErrorUnavailable if no implementation on current platform. + /// + ErrorInitializationFailed if flip status shared buffer was failed to initialize. + virtual Result GetFlipStatus( + uint32 vidPnSrcId, + FlipStatusFlags* pFlipFlags, + bool* pIsFlipOwner) const = 0; + + /// Resets the specified set of fences. + /// + /// All fences must be reset before passing them to a submission command. + /// + /// @param [in] fenceCount Number of fences to reset. + /// @param [in] ppFences Array of fences to reset. + /// + /// @returns Success if the specified fences have been successfully reset. + /// + /// @note The function assumes that neither ppFences is null nor that any of the elements of the array pointed by + /// ppFences are null. + virtual Result ResetFences( + uint32 fenceCount, + IFence*const* ppFences) const = 0; + + /// Stalls the current thread until one or all of the specified fences have been reached by the device. + /// + /// If waitAll is true all fences must have been submitted at least once before this is called; + /// otherwise at least one fence must have been submitted. Using a zero timeout value returns + /// immediately and can be used to determine the status of a set of fences without stalling. + /// + /// @param [in] fenceCount Number of fences to wait for (i.e., size of the ppFences array). + /// @param [in] ppFences Array of fences to be waited on. + /// @param [in] waitAll If true, wait for completion of all fences in the array before returning; if false, + /// return after any single fence in the array has completed. + /// @param [in] timeout This method will return after this many nanoseconds even if the fences do not complete. + /// + /// @returns Success if the specified fences have been reached, or Timeout if the fences have not been reached but + /// the specified timeout time has elapsed. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if: + /// - ppFences is null. + /// - Any member of the ppFences array is null. + /// + ErrorInvalidValue if: + /// - fenceCount is zero. + /// + ErrorFenceNeverSubmitted if: + /// - Any of the specified fences haven't been submitted. + virtual Result WaitForFences( + uint32 fenceCount, + const IFence*const* ppFences, + bool waitAll, + std::chrono::nanoseconds timeout) const = 0; + + /// Stalls the current thread until one or all of the specified Semaphores have been reached by the device. + /// + /// Using a zero timeout value returns immediately and can be used to determine the status of a set of semaphores + /// without stalling. + /// + /// @param [in] semaphoreCount Number of semaphores to wait for (i.e., size of the ppFences array). + /// @param [in] ppSemaphores Array of semaphores to be waited on. + /// @param [in] pValues Array of semaphores's value to be waited on. + /// @param [in] flags Combination of zero or more @ref HostWaitFlags values describing the behavior of this + /// wait operation. See @ref HostWaitFlags for more details. + /// @param [in] timeout This method will return after this many nanoseconds even if the semaphores do not + /// complete. + /// + /// @returns Success if the specified semaphores have been reached, or Timeout if the semaphores have not been + /// reached but the specified timeout time has elapsed. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - ppSemaphores is null. + /// - Any member of the ppSemaphores array is null. + /// + ErrorInvalidValue if: + /// - semaphoreCount is zero. + virtual Result WaitForSemaphores( + uint32 semaphoreCount, + const IQueueSemaphore*const* ppSemaphores, + const uint64* pValues, + uint32 flags, + std::chrono::nanoseconds timeout) const = 0; + + /// Correlates a GPU timestamp with the corresponding CPU timestamps, for tighter CPU/GPU timeline synchronization + /// + /// @param [out] pCalibratedTimestamps Reports a current GPU timestamp along with the CPU timestamps at the time + /// that GPU timestamp was written. The CPU timestamps are OS-specific. Also + /// reports a maximum deviation between the captured timestamps in nanoseconds. + /// + /// @returns Success if the request was successful. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if: + /// - pCalibratedTimestamps is null. + /// + ErrorUnavailable if: + /// - unable to capture timestamps for all requested time domains. + virtual Result GetCalibratedTimestamps( + CalibratedTimestamps* pCalibratedTimestamps) const = 0; + + /// Binds the specified GPU memory as a trap handler for the specified pipeline type. This GPU memory must hold + /// shader machine code (i.e., the client must generate HW-specific shader binaries through some external means, + /// probably the SP3 assembler). + /// + /// The same trap handler will be installed for all shader stages that are part of the pipeline. A trap handler + /// will only ever be executed for shaders that set the trapPresent bit in @ref PipelineShaderInfo. + /// + /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap handler will be installed + /// for _all_ hardware shader stages. + /// @param [in] pGpuMemory GPU memory allocation holding the trap handler. + /// @param [in] offset Offset in bytes into pGpuMemory where the trap handler shader code begins. Must be + /// 256 byte aligned. + virtual void BindTrapHandler( + PipelineBindPoint pipelineType, + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Binds the specified GPU memory location as a trap buffer for the specified pipeline type. This GPU memory will + /// be available to the trap handler as scratch memory to use as it chooses. The same trap buffer will be installed + /// for all shader stages that are part of the pipeline. + /// + /// There is no size parameter for the trap buffer. The client is responsible for ensuring that the trap handler + /// only reads/writes data within the bounds designated for trap buffer usage. + /// + /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap buffer will be installed + /// for _all_ hardware shader stages. + /// @param [in] pGpuMemory GPU memory allocation holding the trap buffer range. + /// @param [in] offset Offset in bytes into pGpuMemory where the trap buffer range starts. Must be 256 byte + /// aligned. + virtual void BindTrapBuffer( + PipelineBindPoint pipelineType, + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Get the swap chain information for creating a swap chain and presenting an image. + /// + /// @param [in] hDisplay Display handle of the local window system. + /// @param [in] hWindow Window handle of the local window system. + /// @param [in] wsiPlatform WSI Platform the swapchain supposed to work on + /// @param [in,out] pSwapChainProperties Contains swap chain information. + /// + /// @returns Success if get swap chain information successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result GetSwapChainInfo( + OsDisplayHandle hDisplay, + OsWindowHandle hWindow, + WsiPlatform wsiPlatform, + SwapChainProperties* pSwapChainProperties) = 0; + + /// Determines if the given window system requirement is supported by the underlying wsiPlatform. + /// + /// @param [in] hDisplay Display handle of the local window system. + /// @param [in] wsiPlatform WSI Platform the request supposed to send to + /// @param [in] visualId Requested visual information which may not needed for some wsiPlatforms + /// + /// @returns Success if the request is supported. Otherwise, one of the following erros may be returned: + /// + Unsupported + virtual Result DeterminePresentationSupported( + OsDisplayHandle hDisplay, + WsiPlatform wsiPlatform, + int64 visualId) = 0; + + /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each + /// WsiPlatform. + /// + /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. + /// @param [in] mode The swap chain will use this present mode. + /// + /// @returns Returns a mask of SwapChainModeSupport. + virtual uint32 GetSupportedSwapChainModes( + WsiPlatform wsiPlatform, + PresentMode mode) const = 0; + + /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each + /// WsiPlatform. + /// + /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. + /// @param [in] mode The swap chain will use this present mode. + /// + /// @returns Returns a mask of SwapChainModeSupport. + virtual uint32 GetCompatiblePresentModes( + WsiPlatform wsiPlatform, + SwapChainMode mode) const = 0; + + /// Determines if the given information corresponds to an external shared image. + /// + /// Some clients may not know if a given external shared resource is a simple GPU memory allocation or an image; it + /// is expected they will call this function to determine which set of open functions they must call. + /// + /// @param [in] openInfo The open info describing the external shared resource. + /// @param [out] pIsImage Its contents will be set to true if the external shared resource is an image. + /// + /// @returns Success if PAL was able to determine whether or not the resource is an image. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pIsImage is null. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result DetermineExternalSharedResourceType( + const ExternalResourceOpenInfo& openInfo, + bool* pIsImage) const = 0; + + /// @name FactoryMethods Device Factory Methods + /// + /// The following set of IDevice methods is the interface through which almost all PAL objects are created. + /// + /// PAL does not allocate its own system memory for these objects. Instead, the client must query the amount of + /// system memory required for the object then provide a pointer where PAL will construct the object. + /// + /// This approach allows the client to roll the PAL object into its own allocations without unnecessary heap + /// allocations and cache misses. It can also allocate many objects in a single memory space without PAL + /// involvement. + /// + /// @{ + + /// Determines the amount of system memory required for a queue object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateQueue(). + /// + /// @param [in] createInfo Properties of the new queue such as engine type and engine index. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueue object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueueSize( + const QueueCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a queue object. + /// + /// @param [in] createInfo Properties of the new queue such as engine type and engine index. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueueSize() with the same + /// create info. + /// @param [out] ppQueue Constructed queue object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the queue was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueue is null. + /// + ErrorInvalidValue if the create info's engineType is invalid or if the engineIndex is invalid. + virtual Result CreateQueue( + const QueueCreateInfo& createInfo, + void* pPlacementAddr, + IQueue** ppQueue) = 0; + + /// Determines the amount of system memory required for a multi-queue object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateMultiQueue(). + /// + /// @param [in] queueCount Number of queues in the gang; matches number of entries in pCreateInfo. + /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first + /// entry in this array describes the master queue which will be used to execute all + /// IQueue interfaces except for MultiSubmit(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an multi-queue IQueue object with the specified + /// properties. A return value of 0 indicates the createInfo was invalid. + virtual size_t GetMultiQueueSize( + uint32 queueCount, + const QueueCreateInfo* pCreateInfo, + Result* pResult) const = 0; + + /// Creates a multi-queue (i.e., gang submission queue) object. The resulting version of the IQueue interface + /// is composed of multiple hardware queues which can be atomically submitted to as a group. When this is done, + /// it is safe to use IGpuEvent objects to tightly synchronize work done across queues in a single call to Submit(). + /// This can allow the client to tightly schedule asynchronous workloads for maximum efficiency that isn't possible + /// across queues using IQueueSemaphore objects. + /// + /// @param [in] queueCount Number of queues in the gang; matches number of entries in the pCreateInfo array. + /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first + /// entry in this array describes the master queue which will be used to execute all + /// IQueue interfaces except for the ganged-portion of a Submit() (e.g., Present()). + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetMultiQueueSize() with the same + /// arguments. + /// @param [out] ppQueue Constructed multi queue object. + /// + /// @returns Success if the multi queue was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidValue if queueCount is less than 2. + /// + ErrorInvalidQueueType if any of the created sub-queues are not multi-queue compatible. This is + /// indicated by the supportsMultiQueue engineProperties flag in @ref DeviceProperties. + /// + ErrorInvalidPointer if pCreateInfo, pPlacementAddr or ppQueue is null. + /// + ErrorInvalidValue if any create info's configuration is invalid. + virtual Result CreateMultiQueue( + uint32 queueCount, + const QueueCreateInfo* pCreateInfo, + void* pPlacementAddr, + IQueue** ppQueue) = 0; + + /// Determines the amount of system memory required for a GPU memory object. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of CreateGpuMemory(). + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed heaps. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetGpuMemorySize( + const GpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IGpuMemory object with the requested properties. + /// + /// This method can create either _real_ or _virtual_ GPU memory allocations. + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed + /// heaps. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetGpuMemorySize() with the same + /// createInfo param. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the GPU memory was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + TooManyFlippableAllocations if the GPU memory was successfully created, but the client has reached + /// the limit of flippable allocations for this Device. This is a warning that future flippable GPU + /// memory may fail to be created due to internal OS limitations. + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorInvalidMemorySize if createInfo.size is invalid. + /// + ErrorInvalidAlignment if createInfo.alignment is invalid. + /// + ErrorInvalidValue if createInfo.heapCount is 0 for real allocations or non-0 for virtual allocations. + /// + ErrorOutOfGpuMemory if the allocation failed due to a lack of GPU memory. + /// + ErrorUnavailable if the Reserve Gpu Virtual Address failed. + virtual Result CreateGpuMemory( + const GpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a pinned GPU memory object. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// CreatePinnedGpuMemory(). + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object pinned with the specified + /// properties. + virtual size_t GetPinnedGpuMemorySize( + const PinnedGpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Pins a segment of system memory in place and create an @ref IGpuMemory object allowing access by the GPU. + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPinnedGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the system memory was successfully pinned and a corresponding GPU memory object was created. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if createInfo.pSysMem, pPlacementAddr, or ppGpuMemory is null, or if + /// createInfo.pSysMem is not allocation granularity aligned. + /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. + /// + ErrorOutOfMemory if the creation failed because the system memory could not be pinned. + virtual Result CreatePinnedGpuMemory( + const PinnedGpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a SVM memory object + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// CreateSvmGpuMemory(). + /// + /// @param [in] createInfo Data controlling the SVM memory properties, such as size and type of SVM buffer. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. + virtual size_t GetSvmGpuMemorySize( + const SvmGpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an SVM (Shared Virtual Memory) IGpuMemory object. + /// The basic idea of SVM is to create system memory that has the same CPU and GPU virtual address + /// (i.e., "pointer is a pointer"). This can work in two modes: fine-grain, or coarse-grain. + /// + /// Fine-grain (Single-GPU): The client should just call this function with pReservedGpuVaOwner set to null. + /// PAL will allocate GPU-accessible system memory that will have the same CPU virtual address + /// (as returned by IGpuMemory::Map()) as GPU virtual address + /// (as returned in the gpuVirtAddr value returned by IGpuMemory::Desc()). + /// + /// Fine-grain (MGPU): The client can call this function with pReservedGpuVaOwner set to IGpuMemory object + /// allocated on the first device and receive mapping to the same GPU VA location on another device. + /// + /// Coarse-grain: In this mode, there are actually two separate IGpuMemory objects. + /// The client should first create the "local" GPU memory object by calling IDevice::CreateGpuMemory() to create + /// a standard GPU memory object in the VaRange::Svm VA space. Next, the client should create the "staging" GPU + /// memory by calling this function (CreateSvmGpuMemory) with pReserveGpuVaOwner pointing to the "local" GPU + /// memory object. PAL will create system memory for the "staging" GPU memory with a CPU virtual address matching + /// the "local" GPU memory's GPU virtual address. The GPU virtual address of the "staging" GPU memory is + /// arbitrarily assigned. The client is responsible for managing the contents of the two related allocations + /// per their API rules. + /// + /// @param [in] createInfo Data controlling the SVM memory properties, such as size and location of SVM buffer. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSvmGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the SVM buffer was successfully created and a corresponding GPU memory object was created. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, or ppGpuMemory is null. + /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. + /// + ErrorOutOfMemory if the creation failed because there is not enough GPU memory + /// or the system memory could not be pinned. + virtual Result CreateSvmGpuMemory( + const SvmGpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a GPU memory object created by opening an allocation from a + /// different GPU. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// OpenSharedGpuMemory(). + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetSharedGpuMemorySize( + const GpuMemoryOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable GPU memory object created on another device for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorNotShareable if the specified memory object was not marked as shareable on creation. + virtual Result OpenSharedGpuMemory( + const GpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a external GPU memory object created by opening + /// an allocation from a compatible device, such as D3D device. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetExternalSharedGpuMemorySize( + Result* pResult) const = 0; + + /// Opens an external shared memory object which is created by a compatible device, such as D3D device. + /// There could be more than one underlying allocations in the shared memory object, only one allocation + /// created on the device's GPU will be opened, other allocations will be ignored. + /// + /// @param [in] openInfo Open info. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetExternalSharedGpuMemorySize() + /// with the same params. + /// @param [out] pMemCreateInfo Return CreateInfo of the external shared GPU memory. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory or ppGpuMemory is null. + /// + ErrorNotShareable if none of allocations in the shared memory object is created on the device's GPU. + virtual Result OpenExternalSharedGpuMemory( + const ExternalGpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + GpuMemoryCreateInfo* pMemCreateInfo, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a proxy GPU memory object to a GPU memory object on a + /// different GPU. An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// OpenPeerGpuMemory(). + /// + /// @param [in] openInfo Specifies a handle to a GPU memory object to open for peer-to-peer transfer access. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a peer IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetPeerGpuMemorySize( + const PeerGpuMemoryOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens previously created GPU memory object for peer access on another device. + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPeerGpuMemorySize() with the same + /// params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the memory was successfully opened for peer access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory, or openInfo.pOriginalMem is null. + virtual Result OpenPeerGpuMemory( + const PeerGpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an image object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateImage(). + /// + /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an @ref IImage object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetImageSize( + const ImageCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Determines if the image will have faster clone copy (than normal CS or graphics copy) in @ref CmdCopyImage(). + /// If this function returns true, clients can set @ref ImageCreateInfo::flags::cloneable = 1 to enable clone + /// copy support in @ref CmdCopyImage(); however whether @ref CmdCopyImage() goes through clone copy or not depends + /// on multiple factors, please see doxygen in @ref CmdCopyImage() for more details. + /// + /// Note that clients are free to set this flag to 1 regardless; it still enables clone copy potentially but can't + /// guarantee better performance. If clients find more faster clone copy cases, feel free to improve this function + /// to cover these kinds of images. + /// + /// Due to rely on @ref ImageCreateInfo values to make decision, ideally clients should set all other members + /// for @ref ImageCreateInfo first, and then call this function to setup cloneable flag last before creating image. + /// + /// @param [in] createInfo Properties of the new image. + /// + /// @returns true if image prefers clone copy in CmdCopyImage(); otherwise goes through normal copy. + virtual bool ImagePrefersCloneCopy( + const ImageCreateInfo& createInfo) const = 0; + + /// Creates an @ref IImage object with the requested properties. + /// + /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetImageSize() with the same + /// createInfo param. + /// @param [out] ppImage Constructed image object. When successful, the returned address will be the same as + /// specified in pPlacementAddr. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if: + /// - The image dimensions are invalid based on the image type. + /// - The image dimensions are not properly aligned for compressed formats. + /// - The number of samples is invalid for the image type and format. + /// - MSAA is enabled for an image that doesn't support color or depth usage. + /// - MSAA images have more than one mip level. + /// - The array size is zero, non-1 for 3D images, or beyond the max number of slices for 1D or 2D + /// images. + /// - The number of mipmaps is invalid for the image dimensions. + /// + ErrorInvalidPointer if pPlacementAddr or ppImage is null. + /// + ErrorInvalidFormat if: + /// - The format doesn't support the usage flags. + /// - A 1D image specifies a compressed format. + /// + ErrorInvalidFlags if: + /// - The color target and depth/stencil usages are specified simultaneously. + /// - The color target flag is set for a 1D image. + /// - The depth/stencil flag is set for a non-2D image. + virtual Result CreateImage( + const ImageCreateInfo& createInfo, + void* pPlacementAddr, + IImage** ppImage) = 0; + + /// Determines the amount of system memory required for a presentable image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of CreatePresentableImage(). + /// + /// Only images created through this interface are valid sources for IQueue::Present(). + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. + /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a IGpuMemory object attached to the + /// presentable IImage. Should be specified to the pGpuMemoryPlacementAddr argument + /// of CreatePresentableImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + virtual void GetPresentableImageSizes( + const PresentableImageCreateInfo& createInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + Result* pResult) const = 0; + + /// Creates a presentable image. Presentable image must have internally bound GPU memory allocated as OS needs the + /// information of image/memory via OS callbacks. + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling + /// GetPresentableImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated + /// with this presentable image. There must be as much size available here as + /// reported by calling GetPresentableImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [out] ppGpuMemory Constructed memory object. This object is only valid for specifying in a + /// memory reference list. It must be destroyed when the image is destroyed. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + TooManyFlippableAllocations if the image was successfully created, but the client has reached the + /// limit of flippable allocations for this Device. This is a warning that future presentable Images + /// may fail to be created due to internal OS limitations. + /// + ErrorTooManyPresentableImages if the swap chain cannot be associated with more presentable images. + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, or ppGpuMemory is null. + /// + ErrorInvalidValue if: + /// - The image dimensions are invalid. + /// - The refresh rate is invalid for a fullscreen image. + /// + ErrorInvalidFormat if the format doesn't support presentation. + virtual Result CreatePresentableImage( + const PresentableImageCreateInfo& createInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an image object (and an associated memory object) opened for + /// peer access to an image created on another GPU. Allocations of these amounts of memory must be provided in the + /// pImagePlacementAddr and pGpuMemoryPlacementAddr parameters of OpenPeerImage(). + /// + /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. + /// @param [out] pPeerImageSize Size, in bytes, of system memory required for a peer IImage. Should be + /// specified to the pImagePlacementAddr argument to OpenPeerImage(). + /// @param [out] pPeerGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached + /// to a peer IImage. Should be specified to the pGpuMemoryPlacementAddr argument + /// to OpenPeerImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + virtual void GetPeerImageSizes( + const PeerImageOpenInfo& openInfo, + size_t* pPeerImageSize, + size_t* pPeerGpuMemorySize, + Result* pResult) const = 0; + + /// Creates an @ref IImage object as a proxy to an IImage on another GPU to be used for peer-to-peer transfers. + /// + /// @note The @ref IImage object provided in the @ref PeerImageOpenInfo must be bound to an existing + /// @ref IGpuMemory object prior to opening a peer version of it. + /// If the new image needs to bind to an existing peer memory allocation, as when images are suballocated, + /// pGpuMemoryPlacementAddr must equal nullptr and ppGpuMemory must point to an existing IGpuMemory object + /// that was previously opened to reference the same memory from the current device. + /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling GetPeerImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr If nonzero, this is a pointer to the location where PAL should construct a + /// IGpuMemory to be associated with this peer image. There must be as much + /// size available here as reported by calling GetPeerImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [in] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying + /// in a memory reference list. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, ppGpuMemory, or + /// openInfo.pOriginalImage is null. + virtual Result OpenPeerImage( + const PeerImageOpenInfo& openInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an external shared image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of OpenExternalSharedImage(). + /// + /// @param [in] openInfo Specifies the external image to be opened. + /// @param [out] pImageSize Size, in bytes, of system memory required for pImagePlacementAddr. + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for pGpuMemoryPlacementAddr. + /// @param [out] pImgCreateInfo If non-null, it will be filled out with information describing the shared image. + /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pImageSize or pGpuMemorySize is null. + /// + ErrorNotShareable if none of allocations in the shared image is created on the device's GPU. + virtual Result GetExternalSharedImageSizes( + const ExternalImageOpenInfo& openInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + ImageCreateInfo* pImgCreateInfo) const = 0; + + /// Opens an external shared image object which was created by a compatible device, such as D3D device. + /// There could be more than one underlying allocations in the shared image object, only one allocation + /// created on the device's GPU will be opened, other allocations will be ignored. + /// + /// @param [in] openInfo Specifies the external image to be opened. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct the image object. + /// There must be as much space available here as reported by calling + /// GetExternalSharedImageSizes() with the same params. + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct the GPU memory object. + /// There must be as much space available here as reported by calling + /// GetExternalSharedImageSizes() with the same params. + /// @param [out] pMemCreateInfo If non-null, it is filled with information describing the external GPU + /// memory that backs this external image. + /// @param [out] ppImage Constructed image object. When successful, the returned address will + /// be the same as specified in pImagePlacementAddr. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will + /// be the same as specified in pGpuMemoryPlacementAddr. + /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage or ppGpuMemory is null. + /// + ErrorNotShareable if none of allocations in the shared image object is created on the device's GPU. + virtual Result OpenExternalSharedImage( + const ExternalImageOpenInfo& openInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + GpuMemoryCreateInfo* pMemCreateInfo, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a color target view object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateColorTargetView(). + /// + /// Unlike most creation methods in this class, GetColorTargetViewSize() does not require a ColorTargetCreateInfo + /// parameter. PAL must guarantee that all color target view objects are the same size in order to support DX12, + /// where these views are treated similarly to SRDs. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IColorTargetView object. + virtual size_t GetColorTargetViewSize( + Result* pResult) const = 0; + + /// Creates an @ref IColorTargetView object with the requested properties. + /// + /// @param [in] createInfo Properties of the color target view to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetColorTargetViewSize(). + /// @param [out] ppColorTargetView Constructed color target view object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the color target view was successfully created. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppColorTargetView, or createInfo.pImage is null. + /// + ErrorInvalidValue if: + /// - The base slice is invalid for the given image object and view type. + /// - The number of array slices is zero or the range of slices is too large for the specified image. + /// - The mip level is invalid for the given image object. + /// + ErrorInvalidImage if the image object doesn't have the color target access flag set. + virtual Result CreateColorTargetView( + const ColorTargetViewCreateInfo& createInfo, + void* pPlacementAddr, + IColorTargetView** ppColorTargetView) const = 0; + + /// Determines the amount of system memory required for a depth/stencil view object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilView(). + /// + /// Unlike most creation methods in this class, GetDepthStencilViewSize() does not require a + /// DepthStencilViewCreateInfo parameter. PAL must guarantee that all color target view objects are the same size + /// in order to support DX12, where these views are treated similarly to SRDs. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IDepthStencilView object. + virtual size_t GetDepthStencilViewSize( + Result* pResult) const = 0; + + /// Creates an @ref IDepthStencilView object with the requested properties. + /// + /// @param [in] createInfo Properties of the depth/stencil view to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be + /// as much size available here as reported by calling GetDepthStencilViewSize(). + /// @param [out] ppDepthStencilView Constructed depth/stencil view object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the depth/stencil view was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppDepthStencilView, or createInfo.pImage is null. + /// + ErrorInvalidValue if: + /// - The base slice is invalid for the given image object and view type. + /// - The number of array slices is zero or the range of slices is too large for the specified image. + /// - The mip level is invalid for the given image object. + /// + ErrorInvalidImage if the image object doesn't have the depth/stencil target access flag set. + virtual Result CreateDepthStencilView( + const DepthStencilViewCreateInfo& createInfo, + void* pPlacementAddr, + IDepthStencilView** ppDepthStencilView) const = 0; + + /// Creates one or more typed buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.typedBufferView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should + /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD + /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly + /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, + /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: + /// + If pBufferViewInfo or pOut, is null. + /// + If count is 0. + /// + If pBufferViewInfo[].format is Undefined. + /// + If pBufferViewInfo[].stride does not match the size of an element of that format. + /// + If pBufferViewInfo[].gpuAddr is 0. + /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. + /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateTypedBufferViewSrds( + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateTypedBufViewSrds(this, count, pBufferViewInfo, pOut); } + + /// Creates one or more untyped buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// These SRDs can be accessed in a shader as either _raw_ or _structured_ views. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.untypedBufferView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should + /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD + /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly + /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, + /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: + /// + If pBufferViewInfo or pOut, is null. + /// + If count is 0. + /// + If pBufferViewInfo[].format is not Undefined. + /// + If pBufferViewInfo[].gpuAddr is 0. + /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. + /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateUntypedBufferViewSrds( + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateUntypedBufViewSrds(this, count, pBufferViewInfo, pOut); } + + /// Validates image view SRD input parameters. Error checking for image view SRDs is handled by a separate + /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when + /// rebuilding SRDs. + /// + /// @param [in] viewInfo Input image view SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorImagePlaneUnavailable if the requested image plane specified in the view is not available on + /// the image. + /// + ErrorImageNotShaderAccessible if the image does not have a shader-readable or shader-writable usage. + /// + ErrorInvalidFormatSwizzle if the view's channel swizzle specifies components not available in the view + /// format. + /// + ErrorInvalidBaseMipLevel if the view's start subresource has a mip level larger than the number of + /// of available mip levels. + /// + ErrorFormatIncompatibleWithImageFormat if the view's format is not compatible with the image's format. + /// This can happen if: + /// - For color plane views, the bit-depths of the two formats are not equal. + /// + ErrorFormatIncompatibleWithImagePlane if the view's format is not compatible with the image's plane. + /// This can happen if: + /// - For depth plane views, the bit-depths of the view format and the depth component of the image + /// are not equal. + /// - For stencil plane views, the bit-depths of the view format and the stencil component of the image + /// are not equal. + /// + ErrorInvalidViewArraySize if: + /// - The view array size is 0. + /// - The image type is 3D and the view array size is not 1. + /// + ErrorViewTypeIncompatibleWithImageType if: + /// - The image type is 1D and the view type is not 1D + /// - The image type is 2D and the view type is not 2D or cubemap + /// - The image type is 3D and the view type is not 3D + /// + ErrorInsufficientImageArraySize if the number of viewed array slices is more than available on + /// the image. + /// + ErrorCubemapIncompatibleWithMsaa if the view type is a cubemap view and the image has + /// multiple samples. + /// + ErrorCubemapNonSquareFaceSize if the view type is a cubemap view and the image 2D extents are not + /// square. + /// + ErrorInvalidViewBaseSlice + /// - If the image type is 3D and the view base slice is not 0. + /// + /// @ingroup ResourceBinding + virtual Result ValidateImageViewInfo(const ImageViewInfo& viewInfo) const = 0; + + /// Creates one or more image view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.imageView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated image view SRD allows a set of subresources in an image to be accessed by a shader, and should + /// be setup as described in @ref ImageViewInfo. The client should put the resulting SRD in an appropriate + /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @warning SRDs for Planar YUV images will include padding if pImageViewInfo->subresRange.numSlices > 1 + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. + /// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateImageViewSrds( + uint32 count, + const ImageViewInfo* pImgViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateImageViewSrds(this, count, pImgViewInfo, pOut); } + + /// Validates an fmask view SRD input parameters. Error checking for fmask view SRDs is handled by a separate + /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when + /// rebuilding SRDs. + /// + /// @param [in] viewInfo Input image view SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorImageFmaskUnavailable if the image does not have an FMask. + /// + ErrorInvalidViewArraySize if the view array size is 0. + /// + ErrorViewTypeIncompatibleWithImageType if the image type is not 2D. + /// + ErrorInsufficientImageArraySize if the view base array slice and size define an out of bounds array range. + /// + /// @ingroup ResourceBinding + virtual Result ValidateFmaskViewInfo(const FmaskViewInfo& viewInfo) const = 0; + + /// Creates one or more fmask view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.fmaskView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated fmask view SRD allows a range of image slices to be accessed bo the load_fptr IL instruction, + /// which allows a shader to read compressed MSAA data at the expense of a texture indirection. This SRD should be + /// setup as described in @ref FmaskViewInfo. The client should put the resulting SRD in an appropriate + /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. + /// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateFmaskViewSrds( + uint32 count, + const FmaskViewInfo* pFmaskViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateFmaskViewSrds(this, count, pFmaskViewInfo, pOut); } + + /// Validates a sampler SRD input parameters. Error checking for sampler SRDs is handled by a separate function for + /// performance reasons and to avoid rechecking parameters that the client knows are correct when rebuilding SRDs. + /// + /// @param [in] samplerInfo Input sampler SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if: + /// - The max anisotropy or LOD bias value is outside of the legal range. + /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. + /// - The border color palette index is out of the legal range. + /// + /// @ingroup ResourceBinding + virtual Result ValidateSamplerInfo(const SamplerInfo& samplerInfo) const = 0; + + /// Creates one or more sampler _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.sampler in + /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated sampler SRD controlls execution of sample instructions in a shader, and should be setup as + /// described in @ref SamplerInfo. The client should put the resulting SRD in an appropriate location based on the + /// shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. + /// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pSamplerInfo or pOut is null. + /// + ErrorInvalidValue if: + /// - The max anisotropy or LOD bias value is outside of the legal range. + /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. + /// - The border color palette index is out of the legal range. + /// + /// @ingroup ResourceBinding + void CreateSamplerSrds( + uint32 count, + const SamplerInfo* pSamplerInfo, + void* pOut) const + { m_pfnTable.pfnCreateSamplerSrds(this, count, pSamplerInfo, pOut); } + + /// Creates one or more _BVH resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.bvhInfo in + /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated BVH SRD controls execution of ray trace instructions in a shader, and should be setup as + /// described in @ref BvhInfo. The client should put the resulting SRD in an appropriate location based on + /// the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. + /// @param [in] pBvhInfo Array of BVH (bounding volume hierarchy) descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pBvhInfo or pOut is null. + /// + /// @ingroup ResourceBinding + void CreateBvhSrds( + uint32 count, + const BvhInfo* pBvhInfo, + void* pOut) const + { + m_pfnTable.pfnCreateBvhSrds(this, count, pBvhInfo, pOut); + } + + /// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. + /// + /// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. + /// @param [out] pViewInfo Pointer to memory to be filled with decoded info. + void DecodeBufferViewSrd( + const void* pBufferViewSrd, + BufferViewInfo* pViewInfo) const + { + m_pfnTable.pfnDecodeBufferViewSrd(this, pBufferViewSrd, pViewInfo); + } + + /// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). + /// + /// @param [in] image The same image the SRD was created on. + /// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. + /// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. + void DecodeImageViewSrd( + const IImage& image, + const void* pImageViewSrd, + DecodedImageSrd* pDecodedInfo) const + { + m_pfnTable.pfnDecodeImageViewSrd(this, &image, pImageViewSrd, pDecodedInfo); + } + + /// The MSAA sample pattern palette is a client-managed table of sample patterns that might be in use by the app. + /// + /// The only purpose of this palette is to implement the samplepos shader instruction. This instruction returns the + /// position of a particular sample based on the sample pattern of the current rasterizer state or a particular + /// specified resource. When this instruction is executed, the shader will determine the correct palette index + /// as specified in the pipeline (see samplePatternIdx in the rsState structure inside GraphicsPipelineCreateInfo) + /// or in the image view SRD (see samplePatternIdx in ImageViewInfo). The shader will then return the position + /// for the specified sample in the specified entry of the currently bound sample pattern palette as set with this + /// function. + /// + /// The initial bound sample pattern palette values are undefined. A palette entry must be specified before it is + /// referenced by a samplepos instruction. + /// + /// @warning The samplepos instruction and sample pattern palette both assume every pixel has the same pattern. + /// This may not be accurate if the application uses custom sample patterns that aren't identical for + /// every pixel in the quad. + /// + /// @param [in] palette Small set of sample patterns. Each sample pattern consists of an x,y tuple per sample + /// describing where it is located in a pixel. The coordinate system is described in + /// MsaaStateCreateInfo. + /// + /// @returns Success if the palette was successfully updated. An error is only possible due to an internal error, + /// such as a failure to allocate GPU memory for a new table. + virtual Result SetSamplePatternPalette( + const SamplePatternPalette& palette) = 0; + + /// Determines the amount of system memory required for a border color palette object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateBorderColorPalette(). + /// + /// @param [in] createInfo Border color palette creation info (specifies number of entries). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IBorderColorPalette object with the specified + /// properties. A return value of 0 indicates the createInfo was invalid. + virtual size_t GetBorderColorPaletteSize( + const BorderColorPaletteCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a border color palette object. + /// + /// @param [in] createInfo Border color palette creation info (number of entries). + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetBorderColorPaletteSize() with the + /// same createInfo param. + /// @param [out] ppPalette Constructed border color palette object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the border color palette was successfully created. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppPalette is null. + /// + ErrorInvalidValue if the palette size is too large to be used on any queue on this device. + virtual Result CreateBorderColorPalette( + const BorderColorPaletteCreateInfo& createInfo, + void* pPlacementAddr, + IBorderColorPalette** ppPalette) const = 0; + + /// Determines the amount of system memory required for a compute pipeline object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateComputePipeline(). + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetComputePipelineSize( + const ComputePipelineCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a compute @ref IPipeline object with the requested properties. + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetComputePipelineSize() with the + /// same createInfo param. + /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppPipeline is null. + /// - A required shader pointer is null. + /// - The link time constant data pointer is null. + /// + ErrorInvalidValue if: + /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. + /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. + /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. + virtual Result CreateComputePipeline( + const ComputePipelineCreateInfo& createInfo, + void* pPlacementAddr, + IPipeline** ppPipeline) = 0; + + /// Determines the amount of system memory required for a shader library object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateShaderLibrary(). + /// + /// @param [in] createInfo Library creation parameters including ELF code object and other items. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additonal validation. + /// + /// @returns Size, in bytes, of system memory required for an IShaderLibrary object with the specified properties. + /// A return value of zero indicates the createInfo was invalid. + virtual size_t GetShaderLibrarySize( + const ShaderLibraryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a @ref IShaderLibrary object with the requested properties. + /// + /// @param [in] createInfo Library creation parameters including ELF code object and other items. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetShaderLibrarySize() with the + /// same createInfo parameter. + /// @param [out] ppLibrary Constructed library object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the library was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppLibrary is null. + /// - Required code object pointer is null. + virtual Result CreateShaderLibrary( + const ShaderLibraryCreateInfo& createInfo, + void* pPlacementAddr, + IShaderLibrary** ppLibrary) = 0; + + /// Determines the amount of system memory required for a graphics pipeline object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateGraphicsPipeline(). + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetGraphicsPipelineSize( + const GraphicsPipelineCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a graphics @ref IPipeline object with the requested properties. + /// If this pipeline is created from a set of @IShaderLibrary objects, it is the caller's responsibility to ensure + /// that the lifetime of the pipeline does not exceed the lifetime of the libraries themselves. + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetGraphicsPipelineSize() with the + /// same createInfo param. + /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppPipeline is null. + /// - A required shader pointer is null. + /// - The link time constant data pointer is null. + /// + ErrorInvalidValue if: + /// - The number of control points is invalid for a tessellation pipeline. + /// - Logic operations are enabled while some of the color targets enable blending. + /// - The dual source blend enable doesn't match expectations for color target and blend enable setup. + /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. + /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. + /// + ErrorInvalidFormat if: + /// - Blending is enabled by the color target format doesn't support blending. + /// - Logic operations are enabled by an incompatible format is used. + /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. + virtual Result CreateGraphicsPipeline( + const GraphicsPipelineCreateInfo& createInfo, + void* pPlacementAddr, + IPipeline** ppPipeline) = 0; + + /// Determines the amount of system memory required for a MSAA state object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateMsaaState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IMsaaState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetMsaaStateSize() const = 0; + + /// Creates an @ref IMsaaState object with the requested properties. + /// + /// @param [in] createInfo Properties of the MSAA state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetMsaaStateSize() with the same + /// createInfo param. + /// @param [out] ppMsaaState Constructed MSAA state object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the MSAA state was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppMsaaState is null. + /// + ErrorInvalidValue if: + /// - The number of samples is unsupported. + virtual Result CreateMsaaState( + const MsaaStateCreateInfo& createInfo, + void* pPlacementAddr, + IMsaaState** ppMsaaState) const = 0; + + /// Determines the amount of system memory required for a color blend state object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateColorBlendState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IColorBlendState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetColorBlendStateSize() const = 0; + + /// Creates an @ref IColorBlendState object with the requested properties. + /// + /// @param [in] createInfo Properties of the color blend state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetColorBlendStateSize() with the + /// same createInfo param. + /// @param [out] ppColorBlendState Constructed color blend state object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the color blend state was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppColorBlendState is null. + /// + ErrorInvalidValue if: + /// - An unsupported blend function is used with dual source blending. + virtual Result CreateColorBlendState( + const ColorBlendStateCreateInfo& createInfo, + void* pPlacementAddr, + IColorBlendState** ppColorBlendState) const = 0; + + /// Determines the amount of system memory required for a depth/stencil state object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IDepthStencilState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetDepthStencilStateSize() const = 0; + + /// Creates an @ref IDepthStencilState object with the requested properties. + /// + /// @param [in] createInfo Properties of the depth/stencil state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be + /// as much size available here as reported by calling GetDepthStencilStateSize() + /// with the same createInfo param. + /// @param [out] ppDepthStencilState Constructed depth/stencil state object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the depth/stencil state was successfully created. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppDepthStencilState is null. + /// + ErrorInvalidValue if: + /// - Depth bounds is enabled and the depth range is invalid. + virtual Result CreateDepthStencilState( + const DepthStencilStateCreateInfo& createInfo, + void* pPlacementAddr, + IDepthStencilState** ppDepthStencilState) const = 0; + + /// Determines the amount of system memory required for a queue semaphore object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateQueueSemaphore(). + /// + /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore count. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueueSemaphore object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueueSemaphoreSize( + const QueueSemaphoreCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IQueueSemaphore object with the requested properties. + /// + /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore + /// count. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueueSemaphoreSize() with the + /// same createInfo param. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the queue semaphore was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorInvalidValue if createInfo.initialCount is outside of the [0..31] range. + virtual Result CreateQueueSemaphore( + const QueueSemaphoreCreateInfo& createInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + + /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore + /// from a different device. An allocation of this amount of memory must be provided in the pPlacementAddr + /// parameter of OpenSharedQueueSemaphore(). + /// + /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the + /// specified properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetSharedQueueSemaphoreSize( + const QueueSemaphoreOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable queue semaphore object created on another device for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with + /// the same params. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. + /// + ErrorNotShareable if the specified queue semaphore object was not marked as shareable on creation. + virtual Result OpenSharedQueueSemaphore( + const QueueSemaphoreOpenInfo& openInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + + /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore from + /// a different API which isn't a PAL client. An allocation of this amount of memory must be provided in the + /// pPlacementAddr parameter of OpenExternalSharedQueueSemaphore(). + /// + /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the + /// specified properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetExternalSharedQueueSemaphoreSize( + const ExternalQueueSemaphoreOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable queue semaphore object created on another API which isn't a PAL client for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open and flags. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with + /// the same params. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. + virtual Result OpenExternalSharedQueueSemaphore( + const ExternalQueueSemaphoreOpenInfo& openInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + +#if PAL_KMT_BUILD + /// Opens an external NT handle from name. + /// + /// @param [in] handleInfo Specifies the NT handle attributes when opening from name. + /// @param [out] pHandle A NT handle returned by runtime. + /// + /// @returns Success if the NT handle was successfully opened. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidValue if the name or attributes is invaild. + virtual Result OpenExternalHandleFromName( + const ExternalHandleInfo& handleInfo, + OsExternalHandle* pHandle) = 0; +#endif + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) + /// Query current gpu memory usage info and budget info of specified heap group of the device. + /// + /// @param [out] pInfo Heap usage and budget info reported from Os. + virtual Result QueryGpuMemoryBudgetInfo( + GpuMemoryBudgetInfo* pInfo) = 0; +#endif + + /// Determines the amount of system memory required for an IFence object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateFence(). + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IFence object. + virtual size_t GetFenceSize( + Result* pResult) const = 0; + + /// Creates a GPU fence object. + /// + /// @param [in] createInfo Data controlling the fence properties + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetFenceSize(). + /// @param [out] ppFence Constructed fence object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the fence was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppFence is null. + virtual Result CreateFence( + const FenceCreateInfo& createInfo, + void* pPlacementAddr, + IFence** ppFence) const = 0; + + /// Opens a fence wihich was shared by another Device. + /// + /// @param [in] openInfo A reference to FenceOpenInfo, the handle is used if it's not null, or the + /// event is opened via name. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetFenceSize(). + /// @param [out] ppFence Constructed fence object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// @returns Success if the event was successfully reconstructed, otherwise an appropriate error code. + virtual Result OpenFence( + const FenceOpenInfo& openInfo, + void* pPlacementAddr, + IFence** ppFence) const = 0; + + /// Determines the amount of system memory required for an IGpuEvent object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateGpuEvent(). + /// + /// @param [in] createInfo Properties of the GPU event object to create. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuEvent object. + virtual size_t GetGpuEventSize( + const GpuEventCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a GPU event object. + /// + /// @param [in] createInfo Properties of the GPU event object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetEventSize(). + /// @param [out] ppGpuEvent Constructed event object. When successful, the returned address will be the same as + /// specified in pPlacementAddr. + /// + /// @returns Success if the event was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppEvent is null. + virtual Result CreateGpuEvent( + const GpuEventCreateInfo& createInfo, + void* pPlacementAddr, + IGpuEvent** ppGpuEvent) = 0; + + /// Determines the amount of system memory required for a query pool object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateQueryPool(). + /// + /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are in + /// the pool. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueryPool object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueryPoolSize( + const QueryPoolCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IQueryPool object with the requested properties. + /// + /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are + /// in the pool. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueryPoolSize() with the same + /// createInfo param. + /// @param [out] ppQueryPool Constructed query pool object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the query pool was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueryPool is null. + /// + ErrorInvalidValue if createInfo.numSlots is zero. + virtual Result CreateQueryPool( + const QueryPoolCreateInfo& createInfo, + void* pPlacementAddr, + IQueryPool** ppQueryPool) const = 0; + + /// Determines the amount of system memory required for a command allocator object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateCmdAllocator(). + /// + /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ICmdAllocator object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetCmdAllocatorSize( + const CmdAllocatorCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a command allocator object that can allocate GPU memory with the specified properties for use by command + /// buffer objects. + /// + /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetCmdAllocatorSize() with the + /// same createInfo param. + /// @param [out] ppCmdAllocator Constructed command allocator object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the command allocator was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdAllocator is null. + virtual Result CreateCmdAllocator( + const CmdAllocatorCreateInfo& createInfo, + void* pPlacementAddr, + ICmdAllocator** ppCmdAllocator) = 0; + + /// Determines the amount of system memory required for a command buffer object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateCmdBuffer(). + /// + /// @param [in] createInfo Command buffer properties including the target queue type. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ICmdBuffer object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetCmdBufferSize( + const CmdBufferCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a command buffer object that can build work intended for a particular queue type. + /// + /// @param [in] createInfo Command buffer properties including the target queue type. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetCmdBufferSize() with the same + /// createInfo param. + /// @param [out] ppCmdBuffer Constructed command buffer object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the command buffer was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. + virtual Result CreateCmdBuffer( + const CmdBufferCreateInfo& createInfo, + void* pPlacementAddr, + ICmdBuffer** ppCmdBuffer) = 0; + + /// Determines the amount of system memory required for an indirect command generator object. An allocation of this + /// amount must be provided in the pPlacementAddr parameter of CreateIndirectCmdGenerator(). + /// + /// @param [in] createInfo Indirect command generator properties. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation steps. + /// + /// @returns Size, in bytes, of system memory required for an IIndirectCmdGenerator object with the specified + /// properties. A return value of zero indicates the createInfo was invalid. + virtual size_t GetIndirectCmdGeneratorSize( + const IndirectCmdGeneratorCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an indirect command generator object which can translate an application-specified command buffer into a + /// format understandable by the GPU. + /// + /// @param [in] createInfo + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetIndirectCmdGeneratorSize() with + /// the same createInfo param. + /// @param [out] ppGenerator Constructed indirect command generator object. When successful, the returned + /// address will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the command generator was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGenerator is null. + virtual Result CreateIndirectCmdGenerator( + const IndirectCmdGeneratorCreateInfo& createInfo, + void* pPlacementAddr, + IIndirectCmdGenerator** ppGenerator) const = 0; + + /// Determines the amount of system memory required for a perf experiment object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreatePerfExperiment(). + /// + /// @param [in] createInfo Properties of the performance experiment to be created. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPerfExperiment object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetPerfExperimentSize( + const PerfExperimentCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a performance experiment object that can gather performance counter and trace data for a specific span + /// of a command buffer. + /// + /// @param [in] createInfo Properties of the performance experiment to be created. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPerfExperimentSize() with the + /// same createInfo param. + /// @param [out] ppPerfExperiment Constructed performance experiment object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the perf experiment was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. + virtual Result CreatePerfExperiment( + const PerfExperimentCreateInfo& createInfo, + void* pPlacementAddr, + IPerfExperiment** ppPerfExperiment) const = 0; + + /// Gets @ref IPrivateScreen objects owned by this device. Private screens are screens not exposed through standard + /// OS mechanisms. This function should be called again when any of the private screens are plugged or unplugged. + /// The first call to this function enumerates all private screens and stores in device object as well. The next + /// call triggered by hot-plug event enumerates private screens again but only destroys removed ones and creates + /// new private screen objects for newly-added ones. The hash code generated at enumeration time is used as id of + /// private screens. If the id of an enumerated private screen already exists, it is treated as unchanged. The EDID + /// array and display index are used to generate MD5 hash code. + /// + /// @param [out] pNumScreens Pointer to the number of private sceens, note that this number does not mean first + /// *pNumScreens elements in ppScreens are valid but just a hint that total *pNumScreens + /// out of MaxPrivateScreens are valid. + /// @param [out] ppScreens Pointer to the array of private screens. The client must pass in the pointer to an + /// array of at least MaxPrivateScreens pointers to IPrivateScreen. + /// + /// @returns Success if the private screens are correctly retrieved. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if the device does not support private screen functionalities. + virtual Result GetPrivateScreens( + uint32* pNumScreens, + IPrivateScreen** ppScreens) = 0; + + /// Registers an emulated @ref IPrivateScreen objects owned by this device. An emulated private screen doesn't have + /// a physical display hardware connected to the GPU. This could be useful for debugging unusual configurations or + /// using in automation systems when no real HMDs are available. + /// + /// @param [in] createInfo Properties of to create an emulated private screen object. + /// @param [out] pTargetId Pointer to returned emulated private screen target id. + /// + /// @returns Success if the emulated private screen is correctly created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorTooManyPrivateScreens if the device cannot create an emulated private screen. + virtual Result AddEmulatedPrivateScreen( + const PrivateScreenCreateInfo& createInfo, + uint32* pTargetId) = 0; + + /// Removes an emulated @ref IPrivateScreen objects owned by this device. + /// + /// @param [in] targetId Target id of emulated private screen to be removed. + /// + /// @returns Success if the emulated private screen is correctly removed. Otherwise, one of the following errors may + /// be returned: + /// + ErrorUnknown if any unknown error occurs. + virtual Result RemoveEmulatedPrivateScreen( + uint32 targetId) = 0; + + /// Determines the amount of system memory required for a private screen image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of CreatePrivateScreenImage(). + /// + /// Only images created through this interface are valid sources for IPrivateScreen::Present(). + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. + /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached to + /// the private screen IImage. Should be specified to the pGpuMemoryPlacementAddr + /// argument of CreatePrivateScreenImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + virtual void GetPrivateScreenImageSizes( + const PrivateScreenImageCreateInfo& createInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + Result* pResult) const = 0; + + /// Creates private screen presentable image. A private screen presentable image is similar to a regular presentable + /// image but can only be presented on the private screens. It has some implicit properties relative to standard + /// images, such as mipLevels=1, arraySize=1, numSamples=1 and etc. It also requires its bound GPU memory to be + /// pinned before presenting. + /// + /// @param [in] createInfo Create info. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling + /// GetPrivateScreenImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated + /// with this peer image. There must be as much size available here as + /// reported by calling GetPrivateScreenImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [out] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying + /// in a memory reference list. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorPrivateScreenInvalidFormat if the format isn't supported on the private screen. + /// + ErrorPrivateScreenRemoved if the private screen was removed. + virtual Result CreatePrivateScreenImage( + const PrivateScreenImageCreateInfo& createInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an ISwapChain object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateSwapChain(). + /// + /// @param [in] createInfo All the information related with this swap chain. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ISwapChain object. + virtual size_t GetSwapChainSize( + const SwapChainCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Create swap chain object based on the local window system. It doesn't include creation of presentable images. + /// The presentable images should be associated with SwapChain object when presentable image is created. + /// + /// @param [in] createInfo All the information related with this swap chain. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSwapChainSize(). + /// @param [out] ppSwapChain Constructed swapchain object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if create swap chain instance successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result CreateSwapChain( + const SwapChainCreateInfo& createInfo, + void* pPlacementAddr, + ISwapChain** ppSwapChain) = 0; + + /// Sets a power profile for this device. + /// + /// @param [in] profile A profile is a pre-defined configuration indicates how KMD/PPLib is notified to work, + /// e.g. raise or lower the GPU clock etc. + /// @param [in,out] pInfo Custom power profile info needed for VrCustom mode, can be null for other modes. Note the + /// actualSwitchInfo[] field is output part of @ref CustomPowerProfile. + /// + /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result SetPowerProfile( + PowerProfile profile, + CustomPowerProfile* pInfo) = 0; + + /// Sends an escape call to the KMD to enable power optimizations for DirectML/ROCm workloads. The client must call + /// this function when the context for any ROCm or DirectML workload is created or destroyed. + /// + /// @param [in] enableOptimization Set to true if called during context creation (enable power optimizations), + /// false if context destroyed (restore defaults). + /// + /// @returns Success if the power optimization is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result SetMlPowerOptimization( + bool enableOptimization) const = 0; + + /// Queries workstation caps on this device. + /// + /// @param [out] pCaps Pointer to location where pal should write back workstation caps. + /// + /// @returns Success if wokstation caps is got from KMD successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorOutOfMemory if out of system memory. + virtual Result QueryWorkStationCaps( + WorkStationCaps* pCaps) const = 0; + + /// Queries display connectors installed on the GPU + /// + /// @param [in,out] pConnectorCount Input value specifies the maximum number of connectors to enumerate, and the + /// output value specifies the total number of display modes that were enumerated + /// in pConnectors. The input value is ignored if pConnectors is null. + /// This pointer must not be null. + /// @param [out] pConnectors Output list of connectors. Can be null, in which case the total number of + /// available connectors will be written to pConnectorCount. + /// + /// + /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result QueryDisplayConnectors( + uint32* pConnectorCount, + DisplayConnectorProperties* pConnectors) = 0; + + /// Queries the dispatch kernel source code + /// + /// @returns Returns the source code for the kernel dispatch function used in + /// the kernel scheduler for device enqueue (CL C code) or nullptr if it's not available. + /// + /// @note This function is to support OpenCL AQL submissions. + virtual const char* GetDispatchKernelSource() const = 0; + + /// @} + + /// Query the Flgl state from the device. Device will query the connectivity of GLSync card and return + /// the state. Pal internal state of Flgl will be updated. + /// + /// @param [out] pState Pointer to the location that PAL should write the internal flgl states back. + /// + /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglQueryState( + FlglState* pState) = 0; + + /// Set the Flgl config of the device. + /// + /// @param [in] glSyncConfig const reference to the config struct. + /// + /// @returns Success if setting returns with success. Otherwise, one of the following errors may returned: + /// + ErrorUnknown if an unexpected internal error occurs. + /// + ErrorUnsuppported if the this GenLock function is not available. + virtual Result FlglSetSyncConfiguration( + const GlSyncConfig& glSyncConfig) = 0; + + /// Get the Flgl config of the device. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [out] pGlSyncConfig Pointer to the location that PAL should write the config back. + /// + /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: + /// + ErrorUnknown if an unexpected internal error occurs. + /// + ErrorInvalidPointer if pGlSyncConfig is null poiter. + virtual Result FlglGetSyncConfiguration( + GlSyncConfig* pGlSyncConfig) const = 0; + + /// Set the Framelock to disable or enable. Client should call this interface first to enable/disable Flgl. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [in] enable If true enables KMD framelock, otherwise disables framelock. + /// + /// @returns Success if framelock enable/disable successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not supported on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglSetFrameLock( + bool enable) = 0; + + /// Set the Genlock to disable or enable. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [in] enable If true enables the genlock, otherwise disables genlock. + /// + /// @returns Success if genlock enable/disable successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not supported on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglSetGenLock( + bool enable) = 0; + + /// Reset the framelock HW counter. The following counter operations are directly submit to hardware via I2C + /// interface Pal doesn't store the counter internally. Client should manage the counter + /// + /// @returns Success if the HW counter is reset successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglResetFrameCounter() const = 0; + + /// Check if a reset/discontinuity of HW framecounter occurs. If returns false, there is no need to query HW frame + /// counter, client should update its software counter instead. If returns true, client is required to query HW + /// counter and adjusts its software counter accordingly. + /// + /// @param [out] pReset Pointer to the location that PAL should write the reset status back. + /// + /// @returns Success if the reset status is returned. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglGetFrameCounterResetStatus( + bool* pReset) const = 0; + + /// Get the framelock HW counter. + /// + /// @param [out] pValue Pointer to the location that PAL should write the frame counter value back. + /// @param [out] pReset Pointer to the location that PAL should write the frame counter reset state. + /// + /// @returns Success if the frame counter is returned. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglGetFrameCounter( + uint64* pValue, + bool* pReset) const = 0; + + /// Checks if the specified externally-controlled feature settings have changed since the last time the function was + /// called. + /// + /// This is intended to be a lightweight function that can be called per frame per feature. If the function + /// returns Result::Success and (*pRsFeaturesChanged & RsFeatureTypeXX) != 0, then the user changed some related + /// settings in the UI. + /// + /// If TurboSync has updated, the client should first try to re-read the application profile settings by calling + /// IPlatform::QueryRawApplicationProfile() with client = User3D. If that returns Unsupported, then fall back + /// to device-wide TurboSync settings read via GetRsFeatureGlobalSettings(). + /// + /// If Chill has updated, call IPlatform::QueryRawApplicationProfile() with client = Chill to re-read the + /// system app profiles and then with client = User3D for any per-user Chill overrides, and additionally + /// call GetRsFeatureGlobalSettings() to get the Chill enabled state. + /// + /// If Delag has updated, call IPlatform::QueryRawApplicationProfile() with client = User3D to get the enabled + /// state, and additionally call GetRsFeatureGlobalSettings() to get the Delag hotkey. + /// + /// + /// @param [in] rsFeatures Bitmask of RsFeatureType value(s) to query. Use UINT_MAX to poll all. + /// @param [out] pRsFeaturesChanged Bitmask of queried RsFeatureTypes that have changed since last polling. + /// + /// @returns Success if the call succeeded. + virtual Result DidRsFeatureSettingsChange( + uint32 rsFeatures, + uint32* pRsFeaturesChanged) = 0; + + /// Gets externally-controlled per-device settings for the requested RsFeatureType. + /// + /// @param [in] rsFeature Feature type to request information for (singular, not a mask). + /// @param [out] pRsFeatureInfo Settings related to the specified RsFeatureType. + /// + /// @returns Success if the call succeeded. + virtual Result GetRsFeatureGlobalSettings( + RsFeatureType rsFeature, + RsFeatureInfo* pRsFeatureInfo) = 0; + + /// Update Chill Status (last active time stamp). After every frame, UMD needs to generate a time stamp and inform + /// KMD through the shared memory, if the time stamp changes between 2 frames, it means Chill is active and KMD + /// needs to adjust power through PSM. + /// + /// @param [in] lastChillActiveTimeStampUs the last Chill active time stamp in microseconds to set + /// + /// @returns Success if the call succeeded. + virtual Result UpdateChillStatus( + uint64 lastChillActiveTimeStampUs) = 0; + + /// Make the Bus Addressable allocations available to be accessed by remote device. + /// Exposes the surface and marker bus addresses for each allocation. These bus addresses can be accessed by + /// calling @ref IGpuMemory::Desc() on the appropriate object. + /// Client drivers must call @ref AddGpuMemoryReferences() for all relevant allocations before calling this. + /// + /// @param [in] pQueue Queue used by PAL for performing this operation. + /// @param [in] gpuMemCount Number of GPU memory allocations to expose to remote devices. + /// @param [in] ppGpuMemList Array of gpuMemCount IGpuMemory objects. + /// + /// @returns Success if bus addresses are available by calling @ref IGpuMemory::Desc() on all IGpuMemory objects + virtual Result InitBusAddressableGpuMemory( + IQueue* pQueue, + uint32 gpuMemCount, + IGpuMemory*const* ppGpuMemList) = 0; + + /// Create virtual display. Virtual display is similar to the regular display (IScreen), the difference is the + /// virtual display doesn't have a physical monitor connected. When CreateVirtualDisplay is called, KMD will + /// generate a hot-plug-in event to notify application a new display is added. Then the app/client will call PAL to + /// re-querythe attached screens and they will find a new one in the list that is pretend, but they can use it just + /// like a normal display. + /// + /// @param [in] virtualDisplayInfo Virtual display creation infomation. + /// @param [out] pScreenTargetId The screen target ID returned by KMD + /// + /// @returns Success if the call succeeded. + virtual Result CreateVirtualDisplay( + const VirtualDisplayInfo& virtualDisplayInfo, + uint32* pScreenTargetId) = 0; + + /// Destroy virtual display. When DestroyVirtualDisplay is called KMD will generate a hot-plug-out event to notify + /// application a virtual display is removed, it also will be removed from the display list, and app/client can't + /// uses it anymore. + /// + /// @param [in] screenTargetId Screen target ID. + /// + /// @returns Success if the call succeeded. + virtual Result DestroyVirtualDisplay( + uint32 screenTargetId) = 0; + + /// Query virtual display Properties from screen target Id. + /// + /// @param [in] screenTargetId Screen target ID. + /// @param [out] pVirtualDisplayProperties A pointer to VirtualDisplayProperties + /// + /// @returns Success if the call succeeded. + virtual Result GetVirtualDisplayProperties( + uint32 screenTargetId, + VirtualDisplayProperties* pProperties) = 0; + + /// Determines if hardware accelerated stereo rendering can be enabled for given graphic pipeline. + /// If hardware accelerate stereo rendering can be enabled, client doesn't need to do shader patching + /// which includes translating view id intrinsic to user data slot, outputing render target + /// array index and viewport array index in shader closest to scan converter. + /// + /// @param [in] viewInstancingInfo Graphic pipeline view instancing information. + /// + /// @returns True if hardware accelerated stereo rendering can be enabled, False otherwise. + virtual bool DetermineHwStereoRenderingSupported( + const GraphicPipelineViewInstancingInfo& viewInstancingInfo) const = 0; + + /// Get the UTF-8 file path used to put all files for cache purpose. + /// On Windows, the returned path depends on Util::GetProcessIntegrityLevel and Util::IsProcessInAppContainer. + /// + /// @returns Pointer to cache file path. + virtual const char* GetCacheFilePath() const = 0; + + /// Get the UTF-8 file path used to put all files for debug purpose (such as logs, dumps, replace shader). + /// + /// @returns Pointer to debug file path. + virtual const char* GetDebugFilePath() const = 0; + + /// Queries the base driver Radeon Software Version string (as shown in Radeon Settings). + /// + /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. + /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). + /// + /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors + /// may be returned: + /// + Unsupported if this function is not available on this environment. + /// + NotFound if the Radeon Software Version string is not present. + /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. + virtual Result QueryRadeonSoftwareVersion( + char* pBuffer, + size_t bufferLength) const = 0; + + /// Queries the base Driver Release Version string. + /// + /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. + /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). + /// + /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors + /// may be returned: + /// + Unsupported if this function is not available on this environment. + /// + NotFound if the Release Version string is not present. + /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. + virtual Result QueryReleaseVersion( + char* pBuffer, + size_t bufferLength) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +#if defined(__unix__) + /// Gets the modifier list and modifier count according to image format. + /// + /// @param [in] format Image format. + /// @param [out] pModifierCount Modifier count. + /// @param [out] pModifiersList A pointer to the modifier list. + virtual void GetModifiersList( + ChNumFormat format, + uint32* pModifierCount, + uint64* pModifiersList) const = 0; +#endif + + /// Passes the HIP runtime state setup to KMD on behalf of the HIP runtime. + /// + /// @param [in] runtimeState A structure containing runtime state information to pass to KMD + /// + /// @returns Result for error handling. + virtual Result RegisterHipRuntimeState(const HipRuntimeSetup& runtimeState) const = 0; + + /// Sets the second-level trap handler for HIP + /// + /// @param [in] pTrapHandlerCode A pointer to the piece of memory containing the trap handler code + /// This may be nullptr, which indicates that there is no secondary trap handler. + /// @param [in] codeOffset An offset, in bytes, into the pTrapHandlerCode's memory region + /// @param [in] pTrapHandlerMemory A pointer to the piece of memory containing the trap handler's memory + /// This may be nullptr, which indicates that there is no valid trap handler + /// memory. + /// @param [in] memoryOffset An offset, in bytes, into the pTrapHandlerMemory's memory region + /// + /// @returns Result for error handling. + virtual Result SetHipTrapHandler( + const IGpuMemory* pTrapHandlerCode, + gpusize codeOffset, + const IGpuMemory* pTrapHandlerMemory, + gpusize memoryOffset) const = 0; + + /// Sets the CmdDisassembly::ICmdBufferReporting associated with this device + /// Currently only one can be registered at a time - but this could be changed in the future + /// + /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting being + /// registered + /// @return Result::Success if successful + /// Result::Unsupported if feature is not supported + /// Result::AlreadyExists if there is already a CmdDisassembly::ICmdBufferReporting + /// registered with this devide + /// Result::ErrorInvalidValue if pInterface == nullptr + /// + virtual Result RegisterCmdReportingInterface( + CmdDisassembly::ICmdBufferReporting* pInterface) + { + return Result::Unsupported; + } + + /// Resets the CmdDisassembly::ICmdBufferReporting associated with this device to nullptr + /// Currently only one can be registered at a time - but this could be changed in the future + /// + /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting that + /// was registered + /// @return Result::Success if successful + /// Result::Unsupported if feature is not supported + /// Result::ErrorInvalidValue if pInterface == nullptr or pInterface was not the last + /// CmdDisassembly::ICmdBufferReporting registered + /// + virtual Result UnregisterCmdReportingInterface( + CmdDisassembly::ICmdBufferReporting* pInterface) + { + return Result::Unsupported; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IDevice() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Device objects will be destroyed when + /// @ref IPlatform::Destroy() is called. + virtual ~IDevice() { } + + DeviceInterfacePfnTable m_pfnTable; ///< SRD function pointer table. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +/** + *********************************************************************************************************************** + * @defgroup ResourceBinding Resource Binding Model + * + * _Resource binding_ refers to the process of binding resources (textures, UAVs, samplers, etc.) for access by shaders + * in a pipeline. This is an area where 3D APIs diverge significantly. PAL's model is designed to minimally abstract + * the underlying hardware in a way that allows performant implementations by each client driver. + * + * ### Hardware User Data + * GCN hardware has 16 _user data_ registers that act as a generic interface for passing values from a command buffer to + * a shader. User data registers are set to their desired value via packets in a command buffer, then the specified + * values are loaded from the user data registers into shader GPRs when a wave is launched. Since the user data is just + * arbitrary generic data, this method can be used to pass any type of data a client may want to specify directly from + * a command buffer, for example: + * + * - __Constant value__ - a 32-bit floating point or integer constant could be written into user data then be used + * directly by the shader. + * - __Shader resource descriptor (SRD)__ - 4 or 8 dwords of consecutive data could be an SRD which will be used as a + * t#, s#, etc. by the shader. + * - __Pointer__ - The user data could be an arbitrary GPU virtual address where a table of constants, SRDs, etc. are + * stored. + * + * The 3D driver and shader compiler are responsible for working together to define how resources referenced in a shader + * should be mapped to user data bound in a command buffer. + * + * ### PAL User Data + * PAL only lightly abstracts the hardware user data concept. DeviceProperties reports the number of user data entries + * supported on the device in maxUserDataEntries. Note that some clients may require more user data entries than there + * are physical user data registers - PAL will manage "spilling" of user data entries to GPU memory if necessary. + * + * User data entries are set in a command buffer by calling ICmdBuffer::CmdSetUserData(). + * + * ### Shader User Data Mapping + * When creating a pipeline, the client must specify how the user data entries set in a command buffer map to resources + * referenced by each shader in the pipeline. This is done in the pUserDataNodes array of PipelineShaderInfo. + * + * The resource mapping is built as a graph of _resource mapping nodes_ where the root nodes in the graph correspond + * to the user data entries. Each node fits in one of the following categories: + * + * - __SRD__: A 4 or 8 dword descriptor describing a shader resource. The mapping specifies the type and slot the SRD + * corresponds to (e.g., UAV 3 or sampler 7). + * - __Descriptor table pointer__: A GPU virtual address pointing at an array of other nodes. Typically this will be a + * pointer to GPU memory containing just SRDs, but tables are free to be built hierarchically such that tables have + * pointers to other tables in them. + * - __Inline constants__: 32-bit constants loaded directly byu the shader. The mapping specified the CB slot that + * should load the constant (e.g., cb3[1]). + * - __Unused__: A particular shader may not use all entries in a user data layout, and those should be marked unused. + * + * The following image illustrates a simple user data mapping: + * + * @image html userDataMapping.png + * + * ### Building Descriptor Tables + * The client is responsible for building specifying SRDs and pointers to GPU memory in order to execute the shader + * resource mapping specified during pipeline creation. SRDs can be created with several methods provided by IDevice: + * + * - CreateTypedBufferViewSrds() + * - CreateUntypedBufferViewSrds() + * - CreateImageViewSrds() + * - CreateFmaskViewSrds() + * - CreateSamplerSrds() + * + * The size required for each of these SRD types is returned in the srdSizes structure in DeviceProperties. + * + * When building descriptor tables in GPU memory, the client will need to retrieve a virtual address of the GPU memory + * where the tables exist in order to reference them from user data or from other descriptor tables. IGpuMemory + * provides the GetVirtAddr() method for this purpose. + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFence.h b/shared/amdgpu-windows-interop/pal/inc/core/palFence.h new file mode 100644 index 0000000000..0d2e3012f9 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFence.h @@ -0,0 +1,171 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFence.h + * @brief Defines the Platform Abstraction Library (PAL) IFence interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +/// Specifies properties for fence @ref IFence fence creation. Input structure to IDevice::CreateFence(). +struct FenceCreateInfo +{ + union + { + struct + { + uint32 signaled : 1; ///< Specify whether the initial status of the fence is signaled or not. + uint32 eventCanBeInherited : 1; ///< The event handle can be inherited by child process. + uint32 shareable : 1; ///< This fence may be opened for use by a different device. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Fence creation flags. +#if defined(_WIN32) + const wchar_t* pName; /// The name of the event object, Windows uses this name to uniquely identify fence objects + /// across processes. +#endif +}; + +/// Specifies properties for fence opening. Input structure to IDevice::OpenFence(). +struct FenceOpenInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the opened fence will reference the same sync object + ///< in the kernel. Otherwise, the object is copied to the new Fence. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; + + OsExternalHandle externalFence; ///< External shared fence handle. +#if defined(_WIN32) + const wchar_t* pName; /// The name of the event object,Windows uses this name to uniquely identify + /// fence objects across processes. +#endif +}; + +/// Specifies properties for fence exporting. Input structure to IFence::ExportExternalHandle(). +struct FenceExportInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the fence exporting a handle that reference the same sync + ///< object in the kernel. Otherwise, the object is copied to the new Fence. + uint32 implicitReset : 1; ///< If set, a fence reset will be done for the sync fd exported. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; +}; + +/** + *********************************************************************************************************************** + * @interface IFence + * @brief Represents a command buffer fence the client can use for coarse-level synchronization between the GPU and + * CPU. + * + * Fences can be specified when calling IQueue::Submit() and will be signaled when certain prior queue operations have + * completed. The status of the fence can be queried by the client to determine when the GPU work of interest has + * completed. + * + * Fences are guaranteed to wait for: + * + Prior command buffer submissions. + * + Prior queue semaphore signals and waits. + * + Prior direct presents. + * + * @see IDevice::CreateFence() + *********************************************************************************************************************** + */ +class IFence : public IDestroyable +{ +public: + /// Gets the status (completed or not) of the fence. + /// + /// @returns Success if the fence has been reached, or NotReady if the fence hasn't been reached. Other return + /// codes indicate an error: + /// + ErrorFenceNeverSubmitted if the fence hasn't been submitted yet and the fence is not created with + /// initialSignaled set to true. + virtual Result GetStatus() const = 0; + + /// Export the event handle or sync object handle of the fence for external usage. + /// If @ref FenceExportInfo::isReference is not set, then this also performs an implicit reset operation on + /// the Fence. + /// + /// @param [in] exportInfo Information describing how the Fence handle should be exported. + /// @returns the handle in the type OsExternalHandle + virtual OsExternalHandle ExportExternalHandle( + const FenceExportInfo& exportInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IFence() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IFence() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h b/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h new file mode 100644 index 0000000000..a36ce67d64 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h @@ -0,0 +1,506 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFormat.h + * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines format types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" + +/// Library-wide namespace encapsulating all PAL entities. +namespace Pal +{ + +/// Specifies the format for an image or GPU memory view. +/// +/// This defines the bit layout of the channels and how the value in each channel is interpreted. +/// +/// Channels are listed in DX10+-style order, where the least significant channels are listed first. For example, for +/// a uint32 val with an X8Y8Z8W8 value: X = val & 0xFF, Y = (val >> 8) & 0xFF, Z = (val >> 16) & 0xFF, +/// W = (val >> 24) & 0xFF. Enums without a more detailed explanation can be decoded in this manner. Multimedia, +/// or "YUV" formats are all exceptions to this rule. Each of those formats explicitly describes how their channels are +/// organized. +/// +/// Many of the multimedia (i.e., "YUV") formats are encoded such that the chrominance (chroma, CbCr, UV) samples are +/// stored at a lower resolution than the luminance (luma, Y) samples as a form of compression. The ratio of the +/// subsampling is often referred to using an A:B:C notation, where the trio of numbers A,B,C are used to describe the +/// number of luma and chroma samples in a hypothetical region which is A pixels wide and 2 pixels high. The three +/// numbers usually refer to the following quantities by convention: +/// A --> Width of the conceptual region of pixels, and is usually 4. +/// B --> Number of chroma samples in the first of two rows of A pixels. +/// C --> Number of changes of chroma samples between the first and second rows of A pixels. +/// +/// Common examples of this notation are described below: +/// 4:4:4 --> No chroma subsampling because luma and chroma both have 4 samples per row, and the number of chroma and +/// luma rows is the same. +/// 4:1:1 --> Each row of 4 pixels has 1 chroma sample, and the number of chroma and luma rows is the same. +/// 4:2:0 --> Each row of 4 pixels has 2 chroma samples, and there is only 1 chroma row for every 2 luma rows. +/// 4:2:2 --> Each row of 4 pixels has 2 chroma samples, and the number of chroma and luma rows is the same. +/// +/// Because of the subsampling ratios for multimedia formats, there are some restrictions on what dimensions can be used +/// when creating Images of these formats. 4:1:1 formats must have widths specified as a multiple of 4. 4:2:0 formats +/// must have widths and heights specified as multiples of 2. 4:2:2 formats must have widths specified as a multiple of +/// 2. 4:4:4 formats have no dimensional restrictions. +/// +/// Additionally, the YUV formats are broadly grouped into two categories: packed and planar formats. Packed formats +/// interleave the luma and chroma samples in each row of pixels. Planar formats are organized so that all of the luma +/// samples are together, followed by all of the chroma samples. Some planar formats interleave the U and V chroma +/// data, while some choose to have separate U and V planes. Both packed and planar formats can have any subsampling +/// ratio between the luma and chroma data. +enum class ChNumFormat : Util::uint32 +{ + Undefined = 0x0, ///< Used in situations where no format is needed, like raw memory views, or to + /// indicate no color/depth target will be attached when creating a graphics + /// pipeline. + X1_Unorm = 0x1, ///< _Untested._ + X1_Uscaled = 0x2, ///< _Untested._ + X4Y4_Unorm = 0x3, + X4Y4_Uscaled = 0x4, + L4A4_Unorm = 0x5, + X4Y4Z4W4_Unorm = 0x6, + X4Y4Z4W4_Uscaled = 0x7, + X5Y6Z5_Unorm = 0x8, + X5Y6Z5_Uscaled = 0x9, + X5Y5Z5W1_Unorm = 0xA, + X5Y5Z5W1_Uscaled = 0xB, + X1Y5Z5W5_Unorm = 0xC, + X1Y5Z5W5_Uscaled = 0xD, + X8_Unorm = 0xE, + X8_Snorm = 0xF, + X8_Uscaled = 0x10, + X8_Sscaled = 0x11, + X8_Uint = 0x12, + X8_Sint = 0x13, + X8_Srgb = 0x14, + A8_Unorm = 0x15, + L8_Unorm = 0x16, + P8_Unorm = 0x17, + X8Y8_Unorm = 0x18, + X8Y8_Snorm = 0x19, + X8Y8_Uscaled = 0x1A, + X8Y8_Sscaled = 0x1B, + X8Y8_Uint = 0x1C, + X8Y8_Sint = 0x1D, + X8Y8_Srgb = 0x1E, + L8A8_Unorm = 0x1F, + X8Y8Z8W8_Unorm = 0x20, + X8Y8Z8W8_Snorm = 0x21, + X8Y8Z8W8_Uscaled = 0x22, + X8Y8Z8W8_Sscaled = 0x23, + X8Y8Z8W8_Uint = 0x24, + X8Y8Z8W8_Sint = 0x25, + X8Y8Z8W8_Srgb = 0x26, + U8V8_Snorm_L8W8_Unorm = 0x27, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats + /// are X8Y8Z8W8_Snorm (to target U8V8_Snorm) and X8Y8Z8W8_Unorm (to target + /// L8W8_Unorm). + X10Y11Z11_Float = 0x28, + X11Y11Z10_Float = 0x29, + X10Y10Z10W2_Unorm = 0x2A, + X10Y10Z10W2_Snorm = 0x2B, + X10Y10Z10W2_Uscaled = 0x2C, + X10Y10Z10W2_Sscaled = 0x2D, + X10Y10Z10W2_Uint = 0x2E, + X10Y10Z10W2_Sint = 0x2F, + X10Y10Z10W2Bias_Unorm = 0x30, ///< A four-component, 32-bit 2.8-biased fixed-point format that supports 10 + /// bits for each color channel and 2-bit alpha. A shader must be aware of + /// *Bias* and must perform its own bias and scale on any data that is read + /// from or written. + U10V10W10_Snorm_A2_Unorm = 0X31, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats + /// are X10Y10Z10W2_Snorm (to target U10V10W10_Snorm) and X10Y10Z10W2_Unorm + /// (to target A2_Unorm). + X16_Unorm = 0x32, + X16_Snorm = 0x33, + X16_Uscaled = 0x34, + X16_Sscaled = 0x35, + X16_Uint = 0x36, + X16_Sint = 0x37, + X16_Float = 0x38, + L16_Unorm = 0x39, + X16Y16_Unorm = 0x3A, + X16Y16_Snorm = 0x3B, + X16Y16_Uscaled = 0x3C, + X16Y16_Sscaled = 0x3D, + X16Y16_Uint = 0x3E, + X16Y16_Sint = 0x3F, + X16Y16_Float = 0x40, + X16Y16Z16W16_Unorm = 0x41, + X16Y16Z16W16_Snorm = 0x42, + X16Y16Z16W16_Uscaled = 0x43, + X16Y16Z16W16_Sscaled = 0x44, + X16Y16Z16W16_Uint = 0x45, + X16Y16Z16W16_Sint = 0x46, + X16Y16Z16W16_Float = 0x47, + X32_Uint = 0x48, + X32_Sint = 0x49, + X32_Float = 0x4A, + X32Y32_Uint = 0x4B, + X32Y32_Sint = 0x4C, + X32Y32_Float = 0x4D, + X32Y32Z32_Uint = 0x4E, + X32Y32Z32_Sint = 0x4F, + X32Y32Z32_Float = 0x50, + X32Y32Z32W32_Uint = 0x51, + X32Y32Z32W32_Sint = 0x52, + X32Y32Z32W32_Float = 0x53, + D16_Unorm_S8_Uint = 0x54, + D32_Float_S8_Uint = 0x55, + X9Y9Z9E5_Float = 0x56, ///< Three partial-precision floating-point numbers encoded into a single 32-bit + /// value all sharing the same 5-bit exponent (variant of s10e5, which is sign + /// bit, 10-bit mantissa, and 5-bit biased (15) exponent). There is no sign + /// bit, and there is a shared 5-bit biased (15) exponent and a 9-bit mantissa + /// for each channelShared exponent format. + Bc1_Unorm = 0x57, ///< BC1 compressed texture format. + Bc1_Srgb = 0x58, ///< BC1 compressed texture format. + Bc2_Unorm = 0x59, ///< BC2 compressed texture format. + Bc2_Srgb = 0x5A, ///< BC2 compressed texture format. + Bc3_Unorm = 0x5B, ///< BC3 compressed texture format. + Bc3_Srgb = 0x5C, ///< BC3 compressed texture format. + Bc4_Unorm = 0x5D, ///< BC4 compressed texture format. + Bc4_Snorm = 0x5E, ///< BC4 compressed texture format. + Bc5_Unorm = 0x5F, ///< BC5 compressed texture format. + Bc5_Snorm = 0x60, ///< BC5 compressed texture format. + Bc6_Ufloat = 0x61, ///< BC6 unsigned compressed texture format. + Bc6_Sfloat = 0x62, ///< BC6 signed compressed texture format. + Bc7_Unorm = 0x63, ///< BC7 compressed texture format. + Bc7_Srgb = 0x64, ///< BC7 compressed texture format. + Etc2X8Y8Z8_Unorm = 0x65, + Etc2X8Y8Z8_Srgb = 0x66, + Etc2X8Y8Z8W1_Unorm = 0x67, + Etc2X8Y8Z8W1_Srgb = 0x68, + Etc2X8Y8Z8W8_Unorm = 0x69, + Etc2X8Y8Z8W8_Srgb = 0x6A, + Etc2X11_Unorm = 0x6B, + Etc2X11_Snorm = 0x6C, + Etc2X11Y11_Unorm = 0x6D, + Etc2X11Y11_Snorm = 0x6E, + AstcLdr4x4_Unorm = 0x6F, + AstcLdr4x4_Srgb = 0x70, + AstcLdr5x4_Unorm = 0x71, + AstcLdr5x4_Srgb = 0x72, + AstcLdr5x5_Unorm = 0x73, + AstcLdr5x5_Srgb = 0x74, + AstcLdr6x5_Unorm = 0x75, + AstcLdr6x5_Srgb = 0x76, + AstcLdr6x6_Unorm = 0x77, + AstcLdr6x6_Srgb = 0x78, + AstcLdr8x5_Unorm = 0x79, + AstcLdr8x5_Srgb = 0x7A, + AstcLdr8x6_Unorm = 0x7B, + AstcLdr8x6_Srgb = 0x7C, + AstcLdr8x8_Unorm = 0x7D, + AstcLdr8x8_Srgb = 0x7E, + AstcLdr10x5_Unorm = 0x7F, + AstcLdr10x5_Srgb = 0x80, + AstcLdr10x6_Unorm = 0x81, + AstcLdr10x6_Srgb = 0x82, + AstcLdr10x8_Unorm = 0x83, + AstcLdr10x8_Srgb = 0x84, + AstcLdr10x10_Unorm = 0x85, + AstcLdr10x10_Srgb = 0x86, + AstcLdr12x10_Unorm = 0x87, + AstcLdr12x10_Srgb = 0x88, + AstcLdr12x12_Unorm = 0x89, + AstcLdr12x12_Srgb = 0x8A, + AstcHdr4x4_Float = 0x8B, + AstcHdr5x4_Float = 0x8C, + AstcHdr5x5_Float = 0x8D, + AstcHdr6x5_Float = 0x8E, + AstcHdr6x6_Float = 0x8F, + AstcHdr8x5_Float = 0x90, + AstcHdr8x6_Float = 0x91, + AstcHdr8x8_Float = 0x92, + AstcHdr10x5_Float = 0x93, + AstcHdr10x6_Float = 0x94, + AstcHdr10x8_Float = 0x95, + AstcHdr10x10_Float = 0x96, + AstcHdr12x10_Float = 0x97, + AstcHdr12x12_Float = 0x98, + X8Y8_Z8Y8_Unorm = 0x99, ///< _Untested._ + X8Y8_Z8Y8_Uscaled = 0x9A, ///< _Untested._ + Y8X8_Y8Z8_Unorm = 0x9B, ///< _Untested._ + Y8X8_Y8Z8_Uscaled = 0x9C, ///< _Untested._ + AYUV = 0x9D, ///< YUV 4:4:4 packed format. Valid Image and Color-Target view formats are + /// { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully maps the + /// entire YUV subresource, with the V,U,Y,A channels mapped to the X,Y,Z,W + /// channels respectively. Additionally, Image views can use the { X32, Uint } + /// format where all four channels are packed into a single uint32. + UYVY = 0x9E, ///< YUV 4:2:2 packed format. The Image data is subsampled such that each 32bit + /// element contains two Y samples and one U and V sample. Valid Image view + /// formats are { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully + /// maps the entire YUV subresource, with the X,Y,Z,W channels mapped to the + /// U0,Y0,V0,Y1 channels respectively. Additionally, Image views can use the + /// { X32, Uint } format where all four channels are packed into a single + /// uint32. Image views can also use the { X8Y8_Z8Y8, Unorm } format to access + /// these as well. In this case, the width of the Image view would appear to be + /// twice as wide as it normally does, and the X0,Y0,Z0,Y1 channels map to the + /// U0,Y0,V0,Y1 channels respectively. + VYUY = 0x9F, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::UYVY format, except with a different channel ordering. + /// Image views with X8Y8Z8W8 channel formats map the X,Y,Z,W channels to the + /// V0,Y0,U0,Y1 channels respectively. Image views with the X8Y8_Z8Y8 channel + /// format map the X0,Y0,Z0,Y1 channels to the V0,Y0,U0,Y1 channels + /// respectively. + YUY2 = 0xA0, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::UYVY format, except with a different channel ordering. + /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,U0,Y1,V0 + /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format + /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,U0,Y1,V0 channels. + YVY2 = 0xA1, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::YUY2 format, except with a different channel ordering. + /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,V0,Y1,U0 + /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format + /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,V0,Y1,U0 channels. + YV12 = 0xA2, ///< YVU 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containg a uint8 per sample. Next is the V plane and the U + /// plane, both of which have a uint8 per sample. Valid Image view formats are + /// { X8, Unorm } and { X8, Uint }. Each view only has access to one of the Y, + /// V, or U planes. + NV11 = 0xA3, ///< YUV 4:1:1 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containing a uint8 per sample. Next is a UV plane which + /// has interleaved U and V samples, each stored as a uint8. Valid Image and + /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } + /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view + /// only has access to the Y plane. When using X8Y8, the view only has access + /// to the UV plane. + NV12 = 0xA4, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containing a uint8 per sample. Next is a UV plane which + /// has interleaved U and V samples, each stored as a uint8. Valid Image and + /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } + /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view + /// only has access to the Y plane. When using X8Y8, the view only has access + /// to the UV plane. + NV21 = 0xA5, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::NV12, except that the second plane swaps the + /// ordering of the U and V samples. Image views behave just like with + /// @ref ChNumFormat::NV12. + P016 = 0xA6, ///< YUV 4:2:0 planar format, with 16 bits per luma and chroma sample. The + /// plane ordering is identical to @ref ChNumFormat::NV12. Instead of uint8 + /// samples, this format uses 8.8 fixed point sample encoding. Image views + /// behave just like with @ref ChNumFormat::NV12, except R16 channel formats + /// are used for the Y plane, and X16Y16 channel formats are used for the UV + /// plane. + P010 = 0xA7, ///< YUV 4:2:0 planar format, with 10 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P016, except that the lowest 6 bits of each + /// luma and chroma sample are ignored. This allows the source data to be + /// interpreted as either P016 or P010 interchangably. + P210 = 0xA8, ///< YUV 4:2:2 planar format, with 10 bits per luma and chroma sample. This is + /// similar to @ref ChNumFormat::P010, except that the UV planes are sub-sampled + /// only in the horizontal direction, but still by a factor of 2 so the UV plane + /// ends up having the same number of lines as the Y plane. + X8_MM_Unorm = 0xA9, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces. Such as the Y plane or any plane in YV12. + X8_MM_Uint = 0xAA, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces. Such as the Y plane or any plane in YV12. + X8Y8_MM_Unorm = 0xAB, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces. + X8Y8_MM_Uint = 0xAC, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces. + X16_MM10_Unorm = 0xAD, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (10-bit). Such as the Y plane or any plane in YV12. + X16_MM10_Uint = 0xAE, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (10-bit). Such as the Y plane or any plane in YV12. + X16Y16_MM10_Unorm = 0xAF, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (10-bit). + X16Y16_MM10_Uint = 0xB0, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (10-bit). + P208 = 0xB1, ///< YUV 4:2:2 planar format, with 8 bits per luma and chroma sample. This is + /// similar to @ref ChNumFormat::NV12, except that the UV planes are sub-sampled + /// only in the horizontal direction, but still by a factor of 2 so the UV plane + /// ends up having the same number of lines as the Y plane. This format is + /// sometimes referred to as NV16. + X16_MM12_Unorm = 0xB2, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (12-bit). + X16_MM12_Uint = 0xB3, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (12-bit). + X16Y16_MM12_Unorm = 0xB4, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (12-bit). + X16Y16_MM12_Uint = 0xB5, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (12-bit). + P012 = 0xB6, ///< YUV 4:2:0 planar format, with 12 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P010, except that the lowest 4 bits of each + /// luma and chroma sample are ignored. + P212 = 0xB7, ///< YUV 4:2:2 planar format, with 12 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P210, except that the lowest 4 bits of each + /// luma and chroma sample are ignored. + P412 = 0xB8, ///< YUV 4:4:4 planar format, with 12 bits per luma and chroma sample. It consists + /// of a Y-plane followed by an interleaved UV plane. + X10Y10Z10W2_Float = 0xB9, ///< RGBA format with three 10-bit floats (6e4) and a 2-bit unorm as alpha. + Y216 = 0xBA, ///< YUV 4:2:2 packed, with 16 bits per luma or chroma sample. No alpha. + Y210 = 0xBB, ///< YUV 4:2:2 packed, with 10 bits per luma or chroma sample. No alpha. + /// Same memory layout as @ref ChNumFormat::Y216. + /// The lowest 6 bits of each sample are ignored. + Y416 = 0xBC, ///< YUV 4:4:4 packed, with 16 bits per luma or chroma sample. + Y410 = 0xBD, ///< YUV 4:4:4 packed, with 10 bits per luma or chroma sample and 2 bits for alpha. + _ReservedBE = 0xBE, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 + P216 = 0xBF, ///< YUV 4:2:2 planar format, with 16 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + YUV_420P10 = 0xC0, ///< YUV 4:2:0 tri-planar format, with 10 bits per luma and chroma sample. + YUV_422P10 = 0xC1, ///< YUV 4:2:2 tri-planar format, with 10 bits per luma and chroma sample. + YUV_444P10 = 0xC2, ///< YUV 4:4:4 tri-planar format, with 10 bits per luma and chroma sample. + YUV_420P12 = 0xC3, ///< YUV 4:2:0 tri-planar format, with 12 bits per luma and chroma sample. + YUV_422P12 = 0xC4, ///< YUV 4:2:2 tri-planar format, with 12 bits per luma and chroma sample. + YUV_444P12 = 0xC5, ///< YUV 4:4:4 tri-planar format, with 12 bits per luma and chroma sample. + YUV_420P16 = 0xC6, ///< YUV 4:2:0 tri-planar format, with 16 bits per luma and chroma sample. + YUV_422P16 = 0xC7, ///< YUV 4:2:2 tri-planar format, with 16 bits per luma and chroma sample. + YUV_444P16 = 0xC8, ///< YUV 4:4:4 tri-planar format, with 16 bits per luma and chroma sample. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + YV16 = 0xC9, ///< YVU 4:2:2 tri-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled in + /// vertical direction. + YV24 = 0xCA, ///< YVU 4:4:4 tri-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled. + NV24 = 0xCB, ///< YUV 4:4:4 bi-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::NV12, except chroma is not subsampled. + /// This format is sometimes referred to as P408. + P410 = 0xCC, ///< YUV 4:4:4 planar format, with 10 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. + P416 = 0xCD, ///< YUV 4:4:4 planar format, with 16 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 929 + X16Y16Z16W16_MM10_Unorm = 0xCE, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of + /// each component are ignored. + X16Y16Z16W16_MM10_Uint = 0xCF, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of + /// each component are ignored. + X16Y16Z16W16_MM12_Unorm = 0xD0, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of + /// each component are ignored. + X16Y16Z16W16_MM12_Uint = 0xD1, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of + /// each component are ignored. +#endif + Count, + +}; + +/// Specifies which channel of a resource should be mapped to a particular component of an image view. +/// +/// @ingroup ResourceBinding +enum class ChannelSwizzle : Util::uint8 +{ + Zero = 0x0, ///< Ignore resource data and always fetch a 0 into this component. + One = 0x1, ///< Ignore resource data and always fetch a 1 into this component. + X = 0x2, ///< Use the X channel from resource for this component. + Y = 0x3, ///< Use the Y channel from resource for this component. + Z = 0x4, ///< Use the Z channel from resource for this component. + W = 0x5, ///< Use the W channel from resource for this component. + Count +}; + +/// Specifies a mapping for each component of an image or buffer view to a channel in its associated resource. +/// +/// @ingroup ResourceBinding +struct ChannelMapping +{ + union + { + struct + { + ChannelSwizzle r; ///< Red component swizzle. + ChannelSwizzle g; ///< Green component swizzle. + ChannelSwizzle b; ///< Blue component swizzle. + ChannelSwizzle a; ///< Alpha component swizzle. + }; + ChannelSwizzle swizzle[4]; ///< All four swizzles packed into one array. + Util::uint32 swizzleValue; + }; +}; + +/// Specifies a pixel format for an image or memory view and its corresponding channel swizzle. +struct SwizzledFormat +{ + ChNumFormat format; ///< Pixel format. + ChannelMapping swizzle; ///< Compatible channel swizzle for the above pixel format. +}; + +inline constexpr bool operator==(const SwizzledFormat& lhs, const SwizzledFormat& rhs) +{ + return (lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue); +} + +/// Constant for undefined formats. +constexpr SwizzledFormat UndefinedSwizzledFormat = +{ + ChNumFormat::Undefined, + { { { ChannelSwizzle::X, ChannelSwizzle::Zero, ChannelSwizzle::Zero, ChannelSwizzle::One } } }, +}; + +/// Flags structure reporting available capabilities of a particular format. +enum FormatFeatureFlags : Util::uint32 +{ + FormatFeatureCopy = 0x00001, ///< Images of this format can be used as a copy source or destination. + FormatFeatureFormatConversion = 0x00002, ///< Images of this format support format conversion in copy + /// operations. + FormatFeatureImageShaderRead = 0x00004, ///< Images of this format can be read from a shader. + FormatFeatureImageShaderWrite = 0x00008, ///< Images of this format can be written from a shader. + FormatFeatureImageShaderAtomics = 0x00010, ///< Images of this format can be written atomically from a shader. + FormatFeatureMemoryShaderRead = 0x00020, ///< Memory views of this format can be read from a shader. + FormatFeatureMemoryShaderWrite = 0x00040, ///< Memory views of this format can be written from a shader. + FormatFeatureMemoryShaderAtomics = 0x00080, ///< Memory views of this format can be written atomically from a + /// shader. + FormatFeatureColorTargetWrite = 0x00100, ///< Images of this format can be bound as a color target. + FormatFeatureColorTargetBlend = 0x00200, ///< Images of this format can be bound as a color target for blending. + FormatFeatureDepthTarget = 0x00400, ///< Images of this format can be bound as a depth target. + FormatFeatureStencilTarget = 0x00800, ///< Images of this format can be bound as a stencil target. + FormatFeatureMsaaTarget = 0x01000, ///< Images of this format can support multisampling. + FormatFeatureWindowedPresent = 0x02000, ///< Images of this format can support windowed-mode presents. + /// Fullscreen present capability is queried using the @ref + /// IScreen::GetScreenModeList method. + FormatFeatureImageFilterLinear = 0x04000, ///< Images of this format can be linearly filtered. + FormatFeatureImageFilterMinMax = 0x08000, ///< Images of this format can be min/max filtered. + FormatFeatureFormatConversionSrc = 0x10000, ///< Images of this format support format conversion in copy + /// operations as the source image. + /// @note This is aliased to FormatFeatureFormatConversionDst for + /// backwards compatibility. + FormatFeatureFormatConversionDst = 0x20000, ///< Images of this format support format conversion in copy + /// operations as the destination image. + /// @note This is aliased to FormatFeatureFormatConversionSrc for + /// backwards compatibility. +}; + +/// Enumeration for indexing into the format properties table based on tiling. +enum FormatPropertiesTiling : Util::uint32 +{ + IsLinear = 0, ///< Format properties requested is for linearly-tiled surfaces. + IsNonLinear, ///< Format properties requested is for non-linearly tiled surfaces. + Count, ///< Number of format property tile types. +}; + +/// The format properties lookup table. Contains information about which device access features are available for all +/// formats and tiling modes. The tiling features for non-linear tiling modes are identical so we only store linear +/// and non-linear tiling features. From left to right, it is indexed by format and "is-non-linear". +/// Returned by IDevice::GetFormatProperties(). +struct MergedFormatPropertiesTable +{ + FormatFeatureFlags features[static_cast(ChNumFormat::Count)][FormatPropertiesTiling::Count]; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h b/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h new file mode 100644 index 0000000000..023a7c0eeb --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h @@ -0,0 +1,881 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFormatInfo.h + * @brief Defines the Platform Abstraction Library (PAL) Format utility functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDevice.h" +#include "palImage.h" +#include "palInlineFuncs.h" +#include "palMath.h" + +namespace Pal +{ + +/// Namespace encapsulating all PAL format utility functions. +namespace Formats +{ + +/// Specifies flags which indicate properties of each PAL channel format. +enum PropertyFlags : uint32 +{ + BitCountInaccurate = 0x1, ///< Indicates that format's bit count array is inaccurate + BlockCompressed = 0x2, ///< Indicates channel format is block-compressed + MacroPixelPacked = 0x4, ///< Indicates channel format has multiple pixels' data packed together into + /// one "macro pixel" + YuvPlanar = 0x8, ///< Indicates channel format is YUV-planar + YuvPacked = 0x10, ///< Indicates channel format is YUV packed +}; + +/// Specifies numeric support of a specified format. +enum class NumericSupportFlags : uint32 +{ + Undefined, ///< No numeric support. + Unorm, ///< Unsigned normalized. + Snorm, ///< Signed normalized. + Uscaled, ///< _Untested._ Treated as an unsigned integer inside the resource, but received by + /// the shader as a floating point number. + Sscaled, ///< _Untested._ Treated as a signed integer inside the resource, but received by + /// the shader as a floating point number. + Uint, ///< Unsigned integer. + Sint, ///< Signed integer. + Float, ///< Floating point number. + Srgb, ///< sRGB. + DepthStencil, ///< Depth/stencil support. + Yuv, ///< YUV support. +}; + +/// Specifies flags which indicate the presence of each color channel in a PAL channel format. +enum ChannelFlags : uint32 +{ + X = 0x1, ///< Indicates the X channel is present. + Y = 0x2, ///< Indicates the Y channel is present. + Z = 0x4, ///< Indicates the Z channel is present. + W = 0x8, ///< Indicates the W channel is present. +}; + +/// An entry in the channel-format info lookup table. Contains intrinsic properties describing a channel format. +struct FormatInfo +{ + uint32 bitsPerPixel; ///< Total count of bits in a signel pixel (or block). + uint32 componentCount; ///< Number of color components (channels) present. + + uint32 bitCount[4]; ///< Number of bits for each component in the format. These members are + /// only reliable if the 'bitCountInaccurate' flag is not set. + /// Listed in order: X, Y, Z, and W. + + uint32 channelMask; ///< Mask of @ref ChannelFlags values indicating which channels are present. + uint32 properties; ///< Mask of @ref PropertyFlags values indicating which properties a format + /// has. + NumericSupportFlags numericSupport; ///< Which numeric format this format represents. Used for easy identification. +}; + +/// BC block dimension (4x4) +static constexpr uint32 CompressedBcBlockDim = 4; + +/// ETC block dimension (4x4) +static constexpr uint32 CompressedEtcBlockDim = 4; + +/// Lookup table for intrinsic properties describing each channel format. Callers should access the members of this +/// table via BitsPerPixel() and related functions. +extern const FormatInfo FormatInfoTable[static_cast(ChNumFormat::Count)]; + +/// Convert a floating-point representation of a color value in RGBA order to the appropriate bit representation for +/// each channel based on the specified format. Swizzling is enabled by default to maintain backwards compatability. +/// There will be no swizzling functionality going forwards. +extern void ConvertColor( + SwizzledFormat format, + const float* pColorIn, + uint32* pColorOut); + +/// Convert an unsigned integer representation of a color value in YUVA order to the appropriate bit representation for +/// each channel based on the specified format. +extern void ConvertYuvColor( + SwizzledFormat format, + uint32 plane, + const uint32* pColorIn, + uint32* pColorOut); + +/// Packs a clear color value in RGBA order to a single element of the provided format and stores it in the +/// memory provided. Swizzling is enabled by default to maintain backwards compatability. There will be +/// no swizzling functionality going forwards. +extern void PackRawClearColor( + SwizzledFormat format, + const uint32* pColor, + void* pBufferMemory); + +/// Swizzles the color according to the provided format swizzle. +extern void SwizzleColor(SwizzledFormat format, const uint32* pColorIn, uint32* pColorOut); + +/// Compares two SwizzledFormats and checks for equality. +/// +/// @param lhs [in] Left hand side of comparison +/// @param rhs [in] Right hand side of comparison +/// +/// @return True if the formats are equal, false otherwise. +constexpr bool IsSameFormat( + const SwizzledFormat& lhs, + const SwizzledFormat& rhs) +{ + return ((lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue)); +} + +/// Queries the number of components for a particular channel format. +/// +/// @param [in] format The channel format to query for. +/// +/// @returns The number of components of the specified channel format. +inline uint32 NumComponents( + ChNumFormat format) +{ + return FormatInfoTable[static_cast(format)].componentCount; +} + +/// Queries the component mask for a particular format. +/// +/// @param [in] format The format to query for. +/// +/// @returns The component mask of @ref ChannelFlags for the specified format. +inline uint32 ComponentMask( + ChNumFormat format) +{ + uint32 mask = FormatInfoTable[static_cast(format)].channelMask; + PAL_ASSERT((mask & 0xF) == mask); + return mask; +} + +/// Checks if a format is undefined. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is undefined. False otherwise. +constexpr bool IsUndefined( + ChNumFormat format) +{ + return (format == ChNumFormat::Undefined); +} + +/// Checks if a format's numeric representation is unsigned normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned normalized. False otherwise. +inline bool IsUnorm( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Unorm); +} + +/// Checks if a format's numeric representation is signed normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed normalized. False otherwise. +inline bool IsSnorm( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Snorm); +} + +/// Checks if a format's numeric representation is unsigned scaled. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned scaled. False otherwise. +inline bool IsUscaled( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uscaled); +} + +/// Checks if a format's numeric representation is signed scaled. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed scaled. False otherwise. +inline bool IsSscaled( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sscaled); +} + +/// Checks if a format's numeric representation is unsigned integer. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned integer. False otherwise. +inline bool IsUint( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uint); +} + +/// Checks if a format's numeric representation is signed integer. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed integer. False otherwise. +inline bool IsSint( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sint); +} + +/// Checks if a format's numeric representation is floating point. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is floating point. False otherwise. +inline bool IsFloat( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Float); +} + +/// Checks if a format's numeric representation is gamma-corrected sRGB. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is sRGB. False otherwise. +inline bool IsSrgb( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Srgb); +} + +/// Checks if a format's numeric representation is normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is normalized. False otherwise. +inline bool IsNormalized( + ChNumFormat format) +{ + return IsUnorm(format) || IsSnorm(format); +} + +/// Checks if a format's numeric representation is an integer format. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is an integer format. False otherwise. +inline bool IsInteger( + ChNumFormat format) +{ + return IsUint(format) || IsSint(format); +} + +/// Checks if a format is a depth/stencil only format. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is a depth/stencil only format. False otherwise. +inline bool IsDepthStencilOnly( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::DepthStencil); +} + +/// Checks if the specified format is one of the YUV-planar ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is YUV-planar. False otherwise. +inline bool IsYuvPlanar( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & YuvPlanar) != 0); +} + +/// Checks if the specified format is one of the YUV-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is YUV-packed. False otherwise. +inline bool IsYuvPacked( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & YuvPacked) != 0); +} + +/// Checks if the specified format is one of the YUV ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is for YUV data. False otherwise. +inline bool IsYuv( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Yuv); +} + +/// Checks if a format has alpha. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format has an alpha channel. False otherwise. +constexpr bool HasAlpha( + SwizzledFormat format) +{ + return ((format.format == ChNumFormat::A8_Unorm) || + (format.format == ChNumFormat::L4A4_Unorm) || + (format.format == ChNumFormat::L8A8_Unorm) || + ((ComponentMask(format.format) & ChannelFlags::W) != 0) || + ((format.swizzle.a != ChannelSwizzle::Zero) && (format.swizzle.a != ChannelSwizzle::One))); +} + +/// Checks if a format has an unused alpha channel. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is a four channel format and has an unused alpha channel. False otherwise. +inline bool HasUnusedAlpha( + SwizzledFormat format) +{ + return ((NumComponents(format.format) == 4) && + (format.swizzle.r != ChannelSwizzle::W) && + (format.swizzle.g != ChannelSwizzle::W) && + (format.swizzle.b != ChannelSwizzle::W) && + (format.swizzle.a != ChannelSwizzle::W)); +} + +/// Converts format into its Unorm equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUnorm(ChNumFormat format); + +/// Converts format into its Snorm equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSnorm(ChNumFormat format); + +/// Converts format into its Uscaled equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUscaled(ChNumFormat format); + +/// Converts format into its Sscaled equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSscaled(ChNumFormat format); + +/// Converts format into its Uint equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Uint format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUint(ChNumFormat format); + +/// Converts format into its Sint equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Sint format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSint(ChNumFormat format); + +/// Converts format into its Float equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Float format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToFloat(ChNumFormat format); + +/// Converts format into its Srgb equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Srgb format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSrgb(ChNumFormat format); + +/// Converts source numeric format to the provided destination numeric format. +/// +/// @param [in] srcFormat Source Pixel format. +/// @param [in] dstFormat Destination Pixel format. +/// +/// @returns Source format with equivalent numeric format of destination format. Undefined if none exist. +extern ChNumFormat ConvertToDstNumFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); + +/// Determines whether the srcFormat and the dstFormat have the same channel formats. +/// +/// @param [in] srcFormat Source channel pixel format. +/// @param [in] dstFormat Destination channel pixel format. +/// +/// @returns True if both formats share the same channel format. False otherwise. +extern bool ShareChFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); + +/// Determines whether the srcFormat and the dstFormat have the same numeric formats. +/// +/// @param [in] srcFormat Source channel pixel format. +/// @param [in] dstFormat Destination channel pixel format. +/// +/// @returns True if both formats share the same numeric format. False otherwise. +inline bool HaveSameNumFmt( + ChNumFormat srcFormat, + ChNumFormat dstFormat) +{ + return (FormatInfoTable[static_cast(srcFormat)].numericSupport == + FormatInfoTable[static_cast(dstFormat)].numericSupport); +} + +/// Returns the block dimension for a compressed format. +/// +/// @param [in] format Format. +/// +/// @returns Corresponding block dimensions for the compressed format. +inline Extent3d CompressedBlockDim( + ChNumFormat format) +{ + Extent3d blockDim = {}; + + switch (format) + { + case ChNumFormat::Bc1_Unorm: + case ChNumFormat::Bc1_Srgb: + case ChNumFormat::Bc2_Unorm: + case ChNumFormat::Bc2_Srgb: + case ChNumFormat::Bc3_Unorm: + case ChNumFormat::Bc3_Srgb: + case ChNumFormat::Bc4_Unorm: + case ChNumFormat::Bc4_Snorm: + case ChNumFormat::Bc5_Unorm: + case ChNumFormat::Bc5_Snorm: + case ChNumFormat::Bc6_Ufloat: + case ChNumFormat::Bc6_Sfloat: + case ChNumFormat::Bc7_Unorm: + case ChNumFormat::Bc7_Srgb: + blockDim.width = CompressedBcBlockDim; + blockDim.height = CompressedBcBlockDim; + blockDim.depth = 1; + break; + case ChNumFormat::Etc2X8Y8Z8_Unorm: + case ChNumFormat::Etc2X8Y8Z8_Srgb: + case ChNumFormat::Etc2X8Y8Z8W1_Unorm: + case ChNumFormat::Etc2X8Y8Z8W1_Srgb: + case ChNumFormat::Etc2X8Y8Z8W8_Unorm: + case ChNumFormat::Etc2X8Y8Z8W8_Srgb: + case ChNumFormat::Etc2X11_Unorm: + case ChNumFormat::Etc2X11_Snorm: + case ChNumFormat::Etc2X11Y11_Unorm: + case ChNumFormat::Etc2X11Y11_Snorm: + blockDim.width = CompressedEtcBlockDim; + blockDim.height = CompressedEtcBlockDim; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr4x4_Unorm: + case ChNumFormat::AstcLdr4x4_Srgb: + case ChNumFormat::AstcHdr4x4_Float: + blockDim.width = 4; + blockDim.height = 4; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr5x4_Unorm: + case ChNumFormat::AstcLdr5x4_Srgb: + case ChNumFormat::AstcHdr5x4_Float: + blockDim.width = 5; + blockDim.height = 4; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr5x5_Unorm: + case ChNumFormat::AstcLdr5x5_Srgb: + case ChNumFormat::AstcHdr5x5_Float: + blockDim.width = 5; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr6x5_Unorm: + case ChNumFormat::AstcLdr6x5_Srgb: + case ChNumFormat::AstcHdr6x5_Float: + blockDim.width = 6; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr6x6_Unorm: + case ChNumFormat::AstcLdr6x6_Srgb: + case ChNumFormat::AstcHdr6x6_Float: + blockDim.width = 6; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x5_Unorm: + case ChNumFormat::AstcLdr8x5_Srgb: + case ChNumFormat::AstcHdr8x5_Float: + blockDim.width = 8; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x6_Unorm: + case ChNumFormat::AstcLdr8x6_Srgb: + case ChNumFormat::AstcHdr8x6_Float: + blockDim.width = 8; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x8_Unorm: + case ChNumFormat::AstcLdr8x8_Srgb: + case ChNumFormat::AstcHdr8x8_Float: + blockDim.width = 8; + blockDim.height = 8; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x5_Unorm: + case ChNumFormat::AstcLdr10x5_Srgb: + case ChNumFormat::AstcHdr10x5_Float: + blockDim.width = 10; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x6_Unorm: + case ChNumFormat::AstcLdr10x6_Srgb: + case ChNumFormat::AstcHdr10x6_Float: + blockDim.width = 10; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x8_Unorm: + case ChNumFormat::AstcLdr10x8_Srgb: + case ChNumFormat::AstcHdr10x8_Float: + blockDim.width = 10; + blockDim.height = 8; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x10_Unorm: + case ChNumFormat::AstcLdr10x10_Srgb: + case ChNumFormat::AstcHdr10x10_Float: + blockDim.width = 10; + blockDim.height = 10; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr12x10_Unorm: + case ChNumFormat::AstcLdr12x10_Srgb: + case ChNumFormat::AstcHdr12x10_Float: + blockDim.width = 12; + blockDim.height = 10; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr12x12_Unorm: + case ChNumFormat::AstcLdr12x12_Srgb: + case ChNumFormat::AstcHdr12x12_Float: + blockDim.width = 12; + blockDim.height = 12; + blockDim.depth = 1; + break; + default: + // This function should not be called on a non-compressed format. + PAL_ASSERT_ALWAYS(); + break; + } + + return blockDim; +} + +/// Convert a compressed format block coordinate to texels. +/// +/// @param [in] format Format. +/// @param [in] width Block width. +/// @param [in] height Block height. +/// @param [in] depth Block depth. +/// +/// @returns Structure containing the texel width, height and depth +inline Extent3d CompressedBlocksToTexels( + ChNumFormat format, + uint32 width, + uint32 height, + uint32 depth) +{ + Extent3d dims = CompressedBlockDim(format); + dims.width *= width; + dims.height *= height; + dims.depth *= depth; + return dims; +} + +/// Convert a compressed format texel coordinate to blocks. +/// +/// @param [in] format Format. +/// @param [in] width Texel width. +/// @param [in] height Texel height. +/// @param [in] depth Texel depth. +/// +/// @returns Structure containing the block width, height and depth +inline Extent3d CompressedTexelsToBlocks( + ChNumFormat format, + uint32 width, + uint32 height, + uint32 depth) +{ + Extent3d dims = CompressedBlockDim(format); + dims.width = Util::RoundUpQuotient(width, dims.width); + dims.height = Util::RoundUpQuotient(height, dims.height); + dims.depth = Util::RoundUpQuotient(depth, dims.depth); + return dims; +} + +/// Queries the number of bits in a pixel or element for the given format. +/// +/// @param format The format to query for. +/// +/// @return The number of bits per pixel for the given channel format. +inline uint32 BitsPerPixel( + ChNumFormat format) +{ + return FormatInfoTable[static_cast(format)].bitsPerPixel; +} + +/// Queries the number of bits in a pixel or element for the given format. +/// +/// @param format The format to query for. +/// +/// @return The number of bytes per pixel for the given channel format. +inline uint32 BytesPerPixel( + ChNumFormat format) +{ + return (BitsPerPixel(format) >> 3); +} + +/// Checks if the specified channel swizzle is allowed with the given format. +/// +/// @param [in] format The pixel format to check against. +/// @param [in] swizzle The specified channel swizzle to check with. +/// +/// @returns True if the specified channel swizzle is valid for the given format. False otherwise. +inline bool IsValidChannelSwizzle( + ChNumFormat format, + ChannelSwizzle swizzle) +{ + const uint32 mask = ComponentMask(format); + + bool valid = false; + switch (swizzle) + { + case ChannelSwizzle::Zero: + case ChannelSwizzle::One: + valid = true; + break; + case ChannelSwizzle::X: + valid = ((mask & ChannelFlags::X) != 0); + break; + case ChannelSwizzle::Y: + valid = ((mask & ChannelFlags::Y) != 0); + break; + case ChannelSwizzle::Z: + valid = ((mask & ChannelFlags::Z) != 0); + break; + case ChannelSwizzle::W: + valid = ((mask & ChannelFlags::W) != 0); + break; + default: + PAL_NEVER_CALLED(); + break; + } + + return valid; +} + +/// Queries the per-component bit counts for a particular format. +/// +/// @param [in] format The format to query for. +/// +/// @returns The corresponding component swizzles for the specified format. Returned as an array of four counts. +inline const uint32* ComponentBitCounts( + ChNumFormat format) +{ + return &FormatInfoTable[static_cast(format)].bitCount[0]; +} + +/// Determines the maximum bit-count of any component in the format. +/// +/// @param [in] format The channel format to query for. +/// +/// @returns The maximum bit-count of any component in the format. +inline uint32 MaxComponentBitCount( + ChNumFormat format) +{ + const FormatInfo& info = FormatInfoTable[static_cast(format)]; + + return Util::Max(Util::Max(info.bitCount[0], info.bitCount[1]), Util::Max(info.bitCount[2], info.bitCount[3])); +} + +/// Checks if the specified format is one of the block-compressed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is block-compressed. False otherwise. +inline bool IsBlockCompressed( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & BlockCompressed) != 0); +} + +/// Checks if the specified format is one of the macro-pixel-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is macro-pixel-packed. False otherwise. +inline bool IsMacroPixelPacked( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & MacroPixelPacked) != 0); +} + +/// Checks if the specified format is one of the rgb macro-pixel-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is a rgb macro-pixel-packed. False otherwise. +inline bool IsMacroPixelPackedRgbOnly( + ChNumFormat format) +{ + return (IsMacroPixelPacked(format) && (IsYuv(format) == false)); +} + +/// Returns the base-2 logarithm of of the subsampling ratio between the luma plane and chroma plane(s) of a YUV planar +/// format. The dimensions of the luma plane should be right-shifted by these amounts to determine the dimensions of the +/// chroma plane(s). +/// +/// @param [in] format Format. +/// @param [in] plane Image plane to query for. +/// +/// @returns Corresponding scaling factors between the luma plane and chroma plane(s). +inline Extent3d Log2SubsamplingRatio( + ChNumFormat format, + uint32 plane) +{ + // All planes for formats which are not YUV planar, and the 0th plane of a YUV planar format (the luma plane) are + // sampled at full rate, so the ratio is { log2(1), log2(1), log2(1) }, which equates to { 0,0,0 }. + Extent3d ratio = { }; + + if (IsYuvPlanar(format) && (plane != 0)) + { + PAL_ASSERT((plane == 1) || (plane == 2)); + switch (format) + { + // 4:4:4 formats have the same number of samples in every direction. + case ChNumFormat::P412: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_444P10: + case ChNumFormat::YUV_444P12: + case ChNumFormat::YUV_444P16: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + case ChNumFormat::YV24: + case ChNumFormat::NV24: + case ChNumFormat::P410: + case ChNumFormat::P416: +#endif + break; + // 4:2:0 formats have 1/2 as many samples in both the horizontal and vertical directions. + case ChNumFormat::YV12: + case ChNumFormat::NV12: + case ChNumFormat::NV21: + case ChNumFormat::P010: + case ChNumFormat::P012: + case ChNumFormat::P016: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_420P10: + case ChNumFormat::YUV_420P12: + case ChNumFormat::YUV_420P16: +#endif + ratio.width = 1; // log2(1/2) = -1 + ratio.height = 1; + break; + // 4:2:2 formats have 1/2 as many samples in the horizontal direction, and the same number of samples + // in the vertical direction. + case ChNumFormat::P208: + case ChNumFormat::P210: + case ChNumFormat::P212: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 + case ChNumFormat::P216: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_422P10: + case ChNumFormat::YUV_422P12: + case ChNumFormat::YUV_422P16: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + case ChNumFormat::YV16: +#endif + ratio.width = 1; + break; + // 4:1:1 formats have 1/4 as many samples in the horizontal direction, and the same number of samples + // in the vertical direction. + case ChNumFormat::NV11: + ratio.width = 2; // log2(1/4) = -2 + break; + default: + PAL_NEVER_CALLED(); // Did we miss a new YUV planar format? + break; + } + } + + return ratio; +} + +/// Converts a linearly-scaled color value to gamma-corrected sRGB. +/// +/// @param [in] linear Linear color value +/// +/// @returns Gamma-corrected sRGB color value +extern float LinearToGamma(float linear); + +/// Converts a gamma-corrected sRGB color value to linear color space. +/// +/// @param [in] gammaCorrectedVal Gamma-corrected sRGB color value +/// +/// @returns Linear color value +extern float GammaToLinear(float gammaCorrectedVal); + +/// Checks to see if a given format is a MM format +/// +/// +/// @returns bool is it an MM format +extern bool IsMmFormat(ChNumFormat format); + +/// Checks to see if a given format is a MM12 format +/// +/// +/// @returns bool is it an MM12 format +extern bool IsMm12Format(ChNumFormat format); + +/// Checks to see if a given format is a MM10 format +/// +/// +/// @returns bool is it an MM10 format +extern bool IsMm10Format(ChNumFormat format); + +} // Formats +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h new file mode 100644 index 0000000000..17d7093043 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h @@ -0,0 +1,711 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuMemory.h + * @brief Defines the Platform Abstraction Library (PAL) IGpuMemory interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +#if defined(_WIN32) +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Pal +{ + +// Forward declarations. +class IGpuMemory; +class IDevice; +class IImage; +enum class VaRange : uint32; + +/// Specifies Base Level priority per GPU memory allocation as a hint to the memory manager in the event it needs to +/// select allocations to page out of their preferred heaps. +enum class GpuMemPriority : uint32 +{ + Unused = 0x0, ///< Indicates that the allocation is not currently being used at all, and should be the first + /// choice to be paged out. + VeryLow = 0x1, ///< Lowest priority to keep in its preferred heap. + Low = 0x2, ///< Low priority to keep in its preferred heap. + Normal = 0x3, ///< Normal priority to keep in its preferred heap. + High = 0x4, ///< High priority to keep in its preferred heap (e.g., render targets). + VeryHigh = 0x5, ///< Highest priority to keep in its preferred heap. Last choice to be paged out (e.g., page + /// tables, displayable allocations). + Count +}; + +/// Specifies a finer granularity to the base Level priority per GPU memory allocation as a hint to the memory manager +/// in the event it needs to select allocations to page out of their preferred heaps. +enum class GpuMemPriorityOffset : uint32 +{ + Offset0 = 0x0, ///< Same priority as Base Level + Offset1 = 0x1, ///< Next priority from Base Level + Offset2 = 0x2, ///< Next priority from Base Level. + Offset3 = 0x3, ///< Next priority from Base Level. + Offset4 = 0x4, ///< Next priority from Base Level. + Offset5 = 0x5, ///< Next priority from Base Level. + Offset6 = 0x6, ///< Next priority from Base Level + Offset7 = 0x7, ///< Highest priority from Base Level + Count +}; + +/// Speicfies access mode for unmapped pages in a virtual Gpu Memory. +enum class VirtualGpuMemAccessMode : uint32 +{ + Undefined = 0x0, ///< Used in situations where no special accessMode needed. + NoAccess = 0x1, ///< All accesses of unmapped pages will trigger a GPU page fault. + ReadZero = 0x2, ///< Reads of unmapped pages return zero, and writes are discarded. + Count +}; + +/// Controls the behavior of this allocation with respect to the MALL. +enum class GpuMemMallPolicy : uint32 +{ + Default = 0x0, ///< MALL policy is decided by the driver. + Never = 0x1, ///< This allocation is never put through the MALL. + Always = 0x2, ///< This allocation is always put through the MALL. +}; + +/// Bitmask of cases where RPM view memory accesses will bypass the MALL. +enum RpmViewsBypassMall : uint32 +{ + RpmViewsBypassMallOff = 0x0, ///< Disable MALL bypass + RpmViewsBypassMallOnRead = 0x1, ///< Skip MALL for read access of views created in RPM + RpmViewsBypassMallOnWrite = 0x2, ///< Skip MALL for write access of views created in RPM + RpmViewsBypassMallOnCbDbWrite = 0x4, ///< Control the RPM CB/DB behavior + +}; + +/// Used for specifying a subregion of the allocation as having a different mall policy from the rest of the +/// allocation. +struct GpuMemMallRange +{ + uint32 startPage; ///< Starting 4k page that will obey the specified mallPolicy. + uint32 numPages; ///< Number of 4k pages that will obey the specified mallPolicy. +}; + +/// Specifies flags for @ref IGpuMemory creation. +union GpuMemoryCreateFlags +{ + struct + { + uint64 virtualAlloc : 1; ///< Create a _virtual_ as opposed to _real_ GPU memory allocation. + /// Only VA space will be allocated, and pages must be mapped via + /// IQueue::RemapVirtualMemoryPages(). + uint64 shareable : 1; ///< Memory can be shared between devices in the same process that + /// report the sharedMemory flag from + /// IDevice::GetMultiGpuCompatibility(). + uint64 interprocess : 1; ///< Memory will be visible to other processes + /// (they may choose to open it). + uint64 presentable : 1; ///< Memory can be bound to an image that will be used by presents. + uint64 flippable : 1; ///< Memory can be bound to an image that will be used by flip + /// presents. + uint64 stereo : 1; ///< Memory will be used for stereo (DXGI or AQBS stereo). + uint64 globallyCoherent : 1; ///< Memory needs to be globally coherent, + /// indicating the driver must manage both + /// CPU caches and GPU caches that are not flushed on + /// command buffer boundaries. + uint64 xdmaBuffer : 1; ///< GPU memory will be used for an XDMA cache buffer for + /// transferring data + /// between GPUs in a multi-GPU configuration. + uint64 turboSyncSurface : 1; ///< The memory will be used for TurboSync private swapchain primary. + uint64 typedBuffer : 1; ///< GPU memory will be permanently considered a single + /// typed buffer pseudo-object + /// with the properties given in typedBufferInfo. + uint64 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. + uint64 useReservedGpuVa : 1; ///< Use GPU virtual address previously reserved by another + /// memory object. It is invalid when using the shadow descriptor + /// table VA range. + uint64 autoPriority : 1; ///< Allow the platform to automatically determine the priority of + /// this GPU memory allocation. Flag is only valid if the device + /// reports that it supports this feature, and will result in an + /// error otherwise. + uint64 busAddressable : 1; ///< Create Bus Addressable memory. Allow memory to be used by other + /// device on the PCIe bus by exposing a write-only bus address. + uint64 sdiExternal : 1; ///< Create External Physical memory from an already allocated memory + /// on remote device. Similar to virtual allocations (no physical + /// backing) but have an immutable page mapping. The client must + /// specify surfaceBusAddr and markerBusAddr either at creation time + /// in GpuMemoryCreateInfo or by calling SetSdiRemoteBusAddress + /// once before using the GPU memory. The page mappings for an + /// allocation with this flag set must be initialized by including a + /// reference to it in the ppExternPhysMem list for the first + /// submission that references it. + uint64 sharedViaNtHandle : 1; ///< Memory will be shared by using Nt handle. + uint64 peerWritable : 1; ///< The memory can be open as peer memory and be writable. + uint64 tmzProtected : 1; ///< The memory is protected using TMZ (Trusted Memory Zone) or HSFB + /// (Hybrid Secure Framebuffer). It is not CPU accessible, + /// and GPU access is restricted by the hardware such that data + /// cannot be copied from protected memory into unprotected memory. + uint64 placeholder0 : 1; ///< Placeholder. + uint64 externalOpened : 1; ///< Specifies the GPUMemory is opened. + uint64 restrictedContent : 1; ///< Specifies the GPUMemory is protected content. + uint64 restrictedAccess : 1; ///< Specifies the GPUMemory is restricted shared access resource. + uint64 crossAdapter : 1; ///< Specifies the GPUMemory is shared cross-adapter resource. + uint64 cpuInvisible : 1; ///< By default, PAL makes every allocation CPU visible if all of its + /// preferred + ///< heaps are CPU visible. This flag can be used to override this + /// behavior when the client knows the memory will never be mapped + /// for CPU access. If this flag is set, calls to IGpuMemory::Map() + /// on this object will fail. + uint64 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. + /// But the memory still would be cached by other cache hierarchy + /// like L0, RB caches, L1, and L3. + uint64 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the + /// MALL. If this is set, then the mallPolicy enumeration must be set + /// to either "always" or "never". + uint64 explicitSync : 1; ///< If set, shared memory will skip syncs in the kernel and all + /// drivers that use this memory must handle syncs explicitly. + uint64 privPrimary : 1; ///< This is a private primary surface gpu memory. + uint64 privateScreen : 1; ///< GPU memory will be used for a private screen image. + uint64 kmdShareUmdSysMem : 1; ///< UMD will allocate/free a memory buffer to be shared with KMD. + uint64 deferCpuVaReservation : 1; ///< KMD will allocate with the "CpuVisibleOnDemand" alloc flag. + /// Ignored for non-CPU-visible allocations. + uint64 placeholder1 : 1; + uint64 startVaHintFlag : 1; ///< startVaHintFlag is set to 1 for passing startVaHint address + /// to set baseVirtAddr as startVaHint for memory allocation. +#if PAL_AMDGPU_BUILD + uint64 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes + /// the allocation upon creation, currently, only GpuHeapLocal and + /// GpuHeapInvisible are supported. + uint64 discardable : 1; ///< If set, this gpu memory object can be discarded under memory + /// pressure without keeping the content. +#else + uint64 placeholder2 : 2; +#endif + uint64 directCaptureSource : 1; ///< Memory will be mapped to DirectCapture resource's KMD-managed + /// private VA. + uint64 reserved : 28; ///< Reserved for future use. + }; + uint64 u64All; ///< Flags packed as 64-bit uint. +}; + +/// Specifies properties of a typed buffer pseudo-object. When this is specified in GpuMemoryCreateInfo along with the +/// typedBuffer flag, the GPU memory object has been permanently cast as a single typed buffer. A typed buffer is very +/// similar to a linear 3D image: it has a format, extent, and row/depth pitch values. +/// +/// Note that the typed buffer concept is used in other parts of the PAL interface and some of those instances may not +/// require a permanent typed buffer association. In such cases multiple typed buffers can be "bound" to one GPU memory +/// object at arbitrary offsets without any need to set the typedBuffer flag or fill out a TypedBufferCreateInfo. +struct TypedBufferCreateInfo +{ + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + Extent3d extent; ///< Dimensions in pixels WxHxD. + uint32 rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + uint32 depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + bool depthIsSubres; ///< True if the depth slices should be treated as an array of 2D subresources. +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size, alignment, vaRange, and descrVirtAddr fields. +struct GpuMemoryCreateInfo +{ + GpuMemoryCreateFlags flags; ///< GPU memory flags. + gpusize size; ///< Amount of GPU memory to allocate in bytes. + gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the + /// allocation granularity will be used. + VaRange vaRange; ///< Virtual address range for the GPU memory allocation. + + union + { + const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to the + /// memory object which previously reserved the GPU VA range to be used + /// by the new memory object. + gpusize descrVirtAddr; ///< Must be zero unless vaRange is ShadowDescriptorTable, in which case + /// it must specify the GPU VA of the corresponding DescriptorTable. + /// It doesn't need to be the base VA of the DescriptorTable allocation + /// but must be aligned to "alignment". + gpusize replayVirtAddr; ///< Must be zero unless vRange is CaptureReplay, in which case it must + /// specify the GPU VA of the corresponding memory object. + gpusize startVaHint; ///< Client passes a start VA hint to set as baseVirtAddr. If the given + /// hint is not properly aligned, find next higher aligned address as + /// hint. If the hint is available and within right vaRange where + /// vaRange is VaRange::Default then set baseVirtAddr as hint. If the + /// hint is unavailable, find the higher available address between + /// startVaHint and max vaRange. If any of the two cases are failed, + /// set baseVirtAddr as normal. + }; + + GpuMemPriority priority; ///< Hint to the OS paging process on how important it is to keep this + /// allocation in its preferred heap. + GpuMemPriorityOffset priorityOffset; ///< Offset from the base level priority. A higher offset means higher priority + /// within same base Level. Currently supported on Windows only. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via the + /// MALL (memory access last level). Only valid if "supportsMall" is set in + /// DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. + /// Any pages outside of this range will use the opposite MALL policy from + /// what is specified in "mallPolicy". + + /// Describes how the allocation will be accessed. If heapAccess is set to something other than @ref + /// GpuHeapAccessExplicit, then PAL decides the appropriate heap to allocate memory from based on this member and + /// @ref heaps is ignored. Otherwise heap selection respects the selection in @ref heaps. + GpuHeapAccess heapAccess; + uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. + GpuHeap heaps[GpuHeapCount]; ///< List of allowed memory heaps, in order of preference. It will be ignored if + /// @ref heapAccess is to something other than @ref GpuHeapAccessExplicit. + + /// The pointer to an Image object the memory object will be bound to. It must be specified in special cases where + /// a memory object is permanently linked to an Image such as presentable images or shared resources on Windows. + /// For other cases, it's highly encouraged to provide the image object pointer. + /// + /// When @ref compression is set to Default, clients can benefit from PAL's internal logic to choose the optimal + /// GFX12-style distributed compression setting (enabled or disabled) based on properties of this image (e.g., usage + /// flags); otherwise PAL will disable the compression by default when detecting nullptr @pImage. + IImage* pImage; + + TypedBufferCreateInfo typedBufferInfo; ///< This struct must be filled out if the @tref typedBuffer flag is set. + /// This GPU memory will be permanently considered a typed buffer. + + VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages, WDDM only. + gpusize surfaceBusAddr; ///< Surface bus address of Bus Addresable Memory. + /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. + gpusize markerBusAddr; ///< Marker bus address of Bus Addresable Memory. The client can: + /// 1. Write to marker + /// 2. Let GPU wait until a value is written to marker before issuing + /// the next command. + /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. + + /// Client override for GFX12-style distributed compression. Only meaningful on devices that set the + /// supportDistributedCompression DeviceProperties flag. By default, PAL will apply a heuristic to determine + /// whether or not to enable compression based on properties of the memory allocation (e.g., CPU-visibility or + /// properties of the attached pImage). This mode does not apply to virtual IGpuMemory objects (such object + /// inherit their compression behavior from their backing physical memory on a page-by-page basis). + /// + /// @note If allocating memory intended to back multiple resources (e.g., a heap for DX12 "placed resources") where + /// pImage is nullptr, the client should set compression to Enable then rely on per-resource or per-view controls to + /// disable compression for resources as needed (use compressionMode fields in ImageCreateInfo, BufferViewInfo, + /// ImageViewInfo, etc.). + /// + /// If clients intends to enable distributed compression on buffers, buffer base alignment and size should be + /// aligned to DCC minimum compression unit size (256 bytes); otherwise there is potential corruption issue. + /// e.g. An allocation holding two buffers (placed resource or driver suballocation). Suppose the first buffer has + /// size 4Kibytes+128 bytes with compression enabled and the second buffer has size 128bytes with compression + /// disabled (buffer view compressionMode is ReadBypassWriteDisable). When compressing the trailing 128bytes of + /// the first buffer, it will compress the second buffer as well. Reading the second buffer will result in corrupted + /// content. + TriState compression; +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreatePinnedGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size and vaRange fields. +struct PinnedGpuMemoryCreateInfo +{ + const void* pSysMem; ///< Pointer to the system memory that should be pinned for GPU access. Must be + /// aligned to realMemAllocGranularity in DeviceProperties. + size_t size; ///< Amount of system memory to pin for GPU access. + VaRange vaRange; ///< Virtual address range for the GPU memory allocation. + gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the + /// Platform's allocation granularity will be used. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be + /// accessed via the MALL (memory access last level). Only valid + /// if "supportsMall" is set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive + /// is set. Any pages outside of this range will use the opposite + /// MALL policy from what is specified in "mallPolicy". +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateSvmGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size and alignment. +struct SvmGpuMemoryCreateInfo +{ + GpuMemoryCreateFlags flags; ///< GPU memory flags. + gpusize size; ///< Amount of SVM memory to allocate in bytes. + /// The total amount of SVM memory can't exceed the value set in + /// maxSvmSize when the platform is created. + gpusize alignment; ///< Byte alignment of the allocation's SVM VA. If zero, an + /// alignment matching the allocation granularity will be used. + const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to + /// the memory object which previously reserved the GPU VA range + /// to be used by the new memory object. + bool isUsedForKernel; ///< Memory will be used to store kernel and execute on gpu. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be + /// accessed via the MALL (memory access last level). Only valid + /// if "supportsMall" is set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive + /// is set. Any pages outside of this range will use the opposite + /// MALL policy from what is specified in "mallPolicy". +}; + +/// Specifies parameters for opening a shared GPU memory object on another device. +struct GpuMemoryOpenInfo +{ + IGpuMemory* pSharedMem; ///< Shared GPU memory object from another device to open. +}; + +/// Specifies parameters for opening a GPU memory object on another device for peer-to-peer memory transfers. +struct PeerGpuMemoryOpenInfo +{ + IGpuMemory* pOriginalMem; ///< GPU memory object from another device to open for peer-to-peer memory transfers. +}; + +/// Specifies parameters for opening another non-PAL device's gpu memory for access from this device. Input structure to +/// IDevice::OpenExternalSharedGpuMemory(). +struct ExternalGpuMemoryOpenInfo +{ + ExternalResourceOpenInfo resourceInfo; ///< Information describing the external gpuMemory. + TypedBufferCreateInfo typedBufferInfo; ///< Information describing the typed buffer information. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via + /// the MALL (memory access last level). Only valid if "supportsMall" is + /// set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. + /// Any pages outside of this range will use the opposite MALL policy + /// from what is specified in "mallPolicy". + union + { + struct + { + uint32 typedBuffer : 1; ///< GPU memory will be permanently considered a single typed buffer pseudo-object + /// with the properties given in typedBufferInfo. + uint32 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. + uint32 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the MALL. + /// If this is set, then the mallPolicy enumeration must be set to either + /// "always" or "never". + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External Gpu memory open info flags. +}; + +/// The fundemental information that describes a GPU memory object that is stored directly in each IGpuMemory. +/// It can be accessed without a virtual call via IGpuMemory::Desc(). +struct GpuMemoryDesc +{ + gpusize gpuVirtAddr; ///< GPU virtual address of the GPU memory allocation. + gpusize size; ///< Size of the GPU memory allocation, in bytes. + gpusize clientSize; ///< Size of the client requested GPU memory allocation, in bytes. + gpusize alignment; ///< Required GPU virtual address alignment, in bytes. + uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. + GpuHeap heaps[GpuHeapCount]; ///< List of preferred memory heaps, in order of preference. + gpusize surfaceBusAddr; ///< Bus Address of SDI memory surface and marker. These will not be initialized + gpusize markerBusAddr; /// until the memory is made resident. Client needs to call + /// InitBusAddressableGpuMemory() to query and update before this is valid. + union + { + struct + { + uint32 isVirtual : 1; ///< GPU memory is not backed by physical memory and must be remapped before the + /// GPU can safely access it. Will also be set for sdiExternal allocations. See + /// GpuMemoryCreateFlags::sdiExternal + uint32 isPeer : 1; ///< GPU memory object was created with @ref IDevice::OpenPeerGpuMemory. + uint32 isShared : 1; ///< GPU memory object was created either with + /// @ref IDevice::OpenExternalSharedGpuMemory or OpenSharedGpuMemory. + /// This IGpuMemory references memory created either by another process or + /// another device with the exception of peer access. + uint32 isExternal : 1; ///< GPU memory object was created with @ref IDevice::OpenExternalSharedGpuMemory. + /// This IGpuMemory references memory that was created either by another process + /// or by a device that doesn't support sharedMemory with this object's device + /// (i.e., MDA sharing on Windows). + uint32 isSvmAlloc : 1; ///< GPU memory is allocated in system memory. + /// Valid only when IOMMUv2 is supported + uint32 isExecutable : 1; ///< GPU memory is used for execution. Valid only when IOMMUv2 is supported + uint32 isExternPhys : 1; ///< GPU memory is External Physical memory + + uint32 placeholder0 : 1; ///< Reserved for future memory flag + + uint32 isCompressed : 1; ///< Set for physical allocations where UMD requested PTE.D=1 to enable + /// GFX12-style distributed compression. + uint32 reserved : 23; ///< Reserved for future use + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory desc flags. + + uint64 uniqueId; ///< Unique ID given to each GPU memory object, allows client tracking of GPU memory allocations. +}; + +/// Defines GPU memory sub allocation info. Contains a GPU memory handle to the whole memory. And the offset and size +/// shows where is the sub allocated memory. +struct GpuMemSubAllocInfo +{ + gpusize address; ///< Start address of the memory, not including the offset. + gpusize offset; ///< Offset from the start address of the memory. + gpusize size; ///< Size of the memory. +}; + +/// Specifies a GPU memory object and flags with more specific usage details. An array of these structures is specified +/// to PAL residency operations. +/// +/// @see IDevice::AddGpuMemoryReferences +/// @see IQueue::Submit +struct GpuMemoryRef +{ + union + { + struct + { + uint32 readOnly : 1; ///< The allocation will not be written using this reference. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory reference flags. + + IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. +}; + +/// Specifies a Display Output Post-Processing (DOPP) allocation that will be referenced by a submission along with +/// additional info describing how it will be used. +/// +/// @see IQueue::Submit +struct DoppRef +{ + union + { + struct + { + uint32 pfpa : 1; ///< Access to this DOPP allocation will be redirected to the primary pending + /// present (i.e., pre-flip primary access). If not set, access will + /// refer to the current onscreen primary. + uint32 lastPfpaCmd : 1; ///< This submission will be the last access of this pfpa allocation + /// for this frame. The pfpa interval will end once this submit + /// completes, allowing the corresponding vidPnSource to flip. + /// This flag is invalid if the pfpa flag is not set. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory reference flags. + + IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. +}; + +/// Specifies the types of the exporting memory. +enum class ExportHandleType : uint32 +{ + Default = 0, ///< Let PAL choose the export type +#if PAL_AMDGPU_BUILD + FileDescriptor, ///< Export using a Linux file descriptor + Kms, ///< Export through KMS +#endif +}; + +/// Specifies parameters for export a GPUMemory NT handle from its name. +struct GpuMemoryExportInfo +{ +#if PAL_KMT_BUILD + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT + /// handle with a name, and then the handle can be acquired + /// via this name. + uint32 accessFlags; ///< Desried access rights of GPU memory. +#endif + ExportHandleType exportType; ///< Type of handle to use for exporting the memory. +}; + +/** + *********************************************************************************************************************** + * @interface IGpuMemory + * @brief Interface representing a GPU-accessible memory allocation. + * + * Depending on creation parameters, this could correspond to: + * + * + A _real_ memory object, corresponding directly to a physical allocation made on this device (whether it resides in + * a local or non-local heap). + * + A _virtual_ memory object, only consisting of virtual address space that can be mapped on a page basis to pages in + * _real_ memory objects via IQueue::RemapVirtualMemoryPages. + * + Pinned memory, a _real_ memory object created by pinning down client system memory. + * + Peer memory, a _real_ memory object corresponding to GPU memory that is likely local to another GPU. Only copy + * operations (peer-to-peer transfers) are allowed with this memory. + * + Opened/shared memory, a _real_ memory object that is fully shared between multiple GPUs, residing in a non-local + * heap. + * + External shared memory, a _real_ memory object that was created by an external process and is fully shared between + * multiple GPUs. + * + * @see IDevice::CreateGpuMemory + * @see IDevice::CreatePinnedGpuMemory + * @see IDevice::OpenSharedGpuMemory + * @see IDevice::OpenPeerGpuMemory + * @see IDevice::OpenExternalSharedGpuMemory + * + * + * All of these kinds of GPU memory are assigned a set of fundemental properties specified in GpuMemoryDesc which are + * either specified by the client or by PAL. There are specific rules these properties must follow; those rules are + * documented here to avoid duplication. Violating these rules will cause the device's corresponding "get size" + * functions to return an error code, the create/open functions may not validate their arguments. + * + * + * With the exception of external memory objects being opened, PAL will adjust size and base alignments as necessary + * to meet device requirements. Typically this means going out to OS page boundaries. The client is no longer required +* to query device requirements and align for PAL. + * + * Note that the device alignment requirements apply equally to GPU VAs. However, other kinds of alignment + * restrictions (e.g., IGpuMemoryBindable's requirements) may only apply to one of those two properties. When creating + * GPU memory objects the client must be careful to set the "alignment" field to the alignment of the GPU VA. + * + * + * Second, the client can't directly specify a memory object's GPU VA but must specify its VA range, limiting which + * portions of the VA space can be used. Note that non-external shared and peer GPU memory objects will use the + * original memory's VA range. External shared GPU memory always uses the default VA range. + * + * The ShadowDescriptorTable VA range is special because it pairs the shadow GPU memory to an existing descriptor GPU + * memory. The client must specify the GPU VA of the corresponding DescriptorTable memory when creating a shadow GPU + * memory object via descrVirtAddr; it must satisfy the alignment requirements of the shadow GPU memory. Both GPU + * memory objects must be created on the same device. Note that descrVirtAddr can be offset into the descriptor + * allocation such that multiple shadow GPU memory objects correspond to one larger descriptor GPU memory object. + * + * + * The client can further influence the GPU VA of shared and peer GPU memory objects. If the globalGpuVa flag is set + * when the original GPU memory object is created, PAL will assign any shared or peer GPU memory objects that same VA. + * Note that globalGpuVa is only supported if globalGpuVaSupport is set in DeviceProperties. + *********************************************************************************************************************** + */ +class IGpuMemory : public IDestroyable +{ +public: + /// Sets a new priority for this GPU memory object. + /// + /// This call is not available for virtual or pinned memory. + /// + /// @param [in] priority New base priority for the GPU memory object. + /// + /// @param [in] priorityOffset New priority offset for the GPU memory object. This is a small bias that can be + /// used by the OS to raise the importance of an allocation when there are + /// multiple allocations in the same base priority level. You can think of it as + /// the fractional bits of the priority level. + /// + /// @returns Success if the priority was successfully updated. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this is a virtual or pinned GPU memory object. + virtual Result SetPriority( + GpuMemPriority priority, + GpuMemPriorityOffset priorityOffset) = 0; + + /// Makes the GPU memory available for CPU access and gives the client a pointer to reference it. + /// + /// The allocation should be unmapped by the client once CPU access is complete, although it _is_ legal to keep an + /// allocation mapped while the GPU references the allocation from a command buffer. + /// + /// It is legal to map the allocation multiple times concurrently. Mapping is not available for pinned or virtual + /// memory objects. This call is thread safe for calls referencing this memory object. + /// + /// @see Unmap. + /// + /// @param [out] ppData CPU pointer to the GPU memory object. + /// + /// @returns Success if the map succeeded. Otherwise, *ppData will not be valid and one of the following errors may + /// be returned. + /// + ErrorInvalidPointer if ppData is null. + /// + ErrorGpuMemoryMapFailed if the object is busy and cannot be mapped by the OS. + /// + ErrorNotMappable if the memory object cannot be mapped due to some of its heaps not having the CPU + /// visible flag set. + /// + ErrorUnavailable if the memory object is not a real allocation. + virtual Result Map( + void** ppData) = 0; + + /// Removes CPU access from a previously mapped GPU memory object. + /// + /// This call is thread safe for calls referencing the same memory object. + /// + /// @see Map + /// + /// @returns Success if the unmap succeeded. Otherwise, one of the following errors may be returned: + /// + ErrorGpuMemoryUnmapFailed if the GPU memory object cannot be unlocked. + /// + ErrorUnavailable if the GPU memory object is not a real allocation. + virtual Result Unmap() = 0; + +#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD + /// Returns an OS-specific handle which can be used to refer to this GPU memory object across processes. This will + /// return a null or invalid handle if the object was not created with the @ref interprocess create flag set. + /// + /// @note This function is only available for Linux builds or KMT builds. + /// + /// @param [in] handleInfo The info is used to open handle. + /// + /// @returns An OS-specific handle which can be used to access the GPU memory object across processes. + virtual OsExternalHandle ExportExternalHandle(const GpuMemoryExportInfo& exportInfo) const = 0; +#endif + + /// Returns a structure containing some fundemental information that describes this GPU memory object. + /// + /// @returns A reference to this allocation's GpuMemoryDesc. + const GpuMemoryDesc& Desc() const { return m_desc; } + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Set SDI remote surface bus address and marker bus address. + /// + /// This GPU memory object must have been created with the sdiExternal flag set and with the GpuMemoryCreateInfo + /// surfaceBusAddr and markerBusAddr fields both set to zero. This function allows clients to defer setting those + /// addresses until after creation. It must be called exactly once to permanently bind the given SDI addresses to + /// this GPU memory object. + /// + /// @warning An sdiExternal GPU memory object is not complete until its given its SDI addresses! The gpuVirtAddr + /// field in this GPU memory's GpuMemoryDesc will not be valid until this function is called! + /// + /// @param [in] surfaceBusAddr Surface bus address of Bus Addressable Memory. + /// @param [in] markerBusAddr Marker bus address of Bus Addressable Memory. The client can write to the marker + /// and have the GPU wait until a value is written to marker before continuing. + /// + /// @returns Success if succeeded. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if the GPU memory object is not external physical memory or it has already been set. + /// + ErrorInvalidValue if one of the input params is 0. + /// + One of the escape call failed error. + virtual Result SetSdiRemoteBusAddress(gpusize surfaceBusAddr, gpusize markerBusAddr) = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IGpuMemory() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IGpuMemory() { } + + GpuMemoryDesc m_desc; ///< Information that describes this GPU memory object. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h new file mode 100644 index 0000000000..086556f815 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h @@ -0,0 +1,139 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuMemoryBindable.h + * @brief Defines the Platform Abstraction Library (PAL) IGpuMemoryBindable interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +// Forward declarations. +class IGpuMemory; + +/// Reports required properties of a GPU memory object bound to a specific object. The client must query these +/// properties via IGpuMemoryBindable::GetGpuMemoryRequirements() and bind an @ref IGpuMemory object matching these +/// requirements to the @ref IGpuMemoryBindable object using IGpuMemoryBindable::BindGpuMemory(). +struct GpuMemoryRequirements +{ + union + { + struct + { + uint32 cpuAccess : 1; ///< CPU access is required. If set, the client must not set cpuInvisible in + /// GpuMemoryCreateFlags and must provide CPU visible heaps or CPU visible heap + /// access mode. If not set, it's strongly recommended to set cpuInvisible. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags specifying required GPU memory properties. + + gpusize size; ///< Amount of GPU memory required, in bytes. + gpusize alignment; ///< Required GPU memory virtual address alignment, in bytes. + uint32 heapCount; ///< Number of valid entries in heaps[]. + GpuHeap heaps[GpuHeapCount]; ///< List of allowed heaps for the GPU memory in order of predicted performance. +}; + +/** + *********************************************************************************************************************** + * @interface IGpuMemoryBindable + * @brief Interface inherited by objects that may require GPU memory be bound to them. + * + * In the future, PAL may discover a need to allocate GPU memory for a class that currently doesn't require it. In that + * situation, that class will be updated to inherit from IGpuMemoryBindable. This change would break backward + * compatibility and would result in the major interface version being incremented. + *********************************************************************************************************************** + */ +class IGpuMemoryBindable : public IDestroyable +{ +public: + /// Queries the GPU memory properties required by this object. The client should query properties with this method, + /// create/sub-allocate a memory range matching the requirements, then bind the memory to the object via + /// @ref BindGpuMemory(). + /// + /// @note Not all objects may actually need GPU memory, and in that case the memory properties will reflect a 0 size + /// and alignment. + /// + /// @param [out] pGpuMemReqs Required properties of GPU memory to be bound to this object. Includes properties like + /// size, alignment, and allowed heaps. + virtual void GetGpuMemoryRequirements( + GpuMemoryRequirements* pGpuMemReqs) const = 0; + + /// Binds GPU memory to this object according to the requirements queried via GetGpuMemoryRequirements(). + /// + /// Binding memory to objects other than images automatically initializes the object memory as necessary. Image + /// objects used as color or depth-stencil targets have to be explicitly initialized in command buffers using a + /// ICmdBuffer::CmdReleaseThenAcquire() command to transition them out of the LayoutUninitializedTarget usage. + /// + /// Binding memory to an object automatically unbinds any previously bound memory. There is no need to bind null to + /// an object to explicitly unbind a previously bound allocation before binding a new allocation. + /// + /// This call is invalid on objects that have no memory requirements, even if binding null. + /// + /// @param [in] pGpuMemory GPU memory to be bound. If null, the previous binding will be released. + /// @param [in] offset Offset into the GPU memory where the object's memory range should begin. This allows + /// sub-allocating many object's GPU memory from the same IGpuMemory object. + /// + /// @returns Success if the specified GPU memory was successfully bound to the object. Otherwise, one of the + /// following errors may be returned: + /// + ErrorUnavailable if binding a non-image to a virtual allocation. + /// + ErrorInvalidAlignment if the offset does not match the alignment requirements of the object. + /// + ErrorInvalidMemorySize if the object's required memory size does not fit completely within the given + /// memory object at the specified offset. + virtual Result BindGpuMemory( + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Returns the GPU memory object and offset that this object is bound to or nullptr and 0 if not bound. + /// + /// @param [out] ppGpuMemory Returns the GPU memory object to the address specified in this pointer. + /// Returns nullptr if this object is not bound to any GPU memory. + /// @param [out] pOffset Returns the GPU memory offset to the address specified in this pointer. + /// Returns 0 if this object is not bound to any GPU memory. + /// + /// @returns Success if the GPU memory and offset was successfully returned. Otherwise, one of the following errors + /// may be returned: + /// + ErrorGpuMemoryNotBound if this object is not bound to any GPU memory. + /// + ErrorInvalidPointer if either ppGpuMemory or pOffset is nullptr. + /// + ErrorUnavailable if binding is not supported in the derived class + virtual Result GetGpuMemory( + IGpuMemory** ppGpuMemory, + gpusize* pOffset) const = 0; + +protected: + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IGpuMemoryBindable() { } +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palImage.h b/shared/amdgpu-windows-interop/pal/inc/core/palImage.h new file mode 100644 index 0000000000..0fcdb16204 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palImage.h @@ -0,0 +1,983 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palImage.h + * @brief Defines the Platform Abstraction Library (PAL) IImage interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +// Forward declarations. +class IImage; +class IPrivateScreen; +class IScreen; +class ISwapChain; +enum class CompressionMode : uint32; +enum class ClientCompressionMode : uint32; +enum SwizzleMode : uint32; +struct ImageCopyRegion; + +/// When used as the value of the viewFormatCount parameter of image creation it indicates that all compatible formats +/// can be used for views of the created image. +constexpr uint32 AllCompatibleFormats = UINT32_MAX; + +/// Specifies dimensionality of an image (i.e., 1D, 2D, or 3D). +enum class ImageType : uint32 +{ + Tex1d = 0x0, + Tex2d = 0x1, + Tex3d = 0x2, + Count +}; + +/// Specifies the tiling (address swizzling) to use for an image. When a linear tiled image is mapped its contents will +/// be laid out in row-major ordering. All other tiling modes require the use of swizzles equation to locate texels. +enum class ImageTiling : uint32 +{ + Linear = 0x0, ///< Image is laid out in scan-line (row-major) order. + Optimal = 0x1, ///< Image is laid out in a GPU-optimal order. + Standard64Kb = 0x2, ///< Image is laid out in the cross-IHV, 64KB, standard swizzle tiling. + Count +}; + +/// Hints to pal to identify a preference for how this image is organized. This is a preference setting, and may be +/// ignored if pal believes better options exist. +enum class ImageTilingPattern : uint32 +{ + Default = 0x0, ///< No swizzle mode is preferred. + Standard = 0x1, ///< Prefer standard swizzle modes. + XMajor = 0x2, ///< Prefer x-coordinate major swizzle modes. + YMajor = 0x3, ///< Prefer y-coordinate major swizzle modes. + Interleaved = 0x4, ///< Prefer interleaved coordinate swizzle modes. + Count +}; + +/// Hints to pal to select the appropriate tiling mode for a optimization target. +enum class TilingOptMode : uint32 +{ + Balanced = 0x0, ///< Balance memory foorprint and rendering performance. + OptForSpace = 0x1, ///< Optimize tiling mode for saving memory footprint + OptForSpeed = 0x2, ///< Optimize tiling mode for rendering performance. + Count +}; + +/// Image metadata modes. +enum class MetadataMode : uint16 +{ + Default = 0, ///< Default behavior. PAL chooses if metadata should be present or not. + ForceEnabled, ///< Optimization Hint: The client would prefer Metadata if possible. Useful for scenarios where + /// metadata isn't an obvious win and clients can enable based on some hueristic or app-detect. + Disabled, ///< The Image will not contain any compression metadata. + FmaskOnly, ///< The color msaa Image will only contain Cmask/Fmask metadata; this mode is only valid for color + /// msaa Image. On GPUs with GFX12-style distributed compression (see supportDistributedCompression + /// flag in @ref DeviceProperties), metadataMode only controls UMD metadata (Hi-Z and Hi-S). On such + /// GPUs, the FmaskOnly enum is ignored and treated like Default. + Count, +}; + +/// Image metadata TC compat modes. +enum class MetadataTcCompatMode : uint16 +{ + Default = 0, ///< Default behavior. PAL chooses if TC compat should be enabled (if compressed). + ForceEnabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is TC compat. + Disabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is not TC compat. + Count, +}; + +/// Image shared metadata support level +enum class MetadataSharingLevel : uint32 +{ + FullExpand = 0, ///< The metadata need to be fully expanded at ownership transition time. + ReadOnly = 1, ///< The metadata are expected to have read-only usage after the ownership is transitioned. + FullOptimal = 2, ///< The metadata can remain as-is if possible at ownership transition time. +}; + +/// Specifies the type of PRT map image being created. +enum class PrtMapType : uint32 +{ + None = 0, ///< This is not an auxillary image used for PRT plus functionality. + Residency = 1, ///< Image data is really a low-resolution map containing the finest populated LOD + /// for a particular UV space region. + SamplingStatus = 2, ///< Indicates the validity of a given tile on a per-mip level basis. + Count, +}; + +/// Specifies how to interpret a clear color. +enum class ClearColorType : uint32 +{ + Uint = 0, ///< The color is stored as an unsigned integer in RGBA order in u32Color. It will be swizzled and + /// compacted before it is written to memory. + Sint = 1, ///< The color is stored as a signed integer in RGBA order in i32Color. It will be swizzled and + /// compacted before it is written to memory. + Float = 2, ///< The color is stored as floating point in RGBA order. It will be swizzled and converted to the + /// appropriate numeric format before it is written to memory. + Yuv = 3, ///< The color is stored as an unsigned integer in YUVA order in u32Color. It will be swizzled and + /// compacted before it is written to memory. The client must clamp the clear color within the + /// valid range, e.g. [0, 255] for 8-bit. + Count +}; + +/// Contains everything necessary to store and interpret a clear color. +struct ClearColor +{ + ClearColorType type; ///< How to interpret this clear color. + uint8 disabledChannelMask; ///< This 4 bits are used to selectively disable the A,B,G,R channels + /// from being written. 0 means write ABRG. 0xF means write nothing. + /// 0x8 means write Blue, Green, Red. 0x7 means write Alpha. etc... + + union + { + uint32 u32Color[4]; ///< The clear color, interpreted as four unsigned integers. + float f32Color[4]; ///< The clear color, interpreted as four floating point values. + }; +}; + +/// Specifies a set of image creation flags. +union ImageCreateFlags +{ + struct + { + uint32 invariant : 1; ///< Images with this flag set and all other creation identical are + /// guaranteed to have a consistent data layout. + uint32 cloneable : 1; ///< Image is valid as a source or destination of a clone operation. + /// See @ref IDevice::ImagePrefersCloneCopy() for more details. + uint32 shareable : 1; ///< Image can be shared between compatible devices. + uint32 presentable : 1; ///< Indicates this image can be used in presents. + uint32 flippable : 1; ///< Image can be used for flip presents. + uint32 stereo : 1; ///< Indicates AMD quad buffer stereo extension (AQBS extension) image + uint32 dxgiStereo : 1; ///< Indicates DXGI stereo (Win8 stereo) image + uint32 cubemap : 1; ///< Image will be used as a cubemap. + uint32 prt : 1; ///< Image is a partially resident texture (aka, sparse image or tiled + /// resource) + uint32 needSwizzleEqs : 1; ///< Image requires valid swizzle equations. + uint32 perSubresInit : 1; ///< The image may have its subresources initialized independently using + /// barrier calls out of the uninitialized layout. + uint32 separateDepthPlaneInit : 1; ///< If set, the caller may transition the stencil and depth planes from + /// "Uninitialized" state at any time. Otherwise, both planes must be + /// transitioned in the same barrier call. Only meaningful if + /// "perSubresInit" is set. + uint32 repetitiveResolve : 1; ///< Optimization: Is this image resolved multiple times to an image which + /// is mostly similar to this image? + uint32 preferSwizzleEqs : 1; ///< Image prefers valid swizzle equations, but an invalid swizzle + /// equation is also acceptable. + uint32 fixedTileSwizzle : 1; ///< Fix this image's tile swizzle to ImageCreateInfo::tileSwizzle. This + /// is only supported for single-sampled color images. + uint32 videoReferenceOnly : 1; ///< Image is used by video hardware for reference buffer only. + /// It uses a different tiling format than the decoder output buffer. + uint32 optimalShareable : 1; ///< Indicates metadata information is to be added into private data on + /// creation time and honored on open time. + uint32 sampleLocsAlwaysKnown : 1; ///< Sample pattern is always known in client driver for MSAA depth image. + uint32 fullResolveDstOnly : 1; ///< Indicates any ICmdBuffer::CmdResolveImage using this image as a + /// desination will overwrite the entire image (width and height of + /// resolve region is same as width and height of resolve dst). + uint32 fullCopyDstOnly : 1; ///< Indicates any copy to this image will overwrite the entire image. + /// A perf optimization of using post-copy metadata fixup to replace heavy + /// expand at barrier to LayoutCopyDst. Unsafe to enable it if there is + /// potential partial copy to the image. + uint32 pipSwapChain : 1; ///< Indicates this image is PIP swap-chain. It is only supported on + /// Windows platforms. + uint32 view3dAs2dArray : 1; ///< If set client can view 3D image as 2D with its depth as array slices. + /// Note that not all 3D images supports it. The image creation will + /// return error if we fail to create a compatible image. + + uint32 tmzProtected : 1; ///< Indicate this image is protected or not. + uint32 sharedWithMesa : 1; ///< Indicate this Image was opened from a Mesa shared Image + uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes + uint32 hasModifier : 1; ///< Set if the image uses drm format modifier. + uint32 disableDccStateTracking : 1; ///< Disable a PAL optimization which is commonly broken by app bugs. + /// Setting this flag may increase DCC decompress overhead. +#if PAL_CLIENT_EXAMPLE + uint32 useFixedSwizzleMode : 1; ///< If set, require the fixed swizzle mode provided. + /// Fails creation on incompatible swizzles. +#else + uint32 reservedSwMode : 1; ///< Reserved for future use. +#endif + uint32 reserved : 4; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies a set of ways an image might be used by the GPU (color target, shader read, etc.). +union ImageUsageFlags +{ + struct + { + uint32 shaderRead : 1; ///< Image will be read from shader (i.e., texture). + uint32 shaderWrite : 1; ///< Image will be written from a shader (i.e., UAV). + uint32 resolveSrc : 1; ///< Image will be used as resolve source image + uint32 resolveDst : 1; ///< Image will be used as resolve dst image + uint32 colorTarget : 1; ///< Image will be bound as a color target. + uint32 depthStencil : 1; ///< Image will be bound as a depth/stencil target. + uint32 noStencilShaderRead : 1; ///< Image will be neither read as stencil nor resolved on stencil plane. + /// Note that if resolveSrc bit has been set to indicate that the image + /// could be adopted as a resolveSrc image and there could be stencil + /// resolve, noStencilShaderRead must be set to 0, since shader-read + /// based stencil resolve might be performed. + uint32 hiZNeverInvalid : 1; ///< Hint to PAL indicating the client will guarantee that no operations + /// performed on this Image while it is in a decompressed state will cause + /// Hi-Z metadata to become invalid. This allows PAL to avoid an expensive + /// resummarization blit in some resource barriers. + uint32 depthAsZ24 : 1; ///< Use a 24-bit format for HW programming of a native 32-bit surface. + /// If set, border color and Z-reference values are treated as Z-24. + uint32 firstShaderWritableMip : 4; ///< Only relevant if the shaderWrite flag is set. Typically set to 0 so + /// entire image is writable. If non0, such as an image where only level0 + /// is used as a color target and compute is used to generate mipmaps,PAL + /// may be able to enable additional compression on the baseLevels which + /// are used exclusively as color target and shader read. + uint32 cornerSampling : 1; ///< Set if this image will use corner sampling in image-read scenarios. + /// With corner sampling, the extent refers to the number of pixel corners + /// which will be one more than the number of pixels. Border color is + /// ignored when corner sampling is enabled. + + uint32 vrsDepth : 1; ///< Set if this depth image will be bound when VRS rendering is enabled. + uint32 disableOptimizedDisplay: 1; ///< Do not create Display Dcc + uint32 useLossy : 1; ///< Set if this image may use lossy compression. + uint32 stencilOnlyTarget : 1; ///< This must be set if a stencil-only IDepthStencilView will be created + ///< for this image. + uint32 vrsRateImage : 1; ///< This image is potentially used with CmdBindSampleRateImage + uint32 videoDecoder : 1; ///< Indicating this Image is video decoder target + uint32 reserved : 12; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies properties for @ref IImage creation. Input structure to IDevice::CreateImage(). +/// +/// Note that by default PAL may instruct the hardware to swizzle the contents of an image in memory; if this occurs +/// two images created with identical properties will not map their texels to the same offsets in GPU memory and may +/// even have different sizes. At the expense of performance this behavior can be limited by setting the invariant flag, +/// which guarantees that images with identical properties will have identical GPU memory layouts. +/// +/// For single-sampled color images, there is a middle ground between these two modes. If the fixedTileSwizzle flag is +/// set, PAL will use the tileSwizzle property instead of generating its own swizzle value. The tileSwizzle value must +/// be obtained from the base subresource of a single-sampled color image with identical properties (excluding +/// fixedTileSwizzle and tileSwizzle). This allows the client to force certain similar images to share the same GPU +/// memory layouts without forcing all similar images to a single GPU memory layout. +struct ImageCreateInfo +{ + ImageCreateFlags flags; ///< Image creation flags. + ImageUsageFlags usageFlags; ///< Image usage flags. + ImageType imageType; ///< Dimensionality of image (1D/2D/3D). + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + Extent3d extent; ///< Dimensions in pixels WxHxD. + uint32 mipLevels; ///< Number of mipmap levels. Cannot be 0. + uint32 arraySize; ///< Number of slices. Set to 1 for non-array images. + uint32 samples; ///< Number of coverage samples. Set to 1 for single sample images. Must be + /// greater than or equal to the number of fragments. + uint32 fragments; ///< Number of color/depth fragments. Set to 1 for single sample images. + ImageTiling tiling; ///< Controls layout of pixels in the image. + ImageTilingPattern tilingPreference; ///< Controls preferred tile swizzle organization for this image. + TilingOptMode tilingOptMode; ///< Hints to pal to select the appropriate tiling mode. + uint32 tileSwizzle; ///< If fixedTileSwizzle is set, use this value for the image's base swizzle. +#if PAL_CLIENT_EXAMPLE + SwizzleMode fixedSwizzleMode; ///< For directed image tests, force a particular swizzle mode. +#endif + + /// Metadata behavior mode for this image. On GPUs with GFX12-style distributed compression + /// (see supportDistributedCompression flag in @ref DeviceProperties), metadataMode only controls UMD metadata + /// (Hi-Z and Hi-S). On such GPUs, the FmaskOnly enum is ignored and treated like Default. + MetadataMode metadataMode; + MetadataTcCompatMode metadataTcCompatMode; ///< TC compat mode for this image. + + /// Distributed compression contains GL2/DF DCC compression and RB backend client compression which includes + /// fragment client compression (previous FMASK compression alike) on color MSAA images and Z Plane client + /// compression on depth stencil images. Only relevant if the backing memory pages enable compression, controllable + /// by client with @ref GpuMemoryCreateInfo::compression. + CompressionMode compressionMode; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 876 + /// Client compression is part of distributed compression (aka physical compression); it can only be enabled if + /// physical compression is enabled. + /// + /// On Gfx12, controls (legacy FMask based) color fragment compression and Z plane compression. + ClientCompressionMode clientCompressionMode; ///< Controls client compression behavior for this resource. +#else + TriState clientCompressionMode; ///< Controls client compression behavior for this resource. +#endif + + uint32 maxBaseAlign; ///< Maximum address alignment for this image or zero for an unbounded alignment. + float imageMemoryBudget; ///< The memoryBudget value used in SW addrlib to determine the minSizeBlk for textures. + /// It must be >= 0.0. When in [0.0, 1.0) addrlib uses legacy logic to decide minSizeBlk. + /// When == 1.0, addrlib uses minimizeAlign. When > 1.0, addrlib applies memory budget + /// algorithm. Despite 1.5 in tests show significant texture allocation size reduction, + /// default value 0.0 (legacy behavior) is recommended if not specified by client. + + struct + { + PrtMapType mapType; ///< Indicates what sort of PRT metadata is stored in this image. If this image is PRT + /// metadata, then it can only be associated with an image that is a power-of-two + /// multiple bigger (or the same size). Image properties needs to include + /// "PrtFeaturePrtPlus" to create PRT map images. Format must be set to X8_Unorm for + /// residency map and sampling-status map types. + Extent3d lodRegion; ///< Useful only if mapType is not "none". Defines the region size of the parent image + /// that one pixel of this image matches with. The map image can only be paired with a + /// parent image of matching dimensions. This parameter can be left at zero. + } prtPlus; + + /// The following "pitch" members must be zeroed unless the client is creating a @ref ImageTiling::Linear image and + /// wishes to directly specify the image's row and depth pitches. In that case, they must be integer multiples of + /// the alignments given by @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. + uint32 rowPitch; ///< The image must have this row pitch for the first mip level (in bytes). + uint32 depthPitch; ///< The image must have this depth pitch for the first mip level (in bytes). + + Rational refreshRate; ///< The expected refresh rate when presenting this flippable or stereo image. + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. + +#if defined(__unix__) + uint64 modifier; ///< Drm format modifier. Ignored if flags.hasModifier unset. + uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. + gpusize modifierMemoryPlaneOffset[3]; ///< Offset of main surface, display Dcc surface and gfx Dcc surface. +#endif +}; + +inline constexpr bool operator==(const ImageCreateInfo& lhs, const ImageCreateInfo& rhs) +{ + bool same = (lhs.flags.u32All == rhs.flags.u32All) && + (lhs.usageFlags.u32All == rhs.usageFlags.u32All) && + (lhs.imageType == rhs.imageType) && + (lhs.swizzledFormat == rhs.swizzledFormat) && + (lhs.extent == rhs.extent) && + (lhs.mipLevels == rhs.mipLevels) && + (lhs.arraySize == rhs.arraySize) && + (lhs.samples == rhs.samples) && + (lhs.fragments == rhs.fragments) && + (lhs.tiling == rhs.tiling) && + (lhs.tilingPreference == rhs.tilingPreference) && + (lhs.tilingOptMode == rhs.tilingOptMode) && + (lhs.tileSwizzle == rhs.tileSwizzle) && +#if PAL_CLIENT_EXAMPLE + (lhs.fixedSwizzleMode == rhs.fixedSwizzleMode) && +#endif + (lhs.metadataMode == rhs.metadataMode) && + (lhs.metadataTcCompatMode == rhs.metadataTcCompatMode) && + (lhs.compressionMode == rhs.compressionMode) && + (lhs.clientCompressionMode == rhs.clientCompressionMode) && + (lhs.maxBaseAlign == rhs.maxBaseAlign) && + (lhs.imageMemoryBudget == rhs.imageMemoryBudget) && + (lhs.prtPlus.mapType == rhs.prtPlus.mapType) && + (lhs.prtPlus.lodRegion == rhs.prtPlus.lodRegion) && + (lhs.rowPitch == rhs.rowPitch) && + (lhs.depthPitch == rhs.depthPitch) && + (lhs.refreshRate.numerator == rhs.refreshRate.numerator) && + (lhs.refreshRate.denominator == rhs.refreshRate.denominator) && + (lhs.viewFormatCount == rhs.viewFormatCount) && + true; + +#if defined(__unix__) + if (same && (lhs.flags.hasModifier != 0)) + { + same = (lhs.modifier == rhs.modifier) && + (lhs.modifierPlaneCount == rhs.modifierPlaneCount) && + (memcmp(&lhs.modifierMemoryPlaneOffset[0], + &rhs.modifierMemoryPlaneOffset[0], + sizeof(gpusize) * lhs.modifierPlaneCount) == 0); + } +#endif + + if (same && (lhs.viewFormatCount > 0) && (lhs.viewFormatCount != AllCompatibleFormats)) + { + same = (memcmp(lhs.pViewFormats, rhs.pViewFormats, lhs.viewFormatCount * sizeof(SwizzledFormat)) == 0); + } + + return same; +} + +/// Specifies properties for presentable @ref IImage creation. Input structure to IDevice::CreatePresentableImage(). +struct PresentableImageCreateInfo +{ + union + { + struct + { + uint32 fullscreen : 1; ///< Image supports fullscreen presentation. + uint32 stereo : 1; ///< Image supports stereoscopic rendering and display. + /// Implies an array size of 2. Fullscreen must be set. + uint32 turbosync : 1; ///< Image supports turbosync flip + uint32 peerWritable : 1; ///< Indicates if the memory allocated will be writable by other devices + uint32 tmzProtected : 1; ///< Indicates this presenatble image's memory is tmz Protected. +#if PAL_AMDGPU_BUILD + uint32 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes + /// the allocation upon creation, currently, only GpuHeapLocal and + /// GpuHeapInvisible are supported. +#else + uint32 placeholder0 : 1; ///< Placeholder. +#endif + uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes. + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Presentable image creation flags. + + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + ImageUsageFlags usage; ///< Image usage flags. + Extent2d extent; ///< Width/height of the image. + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression behavior for this resource. + /// Only relevant if the backing memory pages enable compression (controllable by + /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). + const IScreen* pScreen; ///< Target screen for fullscreen presentable images. Can be null if the fullscreen + /// flag is 0. + OsDisplayHandle hDisplay; ///< Display handle of the local display system only for WSI. + OsWindowHandle hWindow; ///< Window handle only for WSI. + ISwapChain* pSwapChain; ///< SwapChain object which the presentable image belongs to. + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. +}; + +/// Specifies properties for private screen @ref IImage image creation. Input structure to +/// IDevice::CreatePrivateScreenImage(). +struct PrivateScreenImageCreateInfo +{ + union + { + struct + { + uint32 invariant : 1; ///< Images with this flag set and all other creation identical are guaranteed + /// to have a consistent data layout. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Private screen image creation flags. + + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + ImageUsageFlags usage; ///< Image usage flags. + Extent2d extent; ///< Width/height of the image. + IPrivateScreen* pScreen; ///< Private screen this image is created on (then this image can be used to be + /// presented on this private screen). + CompressionMode compressionMode;///< Specify GFX12-style distributed compression behavior for this resource. + /// Only relevant if the backing memory pages enable compression (controllable by + /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. +}; + +/// Specifies parameters for opening another device's image for peer access from this device. Input structure to +/// IDevice::OpenPeerImage(). +struct PeerImageOpenInfo +{ + const IImage* pOriginalImage; ///< Other device's image to be opened for peer access. +}; + +/// Specifies parameters for opening another non-PAL device's image for access from this device. Input structure to +/// IDevice::OpenExternalSharedImage(). +struct ExternalImageOpenInfo +{ + ExternalResourceOpenInfo resourceInfo; ///< Information describing the external image. + Extent3d extent; ///< Expected extent for the external image. This reference value would be + /// ignored and use extents from shared metadata if any dimension of the + /// reference extent is zero. + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. Or UndefinedFormat to infer the + /// format internally. + ImageCreateFlags flags; ///< Image Creation flags. + ImageUsageFlags usage; ///< Image usage flags. + IPrivateScreen* pScreen; ///< Private screen this image is created on, or null. + gpusize gpuMemOffset; ///< GpuMemory offset +#if defined(__unix__) + gpusize dccOffset; ///< Offset of gfx Dcc surface if nonzero. + gpusize displayDccOffset; ///< Offset of display Dcc surface if nonzero. + uint64 modifier; ///< Drm format modifier, if flags.hasModifier is set. + uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. +#endif + /// The following members must be set to zero unless the client is opening a @ref ImageTiling::Linear image with + /// specified row and depth pitches. In that case, they must be integer multiples of the alignments given by + /// @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines + /// of the subresource. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive + /// slices. +}; + +/// Reports the overall GPU memory layout of the entire image. Output structure for IImage::GetMemoryLayout(). Unused +/// sections will have a size of zero, an offset of zero, and an alignment of one. The layout is split into: +/// + Image Data: The raw texel values for all subresources of the image. +/// + Image Metadata: Additional data that will be used to optimize GPU operations that access the image. +/// + Image Metadata Header: A special subsection of the metadata for small bits of data with weaker alignment. +struct ImageMemoryLayout +{ + gpusize dataSize; ///< The size, in bytes, of the image's core data section. + gpusize dataAlignment; ///< The alignment, in bytes, of the image's core data section. + + gpusize metadataOffset; ///< The offset, in bytes, of the image's metadata section. + gpusize metadataSize; ///< The size, in bytes, of the image's metadata section. + gpusize metadataAlignment; ///< The alignment, in bytes, of the image's metadata section. + + gpusize metadataHeaderOffset; ///< The offset, in bytes, of the image's metadata header. + gpusize metadataHeaderSize; ///< The size, in bytes, of the image's metadata header. + gpusize metadataHeaderAlignment; ///< The alignment, in bytes, of the image's metadata header. + + uint8 swizzleEqIndices[2]; ///< Which swizzle equations this image uses or InvalidSwizzleEqIndex if + /// there are no swizzle equations for this image's layout. + uint8 swizzleEqTransitionMip; ///< Before this mip level, the image uses swizzleEqIndices[0]; from this + /// mip level onwards, the image uses swizzleEqIndices[1]. + uint8 swizzleEqTransitionPlane; ///< Before this mip plane, the image uses swizzleEqIndices[0]; from this + /// plane onward, the image uses swizzleEqIndices[1]. + + uint32 prtTileWidth; ///< Width, in texels, of a PRT tile + uint32 prtTileHeight; ///< Height, in texels, of a PRT tile + uint32 prtTileDepth; ///< Depth, in texels, of a PRT tile + uint32 prtMinPackedLod; ///< First mip level that is packed into the PRT mip tail. + uint32 prtMipTailTileCount; ///< Number of tiles in the packed mip tail. This may either indicate the + /// size per slice or per image depending on the support for + /// PrtFeaturePerLayerMipTail (@see PrtFeatureFlags) + uint32 stereoLineOffset; ///< Y offset to the right eye data, in texels +}; + +/// Collection of bitmasks specifying which operations are currently allowed on an image, and which queues are allowed +/// to perform those operations. Based on this information, PAL can determine the best compression state of the image. +struct ImageLayout +{ + uint32 usages : 24; ///< Bitmask of @ref ImageLayoutUsageFlags values. + uint32 engines : 8; ///< Bitmask of @ref ImageLayoutEngineFlags values. +}; + +/** +**************************************************************************************************** +* @brief +* Enumerates swizzle modes useable on any supported GPU. +* @note +* For details please check _AddrSwizzleMode +* +**************************************************************************************************** +*/ +enum SwizzleMode : uint32 +{ + SwizzleModeLinear = 0, + SwizzleMode256BS, + SwizzleMode256BD, + SwizzleMode256BR, + SwizzleMode4KbZ, + SwizzleMode4KbS, + SwizzleMode4KbD, + SwizzleMode4KbR, + SwizzleMode64KbZ, + SwizzleMode64KbS, + SwizzleMode64KbD, + SwizzleMode64KbR, + SwizzleMode64KbZT, + SwizzleMode64KbST, + SwizzleMode64KbDT, + SwizzleMode64KbRT, + SwizzleMode4KbZX, + SwizzleMode4KbSX, + SwizzleMode4KbDX, + SwizzleMode4KbRX, + SwizzleMode64KbZX, + SwizzleMode64KbSX, + SwizzleMode64KbDX, + SwizzleMode64KbRX, + SwizzleMode256KbVarZX, + SwizzleMode256KbVarSX, + SwizzleMode256KbVarDX, + SwizzleMode256KbVarRX, + + ///< the meaning of swizzle mode varies by generation, + /// do not compare directly with the _R / _S / _D / _S types". + SwizzleMode256B2D, + SwizzleMode4Kb2D, + SwizzleMode4Kb3D, + SwizzleMode64Kb2D, + SwizzleMode64Kb3D, + SwizzleMode256Kb2D, + SwizzleMode256Kb3D, + SwizzleMode64Kb2Dz, + SwizzleMode256Kb2Dz, + SwizzleModeCount, +}; + +/// Reports position and memory layout information for a specific subresource in an image. Output structure for +/// IImage::GetSubresourceLayout(). +struct SubresLayout +{ + uint32 elementBytes; ///< size of each element in bytes + gpusize offset; ///< Offset in bytes from the base of the image's GPU memory where the subresource starts. + gpusize swizzleOffset; ///< Offset in bytes used for supporting parameterized swizzle + gpusize size; ///< Size of the subresource in bytes. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines of the subresource. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + uint32 tileToken; ///< Token representing various tiling information necessary for determining compatible + /// optimally tiled copies. + uint32 tileSwizzle; ///< Bank/Pipe swizzle bits for macro-tiling modes. + Extent3d blockSize; ///< Size of a tile block in texels - micro tile for 1D tiling and macro tile for 2D tiling. + Offset3d mipTailCoord; ///< coords of the subresource within the mip tail + + Extent3d extentTexels; ///< Unpadded extent of the subresource in texels. + Extent3d extentElements; ///< Unpadded extent of the subresource in elements. + Extent3d paddedExtent; ///< Extent of the subresource in elements, including all internal padding for this subresource. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 912 + /// Reports supported engines and usages for this subresource while it can remain in its optimal compression state. + /// Clients using CmdRelease()/CmdAcquire() without complete knowledge of the application's next usage during + /// CmdRelease() or its previous usage at CmdAcquire() can treat this layout as a performant target for an + /// intermediate state that will avoid unnecessary decompressions. + /// + /// This value is only valid if supportSplitReleaseAcquire is set in @ref DeviceProperties. + ImageLayout defaultGfxLayout; +#endif + + SwizzledFormat planeFormat; ///< Swizzled format for plane. Planar resource like D32-S8 + /// will have different swizzled format per plane. + SwizzleMode swizzleMode; ///< Swizzle mode for plane, based on AddrSwizzleMode + uint32 hwSwizzleMode; ///< Hardware Swizzle enum, enum type dependent on gfx version +}; + +/// Selects a specific subresource of an image resource. +/// +/// Most images only have a single data plane but in some cases conceptually related data will be stored in physically +/// separate locations which we call planes. If an image only has a single plane it will always be plane 0. +/// We define the following fixed mappings for all multi-plane formats. +/// + Depth-stencil: if the image format contains depth and stencil data, plane 0 is depth and plane 1 is stencil. +/// + YUV-planar: if the image format is @ref YuvPlanar it has either two or three planes. The luma plane +/// is always plane 0. If the format is @ref ChNumFormat::YV12 it has three planes where plane 1 is the +/// red-difference chrominance plane and plane 2 is the blue-difference chrominance plane. Otherwise, plane 1 +/// interleaves blue-difference and red-difference chrominance values. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 886 +struct SubresId +{ + uint8 plane; ///< Selects a data plane. + uint8 mipLevel; ///< Selects a mip level. + uint16 arraySlice; ///< Selects an array slice. +}; + +/// Defines a range of subresources. +struct SubresRange +{ + SubresId startSubres; ///< First subresource in the range. + uint8 numPlanes; ///< Number of planes in the range. + uint8 numMips; ///< Number of mip levels in the range. + uint16 numSlices; ///< Number of slices in the range. +}; + +#else +struct SubresId +{ + uint32 plane; ///< Selects a data plane. + uint32 mipLevel; ///< Selects a mip level. + uint32 arraySlice; ///< Selects an array slice. +}; + +/// Defines a range of subresources. +struct SubresRange +{ + SubresId startSubres; ///< First subresource in the range. + uint32 numPlanes; ///< Number of planes in the range. + uint32 numMips; ///< Number of mip levels in the range. + uint32 numSlices; ///< Number of slices in the range. +}; + +#endif + +/// A variant struct of MemoryImageCopyRegion +/// Specifies parameters for a copy from CPU memory to Image. +/// An input for Image::CopyMemoryToImage(). +struct MemoryToImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in elements. + uint32 numSlices; ///< Number of slices the copy will span. + const void* pHostPtr; ///< Pointer to the host memory copy from. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// A variant struct of MemoryImageCopyRegion +/// Specifies parameters for a copy from Image to CPU memory. +/// An input for Image::CopyImageToMemory(). +struct ImageToMemoryCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in elements. + uint32 numSlices; ///< Number of slices the copy will span. + void* pHostPtr; ///< Pointer to the host memory copy to. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +inline constexpr bool operator==(const SubresId& lhs, const SubresId& rhs) +{ + return (lhs.plane == rhs.plane) && + (lhs.mipLevel == rhs.mipLevel) && + (lhs.arraySlice == rhs.arraySlice); +} + +inline constexpr bool operator!=(const SubresId& lhs, const SubresId& rhs) +{ + return ((lhs == rhs) == false); +} + +inline constexpr bool operator==(const SubresRange& lhs, const SubresRange& rhs) +{ + return (lhs.startSubres == rhs.startSubres) && + (lhs.numPlanes == rhs.numPlanes) && + (lhs.numMips == rhs.numMips) && + (lhs.numSlices == rhs.numSlices); +} + +/// Determines if two subresource ranges are overlapped. +/// +/// @returns True if two subresource ranges are overlapped, false otherwise. +inline constexpr bool OverlappedSubresRanges( + const SubresRange& a, + const SubresRange& b) +{ + const SubresId aStart = a.startSubres; + const SubresId bStart = b.startSubres; + + return (aStart.plane < (bStart.plane + b.numPlanes)) && + (bStart.plane < (aStart.plane + a.numPlanes)) && + (aStart.mipLevel < (bStart.mipLevel + b.numMips)) && + (bStart.mipLevel < (aStart.mipLevel + a.numMips)) && + (aStart.arraySlice < (bStart.arraySlice + b.numSlices)) && + (bStart.arraySlice < (aStart.arraySlice + a.numSlices)); +} + +/** + *********************************************************************************************************************** + * @interface IImage + * @brief Represents an image resource that can be accessed by the GPU. + * + * @see IDevice::CreateImage() + * @see IDevice::OpenPeerImage() + *********************************************************************************************************************** + */ +class IImage : public IGpuMemoryBindable +{ +public: + /// Reports information on the layout of the image in memory such as core data size and metadata alignment. + /// + /// @returns the reference to ImageCreateInfo + virtual const ImageMemoryLayout& GetMemoryLayout() const = 0; + + /// Reports information on the full range of the image's subresources. + /// + /// @param [out] pRange Reports info on the full range of the image's subresources such as number of mips and + /// planes. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidPointer if pRange is null. + virtual Result GetFullSubresourceRange(SubresRange* pRange) const = 0; + + /// Reports information on the layout of the specified subresource in memory. + /// + /// @param [in] subresId Selects a subresource from the image (aspect/mip/slice). + /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidPointer if pLayout is null. + /// + ErrorInvalidValue is the subresId is out of range for this image. + virtual Result GetSubresourceLayout( + SubresId subresId, + SubresLayout* pLayout) const = 0; + +#if defined(__unix__) + /// Reports information on the memory plane layout of the specified subresource in memory for image with modifier. + /// + /// @param [in] memoryPlane Selects a memory plane from the image. + /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidValue is the memory plane is out of range for this image. + virtual Result GetModifierSubresourceLayout( + uint32 memoryPlane, + SubresLayout* pLayout) const = 0; +#endif + + /// Reports the create info of image. + /// + /// @returns the reference to ImageCreateInfo + const ImageCreateInfo& GetImageCreateInfo() const { return m_createInfo; } + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +#if defined(_WIN32) + /// This method checks if the image is an opened cross-adapter shared image on MS hybrid graphics system. + /// + /// @returns True if the image is an opened cross-adapter shared image. False otherwise. + virtual bool IsCrossAdapter() const = 0; + + /// Returns a special resource ID. Can be used to associate an opened resource with its original resource. + /// + /// @returns The optimal sharing ID. + virtual uint64 GetOptimalSharingId() const = 0; +#endif + + /// Sets level of optimal sharing by opening APIs using this optimal sharable image and pass this information to the + /// creator. This function is supposed to be called by openers only. The call by creator is ignored. + /// + /// @param [in] level Level to be set to specified client API. + virtual void SetOptimalSharingLevel( + MetadataSharingLevel level) = 0; + + /// Returns support level set by all possible opening APIs. + /// + /// @returns A summarized supporting level. + virtual MetadataSharingLevel GetOptimalSharingLevel() const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Copies data directly from CPU memory to an Image. + /// + /// @param [in] pRegions Pointer to an array of Image regions to copy from. + /// @param [in] regionCount Number of regions to copy. + /// @param [in] useMemcpy Indicates that it could be copied more efficiently from host memory to image with + /// image data is already swizzled in host memory + /// + /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the + /// returned: + /// + ErrorInvalidPointer if the memory object bound to image is null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorInvalidValue if copy between image and memory failed. + virtual Result CopyMemoryToImage( + const MemoryToImageCopyRegion* pRegions, + const uint32 regionCount, + bool useMemcpy) const = 0; + + /// Copies data directly from an Image to CPU memory. + /// + /// @param [in] pRegions Pointer to an array of Image regions to copy to. + /// @param [in] regionCount Number of regions to copy. + /// @param [in] useMemcpy Indicates that it could be copied more efficiently from image to host memory and the + /// image data will be obtained while retaining the physical layout of the image. + /// + /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the + /// returned: + /// + ErrorInvalidPointer if the memory object bound to image is null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorInvalidValue if copy between image and memory failed. + virtual Result CopyImageToMemory( + const ImageToMemoryCopyRegion* pRegions, + const uint32 regionCount, + bool useMemcpy) const = 0; + + /// Copies data between two images with specified regions. + /// + /// @param [in] pDstImage Pointer to the destination image where the data will be copied. + /// @param [in] pImgRegions Pointer to an array of regions specifying the area of image to be copied. + /// @param [in] regionCount Number of regions to copy between the source and destination images. + /// + /// @returns Success if the copy operation was successfully performed. Otherwise, one of the following error codes + /// may be returned: + /// + ErrorInvalidPointer if any of the input pointers are null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if there is insufficient memory to perform the operation. + /// + ErrorInvalidValue if copy between images failed. + virtual Result CopyBetweenImages( + IImage* pDstImage, + const ImageCopyRegion* pImgRegions, + const uint32 regionCount) const = 0; + +protected: + /// @internal Constructor. + /// + /// @param [in] createInfo App-specified parameters describing the desired image properties. + IImage(const ImageCreateInfo& createInfo) : m_createInfo(createInfo), m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IImage() { } + + /// Retained Image create info + const ImageCreateInfo m_createInfo; + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palLib.h b/shared/amdgpu-windows-interop/pal/inc/core/palLib.h new file mode 100644 index 0000000000..8f42570262 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palLib.h @@ -0,0 +1,508 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palLib.h + * @brief Defines the Platform Abstraction Library (PAL) initialization and destruction functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palSysMemory.h" +#include "palDbgPrint.h" + +/// Major interface version. Note that the interface version is distinct from the PAL version itself, which is returned +/// in @ref Pal::PlatformProperties. +/// +/// @attention Updates to the major version indicate an interface change that is not backward compatible and may require +/// action from each client during their next integration. When determining if a change is backward +/// compatible, it is assumed that the client will default-initialize all structs. +/// +/// @ingroup LibInit +#define PAL_INTERFACE_MAJOR_VERSION 942 + +/// Minimum major interface version. This is the minimum interface version PAL supports in order to support backward +/// compatibility. When it is equal to PAL_INTERFACE_MAJOR_VERSION, only the latest interface version is supported. +/// +/// @ingroup LibInit +#define PAL_MINIMUM_INTERFACE_MAJOR_VERSION 872 + +/// Minimum supported major interface version for devdriver library. This is the minimum interface version of the +/// devdriver library that PAL is backwards compatible to. +/// +/// @ingroup LibInit +#define PAL_MINIMUM_GPUOPEN_INTERFACE_MAJOR_VERSION 38 + +/** + *********************************************************************************************************************** + * @def PAL_INTERFACE_VERSION + * @ingroup LibInit + * @brief Current PAL interface version packed into a 32-bit unsigned integer. The low 16 bits are always zero. + * They used to contain the interface minor version and remain as a placeholder in case we add it back. + * + * @see PAL_INTERFACE_MAJOR_VERSION + * + * @hideinitializer + *********************************************************************************************************************** + */ +#define PAL_INTERFACE_VERSION (PAL_INTERFACE_MAJOR_VERSION << 16) + +namespace Pal +{ + +// Forward declarations +class IPlatform; + +/// This is a list of GPUs that the NULL OS layer can compile shaders to in offline mode. +enum class NullGpuId : uint32 +{ + Default = 0, ///< PAL gives the client an arbitrary supported null device. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + Polaris10, ///< 8.0.3 + Polaris11, ///< 8.0.3 + Polaris12, ///< 8.0.3 + Vega10, ///< 9.0.0 + Raven, ///< 9.0.2 + Vega12, ///< 9.0.4 + Vega20, ///< 9.0.6 + Raven2, ///< 9.0.9 + Renoir, ///< 9.0.9 +#endif + Navi10, ///< 10.1.0 + Navi12, ///< 10.1.1 + Navi14, ///< 10.1.2 + Navi21, ///< 10.3.0 + Navi22, ///< 10.3.1 + Navi23, ///< 10.3.2 + Navi24, ///< 10.3.4 + Rembrandt, ///< 10.3.5 + Raphael, ///< 10.3.6 + Navi31, ///< 11.0.0 + Navi32, ///< 11.0.1 + Navi33, ///< 11.0.2 + Phoenix1, ///< 11.0.3 + Phoenix2, ///< 11.0.3 + Strix1, ///< 11.5.0 + StrixHalo, ///< 11.5.1 + Krackan1, ///< 11.5.2 + Navi44, ///< 12.0.0 + Navi48, ///< 12.0.1 +#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION>= 888) +#endif + Max, ///< The maximum count of null devices. + All, ///< If you want to enumerate all null devices. +}; + +/// Specifies which graphics IP level (GFXIP) this device has. +enum class GfxIpLevel : uint32 +{ + _None = 0, ///< @internal The device does not have an GFXIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an GFXIP block, or its level cannot be determined +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + GfxIp10_1, ///< GFXIP 10.1 (Navi1x) + GfxIp10_3, ///< GFXIP 10.3 (Navi2x, Rembrandt, Raphael, Mendocino) + GfxIp11_0, ///< GFXIP 11.0 (Navi3x, Phoenix) + GfxIp11_5, ///< GFXIP 11.5 (Strix) + GfxIp12, ///< GFXIP 12.0 (Navi4x) +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + GfxIp6 = 0x1, + GfxIp7 = 0x2, + GfxIp8 = 0x3, + GfxIp8_1 = 0x4, + GfxIp9 = 0x5, + GfxIp10_1 = 0x7, + GfxIp10_3 = 0x9, + GfxIp11_0 = 0xC, + GfxIp11_5 = 0xF, + GfxIp12 = 0x11, +#endif +}; + +/// Specifies the hardware revision. Some AMD tools hard-code these values so we cannot change them. New ASICs should +/// be added at the end of the list and be given the next highest value. +enum class AsicRevision : uint32 +{ + Unknown = 0x00, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + Tahiti = 0x01, + Pitcairn = 0x02, + Capeverde = 0x03, + Oland = 0x04, + Hainan = 0x05, + Bonaire = 0x06, + Hawaii = 0x07, + HawaiiPro = 0x08, + Kalindi = 0x0A, + Godavari = 0x0B, + Spectre = 0x0C, + Spooky = 0x0D, + Carrizo = 0x0E, + Bristol = 0x0F, + Stoney = 0x10, + Iceland = 0x11, + Tonga = 0x12, + TongaPro = Tonga, + Fiji = 0x13, + Polaris10 = 0x14, + Polaris11 = 0x15, + Polaris12 = 0x16, + Vega10 = 0x18, + Vega12 = 0x19, + Vega20 = 0x1A, + Raven = 0x1B, + Raven2 = 0x1C, + Renoir = 0x1D, +#endif + Navi10 = 0x1F, ///< 10.1.0 + Navi12 = 0x21, ///< 10.1.1 + Navi14 = 0x23, ///< 10.1.2 + Navi21 = 0x24, ///< 10.3.0 + Navi22 = 0x25, ///< 10.3.1 + Navi23 = 0x26, ///< 10.3.2 + Navi24 = 0x27, ///< 10.3.4 + Navi31 = 0x2C, ///< 11.0.0 + Navi32 = 0x2D, ///< 11.0.1 + Navi33 = 0x2E, ///< 11.0.2 + Rembrandt = 0x2F, ///< 10.3.5 + Strix1 = 0x33, ///< 11.5.0 + Raphael = 0x34, ///< 10.3.6 + Phoenix1 = 0x35, ///< 11.0.3 + Phoenix2 = 0x38, ///< 11.0.3 + HawkPoint1 = 0x39, ///< 11.0.3 + HawkPoint2 = 0x3A, ///< 11.0.3 + Krackan1 = 0x3B, ///< 11.5.2 + StrixHalo = 0x3C, ///< 11.5.1 + Navi44 = 0x3D, ///< 12.0.0 + Navi48 = 0x3E, ///< 12.0.1 +}; + +/// Maps a null GPU ID to its associated text name. +struct NullGpuInfo +{ + NullGpuId nullGpuId; ///< ID of an ASIC that PAL supports for override purposes + const char* pGpuName; ///< Text name of the ASIC specified by nullGpuId +}; + +/// Various IDs and info associated with a particular GPU. +struct GpuInfo +{ + AsicRevision asicRev; ///< PAL specific ASIC revision identifier. + NullGpuId nullId; ///< PAL specific GPU ID supported by the NULL OS layer. + GfxIpLevel gfxIpLevel; ///< PAL specific identifier for the device's graphics IP level (GFXIP). + uint32 familyId; ///< Hardware family ID. Driver-defined identifier for a particular family of devices. + uint32 eRevId; ///< GPU emulation/internal revision ID. + uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different SKUs or revisions. + uint32 gfxEngineId; ///< Coarse-grain GFX engine ID (R800, SI, etc.). + uint32 deviceId; ///< PCI device ID (e.g., Hawaii XT = 0x67B0). + const char* pGpuName; ///< ASIC name and AMDGPU target name (e.g., "NAVI31:gfx1100"). +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 915 +/// PAL client APIs. +enum class ClientApi : uint32 +{ + Pal = 0, + Dx9 = 1, + Dx12 = 3, + Vulkan = 4, + OpenCl = 7, + Hip = 8, + Amf = 9, +}; +#else +/// The client UMD must identify its API using this enum. Some UMD builds may implement multiple APIs so they must +/// specify which API they're implementing at runtime. Note that the PAL_CLIENT macros are the preferred way to +/// implement client-specific behavior; runtime ClientApi checks should only be used when necessary. +enum class ClientApi : uint32 +{ + OpenCl, + Hip +}; +#endif + +/// Specifies properties for @ref IPlatform creation. Input structure to Pal::CreatePlatform(). +struct PlatformCreateInfo +{ + const Util::AllocCallbacks* pAllocCb; ///< Optional client-provided callbacks. If non-null, PAL will call the + /// specified callbacks to allocate and free all internal system + /// memory. If null, PAL will manage memory on its own through the C + /// runtime library. + const Util::LogCallbackInfo* pLogInfo; ///< Optional client-provided callback info. If non-null, Pal will + /// call the callback to pass debug prints to the client. + + const char* pSettingsPath; ///< A null-terminated string describing the path to where settings are + /// located on the system. For example, on Windows, this will refer to + /// which UMD subkey to look in under a device's key. For Linux, this + /// is the path to the settings file. + + union + { + struct + { + uint32 disableGpuTimeout : 1; ///< Disables GPU timeout detection (Windows only) + uint32 force32BitVaSpace : 1; ///< Forces 32bit VA space for the flat address with 32bit ISA + uint32 createNullDevice : 1; ///< Set to create a null device, so "nullGpuId" below for the + /// ID of the GPU the created device will be based on. Null + /// devices operate in IFH mode; useful for off-line shader + /// compilations. + uint32 enableSvmMode : 1; ///< Enable SVM mode. When this bit is set, PAL will reserve + /// cpu va range with size "maxSvmSize", and allow client to + /// to create gpu or pinned memory for use of Svm. + /// For detail of SVM, please refer to CreateSvmGpuMemory + uint32 requestShadowDescriptorVaRange : 1; ///< Requests that PAL provides support for the client to use + /// the @ref VaRange::ShadowDescriptorTable virtual-address + /// range. Some GPU's may not be capable of supporting this, + /// even when requested by the client. + uint32 disableInternalResidencyOpts : 1; ///< Disables residency optimizations for internal GPU memory + /// allocations. Some clients may wish to have them turned + /// off to save on system resources. + uint32 supportRgpTraces : 1; ///< Indicates that the client supports RGP tracing. PAL will + /// use this flag and the hardware support flag to setup the + /// DevDriver RgpServer. + uint32 dontOpenPrimaryNode : 1; ///< No primary node is needed (Linux only) + uint32 disableDevDriver : 1; ///< If no DevDriverMgr should be created with this Platform. + uint32 reserved : 23; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Platform-wide creation flags. + + ClientApi clientApiId; ///< Client API ID. + NullGpuId nullGpuId; ///< ID for the null device. Ignored unless the above flags.createNullDevice bit is set. + uint16 apiMajorVer; ///< Major API version number to be used by RGP. Should be set by client based on their + /// contract with RGP. + uint16 apiMinorVer; ///< Minor API version number to be used by RGP. Should be set by client based on their + /// contract with RGP. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 916 + uint32 instrApiVer; /// Instrumentation specification version for API-specific SQTT instrumentation fields. + /// Should be set by client based on the SQTT instrumentation spec version being targeted. +#endif + gpusize maxSvmSize; ///< Maximum amount of virtual address space that will be reserved for SVM +}; + +/** +************************************************************************************************************************ +* @brief Determines the amount of system memory required for a Platform object. +* +* This function must be called before any other interaction with PAL. An allocation of this amount of memory must be +* provided in the pPlacementAddr parameter of Pal::CreatePlatform. +* +* @ingroup LibInit +* +* @returns Size, in bytes, of system memory required for an IPlatform object. +************************************************************************************************************************ +*/ +size_t PAL_STDCALL GetPlatformSize(); + +/** + *********************************************************************************************************************** + * @brief Creates the Platform Abstraction Library. + * + * On execution of CreatePlatform(), PAL will establish a connection for OS and KMD communication, install the specified + * system memory allocation callbacks, and initialize any global internal services. Finally, the client will be + * returned an object pointer to the instantiated platform object, which is used to query the capabilities of the + * system. + * + * @ingroup LibInit + * + * @param [in] createInfo Parameters indicating the client requirements for the platform such as allocation + callbacks or the settings path. + * @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + * much size available here as reported by calling GetPlatformSize(). + * @param [out] ppPlatform Platform object pointer to the instantiated platform. Must not be null. + * + * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be + * returned: + * + ErrorInvalidPointer will be returned if: + * - pPlatform is null. + * - pPlacementAddr is null. + * - createInfo.pAllocCb is non-null but pfnAlloc and/or pfnFree is null. + * - createInfo.pSettingsPath is null. + * + ErrorInitializationFailed will be returned if PAL is unable to open a connection to the OS. + *********************************************************************************************************************** + */ +Result PAL_STDCALL CreatePlatform( + const PlatformCreateInfo& createInfo, + void* pPlacementAddr, + IPlatform** ppPlatform); + +/** + *********************************************************************************************************************** + * @brief Provides an association of NULL devices and their associated text name. NULL devices operate in IFH mode + * and are primarily intended for off-line shader compilation mode. The text name is provided for end-user + * identification of the GPU device being created. + * + * @param [in,out] pNullDeviceCount On input, this is the size of the "pNullDevices" array. On output, this + * reflects the number of valid entries in the "pNullDevices" array. + * @param [out] pNullDevices Includes information on the valid NULL devices supported by the system. If + * this is NULL, then pNullDeviceCount reflects the maximum possible size of the + * null-devices array. + * + * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be + * returned: + * + ErrorInvalidPointer will be returned if either input is NULL. + *********************************************************************************************************************** + */ +Result PAL_STDCALL EnumerateNullDevices( + uint32* pNullDeviceCount, + NullGpuInfo* pNullDevices); + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified NullGpuId. + * + * @param [in] nullGpuId Null GPU ID to lookup. + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. + * + NotFound will be returned if the Null GPU ID was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForNullGpuId( + NullGpuId nullGpuId, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForNullGpuId( + NullGpuId nullGpuId, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForNullGpuId(nullGpuId, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified GPU name string. + * + * @param [in] pGpuName Name string of the GPU to lookup (e.g., "NAVI10"). + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuName or pGpuInfo are NULL. + * + NotFound will be returned if the Name string was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForName( + const char* pGpuName, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForName( + const char* pGpuName, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForName(pGpuName, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified hardware revision. + * + * @param [in] asicRevision Hardware revision to lookup. + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. + * + NotFound will be returned if the hardware revision was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForAsicRevision( + AsicRevision asicRevision, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForAsicRevision( + AsicRevision asicRevision, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForAsicRevision(asicRevision, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @defgroup LibInit Library Initialization and Destruction + * + * Before initializing PAL, it is important to make sure that the interface version is consistent with the client's + * expectations. The client should check @ref PAL_INTERFACE_MAJOR_VERSION to ensure the major interface version has not + * changed since the last PAL integration. Ideally, this should be performed with a compile-time assert comparing + * @ref PAL_INTERFACE_MAJOR_VERSION against a client-maintained expected major version. Minor interface version + * changes should be backward compatible, and do not require a client change to maintain previous levels of + * functionality. + * + * On startup, the client's first call to PAL must be GetPlatformSize() followed by CreatePlatform(). This function + * gives an opportunity for PAL to perform any necessary platform-wide initialization such as opening a connection for + * communication with the operating system and kernel mode driver or initializing tracking facilities for system memory + * management. CreatePlatform() returns a created IPlatform object for future interaction with PAL. + * + * PAL optionally allows the client to specify a set of memory management callbacks during initialization. If + * specified, PAL will not allocate or free any memory directly from the runtime, instead calling back to the client. + * The client (or application, if the client forwards on the requests) may be able to implement a more efficient + * allocation scheme. + * + * After a successful call to CreatePlatform(), the client should call @ref IPlatform::EnumerateDevices() in order to + * get a list of supported devices attached to the system. This function returns an array of @ref IDevice objects + * which are used by the client to query properties of the devicess and eventually execute work on those devices. + * IPlatform::EnumerateDevices() is not available to util-only clients (PAL_BUILD_CORE=0). + * + * The client may re-enumerate devices at any time by calling IPlatform::EnumerateDevices(). The client must make sure + * there is no active work on any device and that all objects associated with those devices have been destroyed. + * IPlatform::EnumerateDevices() will destroy all previously reported @ref IDevice objects and return a fresh set. + * The client is required to re-enumerate devices when it receives a ErrorDeviceLost error from PAL. + * + * After enumerating devices, either during start-up or when recovering from an ErrorDeviceLost error, the client must + * setup and finalize PAL's per-device settings. See IDevice::GetPublicSettings(), IDevice::SetDxRuntimeData(), + * IDevice::CommitSettingsAndInit(), and IDevice::Finalize() for details. + * + * After enumerating devices and finalizing them, the client may query the set of available screens. This is done by + * calling the @ref IPlatform::GetScreens() function. Note that screens are not available for DX clients. Each screen + * is accessible by zero or more of the enumerated devices. Most screens are accessible from a "main" device as well as + * several other devices which can perform cross-display Flip presents to the screen. In some configurations, screens + * may not be directly to any of PAL's devices, in which case fullscreen presents are unavailable to that screen. (This + * typically only occurs in PowerExpress configurations.) Note that when IPlatform::EnumerateDevices() is called, any + * enumerated @ref IScreen objects which existed prior to that call are invalidated for the specified platform and + * IPlatform::GetScreens() needs to be called again to get the updated list of screens. + * + * On shutdown, the client should call @ref IPlatform::Destroy() to allow PAL to cleanup and free any remaining + * platform-wide resources. The client must ensure this call is not made until all other created objects are idle and + * destroyed (if destroyable). + * + * When the client is asked to destroy a device it may call IDevice::Cleanup() to explicitly clean up the device. Some + * clients will find it necessary to call Cleanup(), for example, if their devices have OS handles that become invalid. + * Note that Cleanup() doesn't destroy the device; it will return to its initial state, as if it was newly enumerated. + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h b/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h new file mode 100644 index 0000000000..0441f723b0 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h @@ -0,0 +1,187 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMsaaState.h + * @brief Defines the Platform Abstraction Library (PAL) IMsaaState interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +/// Specifies conservative rasterization mode +enum class ConservativeRasterizationMode : uint8 +{ + Overestimate = 0x0, ///< Fragments will be generated if the primitive area covers any portion of the pixel. + Underestimate = 0x1, ///< Fragments will be generated if all of the pixel is covered by the primitive. + Count +}; + +/// Maximum supported number of MSAA color samples. +constexpr uint32 MaxMsaaColorSamples = 16; + +/// Maximum supported number of MSAA depth samples. +constexpr uint32 MaxMsaaDepthSamples = 8; + +/// Maximum supported number of MSAA fragments. +constexpr uint32 MaxMsaaFragments = 8; + +/// Sampling pattern grid size. This is a quad of pixels, i.e. 2x2 grid of pixels. +constexpr Extent2d MaxGridSize = { 2, 2 }; + +/// The positions are rounded to 1/Pow2(SubPixelBits) +constexpr uint32 SubPixelBits = 4; + +/// Each pixel is subdivided into Pow2(SubPixelBits) x Pow2(SubPixelBits) grid of possible sample locations. +constexpr Extent2d SubPixelGridSize = { 16, 16 }; + +/// Represents a 2D coordinate with each component in [-8/16, 7/16] +struct SampleLocation +{ + int8 x; ///< X offset. + int8 y; ///< Y offset. + + /// Conversion operator that does sign-extension. + operator Offset2d() const { return { x, y }; } +}; + +/// Specifies a custom multisample pattern for a pixel quad. +struct MsaaQuadSamplePattern +{ + SampleLocation topLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for TL pixel of quad. + SampleLocation topRight[MaxMsaaRasterizerSamples]; ///< Sample locations for TR pixel of quad. + SampleLocation bottomLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for BL pixel of quad. + SampleLocation bottomRight[MaxMsaaRasterizerSamples]; ///< Sample locations for BR pixel of quad. +}; + +/// Specifies properties for creation of an @ref IMsaaState object. Input structure to IDevice::CreateMsaaState(). +struct MsaaStateCreateInfo +{ + uint8 coverageSamples; ///< Number of rasterizer samples. Must be greater than or equal to all sample + /// rates in the pipeline. Valid values are 1, 2, 4, 8, and 16. + uint8 exposedSamples; ///< Number of samples exposed in the pixel shader coverage mask. Must be less + /// than or equal to coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 pixelShaderSamples; ///< Controls the pixel shader execution rate. Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. Note that value with + /// greater than 1 doesn't mean sample rate shading is enabled. Sample rate + /// shading is enabled by either @ref forceSampleRateShading or pixel shader. + uint8 depthStencilSamples; ///< Number of samples in the bound depth target. Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 shaderExportMaskSamples; ///< Number of samples to use in the shader export mask. Should match the number + /// of color target fragments clamped to + /// @ref DeviceProperties imageProperties.maxMsaaFragments. + uint8 sampleClusters; ///< Number of sample clusters to control over-rasterization (all samples in a + /// cluster are rasterized if any are hit). Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 alphaToCoverageSamples; ///< How many samples of quality to generate with alpha-to-coverage. Must be + /// less than or equal to coverageSamples. Valid values are 1, 2, 4, 8, and 16. + uint8 occlusionQuerySamples; ///< Controls the number of samples to use for occlusion queries. + /// This value must never exceed the MSAA rate. + uint16 sampleMask; ///< Bitmask of which color target and depth/stencil samples should be updated. + /// The lowest bit corresponds to sample 0. + + /// Selects overestimate or underestimate conservative rasterization mode. Used only if + /// @ref MsaaStateCreateInfo::flags::enableConservativeRasterization is set to true. + ConservativeRasterizationMode conservativeRasterizationMode; + + union + { + struct + { + uint8 enableConservativeRasterization : 1; ///< Set to true to enable conservative rasterization + uint8 enable1xMsaaSampleLocations : 1; ///< Set to true to enable 1xMSAA quad sample pattern + uint8 disableAlphaToCoverageDither : 1; ///< Disables coverage dithering. + uint8 enableLineStipple : 1; ///< Set to true to enable line stippling + uint8 forceSampleRateShading : 1; ///< Sample rate shading can be enabled by either the pixel + /// shader, or forced here with forceSampleRateShading = 1. + /// Value 0 means sample rate shading is decided by pixel shader + /// and value 1 means sample rate shading is forced enabled. + /// This bit is for openGL glMinSampleShading, where sample rate + /// shading can be enabled by glEnable(GL_SAMPLE_SHADING) + /// instead of by the pixel shader. + uint8 reserved : 3; ///< Reserved for future use + }; + uint8 u8All; + } flags; +}; + +/** + *********************************************************************************************************************** + * @interface IMsaaState + * @brief Dynamic state object controlling fixed function MSAA state. + * + * Configures sample counts of various portions of the pipeline, specifies sample positions, etc. The full range of + * EQAA hardware features are exposed. + * + * @see IDevice::CreateMsaaState + *********************************************************************************************************************** + */ +class IMsaaState : public IDestroyable +{ +public: + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IMsaaState() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IMsaaState() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h b/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h new file mode 100644 index 0000000000..eea8b32fd6 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h @@ -0,0 +1,619 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPerfExperiment.h + * @brief Defines the Platform Abstraction Library (PAL) IPerfExperiment interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +/// Specifies a particular block on the GPU to gather counters for. +enum class GpuBlock : uint32 +{ + Cpf = 0x0, + Ia = 0x1, + Vgt = 0x2, + Pa = 0x3, + Sc = 0x4, + Spi = 0x5, + Sq = 0x6, + Sx = 0x7, + Ta = 0x8, + Td = 0x9, + Tcp = 0xA, + Tcc = 0xB, + Tca = 0xC, + Db = 0xD, + Cb = 0xE, + Gds = 0xF, + Srbm = 0x10, + Grbm = 0x11, + GrbmSe = 0x12, + Rlc = 0x13, + Dma = 0x14, + Mc = 0x15, + Cpg = 0x16, + Cpc = 0x17, + Wd = 0x18, + Tcs = 0x19, + Atc = 0x1A, + AtcL2 = 0x1B, + McVmL2 = 0x1C, + Ea = 0x1D, + Rpb = 0x1E, + Rmi = 0x1F, + Umcch = 0x20, + Ge = 0x21, + Gl1a = 0x22, + Gl1c = 0x23, + Gl1cg = 0x24, + Gl2a = 0x25, // TCA is used in Gfx9, and changed to GL2A in Gfx10 + Gl2c = 0x26, // TCC is used in Gfx9, and changed to GL2C in Gfx10 + Cha = 0x27, + Chc = 0x28, + Chcg = 0x29, + Gus = 0x2A, + Gcr = 0x2B, + Ph = 0x2C, + UtcL1 = 0x2D, + Ge1 = Ge, + GeDist = 0x2E, + GeSe = 0x2F, + DfMall = 0x30, // The DF subblocks have unique instances and event IDs but they all share the DF's perf counters. + SqWgp = 0x31, // SQ counters that can be sampled at WGP granularity. + Pc = 0x32, + Gl1xa = 0x33, + Gl1xc = 0x34, + Wgs = 0x35, + EaCpwd = 0x36, + EaSe = 0x37, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 926 + RlcUser = 0x38, +#else + RlcLocal = 0x38, +#endif + Count +}; + +/// Distinguishes between global and streaming performance monitor (SPM) counters. +enum class PerfCounterType : uint32 +{ + Global = 0x0, ///< Represents the traditional summary perf counters. + Spm = 0x1, ///< Represents streaming performance counters. + Spm32 = 0x2, ///< Represents 32bit streaming performance counters + Count +}; + +/// Reports the type of data the hardware writes for a particular counter. +enum class PerfCounterDataType : uint32 +{ + Uint32 = 0x0, + Uint64 = 0x1, + Count +}; + +/// Distinguishes between normal thread traces and streaming performance monitor (SPM) traces. +enum class PerfTraceType : uint32 +{ + ThreadTrace = 0x0, + SpmTrace = 0x1, + Count +}; + +/// Mask values ORed together to choose which shader stages a performance experiment should sample. +enum PerfExperimentShaderFlags +{ + PerfShaderMaskPs = 0x01, + PerfShaderMaskVs = 0x02, + PerfShaderMaskGs = 0x04, + PerfShaderMaskEs = 0x08, + PerfShaderMaskHs = 0x10, + PerfShaderMaskLs = 0x20, + PerfShaderMaskCs = 0x40, + PerfShaderMaskAll = 0x7f, +}; + +/// Selects one of generic performance trace markers, which the client can use to track data of its own choosing. +enum class PerfTraceMarkerType : uint32 +{ + SqttA = 0x0, + SqttB = 0x1, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 874 + A = SqttA, + B = SqttB, +#endif + SpmA = 0x2, + SpmB = 0x3, + SpmC = 0x4, + SpmD = 0x5, + Count +}; + +/// Specifies available features in device for supporting performance measurements. +union PerfExperimentDeviceFeatureFlags +{ + struct + { + uint32 counters : 1; ///< Device supports performance counters. + uint32 threadTrace : 1; ///< Device supports thread traces. + uint32 spmTrace : 1; ///< Device supports streaming perf monitor traces. + uint32 dfSpmTrace : 1; ///< Device supports streaming df perf monitor traces. + uint32 supportPs1Events : 1; ///< The thread trace HW of this Device is capable of producing event tokens + /// from the second PS backend of SC. + uint32 sqttBadScPackerId : 1; ///< Hardware is affected by bug causing the packer ID specified in new PS waves + /// to be incorrect in SQ thread trace data. + uint32 reserved : 26; ///< Reserved for future use. + }; + uint32 u32All; ///< Feature flags packed as 32-bit uint. +}; + +/// Specifies properties for a perf counter being added to a perf experiment. Input structure to +/// IPerfExperiment::AddCounter(). +/// +/// A note for GpuBlock::SqWgp +/// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity +/// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock gating (CGCG) +/// and power gating. PAL expose this feature to clients. +/// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same +/// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. +/// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features +/// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. +/// The counter data is still reported per WGP (not aggregated for the whole SE). +/// +struct PerfCounterInfo +{ + PerfCounterType counterType; ///< Type of counter to add. + GpuBlock block; ///< Which block to reference. + uint32 instance; ///< Instance of that block in the device. + uint32 eventId; ///< Which event ID to track. + + // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for + // the relevant blocks. It's recommended to zero them out when not in use. + union + { + struct + { + uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. + } df; + + struct + { + uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. + uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold). + uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). + } umc; + + uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events + + uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds + } subConfig; +}; + +/// Specifies properties for setting up a streaming performance counter trace. Input structure to +/// IPerfExperiment::AddSpmTrace(). +struct SpmTraceCreateInfo +{ + uint32 spmInterval; ///< Interval between each sample in terms of GPU sclks. Minimum of 32. + gpusize ringSize; ///< Suggested size of the SPM output ring buffer in bytes. PAL may use + /// a smaller ring in practice but it cannot exceed this size. + uint32 numPerfCounters; ///< Number of performance counters to be collected in this trace. + const PerfCounterInfo* pPerfCounterInfos; ///< Array of size numPerfCounters of PerfCounterInfo(s). +}; + +/// Reports layout of a single global perf counter sample. +struct GlobalSampleLayout +{ + GpuBlock block; ///< Type of GPU block. + uint32 instance; ///< Which instance of that type of GPU block. + uint32 slot; ///< Slot varies in meaning per block. + uint32 eventId; ///< Sampled event ID. + PerfCounterDataType dataType; ///< What type of data is written (e.g., 32-bit uint). + gpusize beginValueOffset; ///< Offset in bytes where the sample data begins. + gpusize endValueOffset; ///< Offset in bytes where the sample data ends. +}; + +/// Describes the layout of global perf counter data in memory. +struct GlobalCounterLayout +{ + uint32 sampleCount; ///< Number of samples described in samples[]. + GlobalSampleLayout samples[1]; ///< Describes the layout of each sample. This structure is repeated (sampleCount + /// - 1) additional times. +}; + +/// Enumeration of SQ Thread trace token types. All versions of Thread Trace (TT) are represented. If an unsupported +/// token is enabled, no error is reported. +enum ThreadTraceTokenTypeFlags : Pal::uint32 +{ + Misc = 0x00000001, ///< A miscellaneous event has been sent. TT 2.3 + Timestamp = 0x00000002, ///< Timestamp tokens. TT 2.3 + Reg = 0x00000004, ///< Register activity token. TT 2.3 + WaveStart = 0x00000008, ///< A wavefront has started. TT 2.3 + WaveAlloc = 0x00000010, ///< Output space has been allocated for vertex position or color/Z. TT 2.3. + RegCsPriv = 0x00000020, ///< There has been a compute pipeline private data, state or threadgroup update. TT 2.3. + WaveEnd = 0x00000040, ///< Wavefront completion. TT 2.3 + Event = 0x00000080, ///< An event has reached the top of a shader stage. TT 2.3 + EventCs = 0x00000100, ///< An event has reached the top of a compute shader stage. TT 2.3 + EventGfx1 = 0x00000200, ///< An event has reached the top of a shader stage for the second GFX pipe. TT 2.3 + Inst = 0x00000400, ///< The shader has executed an instruction. TT 2.3 + InstPc = 0x00000800, ///< The shader has explicitly written the PC value. TT 2.3 + InstUserData = 0x00001000, ///< The shader has written user data into the thread trace buffer. TT 2.3 + Issue = 0x00002000, ///< Provides information about instruction scheduling. TT 2.3 + Perf = 0x00004000, ///< The performance counter delta has been updated. TT 2.3 and below only. + RegCs = 0x00008000, ///< A compute state update packet has been received by the SPI. TT 2.3 + VmemExec = 0x00010000, ///< A previously issued VMEM instruction is now being sent to LDS/TA. TT 3.0 + AluExec = 0x00020000, ///< A previously issued VALU instruction is now being executed. TT 3.0 + ValuInst = 0x00040000, ///< A VALU instruction has been issued. TT 3.0. + WaveRdy = 0x00080000, ///< Mask of which waves became ready this cycle but did not issue an instruction. TT 3.0 + Immed1 = 0x00100000, ///< One wave issued an immediate instruction this cycle. TT 3.0. + Immediate = 0x00200000, ///< One or more waves have issued an immediate instruction this cycle. TT 3.0. + UtilCounter = 0x00400000, ///< A new set of utilization counter values. TT 3.0. + RealTime = 0x00800000, ///< Output realtime. TT 3.3. + All = 0xFFFFFFFF ///< Enable all the above tokens. +}; + +/// Enumeration of register types whose reads/writes can be traced. Register reads are disabled by default as it can +/// generate a lot of traffic and cause the GPU to hang. +enum ThreadTraceRegTypeFlags : Pal::uint32 +{ + EventRegs = 0x00000001, ///< Event registers. TT 2.3. + DrawRegs = 0x00000002, ///< Draw registers. TT 2.3. + DispatchRegs = 0x00000004, ///< Dispatch registers. TT 2.3. + UserdataRegs = 0x00000008, ///< UserData Registers. Must be explicitly requested in TT 2.3. + MarkerRegs = 0x00000010, ///< Thread trace marker data regs. TT 2.3. + ShaderConfigRegs = 0x00000020, ///< Shader configuration state. TT 3.0. + ShaderLaunchStateRegs = 0x00000040, ///< Shader program launch state. TT 3.0. + GraphicsPipeStateRegs = 0x00000080, ///< Graphics pipeline state. TT 3.0. + AsyncComputeRegs = 0x00000100, ///< Async compute registers. TT 3.0. + GraphicsContextRegs = 0x00000200, ///< Graphics context registers. TT 3.0. + OtherConfigRegs = 0x00000400, ///< Other regs. TT 2.3. + AllRegWrites = 0x000007FF, ///< All reg writes other than OtherBusRegs. + OtherBusRegs = 0x00000800, ///< All write activity over gfx and compute buses. Debug only. TT 3.0. + AllRegReads = 0x00001000, ///< Not encouraged to be enabled. This can cause a GPU hang. + AllReadsAndWrites = 0xFFFFFFFF ///< All reads and writes. Not encouraged. This can cause a GPU hang. +}; + +/// Represents thread trace token types and register types that can be enabled to be reported in the trace data. If +/// a particular token type or reg type is unsupported, no error is returned and the thread trace is configured with +/// the minimum supported tokens in the user provided config. +struct ThreadTraceTokenConfig +{ + /// Mask of ThreadTraceTokenTypeFlags + uint32 tokenMask; + + /// Mask of ThreadTraceRegTypeFlags + uint32 regMask; +}; + +/// Specifies properties for a perf trace being added to a perf experiment. Input structure to +/// IPerfExperiment::AddThreadTrace(). +struct ThreadTraceInfo +{ + PerfTraceType traceType; ///< Type of trace to add. + uint32 instance; ///< Selected trace instance. + + union + { + struct + { + // Options common to all traces + uint32 bufferSize : 1; + + // Thread trace only options + uint32 threadTraceTargetSh : 1; + uint32 threadTraceTargetCu : 1; + uint32 threadTraceSh0CounterMask : 1; + uint32 threadTraceSh1CounterMask : 1; + uint32 threadTraceSimdMask : 1; + uint32 threadTraceVmIdMask : 1; + uint32 threadTraceRandomSeed : 1; + uint32 threadTraceShaderTypeMask : 1; + uint32 threadTraceIssueMask : 1; + uint32 threadTraceWrapBuffer : 1; + uint32 threadTraceStallBehavior : 1; + uint32 threadTraceTokenConfig : 1; + uint32 threadTraceStallAllSimds : 1; + uint32 threadTraceExcludeNonDetailShaderData : 1; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + uint32 threadTraceEnableExecPop : 1; +#else + uint32 placeholder2 : 1; +#endif + uint32 reserved : 16; + }; + uint32 u32All; + } optionFlags; + + struct + { + // Options common to all traces + size_t bufferSize; + + // Thread trace only options + ThreadTraceTokenConfig threadTraceTokenConfig; + uint32 threadTraceTargetSh; + uint32 threadTraceTargetCu; + uint32 threadTraceSh0CounterMask; + uint32 threadTraceSh1CounterMask; + uint32 threadTraceSimdMask; + uint32 threadTraceVmIdMask; + uint32 threadTraceRandomSeed; + PerfExperimentShaderFlags threadTraceShaderTypeMask; + uint32 threadTraceIssueMask; + bool threadTraceWrapBuffer; + uint32 threadTraceStallBehavior; + bool threadTraceStallAllSimds; + bool threadTraceExcludeNonDetailShaderData; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + bool threadTraceEnableExecPop; +#endif + } optionValues; +}; + +/// Reports thread trace data written when the trace is stopped (copied from internal SQ registers). +struct ThreadTraceInfoData +{ + uint32 curOffset; ///< Contents of SQ_THREAD_TRACE_WPTR register. + uint32 traceStatus; ///< Contents of SQ_THREAD_TRACE_STATUS register. + uint32 writeCounter; ///< Contents of SQ_THREAD_TRACE_CNTR register. +}; + +/// Describes the layout of a single shader engine's thread trace data. +struct ThreadTraceSeLayout +{ + uint32 shaderEngine; ///< Shader engine index. + uint32 computeUnit; ///< Compute unit index. + gpusize infoOffset; ///< Offset to ThreadTraceInfoData in memory. + gpusize infoSize; ///< Size in bytes reserved for ThreadTraceInfoData. + gpusize dataOffset; ///< Offset in bytes to the actual trace data. + gpusize dataSize; ///< Amount of trace data, in bytes. +}; + +/// Describes how the thread trace data is laid out. +struct ThreadTraceLayout +{ + uint32 traceCount; ///< Number of entries in traces[]. + ThreadTraceSeLayout traces[1]; ///< ThreadTraceSeLayout repeated (traceCount - 1) times. +}; + +/// Describes a single SPM counter instance. +struct SpmCounterData +{ + GpuBlock gpuBlock; ///< The kind of GPU block this counter measured. + uint32 instance; ///< Which specific global block instance this counter measured. + uint32 eventId; ///< The event that was measured by this counter. + uint32 offsetLo; ///< Byte offset within each sample to the lower 16-bit half of the counter data. + uint32 offsetHi; ///< Byte offset within each sample to the upper 16-bit half of the counter data. + bool is32Bit; ///< If the client must combine the independent 16-bit halves into a single 32-bit value. + /// If this is false offsetLo points to the full 16-bit data value and offsetHi is ignored. +}; + +/// All information required to parse the counter data out of a SpmTrace results buffer. +/// +/// Note that the hardware will continue to write samples to the SPM ring buffer even if it runs out of unused space. +/// The hardware will simply wrap the ring's write pointer back around to the first sample's location. Each subsequent +/// sample will overwrite the oldest sample in the ring. When the trace is finished we will have at most @ref +/// maxNumSamples valid samples. +/// +/// PAL doesn't zero out the ring memory so it's generally hard for the client to distinguish valid samples from random +/// data present in unused sample locations. PAL does guarantee that the final sample location in the ring has its +/// timestamp zeroed out before the SPM trace starts. This means this last timestamp will only be non-zero if the ring +/// has completely filled up and the WrPtr has wrapped one or more times. The client must inspect this timestamp when +/// parsing the sample data: +/// 1. The last timestamp is zero. The ring did not wrap. The oldest sample is at @ref sampleOffset. The ring's write +/// pointer tells us how many samples were written. From the write pointer onwards the ring contains invalid data. +/// 2. The last timestamp is non-zero. The ring did wrap. The ring's write pointer points to the oldest sample, +/// effectively a random sample offset into the ring. The full ring contains valid sample data but it's not in +/// oldest-to-newest order, it's shifted. The client can walk the ring from the write pointer's location (wrapping +/// as they go) to parse all @ref maxNumSamples samples out in oldest-to-newest order. +struct SpmTraceLayout +{ + gpusize offset; ///< Byte offset into the bound GPU memory where the spm trace data begins. + /// The @ref wrPtrOffset and @ref sampleOffset are relative to this value. + uint32 wrPtrOffset; ///< Byte offset within SPM trace data to the HW's write pointer (WrPtr) DWORD. + /// The WrPtr's value is an offset relative to @ref sampleOffset. Don't assume this is + /// a byte offset (see @ref wrPtrGranularity). The WrPtr's value shows where the HW's + /// theoretical next sample would go. This value may wrap back to zero if the HW runs of + /// space in the SPM ring buffer. + uint32 wrPtrGranularity; ///< The WrPtr's granularity. Multiply WrPtr's value by this value to get a byte offset. + uint32 sampleOffset; ///< Byte offset within the SPM trace data to the array of samples. The HW will write the + /// first sample here but it will be overwritten if the ring wraps (see the top comment). + uint32 sampleStride; ///< The distance between consecutive samples in bytes. May include empty padding. + uint32 maxNumSamples; ///< The maximum number of samples the HW can write before wrapping. The SPM ring buffer + /// ends at sampleOffset + sampleStride * maxNumSamples. + uint32 numCounters; ///< The true length of counterData. The client must allocate extra memory for the array. + + SpmCounterData counterData[1]; ///< The layout and identity of the counters in the samples. +}; + +/// Represents the information that is stored in the DF SPM trace metadata buffer. +struct DfSpmTraceMetadataLayout +{ + uint32 numRecordPairs; ///< The number of 64-byte blocks written by this trace. There are two time segments + ///< per 64-byte block so we have to check the lastSpmPkt bit to see which half of + ///< the last 64-byte block is the last packet. + uint32 padding; ///< Padding to match what the compiler does by default. + uint64 beginTimestamp; ///< The DF timestamp at the start of the DF SPM trace. + uint64 endTimestamp; ///< The DF timestamp at the finish of the DF SPM trace. +}; + +/// Specifies properties for creation of an @ref IPerfExperiment object. Input structure to +/// IDevice::CreatePerfExperiment(). +struct PerfExperimentCreateInfo +{ + union + { + struct + { + uint32 cacheFlushOnCounterCollection : 1; + uint32 sampleInternalOperations : 1; + uint32 sqShaderMask : 1; + uint32 sqWgpShaderMask : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } optionFlags; + + struct + { + bool cacheFlushOnCounterCollection; + bool sampleInternalOperations; + PerfExperimentShaderFlags sqShaderMask; ///< GpuBlock::Sq counters only look at these shader types. + PerfExperimentShaderFlags sqWgpShaderMask; ///< GpuBlock::SqWgp counters only look at these shader types. + } optionValues; +}; + +/** + *********************************************************************************************************************** + * @interface IPerfExperiment + * @brief Set of performance profiling activities to be performed over a specific range of commands in a command + * buffer. + * + * @warning The details of building a performance experiment are not very well documented here. Please see your local + * hardware performance expert for more details until this documentation can be fully fleshed out. + * + * @see IDevice::CreatePerfExperiment + *********************************************************************************************************************** + */ +class IPerfExperiment : public IGpuMemoryBindable +{ +public: + /// Adds the specified performance counter to be tracked as part of this perf experiment. + /// + /// @param [in] counterInfo Specifies which counter to add: which hardware block, instance, any options, etc. + /// + /// @returns Success if the counter was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddCounter( + const PerfCounterInfo& counterInfo) = 0; + + /// Queries the layout of counter results in memory for this perf experiment. + /// + /// @param [out] pLayout Layout describing the begin and end offset of each counter in the resulting GPU memory once + /// this perf experiment is executed. Should correspond with counters added via AddCounter(). + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetGlobalCounterLayout( + GlobalCounterLayout* pLayout) const = 0; + + /// Addes the specified thread trace to be recorded as part of this perf experiment. + /// + /// @param [in] traceInfo Specifies what type of trace to record, which block instance to trace, and options, etc. + /// + /// @returns Success if the trace was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddThreadTrace( + const ThreadTraceInfo& traceInfo) = 0; + + /// Adds the specified DfSpmTrace to be recorded as part of this perf experiment. + /// + /// @param [in] dfSpmCreateInfo Specifies the parameters of the df spm trace and + /// provides the list of perf counters. + /// + /// @returns Success if the df spm trace was successfully added to the experiment, + /// otherwise and appropriate error code. + virtual Result AddDfSpmTrace( + const SpmTraceCreateInfo& dfSpmCreateInfo) = 0; + + /// Adds the specified SpmTrace to be recorded as part of this perf experiment. + /// + /// @param [in] spmCreateInfo Specifies the parameters of the spm trace and provides the list of perf counters. + /// + /// @returns Success if the spm trace was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddSpmTrace( + const SpmTraceCreateInfo& spmCreateInfo) = 0; + + /// Queries the layout of thread trace results in memory for this perf experiment. + /// + /// @param [out] pLayout Layout describing how the results of each thread trace will be written to GPU memory when + /// this perf experiment is executed. Should correspond with counters added via AddTrace(). + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetThreadTraceLayout( + ThreadTraceLayout* pLayout) const = 0; + + /// Queries the layout of streaming counter trace results in memory for this perf experiment. + /// + /// The caller is expected to call this function twice. The first time with pLayout->numCounters = 0 which prompts + /// PAL to only set numCounters to the correct number of SPM counters and return. The second call with a non-zero + /// numCounters prompts PAL to fill out the full structure and counterData array. + /// + /// Note that @ref SpmTraceLayout contains a variable length array. The caller must allocate enough memory for + /// an additional "numCounters - 1" copies of @ref SpmCounterData. + /// + /// @param [out] pLayout Layout describing the layout of the streaming counter trace results in the resulting + /// GPU memory once this perf experiment is executed. + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetSpmTraceLayout( + SpmTraceLayout* pLayout) const = 0; + + /// Finalizes the performance experiment preparing it for execution. + /// + /// @returns Success if the operation executed successfully, otherwise an appropriate error code. + virtual Result Finalize() = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPerfExperiment() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPerfExperiment() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h b/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h new file mode 100644 index 0000000000..54c32b8b80 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h @@ -0,0 +1,896 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPipeline.h + * @brief Defines the Platform Abstraction Library (PAL) IPipeline interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" +#include "palDestroyable.h" +#include "palImage.h" +#include "palShaderLibrary.h" +#include "palSpan.h" +#include + +namespace Util +{ +namespace Abi +{ +union ApiHwShaderMapping; +enum class HardwareStage : uint32; +} + +namespace HsaAbi +{ +struct KernelArgument; +} +} + +namespace Pal +{ +struct GpuMemSubAllocInfo; +enum class PrimitiveTopology : uint8; + +/// Specifies a shader type (i.e., what stage of the pipeline this shader was written for). +enum class ShaderType : uint32 +{ + Compute = 0, + Task, + Vertex, + Hull, + Domain, + Geometry, + Mesh, + Pixel, + + Count +}; + +/// Number of shader program types supported by PAL. +constexpr uint32 NumShaderTypes = static_cast(ShaderType::Count); + +/// Maximum number of viewports. +constexpr uint32 MaxViewports = 16; + +/// Maximum number of supported stream-output declaration entries by any PAL device. +constexpr uint32 MaxStreamOutEntries = 512; + +/// Specifies a general primitive category without differentiating between a strip or list and without specifying +/// whether a the primitive will include adjacency info or not. +enum class PrimitiveType : uint32 +{ + Point = 0x0, + Line = 0x1, + Triangle = 0x2, + Rect = 0x3, + Quad = 0x4, + Patch = 0x5, + Count +}; + +/// Specifies the target range of Z values after viewport transform. +enum class DepthRange : uint32 +{ + ZeroToOne = 0x0, + NegativeOneToOne = 0x1, +}; + +/// Specifies whether the v/t texture coordinates of a point sprite map 0 to 1 from top to bottom or bottom to top. +enum class PointOrigin : uint32 +{ + UpperLeft = 0x0, + LowerLeft = 0x1, + Count +}; + +/// Specifies primitive's shade mode. +enum class ShadeMode : uint32 +{ + Gouraud = 0x0, ///< Gouraud shading mode, pixel shader input is interpolation of vertex + Flat = 0x1, ///< Flat shading mode, pixel shader input from provoking vertex + Count +}; + +/// Defines a logical operation applied between the color coming from the pixel shader and the current value in the +/// target image. +enum class LogicOp : uint32 +{ + Copy = 0x0, + Clear = 0x1, + And = 0x2, + AndReverse = 0x3, + AndInverted = 0x4, + Noop = 0x5, + Xor = 0x6, + Or = 0x7, + Nor = 0x8, + Equiv = 0x9, + Invert = 0xA, + OrReverse = 0xB, + CopyInverted = 0xC, + OrInverted = 0xD, + Nand = 0xE, + Set = 0xF, +}; + +/// Shader Engine Dispatch Interleave Size +/// +/// This determines how many Threads or Threadgroups are sent to one SE before switching to the next SE. +/// Work is always distributed in Threadgroups though. +/// +/// The 1D values are specified in Threads and the Threadgroups are walked in a 1D typewriter fashion. +/// The 2D values are specified in Threadgroups and also walked in typewriter fashion (in groups of the 2D pattern). +/// +/// Clients should check for 1D and 2D support separately in: +/// - DeviceProperties::gfxipProperties::flags::support1dDispatchInterleave +/// - DeviceProperties::gfxipProperties::flags::support2dDispatchInterleave +/// +/// Default will result in "Disable" for chips which do not support 1D or 2D. +/// Disable means that every Threadgroup is issued to the next SE. +enum class DispatchInterleaveSize : uint32 +{ + Default, + Disable, + + _1D_64_Threads, + _1D_128_Threads, + _1D_256_Threads, + _1D_512_Threads, + + _2D_1x1_ThreadGroups, + _2D_1x2_ThreadGroups, + _2D_1x4_ThreadGroups, + _2D_1x8_ThreadGroups, + _2D_1x16_ThreadGroups, + + _2D_2x1_ThreadGroups, + _2D_2x2_ThreadGroups, + _2D_2x4_ThreadGroups, + _2D_2x8_ThreadGroups, + + _2D_4x1_ThreadGroups, + _2D_4x2_ThreadGroups, + _2D_4x4_ThreadGroups, + + _2D_8x1_ThreadGroups, + _2D_8x2_ThreadGroups, + + _2D_16x1_ThreadGroups, + + Count, +}; + +/// Specifies whether to override binning setting for pipeline. Enum value of Default follows the PBB global setting. +/// Enable or Disable value overrides PBB global setting for the pipeline and sets binning accordingly. +enum class BinningOverride : uint32 +{ + Default = 0x0, + Disable = 0x1, + Enable = 0x2, + Count +}; + +/// GPU behavior is controlled by LDS_GROUP_SIZE. +enum class LdsPsGroupSizeOverride : uint32 +{ + Default = 0x0, + SingleWave = 0x1, + DoubleWaves = 0x2 +}; + +/// Tri-state enum which controls enabling or disabling a feature or behavior, or letting PAL select a sensible default +enum class OverrideMode : int32 +{ + Default = -1, ///< PAL selects the default behavior, which could be either enabled or disabled. + Disabled = 0, ///< Force to disabled. Equal to set to False. + Enabled = 1, ///< Force to enabled. Equal to set to True. +}; + +/// Enumerates the depth clamping modes a pipeline can use. +enum class DepthClampMode : uint32 +{ + Viewport = 0x0, ///< Clamps to the viewport min/max depth bounds + _None = 0x1, ///< Disables depth clamping +#if PAL_BUILD_SUPPORT_DEPTHCLAMPMODE_ZERO_TO_ONE + ZeroToOne = 0x2, ///< Clamps between 0.0 and 1.0. +#endif + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< Disables depth clamping +#endif +}; + +/// Common flags controlling creation of both compute and graphics pipeline. +union PipelineCreateFlags +{ + struct + { + uint32 clientInternal : 1; ///< Internal pipeline not created by the application. + uint32 reverseWorkgroupOrder : 1; ///< Indicates that any Dispatch using this pipeline should execute in + /// reverse workgroup order. This superceeds the flag on the CommandBuffer + /// (dispatchPingPongWalk) - always forcing reverse workgroup order! This + /// is a best effort as not all implementations or Queues may support this. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Constant definining the max number of view instance count that is supported. +constexpr uint32 MaxViewInstanceCount = 6; + +/// Specifies graphic pipeline view instancing state. +struct ViewInstancingDescriptor +{ + uint32 viewInstanceCount; ///< The view instance count of the graphic pipeline + uint32 viewId[MaxViewInstanceCount]; ///< The view instance ids. + uint32 renderTargetArrayIdx[MaxViewInstanceCount]; ///< The instance render target array index, can be + /// used in hardware accelerated stereo rendering. + uint16 viewportArrayIdx[MaxViewInstanceCount]; ///< The instance viewport array index, can be + /// used in hardware accelerated stereo rendering. + bool enableMasking; ///< Indicate whether instance masking is enabled. +}; + +// Specifies the input parameters for the MSAA coverage out feature. MSAA coverage out is used in conjunction with a +// single sampled color image. This feature exports a mask indicating which samples would have been used if the +// image had been multi-sampled. The mask is exported to the specified channel of the MRT pointing to the rendered +// image. That is, the MRT must be an active bound render target. This MSAA mask data can then be post-processed. +struct MsaaCoverageOutDescriptor +{ + union + { + struct + { + uint32 enable : 1; ///< Set to true to enable render target channel output + uint32 numSamples : 4; ///< Number of samples to export + uint32 mrt : 3; ///< Which MRT to export to. + uint32 channel : 2; ///< Which channel to export to (x = 0, y = 1, z = 2, w = 3) + uint32 reserved : 22; + }; + + uint32 u32All; + } flags; +}; + +/// Specifies properties about an indirect function belonging to a compute @ref IPipelne object. Part of the input +/// structure to IDevice::CreateComputePipeline(). +struct ComputePipelineIndirectFuncInfo +{ + const char* pSymbolName; ///< ELF Symbol name for the associated function. Must not be null. + gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during + /// pipeline creation. +}; + +/// Specifies properties for creation of a compute @ref IPipeline object. Input structure to +/// IDevice::CreateComputePipeline(). +struct ComputePipelineCreateInfo +{ + PipelineCreateFlags flags; ///< Flags controlling pipeline creation. + + const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI + /// interface. The Pipeline ELF contains pre-compiled shaders, + /// register values, and additional metadata. + size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. + uint32 maxFunctionCallDepth; ///< Maximum depth for indirect function calls. Not used for a new + /// path ray-tracing pipeline as the compiler has pre-calculated + /// stack requirements. + bool disablePartialDispatchPreemption; ///< Prevents scenarios where a subset of the dispatched thread groups are + /// preempted and the remaining thread groups run to completion. This + /// can occur when thread group granularity preemption is available and + /// instruction level (CWSR) is not. This setting is useful for allowing + /// dispatches with interdependent thread groups. + DispatchInterleaveSize interleaveSize; ///< Controls how many thread groups are sent to one SE before switching to + /// the next one. + + /// PAL expects a fixed 3D thread group size for each compute pipeline but the HSA ABI supports dynamic group sizes. + /// If this pipeline's ELF binary metadata doesn't specify a fixed thread group size, this should be used to force + /// a particular thread group size. If this extent is set to all zeros PAL will use the metadata's group size. + /// This field is not supported on PAL ABI ELFs, it should be set to all zeros. + Extent3d threadsPerGroup; + TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle + /// issuing of low priority waves when it detects too many higher priority waves are + /// failing to schedule due to resource contraints. + + const char* pKernelName; ///< When create pipeline with hsa ELF binary of multiple kernels, need to set one + /// kernel to create the pipeline. null means only one kernel in ELF binary. + +}; + +/// Specifies information about the viewport behavior of an assembled graphics pipeline. Part of the input +/// structure @ref GraphicsPipelineCreateInfo. +struct ViewportInfo +{ + bool depthClipNearEnable; ///< Enable clipping based on Near Z coordinate. + bool depthClipFarEnable; ///< Enable clipping based on Far Z coordinate. + DepthRange depthRange; ///< Specifies Z dimensions of screen space (i.e., post viewport transform: + /// 0 to 1 or -1 to 1). +}; + +/// Specifies edgeRule for rasterization +enum class EdgeRuleMode : uint32 +{ + D3dCompliant = 0x0, ///< Use rasterization edge-rules which comply with the D3D spec. + OpenGlDefault = 0x1, ///< Use rasterization edge-rules compatible with the default OpenGL driver. +}; + +/// Specifies Rasterizer state in properties for creation of a graphics +struct RasterizerState +{ + PointOrigin pointCoordOrigin; ///< Controls texture coordinate orientation for point sprites. + bool expandLineWidth; ///< If true, line primitives will have their width expanded by 1/cos(a) + /// where a is the minimum angle from horizontal or vertical. + /// This can be used in conjunction with PS patching for a client to + /// implement line antialiasing. + ShadeMode shadeMode; ///< Specifies shading mode, Gouraud or Flat + bool rasterizeLastLinePixel; ///< Specifies whether to draw last pixel in a line. + bool outOfOrderPrimsEnable; ///< Enables out-of-order primitive rasterization. PAL silently + /// ignores this if it is unsupported in hardware. + bool perpLineEndCapsEnable; ///< Forces the use of perpendicular line end caps as opposed to + /// axis-aligned line end caps during line rasterization. + BinningOverride binningOverride; ///< Binning setting for this pipeline. + + DepthClampMode depthClampMode; ///< Depth clamping behavior + + union + { + struct + { + uint8 clipDistMaskValid : 1; ///< Whether or not @ref clipDiskMask, below, is valid. + uint8 cullDistMaskValid : 1; ///< Whether or not @ref cullDistMask, below, is valid. + uint8 reserved : 6; + }; + uint8 u8All; ///< All the flags as a single value. + } flags; + + uint8 cullDistMask; ///< Mask of which cullDistance exports to leave enabled. + uint8 clipDistMask; ///< Mask of which clipDistance exports to leave enabled. + bool dx10DiamondTestDisable; ///< Disable DX10 diamond test during line rasterization. + EdgeRuleMode edgeRule; +}; + +/// Specifies Per-MRT color target info in olor target state +struct ColorTargetInfo +{ + SwizzledFormat swizzledFormat; ///< Color target format and channel swizzle. Set the format to invalid + /// if no color target will be bound at this slot. + uint8 channelWriteMask; ///< Color target write mask. Bit 0 controls the red channel, bit 1 is + /// green, bit 2 is blue, and bit 3 is alpha. + bool forceAlphaToOne; ///< Treat alpha as one regardless of the shader output. Ignored unless + /// supportAlphaToOne is set in DeviceProperties. +}; + +/// Specifies color target state in properties for creation of a graphics +struct ColorTargetState +{ + bool alphaToCoverageEnable; ///< Enable alpha to coverage. + bool dualSourceBlendEnable; ///< Blend state bound at draw time will use a dual source blend mode. + LogicOp logicOp; ///< Logic operation to perform. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 904 + bool uavExportSingleDraw; ///< When UAV export is enabled, acts as a hint that only a single draw + /// is done on a color target with this or subsequent pipelines before + /// a barrier. Improves performance by allowing pipelines to overlap. +#endif + + ColorTargetInfo target[MaxColorTargets]; ///< Per-MRT color target info. +}; + +/// Specifies properties for creation of a graphics @ref IPipeline object. Input structure to +/// IDevice::CreateGraphicsPipeline(). +struct GraphicsPipelineCreateInfo +{ + PipelineCreateFlags flags; ///< Flags controlling pipeline creation. + + const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI + /// interface. The Pipeline ELF contains pre-compiled shaders, + /// register values, and additional metadata. + size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. + const IShaderLibrary** ppShaderLibraries; ///< An array of graphics @ref IShaderLibrary object. pPipelineBinary + /// and ppShaderLibraries can't be valid at the same time. + size_t numShaderLibraries; ///< Number of graphics shaderLibrary object in ppShaderLibraries. + bool useLateAllocVsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally + /// determining the limit. + uint32 lateAllocVsLimit; ///< The number of VS waves that can be in flight without having param + /// cache and position buffer space. If useLateAllocVsLimit flag is set, + /// PAL will use this limit instead of the PAL-specified limit. + bool useLateAllocGsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally + /// determining the limit. + uint32 lateAllocGsLimit; ///< Controls GS LateAlloc val (for pos/prim allocations NOT param cache) + /// on NGG pipelines. Can be no more than 127. + struct + { + struct + { + PrimitiveType primitiveType; ///< Basic primitive category: points, line, triangles, patches. + bool topologyIsPolygon; ///< Indicates that triangle primitives are combined to represent more + /// complex polygons. Only valid for triangle primitive types. + uint32 patchControlPoints; ///< Number of control points per patch. Only required if primitiveType + /// is PrimitiveType::Patch. + } topologyInfo; ///< Various information about the primitive topology that will be used with this pipeline. + /// All of this info must be consistent with the full topology specified by + /// ICmdBuffer::SetPrimitiveTopology() when drawing with this pipeline bound. + + /// Number of vertex buffer slots which are accessed by this pipeline. Behavior is undefined if the pipeline + /// tries to access a vertex buffer slot outside the range [0, vertexBufferCount). It is generally advisable + /// to make this the minimum value possible because that reduces the number of vertex buffer slots PAL has to + /// maintain for this pipeline when recording command buffers. + uint32 vertexBufferCount; + } iaState; ///< Input assembler state. + + RasterizerState rsState; ///< Rasterizer state. + ColorTargetState cbState; ///< Color target state. + + ViewInstancingDescriptor viewInstancingDesc; ///< Descriptor describes view instancing state + /// of the graphics pipeline + MsaaCoverageOutDescriptor coverageOutDesc; ///< Descriptor describes input parameters for MSAA coverage out. + ViewportInfo viewportInfo; ///< Viewport info. + DispatchInterleaveSize taskInterleaveSize; ///< Ignored for pipelines without a task shader. For pipelines with + /// a task shader, controls how many thread groups are sent to one + /// SE before switching to the next one. + LdsPsGroupSizeOverride ldsPsGroupSizeOverride; ///< Whether to override ldsPsGroupSize setting for pipeline. + + TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle + /// issuing of low priority waves when it detects too many higher priority waves are + /// failing to schedule due to resource contraints. + bool noForceReZ; ///< Disables the ability for PAL to force ReZ modes outside of what was chosen by + /// the compiler for this pipeline. +}; + +/// The graphic pipeline view instancing information. This is used to determine if hardware accelerated stereo rendering +/// can be enabled for a graphic pipeline. +struct GraphicPipelineViewInstancingInfo +{ + union + { + struct + { + uint32 shaderUseViewId : 1; ///< If any shader in pipeline uses view id. + uint32 gsExportRendertargetArrayIndex : 1; ///< If gs exports render target array index, + /// must be 0 if there is no gs. + uint32 gsExportViewportArrayIndex : 1; ///< If gs exports viewport array index, + /// must be 0 if there is no gs. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 apiShaderFlags; + }; + + const ViewInstancingDescriptor* pViewInstancingDesc; ///< View Instancing descriptor +}; + +/// Reports properties of a compiled pipeline. This includes hashes for the pipeline and shaders that the client can +/// use to correlate PAL pipeline/shader dumps with corresponding API-level pipelines/shaders. +struct PipelineInfo +{ + const char* pName; ///< Non-owning pointer to a null-terminated string containing human-readable + ///< name of this pipeline. nullptr indicates no name was provided. + + PipelineHash internalPipelineHash; ///< 128-bit identifier extracted from this pipeline's ELF binary, composed of + /// the state the compiler decided was appropriate to identify the compiled + /// shaders. The lower 64 bits are "stable"; the upper 64 bits are "unique". + + struct + { + ShaderHash hash; ///< Unique 128-bit identifier for this shader. 0 indicates there is no shader bound for + /// the corresponding shader stage. + } shader[NumShaderTypes]; ///< Array of per-shader pipeline properties. + + union + { + struct + { + uint32 hsaAbi : 1; ///< This pipeline uses the HSA ABI (i.e. bind arguments not user-data) + uint32 usesCps : 1; ///< This pipeline uses continuations passing shaders (CPS). In an archive pipeline, + /// this bit is set iff it is set in any constituent pipeline. + uint32 cpsGlobal : 1; ///< If using continuations passing shaders (CPS), stack is in global rather than + /// scratch. In an archive pipeline, this bit is set iff it is set in any + /// constituent pipeline. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + } flags; ///< Pipeline properties. + + struct + { + union + { + struct + { + uint32 perSampleShading : 1; ///< Shader instructions want per-sample execution. + uint32 usesSampleMask : 1; ///< Shader is using sample mask. + uint32 enablePops : 1; ///< Primitive order pixel shader is enabled. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + } flags; + } ps; ///< Pixel shader properties. + + uint64 resourceMappingHash; ///< 64-bit hash of the resource mapping used when compiling the pipeline, + /// if available (0 otherwise). + + /// The GPU address and size in bytes of the traceRays table, or { 0, 0 } if none. + /// The format of the table is private to the compiler. + /// Compiled code requires a buffer descriptor for the traceRays table to be passed in to the launch kernel + /// in a particular place. + gpusize traceRaysTable; + gpusize traceRaysTableSize; + + /// Pointer to the the traceRays table for debugging purpose, or nullptr if none. + const char* pTraceRaysTable; + + /// Pointer to and size in bytes of the shader identifiers table, or { nullptr, 0 } if none. + /// The table has an entry for each export and then each hit group specified when the pipeline was compiled. + /// Each entry has a pointer to the shader or hit group name in the same block of data (always 64 bits even + /// on a 32-bit host), then the 32 byte shader identifier whose format is agreed between the compiler and + /// GPURT's library code, for a total of 40 bytes per entry. + const void* pShaderIdentifiers; + size_t shaderIdentifiersSize; + + uint32 unifiedRgsNameHash; ///< 32-bit hash of unified RGS name, 0 otherwise + +}; + +/// A structure that represents any 3D arrangement of threads or thread groups as part of a compute shader dispatch. +/// +/// This structure is halfway between Extent3d and Offset3d, depending on the context it may represent an offset or +/// an extent. Essentially it's meaning is tied to the concept of 3D thread or thread group grids rather than generic +/// contexts like "extent" or "offset". Whether it represents threads or thread groups is also context specific. +struct DispatchDims +{ + uint32 x; ///< Threads or thread groups in the X dimension. + uint32 y; ///< Threads or thread groups in the Y dimension. + uint32 z; ///< Threads or thread groups in the Z dimension. + + /// Computes the volume of this 3D arrangement of threads or thread groups. + /// + /// @returns the total number of threads or threads groups this struct represents. + uint32 Flatten() const { return x * y * z; } +}; + +// There are some places where we'd like to directly cast DispatchDims to an array of three uint32s. +static_assert(sizeof(DispatchDims) == sizeof(uint32) * 3, "DispatchDims not castable to uint32*"); + +/// Component-wise addition of two DispatchDims. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A new DispatchDims which contains the sum of 'l' and 'r' along each dimension. +inline DispatchDims operator+(DispatchDims l, DispatchDims r) { return {l.x + r.x, l.y + r.y, l.z + r.z}; } + +/// Component-wise addition of one DispatchDims into another. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A reference to 'l' after it is updated to the sum of 'l' and 'r'. +inline DispatchDims& operator+=(DispatchDims& l, DispatchDims r) { return l = (l + r); } + +/// Component-wise multiplication of two DispatchDims. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A new DispatchDims which contains the product of 'l' and 'r' along each dimension. +inline DispatchDims operator*(DispatchDims l, DispatchDims r) { return {l.x * r.x, l.y * r.y, l.z * r.z}; } + +/// Component-wise multiplication of one DispatchDims into another. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A reference to 'l' after it is updated to the product of 'l' and 'r'. +inline DispatchDims& operator*=(DispatchDims& l, DispatchDims r) { return l = (l * r); } + +/// Used to represent API level shader stage. +enum ShaderStageFlagBits : uint32 +{ + ApiShaderStageCompute = (1u << static_cast(ShaderType::Compute)), + ApiShaderStageTask = (1u << static_cast(ShaderType::Task)), + ApiShaderStageVertex = (1u << static_cast(ShaderType::Vertex)), + ApiShaderStageHull = (1u << static_cast(ShaderType::Hull)), + ApiShaderStageDomain = (1u << static_cast(ShaderType::Domain)), + ApiShaderStageGeometry = (1u << static_cast(ShaderType::Geometry)), + ApiShaderStageMesh = (1u << static_cast(ShaderType::Mesh)), + ApiShaderStagePixel = (1u << static_cast(ShaderType::Pixel)), +}; + +/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined +/// due to HW support. The same information will be repeated for both the constituent shaders in this case. +struct ShaderStats +{ + uint32 shaderStageMask; ///< Indicates the stages of the pipeline this shader is + /// used for. If multiple bits are set, it implies + /// shaders were merged. See @ref ShaderStageFlagBits. + CommonShaderStats common; ///< The shader compilation parameters for this shader. + /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableVgprs; + /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableSgprs; + size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. + ShaderHash palShaderHash; ///< Internal hash of the shader compilation data used by PAL. + + union + { + struct + { + uint32 writesUAV : 1; ///< This shader performs writes to UAVs. + uint32 writesDepth : 1; ///< Indicates explicit depth writes performed by the shader stage. + uint32 streamOut : 1; ///< The shader performs stream out of shader generated data. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + + } shaderOperations; ///< Flags depicting shader operations. + + struct + { + DispatchDims numThreadsPerGroup; ///< Number of compute threads per thread group in X, Y, and Z dimensions. + } cs; ///< Parameters specific to compute shader only. + + union + { + struct + { + uint8 copyShaderPresent : 1; ///< Indicates that the copy shader data is valid. + uint8 reserved : 7; ///< Reserved for future use. + }; + uint8 u8All; ///< All the flags as a single value. + } flags; ///< Flags related to this shader data. + + CommonShaderStats copyShader; ///< This data is valid only when the copyShaderPresent flag above is set. +}; + + /** + *********************************************************************************************************************** + * @interface IPipeline + * @brief Monolithic object containing all shaders and a large amount of "shader adjacent" state. Separate concrete + * implementations will support compute or graphics pipelines. + * + * @see IDevice::CreateComputePipeline() + * @see IDevice::CreateGraphicsPipeline() + * @see IDevice::LoadPipeline() + *********************************************************************************************************************** + */ +class IPipeline : public IDestroyable +{ +public: + /// Returns PAL-computed properties of this pipeline and its corresponding shaders. + /// + /// @returns Property structure describing this pipeline. + virtual const PipelineInfo& GetInfo() const = 0; + + /// Returns a list of GPU memory allocations used by this pipeline. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this pipeline. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Obtains the binary code object for this pipeline. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the pipeline ELF is written in the buffer. If null, the size required + /// for the pipeline ELF is given out in the location pSize. + /// + /// @returns Success if the pipeline binary was fetched successfully. + /// +ErrorUnavailable if the pipeline binary was not fetched successfully. + virtual Result GetCodeObject( + uint32* pSize, + void* pBuffer) const = 0; + + /// Obtains the pointer of code object with ELF format according to the shader type. Returned ELF object is not + /// guaranteed to be unique with different shader type, because a single code object can contain multiple shaders. + /// + /// @param [in] shaderType The shader stage for which the code object are requested. + /// @param [out] pSize The size of the ELF binary. + /// + /// @returns The pointer of ELF binary which contains requested shader stage. + virtual const void* GetCodeObjectWithShaderType( + ShaderType shaderType, + size_t* pSize) const = 0; + + /// Obtains the shader pre and post compilation stats/params for the specified shader stage. + /// + /// @param [in] shaderType The shader stage for which the stats are requested. + /// + /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for + /// the shader stage mentioned in shaderType. This cannot be nullptr. + /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the + /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. + /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. + /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error + /// occured. + virtual Result GetShaderStats( + ShaderType shaderType, + ShaderStats* pShaderStats, + bool getDisassemblySize) const = 0; + + /// Obtains the compiled shader ISA code for the shader stage specified. + /// + /// @param [in] shaderType The shader stage for which the shader cache entry is requested. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required + /// for the shader ISA is given out in the location pSize. + /// + /// @returns Success if the shader ISA code was fetched successfully. + /// +ErrorUnavailable if the shader ISA code was not fetched successfully. + virtual Result GetShaderCode( + ShaderType shaderType, + size_t* pSize, + void* pBuffer) const = 0; + + /// Obtains the generated performance data for the shader stage specified. + /// + /// @param [in] hardwareStage The hardware stage of the shader which the performance data is requested. + /// @param [in, out] pSize Represents the size of the performance data. + /// @param [out] pBuffer If non-null, the performance data is written in the buffer. If null, the size + /// required for the performance data is given out in the location pSize. + /// + /// @returns Success if the performance data was fetched successfully. + /// +ErrorUnavailable if the performance data was not fetched successfully. + virtual Result GetPerformanceData( + Util::Abi::HardwareStage hardwareStage, + size_t* pSize, + void* pBuffer) = 0; + + /// Notifies PAL that this pipeline may make indirect function calls to any function contained within any of the + /// specified @ref IShaderLibrary objects. This gives PAL a chance to perform any late linking steps required to + /// valid execution of the possible function calls (this could include adjusting hardware resources such as GPRs + /// or LDS space for the pipeline). + /// + /// This may be called multiple times on the same pipeline object. Subsequent calls do not invalidate the result + /// of previous calls. + /// + /// This must be called prior to binding this pipeline to a command buffer which will make function calls into any + /// shader function contained within any of the specified libraries. Failure to comply is an error and will result + /// in undefined behavior. + /// + /// Currently only supported on compute pipelines. + /// + /// @param [in] ppLibraryList List of @ref IShaderLibrary object to link with. + /// @param [in] libraryCount Number of valid library objects in the ppLibraryList array. + /// + /// @returns Success if the operation is successful. Other return codes may include: + /// + ErrorUnavailable if called on a graphics pipeline. + /// + ErrorBadPipelineData if any of the libraries in ppLibraryList are not compatible with this pipeline. + /// Reasons for incompatibility include (but are not limited to) different user-data mappings, different + /// wavefront sizes, and other reasons. + virtual Result LinkWithLibraries( + const IShaderLibrary*const* ppLibraryList, + uint32 libraryCount) = 0; + + /// Sets the stack size for indirect function calls made by this pipeline. This may be smaller than or equal to the + /// stack size already determined during pipeline creation or during an earlier call to LinkWithLibraries() because + /// the client has access to more information about which functions contained in those libraries (or in the pipeline + /// itself) are actually going to be called. + /// + /// Note that a future call to LinkWithLibraries() will invalidate this value and this should + /// be called again. + /// + /// @param [in] stackSizeInBytes Client-specified stack size, in bytes. + virtual void SetStackSizeInBytes( + uint32 stackSizeInBytes) = 0; + + /// Retrieve the stack sizes managed by compiler, including the frontend stack and the backend stack. + /// + /// @param [out] pSizes To be filled with both the frontend stack size and the backend stack size, in bytes. + /// + /// @returns SUCCESS + virtual Result GetStackSizes( + CompilerStackSizes* pSizes) const = 0; + + /// Returns the API shader type to hardware stage mapping for the pipeline. + /// + /// @returns The appropriate mapping for this pipeline. + virtual Util::Abi::ApiHwShaderMapping ApiHwShaderMapping() const = 0; + + /// Given the zero-based position of a kernel argument, return a pointer to that argument's metadata. + /// + /// @note Only compute pipelines using the HSA ABI have kernel arguments. + /// + /// @param [in] index The zero-based position of the kernel argument to query. + /// + /// @returns A pointer to the kernel argument's metadata, or null if this pipeline doesn't have this argument. + virtual const Util::HsaAbi::KernelArgument* GetKernelArgument(uint32 index) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const { return m_pClientData; } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Get the array of underlying pipelines that this pipeline contains. For a normal non-multi-pipeline, + /// this returns a single-entry array pointing to the same IPipeline. For a multi-pipeline compiled in + /// dynamic launch mode, this returns an empty array. The contents of the returned array remain valid + /// until the IPipeline is destroyed. + /// + /// @returns The array of underlying pipelines. + virtual Util::Span GetPipelines() const = 0; + + /// Get the array of underlying shader libraries that this pipeline contains. For a normal non-multi-pipeline, + /// this returns the empty array. The contents of the returned array remain valid until the IPipeline is + /// destroyed. + /// + /// @returns The array of underlying shader libraries. + virtual Util::Span GetLibraries() const { return {}; } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPipeline() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPipeline() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + + IPipeline(const IPipeline&) = delete; + IPipeline& operator=(const IPipeline&) = delete; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h b/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h new file mode 100644 index 0000000000..50ae295d29 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h @@ -0,0 +1,645 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPlatform.h + * @brief Defines the Platform Abstraction Library (PAL) IPlatform interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palSysMemory.h" +#include "palMemTrackerImpl.h" +#include "palDestroyable.h" +#include "palDeveloperHooks.h" + +// DevDriver forward declarations. +namespace DevDriver +{ +class DevDriverServer; +namespace EventProtocol +{ +class EventServer; +} +class SettingsRpcService; +} + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 +namespace SettingsRpcService +{ +class SettingsService; +} +#endif + +#if PAL_BUILD_RDF +// GpuUtil forward declarations. +namespace GpuUtil +{ +class TraceSession; +} +#endif + +#if PAL_ENABLE_LOGGING +namespace Util +{ +struct DbgLoggerFileSettings; +} +#endif + +namespace Pal +{ + +// Forward declarations. +class IDevice; +class IScreen; +struct PalPlatformSettings; +enum class PalEvent : uint32; + +/// Maximum number of Devices possibly attached to a system. +constexpr uint32 MaxDevices = 16; +/// Maximum number of Screens possibly attached to a Device. +constexpr uint32 MaxScreensPerDevice = 6; +/// Maximum number of Screens possibly attached to a system. +constexpr uint32 MaxScreens = (MaxScreensPerDevice * MaxDevices); +constexpr uint32 MaxModePerScreen = 64; + +/// 32-bit PAL version identifier. +/// +/// Version number of the PAL library. Major version is bumped on every promotion from stg/pal to stg/pal_prm. Minor +/// version is bumped when a change is cherry-picked to stg/pal_prm. +/// +/// @see PlatformProperties::palVersion +struct Version +{ + uint16 major; ///< Major version number. + uint16 minor; ///< Minor version number. +}; + +/// Union defining the DevDriver GPU ID layout. +/// This is specifically used by DevDriver across multiple tools/driver and should not be changed. +union PciId +{ + struct + { + uint32 functionId : 8; ///< PCI function number in the system for this GPU. + uint32 deviceId : 8; ///< PCI device number in the system for this GPU. + uint32 busId : 8; ///< PCI bus number in the system for this GPU. + uint32 reserved : 8; ///< Reserved for future use. + }; + uint32 u32All; ///< Fields packed as 32-bit uint. +}; + +/// Reports capabilities and general properties of this instantiation of the PAL library. +/// +/// This covers any property that it platform-wide as opposed to being tied to a particular device in the system. +/// +/// @see IPlatform::GetProperties +struct PlatformProperties +{ + Version palVersion; ///< Version number of the PAL library. Note that this is distinct from the interface version. + /// It will be regularly updated as described in @ref Version. + union + { + struct + { + uint32 supportNonSwapChainPresents : 1; ///< If set, non-swapchain presents are supported. + uint32 supportBlockIfFlipping : 1; ///< If set, IQueue::Submit can protect against command buffers + /// that write to GPU memory queued for a flip present. + uint32 explicitPresentModes : 1; ///< If set, the PresentMode enums specified during direct and swap + /// chain presents explicitly determine the presentation method. + /// Additionally, the client must enumerate IScreens and use them + /// to explicitly manage fullscreen ownership. Otherwise, the + /// present modes are suggestions and fullscreen ownership is + /// managed internally by PAL. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + }; +}; + +/// The client that Pal may query profile for. the order is the same as SHARED_AP_AREA in KMD escape interface +enum class ApplicationProfileClient : uint32 +{ + Uninitialized = 0, + Dxx, + Udx, + Cfx, + Ogl, + User3D, + Ocl, + Mmd, + Pplib, + Dal, + Chill, + Spp, + Boost, + DeLag, + Ris, + TurboSync, + DutyCycleScaling, + ProBoost, + RisWindowed, + FreeMux, + FsrOvr, + Count +}; + +/// Describes a primary surface view +/// +/// @see IPlatform::GetPrimaryLayout() +struct PrimaryViewInfo +{ + Rect rect; ///< Rectangle defining one portion of a primary surface layout. + uint32 numIndices; ///< The size of the gpuIndex array. + uint32 gpuIndex[MaxDevices]; ///< The devices in a linked adapter chain that can use this view. +}; + +/// Specifies output arguments for IPlatform::GetPrimaryLayout(), returning information about the layout of the primary +/// surface. +/// +/// @see IPlatform::GetPrimaryLayout() +struct GetPrimaryLayoutOutput +{ + uint32 numViews; ///< The number of views in the pViewInfoList array. + PrimaryViewInfo* pViewInfoList; ///< The primary surface is composed of these views. + union + { + struct + { + uint32 disablePartialCopy : 1; ///< If this flag is not set, the client can transfer the specific views of + /// primary surface to peer GPUs. Otherwise, the client must transfer the + /// whole primary surface to peer GPUs. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< specifies primary surface layout flags. +}; + +/// Specifies TurboSync control mode +enum class TurboSyncControlMode : uint32 +{ + Disable = 0, ///< Disable TurboSync + Enable = 1, ///< Enable TurboSync + UpdateAllocations = 2, ///< Update allocations only, without disable or enable TurboSync + Register = 3, ///< Register the current platform as TurboSync requested platform, doesn't actually + /// activate TurboSync. + Count +}; + +constexpr uint32 TurboSyncMaxSurfaces = 2; ///< Specifies maximum number of surfaces in a private TurboSync swapchain + +/// Input argument for IPlatform::TurboSyncControl. TurboSync is a feature that enables app to render at higher than +/// V-Sync frame rates while still being tearing-free. It creates a private swapchain and copy application's back +/// buffer to the primary in this private swapchain when application is flipping. KMD controls the flipping of the +/// private swapchain to screen. +struct TurboSyncControlInput +{ + TurboSyncControlMode mode; ///< Specifies the TurboSync control mode + uint32 vidPnSourceId; ///< The vidPnSourceId the call is targeted + + /// GpuMemory of the primaries in private swapchain, per-gpu. This is indexed by the device indices enumerated by + /// the platform. Pal forwards the allocation handles (if IGpuMemory ptr is not null) to Kmd without validation. + const IGpuMemory* pPrimaryMemoryArray[MaxDevices][TurboSyncMaxSurfaces]; +}; + +/** +************************************************************************************************************************ +* @interface IPlatform +* @brief Interface representing an client-configurable context of the PAL platform. +* +* This is the root of all client interaction with PAL. Each IPlatform contains a set of the IDevice's and IScreens +* found in the system. +* +* + Creation of IDevice and IScreen objects. +* + Installation of memory management callbacks. +* + Query application profiles from the system. +************************************************************************************************************************ +*/ +class IPlatform : public IDestroyable +{ +public: + /// Enumerates a list of available Devices. + /// + /// This function creates a set of @ref IDevice objects corresponding to the devices attached to the system. + /// CreatePlatform() must be called before this function is called. + /// + /// This function may be called multiple times during the lifetime of the PAL lib, in which case all previous + /// @ref IDevice and @ref IScreen objects are automatically destroyed. The client is responsible for + /// destroying all objects attached to the existing @ref IDevice objects before re-calling this function. + /// Re-enumerating Devices is required if ErrorDeviceLost is ever returned by PAL, as this may indicate a device + /// has been physically removed from the system. + /// + /// @note Before IPlatform::Destroy can be called, all devices returned by IPlatform::EnumerateDevices() must be + /// destroyed. + /// + /// @param [out] pDeviceCount Specifies the number of devices available in the system. This is the number of valid + /// entries in pDevices[]. Must not be null. + /// @param [out] pDevices Array to be populated with a device object pointer for each device available in the + /// system. The first *pDeviceCount entries are valid. Must not be null. + /// + /// @returns Success if all Devices were successfully enumerated in pDevices[]. Otherwise, one of the following + /// error codes may be returned: + /// + ErrorInitializationFailed will be returned if PAL is unable to query the available Devices. + virtual Result EnumerateDevices( + uint32* pDeviceCount, + IDevice* pDevices[MaxDevices]) = 0; + + /// Returns the storage size of the object implementing IScreen. + /// + /// Use this to determine the size of each pStorage pointer passed to GetScreens. + /// + /// @returns the storage size in bytes of the object implementing IScreen. + virtual size_t GetScreenObjectSize() const = 0; + + /// Retrieves the list of available screens. + /// + /// This function queries a set of @ref IScreen objects corresponding to the screens attached to the system. + /// CreatePlatform() and IPlatform::EnumerateDevices() must be called before this function is called. + /// + /// This function may be called multiple times during the lifetime of the PAL lib. Each call returns a new + /// set of screen objects. + /// + /// @ingroup LibInit + /// + /// @param [out] pScreenCount Specifies the number of screens available in the system. This is the number of valid + /// entries in pScreens[] and pStorage[]. Must not be null. + /// @param [in] pStorage Array of caller-allocated storage for the screen objects. Each must be the size + /// returned by GetScreenObjectSize. Must always pre-allocate MaxScreens worth, must + /// not be NULL nor may any entry be NULL. + /// @param [out] pScreens Array to be populated with a screen pointer for each screen available in the system. + /// The first *pScreenCount entries are valid. Must not be null. + /// + /// @note pScreens[i] uses the storage from pStorage[i]. pStorage[i] is unused for i >= *pScreenCount. + /// + /// @returns Success if all screens were successfully retrieved in pScreens[]. Otherwise, one of the following + /// error codes may be returned: + /// + ErrorUnavailable if this was called prior to IPlatform::EnumerateDevices(). + virtual Result GetScreens( + uint32* pScreenCount, + void* pStorage[MaxScreens], + IScreen* pScreens[MaxScreens]) = 0; + + /// Queries a client specified application profile in raw format. + /// + /// This function queries the kernel-mode driver to determine if there is a platform-wide profile for a specific + /// application that the client would like to honor. It is optional, and doesn't need to be called if the client + /// does not wish to support application profiles. + /// + /// As the format of profile is client specified, the profile will be returned in raw format and client has the + /// responsibility to parse the profile. @see GpuUtil::ProfileIterator provides a basic capability to iterate all + /// properties in the raw data packet. The memory storing the raw data is managed by Pal. + /// + /// The pFilename string can be the EXE name, like "doom.exe", or the "Content Distribution Network" (CDN) ID, + /// like "SteamAppId:570". You can use the function GpuUtil::QueryAppContentDistributionId() to get the CDN ID. + /// + /// @ingroup LibInit + /// + /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. + /// See GpuUtil::QueryAppContentDistributionId(). + /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. + /// @param [in] client Client name that KMD will query the profile for + /// @param [out] pOut Will be filled with the application profile string if the profile exists and was + /// successfully queried. + /// + /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully + /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. + /// Otherwise, one of the following error codes may be returned: + /// + ErrorInvalidPointer will be returned if pFilename or pOut is null. + /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices + /// discovered. + virtual Result QueryRawApplicationProfile( + const wchar_t* pFilename, + const wchar_t* pPathname, + ApplicationProfileClient client, + const char** pOut) = 0; + + /// Enable UMD side support for the SPP feature (Shader Profiling for Power). The gist of the initial version + /// of this feature is that we will profile important applications to determine which shaders are heavily memory + /// bound, then use these profiles to program the RLC to dynamically reduce engine clocks when running such shaders. + /// This should result in power savings with a limited perf impact. + + /// Each of these app profiles will include one or more tables specifying how to program the RLC. + /// There may be multiple tables in cases where we need different RLC programming based on user controlled factors + /// that affect memory boundedness: resolution, MSAA rate, etc. + + /// @ingroup LibInit + /// + /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. + /// See GpuUtil::QueryAppContentDistributionId(). + /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. + + /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully + /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. + /// Otherwise, one of the following error codes may be returned: + /// + ErrorInvalidPointer will be returned if pFilename is null. + /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices + /// discovered. + virtual Result EnableSppProfile( + const wchar_t* pFilename, + const wchar_t* pPathname) = 0; + + /// Reports the properties of the platform. + /// + /// Returns the capabilities and general properties of this platform instantiation. + /// + /// @param [out] pProperties Capabilities and general properties of this platform instantiation (not tied to a + /// particular device). + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pProperties is null. + virtual Result GetProperties( + PlatformProperties* pProperties) = 0; + + /// Installs the callback into the specified platform. + /// + /// @param [in] pPlatform The platform to install the callback into. + /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. + /// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. + static void InstallDeveloperCb( + IPlatform* pPlatform, + Developer::Callback pfnDeveloperCb, + void* pPrivateData) + { pPlatform->InstallDeveloperCb(pfnDeveloperCb, pPrivateData); }; + + /// Returns the currently enabled developer callback types. + /// + /// @returns The bitmask of currently enabled developer callback types + virtual uint32 GetEnabledCallbackTypes() const = 0; + + /// Sets the currently enabled developer callback types. + /// + /// @param [in] callbackTypeMask A bitmask of client requested developer callback types to enable + virtual void SetEnabledCallbackTypes( + uint32 enabledCallbackTypesMask) = 0; + + /// Returns a pointer to the developer driver server object if developer mode is enabled on the system. + /// + /// @returns A valid DevDriver::DevDriverServer pointer if developer mode is enabled. If developer mode is not + /// enabled, nullptr will be returned. + virtual DevDriver::DevDriverServer* GetDevDriverServer() = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 + /// Will be replaced by GetSettingsRpcService(). + virtual SettingsRpcService::SettingsService* GetSettingsService() = 0; +#endif + + /// Client drivers can register their DevDriver based settings components via SettingsRpcService. + /// + /// @returns A pointer to a SettingsRpcService object. Could be nullptr if developer driver mode is not enabled. + virtual DevDriver::SettingsRpcService* GetSettingsRpcService() = 0; + + /// Returns a pointer to the event server object. The event server will soon move out of the DevDriver + /// server. Hence the need to provide a separate interface to access the event server. + /// + /// @returns A valid EventServer pointer or nullptr if not valid. + virtual DevDriver::EventProtocol::EventServer* GetEventServer() = 0; + +#if PAL_BUILD_RDF + /// Returns a pointer to the current trace session if one was created during startup + /// + /// @returns A valid TraceSession pointer if a session currently exists. If a trace session was not created during + /// startup, nullptr will be returned. + virtual GpuUtil::TraceSession* GetTraceSession() = 0; + + /// Indicate frame count increment to frame trace controller. + /// Client driver is responsible for calling this method once per frame. + /// + /// @param [in] pQueue The queue on which a new frame has been detected + virtual void UpdateFrameTraceController( + IQueue *pQueue) = 0; +#endif + + /// Gets the GPU ID for a given pal device index. + /// + /// The GPU ID is determined from (BusID << 16) | (DeviceID << 8) | FunctionID + /// + /// @param [in] gpuIndex The index of the GPU + /// + /// @returns A GPU ID described above or UnknownGPUID if the device doesn't exist + virtual PciId GetPciId(uint32 gpuIndex) = 0; + + /// Indicates whether tracing has been enabled. + /// + /// @returns True if tracing is enabled, false otherwise. + virtual bool IsTracingEnabled() const = 0; + + /// Indicates whether the driver has been signaled to enable crash analysis mode. + /// + /// @returns True if crash analysis is enabled, false otherwise. + virtual bool IsCrashAnalysisModeEnabled() const = 0; + + /// Indicates whether the driver has been signaled to enable Raytracing Shader Data Tokens. + /// + /// @returns True if Raytracing Shader Data Tokens is enabled, false otherwise. + virtual bool IsRaytracingShaderDataTokenRequested() const = 0; + + /// Returns a pointer to the Platform settings structure + /// + /// @returns A reference to a PalPlatformSettings structure. + virtual const PalPlatformSettings& PlatformSettings() const = 0; + + /// Get primary surface layout based upon VidPnSource provided by client. + /// + /// This function is used by client to query the layout of the primary surface. The layout describes how primary + /// surface is composed with a set of views. Each view provides the rectangle of the surface area and the GPUs + /// this surface area will be displayed on. + /// Client should make first call pass in pPrimaryLayoutOutput->pViewInfoList as NULL to query the number of views + /// this primary surface has. + /// Client then based on pPrimaryLayoutOutput->numViews, allocates the buffer for pViewInfoList. And client then + /// makes the escape call again to query the actual view information. + /// + /// @param [in] vidPnSourceId VidPnSource ID that's associated to a primary surface. + /// @param [in, out] pPrimaryLayoutOutput Primary surface layout output arguments. + /// + /// @returns Success if the display layout on given vidPnSourceId was successfully queried. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if pPrimaryLayoutOutput is invalid. + /// + ErrorUnavailable if no implementation on current platform. + /// + ErrorOutOfMemory if there is not enough system memory. + virtual Result GetPrimaryLayout( + uint32 vidPnSourceId, + GetPrimaryLayoutOutput* pPrimaryLayoutOutput) = 0; + + /// Calls TurboSyncControl escape to control TurboSync on specific vidPnSourceId. + /// + /// The function is called when clients intend to toggle TurboSync on a vidPnSourceId. The client should allocate + /// private swapchain primary surfaces that's compatible with the application swapchain primaries. When used to + /// activate TurboSync, the private primaries' handles needs to be passed in the TurboSyncControlInput data. + /// + /// @param [in] turboSyncControlInput TurboSyncControl input arguments. See TurboSyncControlInput. + /// + /// @returns Success if the TurboSyncControl request is handled successfully. + virtual Result TurboSyncControl( + const TurboSyncControlInput& turboSyncControlInput) = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Allocates memory using the platform's ForwardAllocator. + /// + /// @param [in] allocInfo @see Util::AllocInfo + /// + /// @returns Pointer to the allocated memory on success, nullptr on failure. + void* Alloc(const Util::AllocInfo& allocInfo) + { +#if PAL_MEMTRACK + return m_memTracker.Alloc(allocInfo); +#else + return m_allocator.Alloc(allocInfo); +#endif + } + + /// Frees memory using the platform's ForwardAllocator. + /// + /// @param [in] freeInfo @see Util::FreeInfo + void Free(const Util::FreeInfo& freeInfo) + { +#if PAL_MEMTRACK + m_memTracker.Free(freeInfo); +#else + m_allocator.Free(freeInfo); +#endif + } + + /// Logs a text string via the developer driver bus if it is currently connected. + /// + /// @param [in] level Log priority level associated with the message. + /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. + /// @param [in] pFormat Format string for the log message. + /// @param [in] args Variable arguments that correspond to the format string. + virtual void LogMessage(LogLevel level, + LogCategoryMask categoryMask, + const char* pFormat, + va_list args) = 0; + + /// Logs a text string via the developer driver bus if it is currently connected. + /// + /// @param [in] level Log priority level associated with the message. + /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. + /// @param [in] pFormat Format string for the log message. + /// @param [in] ... Variable arguments that correspond to the format string. + void LogMessage(LogLevel level, + LogCategoryMask categoryMask, + const char* pFormat, + ...) + { + va_list args; + va_start(args, pFormat); + LogMessage(level, categoryMask, pFormat, args); + va_end(args); + } + + /// Logs an event using the DevDriver protocol. + /// + /// @param [in] eventId The type of event you want to log. + /// @param [in] pEventData A pointer to the struct corresponding to the event id. + /// @param [in] eventDataSize The size of the event data struct. + virtual void LogEvent( + PalEvent eventId, + const void* pEventData, + uint32 eventDataSize) {} + +#if PAL_ENABLE_LOGGING + /// Function to access the current settings of file logger. + /// Clients can call this function to get file logger settings in order to configure + /// this logger at the time of its creation. + /// + /// @param [in] pSettings A struct in which file logger settings are copied. + virtual void GetDbgLoggerFileSettings( + Util::DbgLoggerFileSettings* pSettings) = 0; +#endif + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPlatform( + const Util::AllocCallbacks& allocCb) + : +#if PAL_MEMTRACK + m_memTracker(&m_allocator), +#endif + m_allocator(allocCb), + m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPlatform() { } + + /// @internal Initialization common to all platforms; must be called in subclass overrides of this function. + /// Currently only handles initialization of the memory leak tracker. + virtual Result Init() + { +#if PAL_MEMTRACK + return m_memTracker.Init(); +#else + return Result::Success; +#endif + } + + /// Used by the InstallDeveloperCb to install the event handler according to the derived platform. + /// + /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. + /// @param [in] pPrivateData Private data that is installed with the event handler for use by the installer. + virtual void InstallDeveloperCb( + Developer::Callback pfnDeveloperCb, + void* pPrivateData) = 0; + +#if PAL_MEMTRACK + /// @internal Memory leak tracker. Requires an allocator in order to perform the actual allocations. We can't + /// provide this platform because that would result in a stack overflow. We must give it our forward allocator. + Util::MemTracker m_memTracker; +#endif + + /// @internal Memory allocator. Calls to Alloc() and Free() are chained down to the allocator's counterparts. + Util::ForwardAllocator m_allocator; + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h new file mode 100644 index 0000000000..69e4de70c1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h @@ -0,0 +1,234 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueryPool.h + * @brief Defines the Platform Abstraction Library (PAL) IQueryPool interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +/// Specifies a category of GPU query pool. +enum class QueryPoolType : uint32 +{ + Occlusion = 0x0, ///< Occlusion query pool. Supports queries based on the Z test. + PipelineStats = 0x1, ///< Pipeline stats query pool. Supports queries based on statistics from the GPU's execution + /// such as a count of prims generated, shader invocations, etc. + StreamoutStats = 0x2, ///< Streamout query pool. Supports queries based on statistics from the GPU's execution + /// such as number of primitives written to SO buffer and storage needed. + Count, +}; + +/// Specifies what data a query slot must produce. Some query pool types support multiple query types. +enum class QueryType : uint32 +{ + Occlusion = 0x0, ///< The total passes recorded by the Z test. + BinaryOcclusion = 0x1, ///< One if there were one or more Z test passes, zero otherwise. + PipelineStats = 0x2, ///< The total statistics selected by the given pipeline stats query pool. + StreamoutStats = 0x3, ///< SO statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats1 = 0x4, ///< SO1 statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats2 = 0x5, ///< SO2 statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats3 = 0x6, ///< SO3 statistics tracked by CP/VGT including primitives written and storage needed. + Count, +}; + +/// Specifies which pipeline stats should be tracked by a pipeline stats query pool. +enum QueryPipelineStatsFlags : uint32 +{ + QueryPipelineStatsIaVertices = 0x1, ///< Input vertices. + QueryPipelineStatsIaPrimitives = 0x2, ///< Input primitives. + QueryPipelineStatsVsInvocations = 0x4, ///< Vertex shader invocations. + QueryPipelineStatsGsInvocations = 0x8, ///< Geometry shader invocations. + QueryPipelineStatsGsPrimitives = 0x10, ///< Geometry shader primitives. + QueryPipelineStatsCInvocations = 0x20, ///< Clipper invocations. + QueryPipelineStatsCPrimitives = 0x40, ///< Clipper primitives. + QueryPipelineStatsPsInvocations = 0x80, ///< Pixel shader invocations. + QueryPipelineStatsHsInvocations = 0x100, ///< Hull shader invocations. + QueryPipelineStatsDsInvocations = 0x200, ///< Domain shader invocations. + QueryPipelineStatsCsInvocations = 0x400, ///< Compute shader invocations. + QueryPipelineStatsTsInvocations = 0x800, ///< Task shader invocations. + QueryPipelineStatsMsInvocations = 0x1000, ///< Mesh shader invocations. + QueryPipelineStatsMsPrimitives = 0x2000, ///< Mesh shader primitives. + QueryPipelineStatsAll = 0x3FFF ///< All of the above stats. +}; + +/// Specifies properties for @ref IQueryPool creation. Input structure to IDevice::CreateQueryPool(). +struct QueryPoolCreateInfo +{ + QueryPoolType queryPoolType; ///< Type of query pool to create (i.e., occlusion vs. pipeline stats). + uint32 numSlots; ///< Number of slots in the query pool. + uint32 enabledStats; ///< An ORed mask of stats flags specific to the query pool type. + /// @see QueryPipelineStatsFlags for PipelineStats query pools. + union + { + struct + { + /// If true, this query pool can have results retrieved using the CPU (using @ref IQueryPool::GetResults) + /// and can be reset using the CPU (using @ref IQueryPool::Reset). Otherwise, the client must use command + /// buffers to perform these operations (using @ref ICmdBuffer::CmdResetQueryPool and + /// @ref ICmdBuffer::CmdResolveQuery). + uint32 enableCpuAccess : 1; + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed together as a uint32. + } flags; ///< Flags controlling QueryPool behavior. +}; + +/// Controls operations that compute query results. +enum QueryResultFlags : uint32 +{ + QueryResultDefault = 0x0, ///< Default to 32-bit results with no waiting. + QueryResult64Bit = 0x1, ///< Store all results as 64-bit values. + QueryResultWait = 0x2, ///< Wait for the queries to finish when computing the results. + QueryResultAvailability = 0x4, ///< If the results of a query are available at computation time a one will be + /// written as a separate value after the result value, if the results were not + /// available a zero will be written. + QueryResultPartial = 0x8, ///< If the final result of a query would be unavailable, then return a + /// result for that query between 0 and what the final result would be. + QueryResultAccumulate = 0x10, ///< Results are added to the values present in the destination, if availability + /// data is enabled it will be ANDed with the present availability data. + QueryResultPreferShaderPath = 0x20, ///< Prefer a shader resolve path over a command processor path. + QueryResultOnlyPrimNeeded = 0x40, ///< Select only primitives storage needed in Streamout query results + QueryResultAll = 0x7F ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/** + *********************************************************************************************************************** + * @interface IQueryPool + * @brief Represents a set of queries that can be used to retrieve detailed info about the GPU's execution of a + * particular range of a command buffer. + * + * Currently, only occlusion queries and pipeline statistic queries are supported. All queries in a pool are the same + * type. + * + * @see IDevice::CreateQueryPool() + *********************************************************************************************************************** + */ +class IQueryPool : public IGpuMemoryBindable +{ +public: + /// Retrieves query results from a query pool. + /// + /// Multiple consecutive query results can be retrieved with one call. + /// + /// @param [in] flags Flags that control the result data layout and how the results are retrieved. + /// @param [in] queryType Specifies what data the query slots must produce. + /// @param [in] startQuery First query pool slot to retrieve data for. + /// @param [in] queryCount Number of query pool slots to retrieve data for. + /// @param [in] pMappedGpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, + // this method will use Map\UnMap to access the data. + /// @param [in,out] pDataSize Input value specifies the available size in pData in bytes; output value reports the + /// number of bytes required to hold all result data. + /// @param [out] pData Location where the query results should be written. Can be null in order to query the + /// required size. The data returned depends on the query pool type and flags. All data + /// entries are either uint32 or uint64 integers. One or more type-specific entries will + /// be optionally followed by one entry for availability. The type-specific data is:
+ /// + QueryOcclusion: One entry to store the zPass count. + /// + QueryPipelineStats: One entry per statistic enabled in the create info. The stats + /// will be written in the appropriate order for each PAL client. + /// @param [in] stride Stride in bytes between subsequent query result data or zero to request tightly + /// packed result data. + /// + /// @returns Success if query results were successfully returned in pData, or NotReady if any of the requested query + /// slots does not yet have results available. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidValue if the range defined by startQuery and queryCount is not valid for this query pool. + /// + ErrorGpuMemoryNotBound if the query pool requires GPU memory but none is bound. + /// + ErrorInvalidMemorySize if pData is non-null and the value stored in pDataSize is too small. + virtual Result GetResults( + QueryResultFlags flags, + QueryType queryType, + uint32 startQuery, + uint32 queryCount, + const void* pMappedGpuAddr, + size_t* pDataSize, + void* pData, + size_t stride) = 0; + + /// Use CPU to reset the query pool slots. + /// + /// Supported for occlusion and video decode statistics query pools. + /// + /// @param [in] startQuery First query pool slot to reset. + /// @param [in] queryCount Number of query pool slots to reset. + /// @param [in] pMappedCpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, + // this method will use Map/UnMap to access the data. + /// + /// @returns Success if the reset was successfully performed. + virtual Result Reset( + uint32 startQuery, + uint32 queryCount, + void* pMappedCpuAddr) = 0; + + /// Returns the distance, in bytes, between successive query slots in the bound GPU memory. + /// This method is only supported for @ref QueryPoolType::VideoDecodeStats + /// + /// @returns the distance, in bytes, between successive query slots in the bound GPU memory. + virtual gpusize GetQuerySlotStride() const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueryPool() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IQueryPool() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h new file mode 100644 index 0000000000..55467e7e6d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h @@ -0,0 +1,765 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueue.h + * @brief Defines the Platform Abstraction Library (PAL) IQueue interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include "palEvent.h" + +namespace Pal +{ + +// Forward declarations. +class ICmdBuffer; +class IFence; +class IGpuMemory; +class IImage; +class IPrivateScreen; +class IQueueSemaphore; +class IScreen; +class ISwapChain; +struct CmdBufInfo; +struct GpuMemSubAllocInfo; +struct GpuMemoryRef; +struct DoppRef; + +enum class VirtualGpuMemAccessMode : uint32; + +/// Specifies whether presents are windowed or fullscreen. This will determine whether the present is performed via a +/// BLT or flip. +enum class PresentMode : uint32 +{ + Unknown, + Windowed, + Fullscreen, + Count +}; + +/// Enumerates the possible overrides for the flip interval. +enum class FlipIntervalOverride : uint32 +{ + _None = 0, ///< No override. + Immediate = 1, ///< Zero frames of flip latency. + ImmediateAllowTearing = 2, ///< Same as Immediate, but allows tearing (no vsync). + One = 3, ///< One frame of flip latency. + Two = 4, ///< Two frames of flip latency. + Three = 5, ///< Three frames of flip latency. + Four = 6, ///< Four frames of flip latency. +}; + +/// Defines flags for describing which types of present modes are supported on a given queue. +enum PresentModeSupport : uint32 +{ + SupportWindowedPresent = 0x1, + SupportWindowedPriorBlitPresent = 0x2, + SupportFullscreenPresent = 0x4, +}; + +/// Defines submit-time bottlenecks which PAL can potentially optimize. +enum class SubmitOptMode : uint32 +{ + Default = 0, ///< PAL will enable optimizations when generally efficient. + Disabled = 1, ///< Disable all optimizations that could be detrimental in special cases. + MinKernelSubmits = 2, ///< Minimize the overhead of launching command buffers on the CPU and GPU. + MinGpuCmdOverhead = 3, ///< Minimize the overhead of reading command buffer commands on the GPU. + Count +}; + +/// Enumerates vcn instance affinity statuses +enum MmAffinityStatus : uint32 +{ + MmAffinityNotAllowed = 0, ///< The specific vcn instance can't be used. + MmAffinityAllowed = 1 ///< The specific vcn instance can be used. +}; + +/// Union describes all vcn instance affinity status. +union MmAffinity +{ + struct + { + uint32 vcn0Affinity : 2; ///< Affinity for instance vcn0 + uint32 vcn1Affinity : 2; ///< Affinity for instance vcn1 + uint32 reserved : 28; ///< Reserved (all 0) + }; + uint32 u32All; +}; + +/// Structure describing dump information for a command buffer. +struct CmdBufferDumpDesc +{ + EngineType engineType; ///< The engine type that this buffer is targeted for. + QueueType queueType; ///< The type of queue that this buffer is being created on. + SubEngineType subEngineType; ///< The ID of which sub-engine that this buffer is made for. + + uint32 cmdBufferIdx; ///< The index into the SubmitInfo ppCmdBuffers array that this + /// command buffer dump came from. + union + { + struct + { + uint8 isPreamble : 1; ///< Set if the buffer is an internal preamble command buffer. + uint8 isPostamble : 1; ///< Set if the buffer is an internal postamble command buffer. + uint8 reserved : 6; ///< Reserved for future use. + }; + uint8 u32All; ///< Flags packed as 8-bit uint. + } flags; + +}; + +/// Structure describing a command buffer chunk for use while dumping command buffers. +struct CmdBufferChunkDumpDesc +{ + uint32 id; ///< ID (number) of this command chunk within the command buffer. + const void* pCommands; ///< Pointer to the command data. + size_t size; ///< Size of valid data in bytes pointed to in pCommands. +}; + +/// Definition for command buffer dumping callback. +/// +/// @param [in] cmdBufferDesc Description of the command buffer. +/// @param [in] pChunks Pointer to an array of command buffer chunk descriptions. +/// @param [in] numChunks The number of chunks pointed to in pChunks. +typedef void (PAL_STDCALL* CmdDumpCallback)( + const CmdBufferDumpDesc& cmdBufferDesc, + const CmdBufferChunkDumpDesc* pChunks, + uint32 numChunks, + void* pUserData); + +/// Specifies properties for @ref IQueue creation. Input structure to IDevice::CreateQueue(). +struct QueueCreateInfo +{ + QueueType queueType; ///< Selects which type of queue to create. + EngineType engineType; ///< Selects which type of engine to create. + uint32 engineIndex; ///< Which instance of the specified engine type to query. For example, there + /// can be multiple compute queues, so this parameter distinguished between them. + SubmitOptMode submitOptMode; ///< A hint telling PAL which submit-time bottlenecks should be optimized, if any. + QueuePriority priority; ///< A hint telling PAL to create queue with proper priority. + /// It is only supported if supportQueuePriority is set in DeviceProperties. + /// In Linux, if we don't have root privilege, the creation with above-Medium + /// priority will fail. Client should take the corresponding action like retry + /// with lower priority, if necessary. + struct + { + uint32 aqlQueue : 1; ///< Compute queue will process AQL packets and kernels + uint32 windowedPriorBlit : 1; ///< All windowed presents on this queue are notifications + /// that the client has manually done a blit present + uint32 tmzOnly : 1; ///< This queue allows only TMZ submissions. Required for + /// compute TMZ submits. + +#if PAL_AMDGPU_BUILD + uint32 enableGpuMemoryPriorities : 1; ///< Enables support for GPU memory priorities on this Queue. + /// This is optional because enabling the feature requires + /// a small amount of memory overhead per-Queue for + /// bookkeeping purposes. +#else + uint32 placeholder2 : 1; ///< Reserved field. Set to 0. +#endif + uint32 dispatchTunneling : 1; ///< This queue uses compute dispatch tunneling. + + uint32 forceWaitIdleOnRingResize : 1; ///< This queue need to wait for idle before resize RingSet. + /// This is intended as a workaround for misbehaving applications. +#if defined(_WIN32) + uint32 nullRendering : 1; ///< Setting this bit makes this queue behave like IfhModeKmd. +#else + uint32 placeholder3 : 1; ///< Reserved field. Set to 0. +#endif + uint32 reserved : 25; ///< Reserved for future use. + }; + + uint32 numReservedCu; ///< The number of reserved compute units for RT CU queue + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 persistentCeRamOffset; ///< Byte offset to the beginning of the region of CE RAM which this Queue should + /// preserve across consecutive submissions. Must be a multiple of 32. It is an + /// error to specify a nonzero value here if the the Device does not support + /// @ref supportPersistentCeRam for the Engine this Queue will attach to. + uint32 persistentCeRamSize; ///< Amount of CE RAM space which this Queue should preserve across consecutive + /// submissions. Units are in DWORDs, and this must be a multiple of 8. It is an + /// error to specify a nonzero value here if the the Device does not support + /// @ref supportPersistentCeRam for the Engine this Queue will attach to. +#endif + + uintptr_t aqlPacketList; ///< Location of the HIP runtime's info about this queue +}; + +/// Specifies the portion of @ref SubmitInfo that is specific to each sub-queue in a multi-queue object (@see +/// IDevice::CreateMultiQueue). Effectively, this enables specifying a different set of command buffers for each +/// queue that makes up a gang submission to a multi-queue object. +struct PerSubQueueSubmitInfo +{ + uint32 cmdBufferCount; ///< Number of command buffers to be submitted (can be 0 if this submit doesn't + /// involve work for the relevant queue). + ICmdBuffer*const* ppCmdBuffers; ///< Array of cmdBufferCount command buffers to be submitted. Command buffers + /// that are part of a ganged submit must guarantee the conditions required + /// for the optimizeExclusiveSubmit flag. + const CmdBufInfo* pCmdBufInfoList; ///< Null, or an array of cmdBufferCount structs providing additional + /// info about the command buffers being submitted. If non-null, + /// elements are ignored if their isValid flag is false. +}; + +/// Specifies all information needed to execute a set of command buffers. Input structure to IQueue::Submit(). +/// +/// Some members of this structure are not supported on all platforms. The client must check the appropriate properties +/// structures to determine if the corresponding features are supported: +/// + pGpuMemoryRefs: Support is indicated by supportPerSubmitMemRefs in @ref DeviceProperties. +/// + ppBlockIfFlipping: Support is indicated by supportBlockIfFlipping in @ref PlatformProperties. If it is supported, +/// the client must not specify a blockIfFlippingCount greater than MaxBlockIfFlippingCount. +/// +/// @note If this queue is running in physical submission mode (due to hardware restrictions), the gpuMemRefCount and +/// pGpuMemoryRefs arguments to this method are ignored because the command buffers themselves contain their own +/// GPU memory reference lists. +struct MultiSubmitInfo +{ + const PerSubQueueSubmitInfo* pPerSubQueueInfo;///< Specifies per-subqueue information for the submit. Typically + /// this is a pointer to a single entry specifying the command + /// buffers to be submitted on this queue. For gang submission on + /// a multi-queue, this should be an array with one entry per + /// sub-queue. The array size must be less than or equal to the + /// queueCount specified when the multi-queue was created and + /// the workload specified in each entry will be assigned to the + /// corresponding sub-queue. It is valid to have a cmdBufferCount + /// of 0 for sub-queues without work. Can be null if perSubQueueInfo- + /// Count is 0. + uint32 perSubQueueInfoCount; ///< Number of PerSubqueueSubmitInfo to be submitted. Can be zero if + /// there is no work to submit. + uint32 gpuMemRefCount; ///< Number of GPU memory references for this submit. + const GpuMemoryRef* pGpuMemoryRefs; ///< Array of gpuMemRefCount GPU memory references. Can be null if + /// gpuMemRefCount is zero. The GPU memory objects will be made + /// resident for the duration of this submit. + uint32 doppRefCount; ///< Number of DOPP desktop texture references for this submit. + const DoppRef* pDoppRefs; ///< Array of doppRefCount DOPP texture references. Can be null if + /// doppRefCount is zero. + uint32 externPhysMemCount; ///< Number of entries in ppExternPhysMem. + const IGpuMemory** ppExternPhysMem; ///< Array of external physical memory allocations to be initialized + /// as part of this submit. The first submit that references a + /// particular external physical memory allocation must include + /// that allocation in this list. Subsequent submits that reference + /// the same allocation should not include it in this list, as it + /// would trigger redundant GPU page table initialization. + uint32 blockIfFlippingCount; ///< Number of GPU memory objects to protect when flipped. + const IGpuMemory*const* ppBlockIfFlipping; ///< Array of blockIfFlippingCount GPU memory objects. Can be null if + /// blockIfFlippingCount is zero. The command buffers will not be + /// scheduled to the GPU while a fullscreen (flip) present is queued + /// for any of these GPU memory allocations. + uint32 fenceCount; ///< Number of fence objects to be signaled once the last command buffer + /// in this submission completes execution. + IFence** ppFences; ///< Array of fence objects. Can be null if fenceCount is zero. + CmdDumpCallback pfnCmdDumpCb; ///< Null, or a callback function to handle the dumping of the + /// command buffers used in this submit. + void* pUserData; ///< Client provided data to be passed to callback. + + uint32 stackSizeInDwords; ///< 0, or the max of stack frame size for indirect shaders of the + /// pipelines referenced in the command buffers of this submission. + /// The size is per native thread. So that the client will have to + /// multiply by 2 if a Wave64 shader that needs scratch is used. + /// Note that the size will not shrink for the lifetime of the queue + /// once it is grown and only affects compute scratch ring. + const IGpuMemory* pFreeMuxMemory; ///< The gpu memory object of the private flip primary surface for the + /// FreeMux feature. +}; + +typedef MultiSubmitInfo SubmitInfo; + +/// The value of blockIfFlippingCount in @ref SubmitInfo cannot be greater than this value. +constexpr uint32 MaxBlockIfFlippingCount = 16; + +/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentDirect(). +struct PresentDirectInfo +{ + union + { + struct + { + uint32 fullscreenDoNotWait : 1; ///< Fail the present immediately if the present queue is full. + uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. + uint32 dstIsTypedBuffer : 1; ///< True if the destination is a typed buffer instead of an image. + uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not + /// execute a present if this is true but may update internal + /// tracking state. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Present flags. + + OsWindowHandle hWindow; ///< Native OS window handle that this image should be presented to. + PresentMode presentMode; ///< Chooses between windowed and fullscreen present. + uint32 presentInterval; ///< Must be an integer from 0 to 4. 0 indicates that the present should + /// occur immediately (may tear), and 1-4 indicates the present should + /// occur after 1 to 4 vertical syncs. Only valid for fullscreen presents. + union + { + IImage* pSrcImage; ///< Optional: The image to be presented. If null, the present will not + /// occur but PAL may still call into the OS on certain platforms that + /// expect it. + IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur + /// but PAL may still call into the OS on certain platforms that expect it. + }; + union + { + IImage* pDstImage; ///< Optional: copy from the source image to this image. If null, PAL will + /// automatically copy into the appropriate platform-specific destination. + /// This is only supported for windowed mode presents. + IGpuMemory* pDstTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur + /// but PAL may still call into the OS on certain platforms that expect it. + }; + +}; + +/// Media stream counter information. +struct MscInfo +{ + uint64 targetMsc; ///< if the current MSC is less than , the buffer swap + ///< will occur when the MSC value becomes equal to + uint64 divisor; ///< Divisor + ///< the buffer swap will occur the next time the MSC value is + ///< incremented to a value such that MSC % = + ///< if the current MSC is greater than or equal to + uint64 remainder; ///< Remainder +}; + +/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentSwapChain(). +struct PresentSwapChainInfo +{ + PresentMode presentMode; ///< Chooses between windowed and fullscreen present. + IImage* pSrcImage; ///< The image to be presented. + ISwapChain* pSwapChain; ///< The swap chain associated with the source image. + uint32 imageIndex; ///< The index of the source image within the swap chain. Owership of this image + /// index will be released back to the swap chain if this call succeeds. + uint32 rectangleCount; ///< Number of valid rectangles in the pRectangles array. + uint32 syncInterval; ///< Applicable only when syncIntervalOverride is set + /// 0 - The presentation occurs immediately, there is no synchronization. + /// 1 through 4 - Synchronize presentation after the nth vertical blank. + const Rect* pRectangles; ///< Array of rectangles defining the regions which will be updated. + uint64 presentId; ///< PresentId functions as an identifier for present operations on a swapchain. + /// If this PresentId is non-zero, then the application can later use this value + /// to refer to that image presentation. A value of zero indicates that this + /// presentation has no associated presentId. A non-zero presentId must be greater + /// than any non-zero presentId passed previously by the application for the same + /// swapchain. + union + { + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 941 + uint32 notifyOnly : 1; ///< True if it is a notify-only present +#else + uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not + /// execute a present if this is true but may update internal + /// tracking state. +#endif + uint32 isTemporaryMono : 1; ///< True if WS Stereo is enabled, but 3D display mode turned off. + uint32 turboSyncEnabled : 1; ///< Whether TurboSync is enabled. + uint32 syncIntervalOverride : 1; ///< Override default syncInterval with the value in syncInterval + /// Supported only on Windows wsiPlatforms. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< PresentSwapChainInfo flags. +#if PAL_AMDGPU_BUILD + MscInfo mscInfo; ///< Media stream counter information +#endif +}; + +/// Specifies a mapping from a range of pages in a virtual GPU memory object to a range of pages in a real GPU memory +/// object. Input to IQueue::RemapVirtualMemoryPages(). +/// +/// When mapping pages of a virtual GPU memory object to a range of pages in a real GPU memory object on a remote GPU, +/// the client must point pRealGpuMem at a peer GPU memory object created on the input queue's device instead of the +/// actual real GPU memory object created on the remote device. This is required for two reasons: +/// 1. PAL can only view remote GPU memory using peer objects. +/// 2. PAL enforces a separation of state between different IDevice object families. +/// +/// virtualStartOffset and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. +/// realStartOffset must be aligned to the realMemAllocGranularity member of @ref DeviceProperties. +struct VirtualMemoryRemapRange +{ + IGpuMemory* pVirtualGpuMem; ///< Virtual GPU memory object whose mapping is being updated. + gpusize virtualStartOffset; ///< Start of the page range to be updated, in bytes. + IGpuMemory* pRealGpuMem; ///< Real GPU memory object the virtual range should point at. + gpusize realStartOffset; ///< Start of the page range in the real GPU memory object, in bytes. + gpusize size; ///< Size of the mapping range, in bytes. + VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages. + /// This parameter is ignored on some platforms. +}; + +/// Specifies a set of page mappings to copy between virtual GPU memory objects. The source and destination can be the +/// same memory object and the source and destination regions may overlap. Input to IQueue::CopyVirtualMemoryPageMappings(). +/// +/// srcStartOffset, dstStartOffset, and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. +struct VirtualMemoryCopyPageMappingsRange +{ + IGpuMemory* pSrcGpuMem; ///< Virtual GPU memory object whose mapping is being copied from. + gpusize srcStartOffset; ///< Start of the copy source range, in bytes. + IGpuMemory* pDstGpuMem; ///< Virtual GPU memory object whose mapping is being copied to. + gpusize dstStartOffset; ///< Start of the copy destination range, in bytes. + gpusize size; ///< Size of the mapping range, in bytes. +}; + +/// Specifies kernel level information about a context. +struct KernelContextInfo +{ + union + { + struct + { + uint32 hasDebugVmid : 1; ///< True if the context has acquired the debug vmid. + uint32 hasHighPriorityVmid : 1; ///< True if the context has acquired the high priority vmid. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Context flags. + + uint64 contextIdentifier; ///< Kernel scheduler context identifier. +}; + +/** + *********************************************************************************************************************** + * @interface IQueue + * @brief Represents a queue of work for a particular GPU engine on a device. + * + * An IQueue object is a virtual representation of a hardware engine on the device. Multiple IQueue objects can be + * created and have work submitted on them in parallel. Work is submitted to a queue through @ref ICmdBuffer objects, + * and work can be synchronized between multiple queues using @ref IQueueSemaphore objects. + * + * @see IDevice::GetQueue() + *********************************************************************************************************************** + */ +class IQueue : public IDestroyable +{ +public: + /// Submits a group of root command buffers for execution on this queue. + /// + /// @param [in] submitInfo Specifies all command buffers to execute along with other residency and synchronization + /// information. See @ref SubmitInfo for additional, important documentation. + /// + /// @returns Success if the command buffer was successfully submitted. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if: + /// - any of the array inputs are null when their counts are non-zero. + /// - any members of non-null point arrays are null. + /// + ErrorTooManyMemoryReferences if the total number of memory references (device/queue global and + /// per-command buffer) is too large. + /// + ErrorInvalidValue if blockIfFlippingCount is too large. + /// + ErrorIncompleteCommandBuffer if any of the submitted command buffers are not properly constructed. + /// + ErrorIncompatibleQueue if any submitted command buffer does not match this queue's type (e.g., + /// universal, graphics, DMA). + virtual Result Submit( + const MultiSubmitInfo& submitInfo) = 0; + + /// Waits for all previous submission on this queue to complete before control is returned to the caller. + /// + /// @returns Success if wait for submissions completed. Otherwise an error indicates reason for unsuccessful wait, + /// for example due to lost device. + virtual Result WaitIdle() = 0; + + /// Inserts a semaphore signal into the GPU queue. The semaphore will be signaled once all previously submitted + /// work on this queue has completed. + /// + /// @param [in] pQueueSemaphore Semaphore to signal. + /// @param [in] value timeline Semaphore point value to signal, ignored for non-timeline semaphores. + /// + /// @returns Success if the semaphore signal was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if the OS scheduler rejects the signal for unknown reasons. + virtual Result SignalQueueSemaphore( + IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; + + /// Inserts a semaphore wait into the GPU queue. The queue will be stalled until the specified semaphore is + /// signaled. + /// + /// @param [in] pQueueSemaphore Semaphore to wait on. + /// @param [in] value timeline semaphore point value to wait on, ignored for non-timeline semaphores. + /// + /// @returns Success if the semaphore wait was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if the OS scheduler rejects the wait for unknown reasons. + virtual Result WaitQueueSemaphore( + IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; + +#if PAL_KMT_BUILD + /// Acquire the keyed mutex of shared GPU memory object (CPU sync) and then wait for the synchronization object of + /// the shared GPU memory object (GPU sync based on fence). Note that the shared GPU memory object has to be + /// a D3d11 resource created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) + /// misc flag. + /// + /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are bound. + /// [in] key Key of keyed mutex to be acquired. + /// [in] timeout Timeout interval for keyed mutex acquiring, in milliseconds. + /// + /// @ returns Success if the keyed mutex has been successfully acquired and wait for the synchronization object + /// has been successfully scheduled. Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if either the keyed mutex has not been successfully acuiqred or wait for the + /// synchronization object has not been successfully scheduled. + virtual Result KeyedMutexAcquireSync( + IGpuMemory* pGpuMemory, + uint64 key, + std::chrono::milliseconds timeout) = 0; + + /// Signal the synchronization object of shared GPU memory object with bumped fence value and then release the + /// keyed mutex of shared GPU memory object. Note that the shared GPU memory object has to be a D3d11 resource + /// created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) misc flag. + /// + /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are + /// bound. + /// [in] key Key of keyed mutex to be released. + /// + /// @ returns Success if signal of the synchronization object has been successfully scheduled and the keyed mutex + /// has been successfully released. Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if either signal of the synchronization object has not been successfully scheduled + /// or keyed mutex has not been successfully released. + virtual Result KeyedMutexReleaseSync( + IGpuMemory* pGpuMemory, + uint64 key) = 0; +#endif + + /// This function passes application information to KMD for application specific power optimizations. + /// Power configuration are restored to default when all application queues are destroyed. + /// + /// @param [in] pFileName Application executable name + /// @param [in] pPathName Path to the application + /// + /// @returns Success if the information is passed successfully. Otherwise, one of the following errors may be + /// returned: + /// + Unsupported if this function is not available on this OS or if the queue context is null. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result UpdateAppPowerProfile( + const wchar_t* pFileName, + const wchar_t* pPathName) = 0; + + /// Queues the specified image for presentation on the screen. This function directly queues the presentation + /// request based on the input parameters without special synchronization considerations like a swap chain present. + /// All previous work done on this queue will complete before the image is displayed. + /// + /// This function should never be called with a swap chain presentable image because it won't release ownership of + /// the presentable image index, eventually deadlocking the swap chain. + /// + /// Overall support for direct presents can be queried at platform creation time via supportNonSwapChainPresents + /// in @ref PlatformProperties. Support for particular present modes is specifed via supportedDirectPresentModes + /// in @ref DeviceProperties. + /// + /// @note Any images specified in presentInfo must be made resident before calling this function. + /// + /// @param [in] presentInfo Specifies the source image and destination window for the present as well as other + /// properties. + /// + /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidValue if the flip interval is invalid. + /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image. + /// + ErrorInvalidFlags if the present flags don't match the capabilities of the image. + virtual Result PresentDirect( + const PresentDirectInfo& presentInfo) = 0; + + /// Queues the specified image for presentation on the screen. This function uses the provided swap chain to + /// determine exactly how the image should be presented (e.g., can the user see tearing). See @ref ISwapChain for + /// more information on swap chain presentation. All previous work done on this queue will complete before the + /// image is displayed, but future work may execute before the present is completed because swap chain present + /// execution may be asynchronous to the queue that initiated present. + /// + /// Assuming the presentInfo is valid, this function will always release ownership of the presentable image index + /// even if PAL encounters an error while executing the present. + /// + /// Queue support for swap chain presents is specified via supportsSwapChainPresents in @ref DeviceProperties. + /// Support for particular PresentModes is queried per SwapChainMode via IDevice::GetSwapChainInfo(). + /// + /// @note The source image specified in presentInfo must be made resident before calling this function. + /// + /// @param [in] presentInfo Specifies the source image, swap chain, and basic presentation information. + /// + /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if the source image or swap chain are null. + /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image or if the image + /// index isn't valid within the swap chain. + virtual Result PresentSwapChain( + const PresentSwapChainInfo& presentInfo) = 0; + + /// Inserts a delay of a specified amount of time before processing more commands on this queue. + /// + /// Only available on timer queues. Useful in conjunction with queue semaphores to implement frame pacing. + /// + /// @param [in] delay Time, in milliseconds, to delay before processing more commands on this queue. + /// + /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if delay is less than 0. + virtual Result Delay( + Util::fmilliseconds delay) = 0; + + /// Inserts a delay of a specified amount of time on this queue after a vsync on a private display object. + /// + /// Only available on timer queues. Useful in conjunction with queue semaphores to implement pacing of GPU and CPU + /// operations for rendering and presentation in VR as this allows GPU commands of next frame to be sent early but + /// blocks GPU execution until after vsync. + /// + /// @param [in] delay Time, in microseconds, to delay before processing more commands on this queue. + /// @param [in] pScreen The private screen object that the vsync is occurring and the delay is waiting on. + /// + /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if delay is less than 0. + virtual Result DelayAfterVsync( + Util::fmicroseconds delay, + const IPrivateScreen* pScreen) = 0; + + /// Updates page mappings for virtual GPU memory allocations. + /// + /// @param [in] rangeCount Number of ranges to remap (i.e., size of the pRanges array). + /// @param [in] pRanges Defines the set of remappings from virtual GPU memory object pages to real GPU + /// memory object pages. + /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without + /// waiting for any previous rendering to finish first. On platforms that don't support + /// this, the flag will be ignored. + /// @param [in] pFence Optional. Pointer to an IFence, which will be signaled after the VA remapping. + /// + /// @returns Success if the remappings were executed successfully. It is assumed that the following conditions are + /// met for the input to this function: + /// + rangeCount is not 0. + /// + The page range for all members of pRanges are valid. + /// + pRanges is not null. + /// + pVirtualGpuMem is not null for any member of pRanges. + /// + pRanges does not specify a real GPU memory object as a virtual GPU memory object or vice versa. + virtual Result RemapVirtualMemoryPages( + uint32 rangeCount, + const VirtualMemoryRemapRange* pRanges, + bool doNotWait, + IFence* pFence) = 0; + + /// Copies page mappings from one virtual GPU memory object to another. + /// + /// @param [in] rangeCount Number of ranges to copy (i.e., size of the pRanges array). + /// @param [in] pRanges Defines the set of page mappings to copy between virtual GPU memory objects. + /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without + /// waiting for any previous rendering to finish first. On platforms that don't support + /// this, the flag will be ignored. + /// + /// @returns Success if the mappings were copied successfully. It is assumed that the following conditions are + /// met for the input to this function: + /// + rangeCount is not 0. + /// + The page range for all members of pRanges are valid. + /// + pRanges is not null. + /// + pSrcGpuMem or pDstGpuMem is not null for any member of pRanges. + /// + pRanges does not specify a real GPU memory object as source or destination + virtual Result CopyVirtualMemoryPageMappings( + uint32 rangeCount, + const VirtualMemoryCopyPageMappingsRange* pRanges, + bool doNotWait) = 0; + + /// Associates the provided Fence object with the last submission on this queue object. The Fence can be used via + /// GetStatus() to get the status of the last Submit, however no event will be created/set for the Fence so + /// WaitForFences() should NOT be called on the fence after this association. + /// + /// @see IFence::GetStatus() + /// @see IFence::WaitForFences() + /// + /// @param [in] pFence Fence object to be associated with the last Submit on this queue + /// + /// @returns Success if the association was successful. ErrorUnavailable will be returned in there has not yet been + /// a Submit on this queue. + virtual Result AssociateFenceWithLastSubmit( + IFence* pFence) = 0; + + /// Set execution priority for the current queue, it allows to elevate execution priority of submitted command + /// buffers, but it has no effect on command buffers that have already been submitted for execution. Elevating + /// the queue priority to medium or high would allow to temporary stall a low priority queue execution and execute + /// its work as soon as the low priority queue starts draining. + /// + /// @param [in] priority The priority level of the queue. + virtual void SetExecutionPriority( + QueuePriority priority) = 0; + + /// Returns a list of GPU memory allocations used by this queue. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this queue. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) = 0; + + /// Returns the QueueType for the queue + virtual QueueType Type() const = 0; + + /// Returns the EngineType for the queue + virtual EngineType GetEngineType() const = 0; + + /// Queries the kernel context info associated with this queue and copies it into pKernelContextInfo. + /// + /// Only supported on Windows platforms. + /// + /// @param [out] pKernelContextInfo Pointer to a KernelContextInfo struct to copy the information into. + /// @returns Success if the information is successfully copied into the output struct. + /// + ErrorInvalidPointer if pKernelContextInfo is nullptr. + /// + ErrorUnavailable if kernel context information is not available on the current platform. + virtual Result QueryKernelContextInfo(KernelContextInfo* pKernelContextInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueue() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Queues will be destroyed when the + /// associated device is destroyed. + virtual ~IQueue() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h new file mode 100644 index 0000000000..87fef9f360 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h @@ -0,0 +1,275 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueueSemaphore.h + * @brief Defines the Platform Abstraction Library (PAL) IQueueSemaphore interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include + +#if defined(_WIN32) +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Pal +{ + +// Forward declarations. +class IQueueSemaphore; + +/// Specifies properties for @ref IQueueSemaphore creation. Input structure to IDevice::CreateQueueSemaphore(). +struct QueueSemaphoreCreateInfo +{ + union + { + struct + { + /// This queue semaphore may be opened for use by a different device. + /// For DX12 native fence, the flag needs to be consistent with D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.Shared + /// given by DX runtime. + uint32 shareable : 1; + /// This queue semaphore can only be shared through Nt handle. + /// For DX12 native fence, the flag needs to be consistent with + /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NtSecuritySharing given by DX runtime. + uint32 sharedViaNtHandle : 1; + uint32 externalOpened : 1; ///< Semaphore was created by other APIs + /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload + /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the + /// waiter until the specified payload value has been signaled. + /// For DX12 native fence, runtime determines initialCount. Therefore, timeline flag has to be set. + uint32 timeline : 1; + /// Do not signal the queue semaphore to max if the device is lost. + /// For DX12 native fence, the flag needs to be consistent with + /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NoSignalMaxValueOnTdr given by DX runtime. + uint32 noSignalOnDeviceLost : 1; + /// For native fence only. If it's 0x0, the native fence type is D3DDDI_NATIVEFENCE_TYPE_DEFAULT. + /// If it's 0x1, native fence type is D3DDDI_NATIVEFENCE_TYPE_INTRA_GPU. + /// For DX12, the value is determined by runtime. DXCP needs to set it by reading D3DDDI_NATIVEFENCEINFO. + uint32 gpuOnly : 1; + /// This queue semaphore will be a monitored fence if this flag set, even if OS supports native fence. + uint32 forceUseMonitoredFence : 1; + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Queue semaphore creation flags. + + uint32 maxCount; ///< The maximum signal count; once reached, further signals are dropped. Must be + /// non-zero and no more than maxSemaphoreCount in @ref DeviceProperties. For + /// example, a value of one would request a binary semaphore. + /// NOTE: maxCount does not apply to timeline semaphores. + + uint64 initialCount; ///< Initial value for timeline semaphores. (or) + /// Initial count value for counting semaphores. + /// Must not be larger than maxCount for counting semaphores. + /// For DX12 native fence, DXCP needs to pass InitialFenceValue from + /// D3DDDI_NATIVEFENCEINFO. + +}; + +/// Specifies parameters for opening a queue semaphore for use on another device. Input structure to +/// IDevice::OpenSharedQueueSemaphore(). +struct QueueSemaphoreOpenInfo +{ + /// Shared queue semaphore object from another device to be opened. + IQueueSemaphore* pSharedQueueSemaphore; +}; + +/// Specifies parameters for opening a queue semaphore created by other APIs such as D3D. +struct ExternalQueueSemaphoreOpenInfo +{ + union + { + struct + { + uint32 crossProcess : 1; ///< This semaphore is created in another process. + uint32 sharedViaNtHandle : 1; ///< The shared semaphore handle is NT handle. + uint32 isReference : 1; ///< If set, then the opened semaphore will reference the same sync + ///< object in the kernel. Otherwise, the object is copied to the + ///< new Semaphore. + /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload + /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the + /// waiter until the specified payload value has been signaled. + uint32 timeline : 1; + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External queue semaphore open flags. + + OsExternalHandle externalSemaphore; ///< External shared semaphore handle. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 882 +#if defined(__unix__) && PAL_KMT_BUILD + uint64 syncFdSignalValue; ///< Signal timeline value when importing the state of a sync file +#endif +#endif +}; + +/// Specifies parameters for exporting a queue semaphore. Input structure to IQueueSemaphore::ExportExternalHandle(). +struct QueueSemaphoreExportInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the semaphore exporting a handle that reference the + ///< same sync object in the kernel. Otherwise, the object is copied + ///< to the new Semaphore. + uint32 reserved : 31; ///< Resevered for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External queue semaphore export flags. + +#if PAL_KMT_BUILD + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT + /// handle with a name, and then the handle can be acquired + /// via this name. + uint32 accessFlags; ///< Desried access rights of GPU memory. +#if defined(__unix__) + uint64 syncFdWaitValue; ///< Wait timeline value when exporting the state of a sync file +#endif +#endif +}; + +/** + *********************************************************************************************************************** + * @interface IQueueSemaphore + * @brief Semaphore object used to synchronize GPU work performed by multiple, parallel queues. + * + * These semaphores are used by calling IQueue::SignalQueueSemaphore() and IQueue::WaitQueueSemaphore(). + * + * @see IDevice::CreateQueueSemaphore() + * @see IDevice::OpenSharedQueueSemaphore() + *********************************************************************************************************************** + */ +class IQueueSemaphore : public IDestroyable +{ +public: + /// An IQueue::WaitQueueSemaphore operation may need to be sent down to the OS after the corresponding + /// IQueue::SignalQueueSemaphore operation due to GPU scheduler limitations. This method checks if any queues have + /// batched-up commands waiting for a SignalQueueSemaphore operation to appear. + /// + /// @returns True if one or more queues have some number of commands batched-up waiting for other queues to signal + /// this semaphore. False otherwise. + virtual bool HasStalledQueues() = 0; + + /// Query timeline Semaphore payload + /// + /// @param [out] pValue returned payload from querying + /// + /// @returns Success if the timeline semaphore is queried successful. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result QuerySemaphoreValue( + uint64* pValue) = 0; + + /// Wait on timeline Semaphore points, to be clarified, this is a CPU wait. + /// + /// @param [in] value Indicate which point to be waited. + /// @param [in] timeout the max waiting time, timeout is the timeout period in units of nanoseconds. + /// + /// @returns Success if the timeline semaphore point is waited successful. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result WaitSemaphoreValue( + uint64 value, + std::chrono::nanoseconds timeout) = 0; + + /// Signal on timeline Semaphore points, to be clarified, this is a CPU signal. + /// + /// @param [in] value Indicate which point to be signaled. + /// + /// @returns Success if the timeline semaphore point is signaled successful. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result SignalSemaphoreValue( + uint64 value) = 0; + +#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD + /// Returns an OS-specific handle which can be used to refer to this semaphore object across processes. This will + /// return a null or invalid handle if the object was not created with the external create flag set. + /// + /// @param [in] exportInfo Information describing how the Semamphore handle should be exported. + /// @note This function is only available for Linux builds. + /// + /// @returns An OS-specific handle which can be used to access the semaphore object across processes. + virtual OsExternalHandle ExportExternalHandle( + const QueueSemaphoreExportInfo& exportInfo) const = 0; +#endif + +#if defined(_WIN32) + /// Returns an OS-specific handle which can be used by another device to access the semaphore object. + /// + /// @returns An OS-specific handle which can be used by another device to access the semaphore object. + virtual OsExternalHandle ExportKmtHandle() const = 0; +#endif + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueueSemaphore() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IQueueSemaphore() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h b/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h new file mode 100644 index 0000000000..1bfd170c35 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h @@ -0,0 +1,251 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palShaderLibrary.h + * @brief Defines the Platform Abstraction Library (PAL) IShaderLibrary interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include "palStringView.h" +#include "palSpan.h" + +namespace Pal +{ + +struct GpuMemSubAllocInfo; + +/// Common flags controlling creation of shader libraries. +union LibraryCreateFlags +{ + struct + { + uint32 clientInternal : 1; ///< Internal library not created by the application. + uint32 isGraphics : 1; ///< Whether it is a graphics library + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies properties about an indirect function belonging to a @ref IShaderLibrary object. Part of the input +/// structure to IDevice::CreateShaderLibrary(). +struct ShaderLibraryFunctionInfo +{ + Util::StringView symbolName; ///< ELF Symbol name for the associated function. + gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during + /// library creation. +}; + +/// Specifies a shader sub type / ShaderKind. +enum class ShaderSubType : uint32 +{ + Unknown = 0, + Traversal, + RayGeneration, + Intersection, + AnyHit, + ClosestHit, + Miss, + Callable, + LaunchKernel, ///< Raytracing launch kernel + Count +}; + +/// Specifies properties for creation of a compute @ref IShaderLibrary object. Input structure to +/// IDevice::CreateShaderLibrary(). +struct ShaderLibraryCreateInfo +{ + LibraryCreateFlags flags; ///< Library creation flags + + const void* pCodeObject; ///< Pointer to code-object ELF binary implementing the Pipeline ABI interface. + /// The code-object ELF contains pre-compiled shaders, register values, and + /// additional metadata. + size_t codeObjectSize; ///< Size of code object in bytes. +}; + +/// Reports properties of a compiled library. +struct LibraryInfo +{ + PipelineHash internalLibraryHash; ///< 128-bit identifier extracted from this library's ELF binary, composed of + /// the state the compiler decided was appropriate to identify the compiled + /// library. The lower 64 bits are "stable"; the upper 64 bits are "unique". +}; + +/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined +/// due to HW support. The same information will be repeated for both the constituent shaders in this case. +struct ShaderLibStats +{ + ShaderHash shaderHash; ///< Shader hash. + CommonShaderStats common; ///< The shader compilation parameters for this shader. + /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableVgprs; + /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableSgprs; + size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. + PipelineHash palInternalLibraryHash; ///< Internal hash of the shader compilation data used by PAL. + uint32 stackFrameSizeInBytes; ///< Shader function stack frame size + ShaderSubType shaderSubType; ///< ShaderSubType / Shader Kind + CompilerStackSizes cpsStackSizes; ///< Stack used in Continuation +}; + +/** + *********************************************************************************************************************** + * @interface IShaderLibrary + * @brief Object containing one or more shader functions stored in GPU memory. These shader functions are callable + * from the shaders contained within IPipeline objects. + * + * Before a pipeline which calls into this library is bound to a command buffer (using @ref ICmdBuffer::BindPipeline), + * the client must call @ref IPipeline::LinkWithLibraries() and specify this library in the list of linked libraries. + * Failure to comply with this requirement is an error and will result in undefined behavior. + * + * @see IDevice::CreateShaderLibrary() + * @see IPipeline::LinkWithLibraries() + *********************************************************************************************************************** + */ +class IShaderLibrary : public IDestroyable +{ +public: + /// Returns properties of this library and its corresponding shader functions. + /// + /// @returns Property structure describing this library. + virtual const LibraryInfo& GetInfo() const = 0; + + /// Returns a list of GPU memory allocations used by this library. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this pipeline. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Obtains the binary code object for this library. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the library ELF is written in the buffer. If null, the size required + /// for the library ELF is given out in the location pSize. + /// + /// @returns Success if the library binary was fetched successfully. + /// +ErrorUnavailable if the library binary was not fetched successfully. + virtual Result GetCodeObject( + uint32* pSize, + void* pBuffer) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const { return m_pClientData; } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Obtains the compiled shader ISA code for the shader function specified. + /// + /// @param [in] pShaderExportName The shader exported name + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required + /// for the shader ISA is given out in the location pSize. + /// + /// @returns Success if the shader ISA code was fetched successfully. + /// +ErrorUnavailable if the shader ISA code was not fetched successfully. + + virtual Result GetShaderFunctionCode( + Util::StringView shaderExportName, + size_t* pSize, + void* pBuffer) const = 0; + + /// Obtains the shader pre and post compilation stats/params for the specified shader. + /// + /// @param [in] pShaderExportName The shader exported name + /// + /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for + /// the shader stage mentioned in shaderType. This cannot be nullptr. + /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the + /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. + /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. + /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error + /// occured. + virtual Result GetShaderFunctionStats( + Util::StringView shaderExportName, + ShaderLibStats* pShaderStats) const = 0; + + /// Returns the function list owned by this shader library + /// + /// @returns A list of ShaderLibraryFunctionInfo. + virtual const Util::Span GetShaderLibFunctionInfos() const = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IShaderLibrary() : m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IShaderLibrary() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + + IShaderLibrary(const IShaderLibrary&) = delete; + IShaderLibrary& operator=(const IShaderLibrary&) = delete; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h new file mode 100644 index 0000000000..a3f8e1b9a1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h @@ -0,0 +1,212 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palGpaSession.h" +#include "palGpuUtil.h" +#include "palTraceSession.h" +#include "palVector.h" +#include "palHashSet.h" +#include "palMutex.h" + +namespace Pal +{ +class IPlatform; +class IDevice; +class IShaderLibrary; +} // namespace Pal + +namespace GpuUtil +{ +class GpaSession; +} // namespace GpuUtil + +namespace GpuUtil +{ + +namespace TraceChunk +{ + +/// "CodeObject" RDF chunk identifier & version +constexpr char CodeObjectChunkId[TextIdentifierSize] = "CodeObject"; +constexpr Pal::uint32 CodeObjectChunkVersion = 2; + +/// Header for the "CodeObject" RDF chunk +struct CodeObjectHeader +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary +}; + +/// "COLoadEvent" RDF chunk identifier & version +constexpr char CodeObjectLoadEventChunkId[TextIdentifierSize] = "COLoadEvent"; +constexpr Pal::uint32 CodeObjectLoadEventChunkVersion = 3; + +struct CodeObjectLoadEventHeader +{ + Pal::uint32 count; /// Number of load events in this chunk +}; + +/// Describes whether a load event was into GPU memory or from. +enum class CodeObjectLoadEventType : Pal::uint32 +{ + LoadToGpuMemory = 0, /// Code Object was loaded into GPU memory + UnloadFromGpuMemory = 1 /// Code Object was unloaded from GPU memory +}; + +/// Describes one or more GPU load/unload(s) of a Code Object. Payload for "COLoadEvent" RDF chunk. +struct CodeObjectLoadEvent +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + CodeObjectLoadEventType eventType; /// Type of loader event + Pal::uint64 baseAddress; /// Base address where the Code Object was loaded + Pal::ShaderHash codeObjectHash; /// Hash of the (un)loaded Code Object binary + Pal::uint64 timestamp; /// CPU timestamp of this event being triggered +}; + +/// "PsoCorrelation" RDF chunk identifier & version +constexpr char PsoCorrelationChunkId[TextIdentifierSize] = "PsoCorrelation"; +constexpr Pal::uint32 PsoCorrelationChunkVersion = 3; + +struct PsoCorrelationHeader +{ + Pal::uint32 count; /// Number of PSO correlations in this chunk +}; + +/// Payload for the "PsoCorrelation" RDF chunks +struct PsoCorrelation +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + Pal::uint64 apiPsoHash; /// Hash of the API-level Pipeline State Object + Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler + char apiLevelObjectName[64]; /// Debug object name (null-terminated) +}; + +/// "COCorrelation" RDF chunk identifier & version +constexpr char CodeObjectCorrelationChunkId[TextIdentifierSize] = "COCorrelation"; +constexpr uint32_t CodeObjectCorrelationChunkVersion = 4; + +struct CodeObjectCorrelationHeader +{ + Pal::uint32 count; /// Number of Code Object Correlations in this chunk +}; + +/// Payload for the "CodeObjectCorrelation" RDF chunks +struct CodeObjectCorrelation +{ + Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler + Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary in the CO Database + Pal::uint32 containsMetadata : 1; /// 1 if the code object contains metadata, 0 otherwise + Pal::uint32 reserved : 31; /// Bitflags reserved for future use +}; + +} // namespace TraceChunk + +/// CodeObject Trace Source name & version +constexpr char CodeObjectTraceSourceName[] = "codeobject"; +constexpr Pal::uint32 CodeObjectTraceSourceVersion = 3; + +// ===================================================================================================================== +class CodeObjectTraceSource : public ITraceSource +{ +public: + CodeObjectTraceSource(Pal::IPlatform* pPlatform); + ~CodeObjectTraceSource(); + + // ==== TraceSource Native Functions ========================================================================== // + Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); + + Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); + Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); + + Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + // ==== Base Class Overrides =================================================================================== // + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { } + + virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } +#else + virtual void OnTraceAccepted() override { } +#endif + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } + virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override { } +#endif + virtual void OnTraceFinished() override; + + virtual const char* GetName() const override { return CodeObjectTraceSourceName; } + virtual Pal::uint32 GetVersion() const override { return CodeObjectTraceSourceVersion; } + +private: + Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); + + Pal::Result AddCodeObjectLoadEvent( + const Pal::IShaderLibrary* pLibrary, + TraceChunk::CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent( + const Pal::IPipeline* pLibrary, + TraceChunk::CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent( + const ElfBinaryInfo& elfBinaryInfo, + TraceChunk::CodeObjectLoadEventType eventType); + + Pal::Result WriteCodeObjectChunks(); + Pal::Result WriteLoaderEventsChunk(); + Pal::Result WritePsoCorrelationChunk(); + Pal::Result WriteCoCorrelationChunk(); + + struct CodeObjectDatabaseRecord + { + Pal::uint32 recordSize; + Pal::ShaderHash codeObjectHash; + }; + + Pal::IPlatform* const m_pPlatform; + + Util::RWLock m_registerPipelineLock; + Util::Vector m_codeObjectRecords; + Util::Vector m_loadEventRecords; + Util::Vector m_psoCorrelationRecords; + Util::Vector m_coCorrelationRecords; + + // API hashes -> internal pipeline hash (-> child code object hashes) + Util::HashSet m_registeredApiHashes; + Util::HashSet m_registeredPipelines; + Util::HashSet m_registeredCoHashes; + +}; + +} // namespace GpuUtil + diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h new file mode 100644 index 0000000000..81aabf1122 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h @@ -0,0 +1,1201 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpaSession.h + * @brief PAL GPU utility GpaSession class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDeque.h" +#include "palDevice.h" +#include "palGpuUtil.h" +#include "palHashSet.h" +#include "palMutex.h" +#include "palPipeline.h" +#include "palVector.h" +#include "palPlatform.h" +#include "palSysMemory.h" +#include "palGpuMemory.h" +#include "palMemTrackerImpl.h" + +// Forward declarations. +namespace Pal +{ + class ICmdAllocator; + class ICmdBuffer; + class IDevice; + class IGpuEvent; + class IGpuMemory; + class IPerfExperiment; + class IQueue; + class IQueueSemaphore; + struct GlobalCounterLayout; + struct MultiSubmitInfo; + struct ThreadTraceLayout; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 + enum PipelineStageFlag : uint32; +#else + enum HwPipePoint : uint32; +#endif +} +struct SqttFileChunkCpuInfo; +struct SqttFileChunkAsicInfo; +struct SqttCodeObjectDatabaseRecord; + +struct GpuMemoryInfo; + +namespace GpuUtil +{ +// Sample id initialization value. +constexpr Pal::uint32 InvalidSampleId = 0xFFFFFFFF; + +/// The available states of GpaSession +enum class GpaSessionState : Pal::uint32 +{ + Reset = 0, + Building = 1, + Complete = 2, + Ready = 3, +}; + +/// The various ways you can change trace options after it has started. +enum class UpdateSampleTraceMode : Pal::uint32 +{ + MinimalToFullMask = 0, ///< Used to convert a minimal trace (needed for context in compute presents) to a full + /// trace according to the options in the active trace. Requires enableSampleUpdates. + /// Additionally, this must be called between BeginSample() and EndSample() and + /// queue timing must also be enabled on the GpaSession when this function is called. + StartInstructionTrace = 1, ///< Used to enable instruction-level trace globally at any time. Can be run without an + /// active sample. Useful for targeting specific parts of a frame. + StopInstructionTrace = 2, ///< Used to disable instruction-level trace globally at any time. Can be run without an + /// active sample. +}; + +/// Specifies basic type of sample to perfom - either a normal set of "global" perf counters, or a trace consisting +/// of SQ thread trace and/or streaming performance counters. +enum class GpaSampleType : Pal::uint32 +{ + None = 0x0, ///< No profile will be done. + Cumulative = 0x1, ///< One 64-bit result will be returned per performance counter representing the cumulative delta + /// for that counter over the sample period. Cumulative samples must begin and end in the same + /// command buffer. + Trace = 0x2, ///< A GPU memory buffer will be filled with hw-specific SQ thread trace and/or streaming + /// performance counter data. Trace samples may span multiple command buffers. + Timing = 0x3, ///< Two 64-bit results will be recorded in beginTs and endTs to gather timestamp data. + Query = 0x4, ///< A set of 11 pipeline stats will be collected. + Count +}; + +/// Specifies a specific performance counter to be sampled with GpaSession::BeginSample() and GpaSession::EndSample(). +/// +/// This identifies a specific counter in a particular HW block instance, e.g., TCC instance 3 counter #19. It is up +/// to the client to know the meaning of a particular counter, e.g., TCC #19 is TCC_PERF_SEL_MISS on Fiji. Eventually, +/// PAL may want to support certain counters without the client needing HW-specific knowledge (i.e., select an enum +/// called L2MissRate from PAL rather than needing to know that counter is TCC #19 on Fiji), but GPA currently works in +/// this low-level mode with other drivers, and wants to keep the flexibility. +struct PerfCounterId +{ + Pal::GpuBlock block; ///< Which GPU block to reference (e.g., CB, DB, TCC). + Pal::uint32 instance; ///< Which instance of the specified GPU block to sample. E.g., Tahiti has 12 TCC blocks + /// (this number is returned per-block in the @ref Pal::GpuBlockPerfProperties structure). + /// There is no shortcut to get results for all instances of block in the whole chip, the + /// client must explicitly sample each instance and sum the results. + Pal::uint32 eventId; ///< Counter ID to sample. Note that the meaning of a particular eventId for a block can + /// change between chips. + + union + { + struct + { + Pal::uint32 spm32Bit : 1; ///< For SPM counters, collect in 32bit instead of 16bit + Pal::uint32 reserved : 31; ///< Reserved for future use + }; + + Pal::uint32 u32All; ///< Union value for copying + + } flags; + + // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for + // the relevant blocks. It's recommended to zero them out when not in use. + union + { + struct + { + Pal::uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. + } df; + + struct + { + Pal::uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. + Pal::uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold) + Pal::uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). + } umc; + + Pal::uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events + + Pal::uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds + } subConfig; +}; + +/// Defines a set of flags for a particular gpa session. +union GpaSessionFlags +{ + struct + { + /// Enables timing of queue operations via Timed* functions. + Pal::uint32 enableQueueTiming : 1; + + /// Enables sample updates via the UpdateSampleTraceParams function. + Pal::uint32 enableSampleUpdates : 1; + + /// Indicates that the client will use the internal Timed*QueueSemaphore() functions for queue semaphore timing + /// data. When not set it indicates the client will provide ETW data via the ExteralTimed* functions. + Pal::uint32 useInternalQueueSemaphoreTiming : 1; + + /// Reserved for future use. + Pal::uint32 reserved : 29; + }; + + /// Flags packed as 32-bit uint. + Pal::uint32 u32All; +}; + +/// Specifies options that direct the gpa session behavior. +struct GpaSessionBeginInfo +{ + /// Gpa Session flags used to control behavior. + GpaSessionFlags flags; +}; + +/// Input structure for CmdBeginGpuProfilerSample. +/// +/// Defines a set of global performance counters and/or SQ thread trace data to be sampled. +struct GpaSampleConfig +{ + /// Selects what type of data should be gathered for this sample. This can either be _cumulative_ to gather + /// simple deltas for the specified set of perf counters over the sample period, or it can be _trace_ to generate + /// a blob of RGP-formatted data containing SQ thread trace and/or streaming performance monitor data. + GpaSampleType type; + + union + { + struct + { + Pal::uint32 sampleInternalOperations : 1; ///< Include BLTs and internal driver operations in the + /// results. + Pal::uint32 cacheFlushOnCounterCollection : 1; ///< Insert cache flush and invalidate events before and + /// after every sample. + Pal::uint32 sqShaderMask : 1; ///< If sqShaderMask is valid. + Pal::uint32 sqWgpShaderMask : 1; ///< If sqWgpShaderMask is valid. + Pal::uint32 reserved : 28; ///< Reserved for future use. + }; + Pal::uint32 u32All; ///< Bit flags packed as uint32. + } flags; ///< Bit flags controlling sample operation for all sample + /// types. + + Pal::PerfExperimentShaderFlags sqShaderMask; ///< Which shader stages are sampled by GpuBlock::Sq counters. + ///< Only used if flags.sqShaderMask is set to 1. + Pal::PerfExperimentShaderFlags sqWgpShaderMask; ///< Which shader stages are sampled by GpuBlock::SqWgp counters. + ///< Only used if flags.sqWgpShaderMask is set to 1. + + struct + { + /// Number of entries in pIds. + Pal::uint32 numCounters; + + /// List of performance counters to be gathered for a sample. If the sample type is _cumulative_ this will + /// result in "global" perf counters being sampled at the beginning of the sample period; if the sample type + /// is _trace_ this will result in SPM data being added to the sample's resulting RGP blob. + /// + /// Note that it is up to the client to respect the hardware counter limit per block. This can be + /// determined by the maxGlobalOnlyCounters, maxGlobalSharedCounters, maxSpmCounters, and instanceGroupSize + /// fields of @ref Pal::GpuBlockPerfProperties. + const PerfCounterId* pIds; + + /// Period for SPM sample collection in cycles. Only relevant for _trace_ samples. + Pal::uint32 spmTraceSampleInterval; + + /// Maximum amount of GPU memory in bytes this sample can allocate for SPM data. Only relevant for _trace_ + /// samples. + Pal::gpusize gpuMemoryLimit; + } perfCounters; ///< Performance counter selection (valid for both _cumulative_ and _trace_ samples). + + struct + { + /// Number of entries in pIds. + Pal::uint32 numCounters; + + /// Period for DF SPM sample collection in nano seconds. + Pal::uint32 sampleInterval; + + /// Maximum amount of GPU memory in bytes this sample can allocate for DF SPM data. + Pal::gpusize gpuMemoryLimit; + + /// List of performance counters to be gathered for a df sample. This has to be separate from the list + /// list of normal counters because it is a completely different mechanism for gathering data. + /// + /// Note that it is up to the client to respect the hardware counter limit per block. This can be + /// determined by the maxSpmCounters fields of + /// @ref Pal::GpuBlockPerfProperties. + const PerfCounterId* pIds; + } dfSpmPerfCounters; + + struct + { + union + { + struct + { + Pal::uint32 enable : 1; ///< Include SQTT data in the trace. + Pal::uint32 supressInstructionTokens : 1; ///< Prevents capturing instruction-level SQTT tokens, + /// significantly reducing the amount of sample data. + Pal::uint32 stallMode : 2; ///< Describes behavior when buffer full + Pal::uint32 stallAllSimds : 1; ///< Stall all SIMDs for thread trace stall. + Pal::uint32 excludeNonDetailShaderData : 1; ///< Only emit shader tokens from the SIMD that have been + /// selected for detail instruction tracing +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + Pal::uint32 enableExecPopTokens : 1; ///< Output exec tokens +#else + Pal::uint32 placeholder2 : 1; +#endif + Pal::uint32 reserved : 25; ///< Reserved for future use. + }; + Pal::uint32 u32All; ///< Bit flags packed as uint32. + } flags; ///< Bit flags controlling SQTT samples. + Pal::uint32 seMask; ///< Mask that determines which specific SEs to run Thread trace on. + /// If 0, all SEs are enabled + Pal::uint32 seDetailedMask; ///< Mask that selects which specific SEs to reveal Thread trace detailed info. + /// If 0, all SEs will reveal detailed thread trace + Pal::gpusize gpuMemoryLimit; ///< Maximum amount of GPU memory in bytes this sample can allocate for the SQTT + /// buffer. If 0, allocate maximum size to prevent dropping tokens toward the + /// end of the sample. + Pal::uint32 tokenMask; ///< Mask indicating which SQTT tokens are requested for capture. If a tokenMask is + /// not provided, PAL will default to collecting all tokens or tokens except + /// instruction tokens if the supressInstructionTokens flag is set. Instruction + /// tokens will always be filtered out if supressInstructionTokens = true. + } sqtt; ///< SQ thread trace configuration (only valid for _trace_ samples). + + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 + Pal::PipelineStageFlag preSample; ///< The pipeline stage in the GPU pipeline where the begin timestamp should + /// take place. + Pal::PipelineStageFlag postSample; ///< The pipeline stage in the GPU pipeline where the end timestamp should + /// take place. +#else + Pal::HwPipePoint preSample; ///< The point in the GPU pipeline where the begin timestamp should take place. + Pal::HwPipePoint postSample; ///< The point in the GPU pipeline where the end timestamp should take place. +#endif + } timing; ///< Timestamp configuration. (only valid for timing samples) +}; + +/// Extra metadata about a command buffer submission +struct TimedSubmitInfo +{ + const Pal::uint64* pApiCmdBufIds; ///< Array of api specific command buffer ids + const Pal::uint32* pSqttCmdBufIds; ///< Array of sqtt command buffer ids + Pal::uint64 frameIndex; ///< The global frame index for the application. +}; + +/// Extra metadata about a queue semaphore operation +struct TimedQueueSemaphoreInfo +{ + Pal::uint64 semaphoreID; ///< Api specific id associated with a semaphore. +}; + +/// Extra metadata about a queue present operation +struct TimedQueuePresentInfo +{ + Pal::uint64 presentID; ///< Api specific id associated with a present. +}; + +/// Struct for storing information about gpu clock speeds. +struct GpuClocksSample +{ + Pal::uint32 gpuEngineClockSpeed; // Current speed of the gpu engine clock in MHz + Pal::uint32 gpuMemoryClockSpeed; // Current speed of the gpu memory clock in MHz +}; + +/// Struct for storing CPU-side allocations of Pal::IPerfExperiment's. +struct PerfExperimentMemory +{ + void* pMemory; // Memory allocated for an IPerfExperiment. + size_t memorySize; // Size of the memory allocated in pMemory. +}; + +/// Struct for supplying API-dependent information about pipelines. +struct RegisterPipelineInfo +{ + Pal::uint64 apiPsoHash; ///< Client-provided PSO hash. +}; + +/// Struct for supplying API-dependent information about libraries. +struct RegisterLibraryInfo +{ + Pal::uint64 apiHash; ///< Client-provided api hash. +}; + +/// Struct for supplying Elf binary. +struct ElfBinaryInfo +{ + const void* pBinary; ///< FAT Elf binary. + Pal::uint32 binarySize; ///< FAT Elf binary size. + Pal::IGpuMemory* pGpuMemory; ///< GPU Memory where the compiled ISA resides. + Pal::gpusize offset; ///< Offset inside GPU memory object + Pal::uint64 originalHash; ///< Original source/binary hash. + Pal::uint64 compiledHash; ///< Compiled binary hash. +}; + +/// Enumeration of RGP trace profiling modes +enum class TraceProfilingMode : Pal::uint32 +{ + Present = 0, ///< Present triggered capture + UserMarkers = 1, ///< Capture triggered by user marker + FrameNumber = 2, ///< Capture based on frame number + Tags = 3, ///< Tag based capture +}; + +/// Constant defines the maximum length for a user marker string. +static constexpr Pal::uint32 UserMarkerStringLength = 256; + +/// Defines data specific to each profiling mode used to capture an RGP trace. +union TraceProfilingModeData +{ + struct + { + char start[UserMarkerStringLength]; ///< User marker string used to start trace capture. + char end[UserMarkerStringLength]; ///< User marker string used to end trace capture. + } userMarkerData; + + struct + { + Pal::uint32 start; ///< Frame number used to start the trace. + Pal::uint32 end; ///< Frame number used to end the trace. + } frameNumberData; + + struct + { + Pal::uint64 start; ///< Tag used to start the trace. + Pal::uint64 end; ///< Tag used to end the trace. + } tagData; +}; + +/// Enumerates the different instruction level data modes for an RGP trace +enum class InstructionTraceMode : Pal::uint32 +{ + Disabled = 0, ///< Instruction level data was disabled for trace. + FullFrame = 1, ///< Instruction level data was enabled for the full trace. + ApiPso = 2, ///< Instruction level data was enabled only for a single API PSO. +}; + +/// Defines the data used to control enabling of instruction level data. +struct InstructionTraceModeData +{ + Pal::uint64 apiPsoHash; ///< Hash of the API PSO targeted for instruction level data. +}; + +/// Struct for supplying API specific information about an RGP trace +struct SampleTraceApiInfo +{ + TraceProfilingMode profilingMode; ///< Profiling mode used to trigger the trace. + TraceProfilingModeData profilingModeData; ///< Profiling mode specific data. + InstructionTraceMode instructionTraceMode; ///< Instruction trace mode for the trace. + InstructionTraceModeData instructionTraceModeData; ///< Instruction trace mode data. +}; + +/// An enumeration of the API types. +enum class ApiType : Pal::uint32 +{ + DirectX12 = 0, ///< Represents DirectX12 API type. + Vulkan = 1, ///< Represents Vulkan API type. + Generic = 2, ///< Represents Generic API type. + OpenCl = 3, ///< Represents OpenCL API type. + Hip = 5, ///< Represents HIP API type. +}; + +/// Struct used for storing SQTT-specific trace information +struct SqttTraceInfo +{ + Pal::uint32 shaderEngine; ///< Shader engine index + Pal::uint32 computeUnit; ///< Compute unit index + Pal::uint32 sqttVersion; ///< SQTT version + Pal::uint64 bufferSize; ///< SQTT trace buffer size +}; + +/// Struct used for storing SPM-specific trace information +struct SpmTraceInfo +{ + Pal::uint32 numSpmCounters; ///< The number of SPM counters sampled in the trace + Pal::uint32 numTimestamps; ///< The number of timestamps that samples were taken + Pal::uint32 sampleFrequency; ///< The SPM counter sampling frequency +}; + +/// Struct used for storing QueueTimings-specific trace information +struct QueueTimingsTraceInfo +{ + Pal::uint32 numQueueInfoRecords; + Pal::uint32 numQueueEventRecords; + Pal::uint32 queueInfoTableSize; + Pal::uint32 queueEventTableSize; +}; + +/** +*********************************************************************************************************************** +* @class GpaSession +* @brief Helper class providing common driver functionality required by all PAL clients that support the GPUPerfAPI +* (GPA). Abstracts IPerfExperiment creation, memory management, completion confirmation, and results reporting +* at a level convenient for GPA. Each PAL client driver will need to publish an API extension exposing this +* support for use by GPA. +* +* A GpaSession is a container for a set of _samples_ of performance counter and/or SQ thread trace data. Its main +* purpose is to manage resources (IPerfExperiments and their backing system/GPU memory) in an efficient manner that is +* consistent with command buffer management in modern APIs. Consider GpaSession as a peer of DX12's command +* allocator or Vulkan's command pool objects. +* +* Basic flow of usage: +* - Newly create sessions are in the _reset_ state. +* - A session is moved from the _reset_ state to the _building_ state by calling Begin(). +* - Samples are added to a session by specifying desired data for each query and marking a begin and end location +* in ICmdBuffers as they are built. Internally required resources, like GPU memory where counters will be +* written, are allocated from internal pools managed by the session. +* - A session is moved from the _building_ state to the _complete_ state by calling End(). +* - The application will submit all command buffers referenced by the session. +* - The session is confirmed as _ready_, either using standard PAL fences to confirm all assocated submission have +* completed, or by polling IsReady() on the session. +* - Results for all samples in the session can be queried via GetResults(). +* - Reset() should be called once results have been gathered and before building a new session. Resources are +* retained by the session object for use in the newly built session. The session object must be destroyed in +* order to fully release all resource back to the system. +* +* Cumulative-type samples may not span multiple command buffers, as other apps could interfere with the counts and +* there the final data doesn't have time-based visibility to detect that happened. +* +* @warning GpaSession is not thread safe. Performing samples in command buffers being built simultaneously by multiple +* threads should use multiple GpaSession objects. +*********************************************************************************************************************** +*/ +class GpaSession +{ + typedef Pal::IPlatform GpaAllocator; +public: + typedef Util::Deque PerfExpMemDeque; + + /// Constructor. + GpaSession( + Pal::IPlatform* pPlatform, + Pal::IDevice* pDevice, + Pal::uint16 apiMajorVer, + Pal::uint16 apiMinorVer, + ApiType apiType, + Pal::uint16 rgpInstrumentationSpecVer = 0, + Pal::uint16 rgpInstrumentationApiVer = 0, + PerfExpMemDeque* pAvailablePerfExpMem = nullptr); + + ~GpaSession(); + + /// Copy constructor creates an empty copy of a session. + /// + /// Newly constructed session copies the GPU memory allocations and their layout from the source session, making + /// this a valid destination for a CopyResults command. This new object is effectively in the _complete_ state. + /// + /// The purpose of such objects is to handle sampling data from bundles or nested command buffers where the same + /// set of commands might be executed multiple times from a single root-level command buffer. The client should + /// note such cases, and create a copy of the bundle's session for each invocation, then call CopyResults() from + /// the original session into the copy after the invocation. + /// + /// @param [in] src Session to be copied. Must either be in the _complete_ or _ready_ state. + explicit GpaSession(const GpaSession& src); + + /// Initialize the newly constructed GPA session. + Pal::Result Init(); + + /// Registers a queue with the GpaSession that will be submitted to using TimedSubmit. This must be called on any + /// queues that are submitted to via the Timed* functions. For Timed* signal and wait queue semaphore events, a + /// valid queueContext will be required (queueContext not equal to 0). + Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, + Pal::uint64 queueId, + Pal::uint64 queueContext); + + /// Unregisters a queue prior to object destruction, and ensure that associated resources are destroyed. Work can + /// no longer be submitted on the queue after this has been called. + Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); + + Pal::Result TimedSubmit(Pal::IQueue* pQueue, + const Pal::MultiSubmitInfo& submitInfo, + const TimedSubmitInfo& timedSubmitInfo); + + /// Executes a timed queue semaphore signal through the given queue. The HW time is measured when the queue semaphore + /// is signaled. + Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedSignalInfo, + Pal::uint64 value = 0); + + /// Executes a timed queue semaphore wait through the given queue. The HW time is measured when the queue semaphore + /// wait finishes. + Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedWaitInfo, + Pal::uint64 value = 0); + + /// Injects a timed queue present event. + Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, + const TimedQueuePresentInfo& timedPresentInfo); + + /// Injects a timed wait queue semaphore event using information supplied by an external source. + /// A valid queueContext (queueContext not equal to 0) is needed for this function. + Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedWaitInfo); + + /// Injects a timed signal queue semaphore event using information supplied by an external source. + /// A valid queueContext (queueContext not equal to 0) is needed for this function. + Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSignalInfo); + + /// Queries the engine and memory clocks from DeviceProperties + Pal::Result SampleGpuClocks(GpuClocksSample* pGpuClocksSample) const; + + /// Samples the timing clocks if queue timing is enabled and adds a clock sample entry to the current session. + Pal::Result SampleTimingClocks(); + + /// Moves the session from the _reset_ state to the _building_ state. + /// + /// Invalid to call Begin() on a session that isn't in the _reset_ state. + /// + /// @param [in] info Information about the gpa sessions desired behavior. + /// + /// @returns Success if the session was successfully moved to the _building_ state. Otherwise, possible errors + /// include: + /// + ErrorUnavailable if the sessions isn't current in the _reset_ state. + Pal::Result Begin(const GpaSessionBeginInfo& info); + + /// Moves the session from the _building_ state to the _complete_ state. + /// + /// Invalid to call End() on a session that isn't in the _building_ state. The implementation _may_ insert GPU + /// commands into the specified pCmdBuf - in the case of a session that spans multiple command buffers, the + /// command buffer specified to End() _must_ be the last command buffer of the session that is submitted. + /// + /// @param [in] pCmdBuf Last (normally _only_) command buffer of the session. Can be used by implementation + /// to insert GPU commands required after all samples are inserted (e.g., to confirm session + /// completion). + /// + /// @returns Success if the session was successfully moved to the _complete_ state. Otherwise, possible errors + /// include: + /// + ErrorUnavailable if the sessions isn't current in the _building_ state. + Pal::Result End(Pal::ICmdBuffer* pCmdBuf); + + /// Marks the beginning of a range of GPU operations to be measured and specifies what data should be recorded. + /// + /// It is possible the sample will not succeed due to internal memory allocation failure, etc. In those cases, + /// the session will be marked invalid and no sample commands will be inserted. Reporting of this error is + /// delayed until GetResults(). + /// + /// A note for GpuBlock::SqWgp + /// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity + /// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock + /// gating (CGCG) and power gating. PAL expose this feature to clients. + /// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same + /// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. + /// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features + /// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. + /// The counter data is still reported per WGP (not aggregated for the whole SE). + /// + /// Check the following two documents for details: + /// + /// @param [in] pCmdBuf Command buffer to issue the begin sample commands. All operations performed + /// between executing the BeginSample() and EndSample() GPU commands will contribute to + /// the sample results. + /// @param [in] sampleConfig Describes what data should be sampled. + /// @param [out] pSampleId An ID corresponding to this sample. This ID should be recorded and passed back to + /// EndSample() when the sampled command buffer range is complete. This ID should also + /// be passed to GetResults() when the session is in the _ready_ state in order to get + /// the results of this sample. + /// + /// @returns Success if the update was successful. Unsupported if the sample config type is not supported. + /// Otherwise, possible errors include: + /// + ErrorInvalidPointer if pCmdBuf or pSampleId is nullptr. + Pal::Result BeginSample( + Pal::ICmdBuffer* pCmdBuf, + const GpaSampleConfig& sampleConfig, + Pal::uint32* pSampleId); + + /// Updates the trace parameters for a specific sample. + /// + /// @param [in] pCmdBuf Command buffer to issue the update commands. + /// @param [in] sampleId Identifies the sample to be updated, if required by the mode. This should be a value + /// returned by BeginSample(), and must correspond to a thread trace sample. + /// @param [in] updateMode The way the sample parameters should be set. Some modes have additional restrictions. + /// @see UpdateSampleTraceMode + /// + /// @returns Success if the update was successful. Otherwise, possible errors + /// include: + /// + ErrorInvalidPointer if pCmdBuf is nullptr. + /// + ErrorInvalidObjectType if a sample is required and the sample associated with sampleId is not a + /// trace sample. + Pal::Result UpdateSampleTraceParams( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId, + UpdateSampleTraceMode updateMode); + + /// Marks the end of a range of command buffer operations to be measured. + /// + /// @param [in] pCmdBuf Command buffer to issue the end sample commands. All operations performed between + /// executing the BeginSample() and EndSample() GPU commands will contribute to the sample + /// results. _Cumulative_ samples (i.e., global performance counter samples) must never span + /// multiple command buffers (EndSample() should be called in the same command buffer as + /// BeginSample()). + /// @param [in] sampleId Identifies the sample to be ended. This should be the value returned by BeginSample() + /// for the sample that is being ended. + /// + /// @note BeginSample() must be called before EndSample() _and_ the GPU commands inserted by BeginSample() must be + /// executed before the command inserted by EndSample(). Since a session is a single-threaded object, this + /// will normally happen naturally. + void EndSample( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId); + + /// Copies the DF SPM trace buffer to the GpaSession result buffer + /// + /// @param [in] pCmdBuf Command buffer to issue the copy commands. + /// @param [in] sampleId Identifies the sample to be copied. + /// @note This must be called after a command buffer with the dfSpmTraceEnd CmdBufInfo flag + /// and with a separate command buffer. DF SPM traces are on a per command buffer granularity + /// because they are started and stopped by the KMD. + void CopyDfSpmTraceResults( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId); + + /// Provides API specific information about an RGP trace. + /// + /// @param [in] traceApiInfo Const reference to the struct of API specific information. + /// @param [in] sampleId Sample ID (returned by BeginSample) for the RGP trace type sample info is being + /// provided for. + void SetSampleTraceApiInfo( + const SampleTraceApiInfo& traceApiInfo, + Pal::uint32 sampleId) const; + + /// Reports if GPU execution of this session has completed and results are _ready_ for querying from the CPU via + /// GetResults(). + /// + /// @returns true if all samples in the session have completed GPU execution. + bool IsReady() const; + + /// Reports results of a particular sample. Only valid for sessions in the _ready_ state. + /// + /// Results will be formatted depending on the sample type: + /// + Cumulative: Results will be an array of uint64 values in the order of perf counter IDs specified by + /// BeginSample(). + /// + SqThreadTrace: Results will be a binary blob in the RGP file format. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData, and *pSizeInBytes will be set to the amount of space written + /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required (should only be necessary when + /// getting RGP data). If non-null, the sample results will be written to this + /// location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + Pal::Result GetResults( + Pal::uint32 sampleId, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the SQTT results. Only valid for sessions in the _complete_ state. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [in] traceIndex The index of the trace to get. + /// @param [out] pTraceInfoOut Optional pointer to a structure which will be written with information about the trace. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData, and *pSizeInBytes will be set to the amount of space written + /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + NotFound if the given index is not valid. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + // + ErrorInvalidPointer if pSizeInBytes is NULL. + Pal::Result GetSqttTraceData( + Pal::uint32 sampleId, + Pal::uint32 traceIndex, + SqttTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the SPM trace results of a particular sample. Only valid for 'Trace' type samples and sessions + /// in the _complete_ state. + /// + /// Results in the output buffer are a binary blob formatted according to the RGP specification. + /// The data layout of the populated output buffer is as follows: + /// - Timestamps array [size: "numTimestamps * sizeof(uint64)" bytes] + /// - SpmCounterInfo array [size: "numSpmCounters * sizeof(SpmCounterInfo)" bytes] + /// - SPM Counter Data matrix [size: "*pSizeInBytes - (timestamps array + SpmCounterInfo array size)" bytes] + /// + /// The SPM Counter Data matrix is laid out linearly in a row-major format. There are "numSpmCounters" rows and + /// "numTimestamps" columns. Each element in the matrix is either 16- or 32-bits, based on the "dataSize" field + /// of the corresponding "SpmCounterInfo" entry. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with trace metadata. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData. + /// If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + Pal::Result GetSpmTraceData( + Pal::uint32 sampleId, + SpmTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the Queue Timings data from the active GpaSession. + /// Only valid when the GpaSession had `enableQueueTiming` flag set. + /// + /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with metadata. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData. + /// If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session was not configured with `enableQueueTiming`. + Pal::Result GetQueueTimingsData( + QueueTimingsTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Moves the session to the _reset_ state, marking all sessions resources as unused and available for reuse when + /// the session is re-built. + /// + /// @warning This function cannot be called when the session is queued for execution on the GPU. The client must + /// confirm this is not the case using IsReady(), fences, etc. + /// + /// @returns Success if the session was successfully moved to the _reset_ state. Otherwise, possible errors + /// include: + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result Reset(); + + /// Uses the GPU to copy results from a nested command buffer's session into a root-level command buffer's per- + /// invocation session data. + /// + /// This command will implicitly wait for the source session (as specified in the copy constructor) to be complete + /// then use the GPU to update this session's data. This allows the client to get accurate sample data in the + /// case where a nested command buffer is launched multiple times from the same root-level command buffer. + /// + /// The session remains in the _complete_ state after calling this, and the client should submit the commands + /// and verify their completion to move to the _ready_ state. + /// + /// @param pCmdBuf Command buffer where the session copy should be performed. + void CopyResults(Pal::ICmdBuffer* pCmdBuf); + + /// Register pipeline with GpaSession for obtaining shader dumps and load events in the RGP file. + /// + /// @param [in] pPipeline The PAL pipeline to be tracked. + /// @param [in] clientInfo API-dependent information for this pipeline to also be recorded. + /// + /// @returns Success if the pipeline has been registered with GpaSession successfully. + /// + AlreadyExists if a duplicate pipeline is provided. + Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + + /// Unregister pipeline with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the PAL pipeline object. + /// + /// @param [in] pPipeline The PAL pipeline to be tracked. + /// + /// @returns Success if the pipeline has been unregistered with GpaSession successfully. + Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); + + /// Register library with GpaSession for obtaining shader dumps and load events in the RGP file. + /// + /// @param [in] pLibrary The PAL library to be tracked. + /// @param [in] clientInfo API-dependent information for this library to also be recorded. + /// + /// @returns Success if the library has been registered with GpaSession successfully. + /// + AlreadyExists if a duplicate library is provided. + Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); + + /// Unregister library with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the PAL library object. + /// + /// @param [in] pLibrary The PAL library to be tracked. + /// + /// @returns Success if the library has been unregistered with GpaSession successfully. + Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); + + /// Register ELF binary with GpaSession for obtaining kernel dumps and load events in the RGP file. + /// + /// @param [in] elfBinaryInfo Contains information about the Elf binary to be recorded. + /// + /// @returns Success if the Elf binary has been registered with GpaSession successfully. + Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + /// Unregister Elf binary with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the Elf binary. + /// + /// @param [in] elfBinaryInfo Contains the elf binary info to be removed from tracking. + /// + /// @returns Success if the library has been unregistered with GpaSession successfully. + Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + /// Given a Pal device, validate a list of perfcounters. + /// + /// @param [in] pDevice a given device + /// @param [in] pCounters a list of perf counters. + /// @param [in] numCounters perf counter counts. + /// + /// @returns Success if counters are valid. + Pal::Result ValidatePerfCounters(Pal::IDevice* pDevice, + const PerfCounterId* pCounters, + const Pal::uint32 numCounters); + +private: + // Tracking structure for a single IGpuMemory allocation owned by a GpaSession::GpaSession. In particular, it + // tracks the associated CPU pointer since these allocations remain mapped for CPU access for their lifetime. + struct GpuMemoryInfo + { + Pal::IGpuMemory* pGpuMemory; + void* pCpuAddr; + }; + + // Event type for code object load events + enum class CodeObjectLoadEventType + { + LoadToGpuMemory = 0, + UnloadFromGpuMemory + }; + + // Represents all information to be contained in one SqttCodeObjectLoaderEventRecord + struct CodeObjectLoadEventRecord + { + CodeObjectLoadEventType eventType; + Pal::uint64 baseAddress; + Pal::ShaderHash codeObjectHash; + Pal::uint64 timestamp; + }; + + // Represents all information to be contained in one SqttPsoCorrelationRecord + struct PsoCorrelationRecord + { + Pal::uint64 apiPsoHash; + Pal::PipelineHash internalPipelineHash; + }; + + // Registers a single (non-archive) pipeline with the GpaSession. Returns AlreadyExists on duplicate PAL pipeline. + Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + + // Unregisters a single (non-archive) pipeline from the GpaSession. + Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); + + Pal::IDevice*const m_pDevice; // Device associated with this GpaSession. + Pal::DeviceProperties m_deviceProps; + Pal::SetClockModeOutput m_peakClockFrequency; // Output of query for stable peak, values in Mhz + Pal::PerfExperimentProperties m_perfExperimentProps; + Pal::uint32 m_timestampAlignment; // Pre-calculated timestamp data alignment. + ApiType m_apiType; // API type, e.g. Vulkan, used in RGP dumps. + Pal::uint16 m_apiMajorVer; // API major version, used in RGP dumps. + Pal::uint16 m_apiMinorVer; // API minor version, used in RGP dumps. + Pal::uint16 m_instrumentationSpecVersion; // Spec version of RGP instrumetation. + Pal::uint16 m_instrumentationApiVersion; // Api version of RGP instrumetation. + + Pal::IGpuEvent* m_pGpuEvent; + GpaSessionState m_sessionState; + + const GpaSession* const m_pSrcSession; // source session for session created via copy c'tor + + // Tracks the current GPU memory object and offset being sub-allocated for AcquireGpuMem(). + GpuMemoryInfo m_curGartGpuMem; + Pal::gpusize m_curGartGpuMemOffset; + GpuMemoryInfo m_curLocalGpuMem; + Pal::gpusize m_curLocalGpuMemOffset; + GpuMemoryInfo m_curInvisGpuMem; + Pal::gpusize m_curInvisGpuMemOffset; + + // Locks for the local-invisible, gart and local memory subdivision (and their pools) + Util::Mutex m_gartGpuMemLock; + Util::Mutex m_localGpuMemLock; + Util::Mutex m_invisGpuMemLock; + + // Counts number of samples that are active in this GpaSession. + Pal::uint32 m_sampleCount; + + Pal::IPlatform*const m_pPlatform; // Platform associated with this GpaSesion. + + // GartHeap / LocalHeap / InvisHeap GPU chunk pools. + Util::Deque m_availableGartGpuMem; + Util::Deque m_busyGartGpuMem; + Util::Deque m_availableLocalGpuMem; + Util::Deque m_busyLocalGpuMem; + Util::Deque m_availableInvisGpuMem; + Util::Deque m_busyInvisGpuMem; + + struct SampleItem; + class PerfSample; + class CounterSample; + class TraceSample; + class TimingSample; + class QuerySample; + + Util::Vector m_sampleItemArray; + PerfExpMemDeque* m_pAvailablePerfExpMem; + + // Unique pipelines registered with this GpaSession. + Util::HashSet m_registeredPipelines; + // Unique API PSOs registered with this GpaSession. + Util::HashSet m_registeredApiHashes; + + // List of cached pipeline code object records that will be copied to the final database at the end of a trace + Util::Deque m_codeObjectRecordsCache; + // List of pipeline code object records that were registered during a trace + Util::Deque m_curCodeObjectRecords; + + // List of cached code object load event records that will be copied to the final database at the end of a trace + Util::Deque m_codeObjectLoadEventRecordsCache; + // List of code object load event records that were registered during a trace + Util::Deque m_curCodeObjectLoadEventRecords; + + // List of cached PSO correlation records that will be copied to the final database at the end of a trace + Util::Deque m_psoCorrelationRecordsCache; + // List of PSO correlation records that were registered during a trace + Util::Deque m_curPsoCorrelationRecords; + + Util::RWLock m_registerPipelineLock; + + // Event type for timed queue events + enum class TimedQueueEventType : Pal::uint32 + { + Submit, + Signal, + Wait, + Present, + ExternalSignal, + ExternalWait + }; + + // Struct that contains information about a specific timed queue event. + struct TimedQueueEventItem + { + TimedQueueEventType eventType; // Type of event + Pal::uint64 cpuTimestamp; // Time when the event was processed on the cpu + Pal::uint64 apiId; // The api specific id for the queue event + Pal::uint32 sqttCmdBufId; // The sqtt command buffer id value associated with a submit event + Pal::uint32 submitSubIndex; // The sub index of an event within a submission event. + Pal::uint32 queueIndex; // The index of the associated queue in the m_timedQueuesArray + Pal::uint64 frameIndex; // The index of the current frame being rendered + union + { + struct + { + GpuMemoryInfo memInfo[2]; // The gpu memory for the timestamps associated with the event + Pal::gpusize offsets[2]; // Memory offsets for the associated timestamp gpu memory + } gpuTimestamps; + + Pal::uint64 cpuCompletionTimestamp; // The time when the event completed on the cpu + }; + }; + + // Struct for keeping track of timed operation on a specific queue + struct TimedQueueState + { + Pal::IQueue* pQueue; // Pal Queue + Pal::uint64 queueId; // Api specific queue id + Pal::uint64 queueContext; // Api specific queue context + Pal::QueueType queueType; // Queue type + Pal::EngineType engineType; // Engine type + bool valid; // Used to track if the queue is valid + Util::Deque* pAvailableCmdBuffers; // List of available cmdbuffers + Util::Deque* pBusyCmdBuffers; // List of busy cmdbuffers + Pal::IFence* pFence; // Used to track queue + // operations + }; + + // Flags for the current session. + GpaSessionFlags m_flags; + + // Array containing all of the queues registered for timing operations + Util::Vector m_timedQueuesArray; + Util::RWLock m_timedQueuesArrayLock; + + // List of timed queue events for the current session + Util::Vector m_queueEvents; + Util::Mutex m_queueEventsLock; + + // List of timestamp calibration samples + Util::Vector m_timestampCalibrations; + + // The most recent gpu clocks sample + GpuClocksSample m_lastGpuClocksSample; + + // Internal command allocator used for timing command buffers + Pal::ICmdAllocator* m_pCmdAllocator; + + // Finds the TimedQueueState associated with pQueue. + Pal::Result FindTimedQueue(Pal::IQueue* pQueue, + TimedQueueState** ppQueueState, + Pal::uint32* pQueueIndex); + + // Finds the TimedQueueState associated with queueContext. + Pal::Result FindTimedQueueByContext(Pal::uint64 queueContext, + TimedQueueState** ppQueueState, + Pal::uint32* pQueueIndex); + + /// Injects an external timed queue semaphore operation event + Pal::Result ExternalTimedQueueSemaphoreOperation(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSemaphoreInfo, + bool isSignalOperation); + + /// Converts a CPU timestamp to a GPU timestamp using a CalibratedTimestamps struct + Pal::uint64 ConvertCpuTimestampToGpuTimestamp(Pal::uint64 cpuTimestamp, + const Pal::CalibratedTimestamps& calibration) const; + + /// Extracts a GPU timestamp from a queue event + Pal::uint64 ExtractGpuTimestampFromQueueEvent(const TimedQueueEventItem& queueEvent) const; + + // Creates a new command buffer for use on pQueue + Pal::Result CreateCmdBufferForQueue(Pal::IQueue* pQueue, + Pal::ICmdBuffer** ppCmdBuffer); + + // Acquires a command buffer from the TimedQueueState's command buffer pool + Pal::Result AcquireTimedQueueCmdBuffer(TimedQueueState* pQueueState, + Pal::ICmdBuffer** ppCmdBuffer); + + // Recycles busy command buffers in pQueueState + Pal::Result RecycleTimedQueueCmdBuffers(TimedQueueState* pQueueState); + + // Preallocates a fixed number of command buffers for pQueueState and adds them to the command buffer pool + Pal::Result PreallocateTimedQueueCmdBuffers(TimedQueueState* pQueueState, + Pal::uint32 numCmdBuffers); + + // Resets all per session state in pQueueState + Pal::Result ResetTimedQueueState(TimedQueueState* pQueueState); + + // Destroys the memory and resources for pQueueState + void DestroyTimedQueueState(TimedQueueState* pQueueState); + + // Helper function to import one sample item from a source session to copy session. + Pal::Result ImportSampleItem(const SampleItem* pSrcSampleItem); + + // Acquires a range of queue-owned GPU memory for use by the next command buffer submission. + Pal::Result AcquireGpuMem( + Pal::gpusize size, + Pal::gpusize alignment, + Pal::GpuHeap heapType, + Pal::GpuMemMallPolicy mallPolicy, + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset); + + // Acquires a GpaSession-owned performance experiment based on the device's active perf counter requests. + Pal::Result AcquirePerfExperiment( + GpaSession::SampleItem* pSampleItem, + const GpaSampleConfig& sampleConfig, + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset, + GpuMemoryInfo* pSecondaryGpuMem, + Pal::gpusize* pSecondaryOffset, + Pal::gpusize* pHeapSize, + Pal::IPerfExperiment** ppExperiment); + + // Acquires a session-owned pipeline stats query. + Pal::Result AcquirePipeStatsQuery( + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset, + Pal::gpusize* pHeapSize, + Pal::IQueryPool** ppQuery); + + // Dump SQ thread trace data in rgp format + Pal::Result DumpRgpData(const GpaSampleConfig* pTraceConfig, + TraceSample* pTraceSample, + void* pRgpOutput, + size_t* pTraceSize) const; + + // Dumps the spm trace data in the buffer provided. + Pal::Result AppendSpmTraceData(TraceSample* pTraceSample, + size_t bufferSize, + void* pData, + Pal::gpusize* pSizeInBytes) const; + + // Dumps the df spm trace data in the buffer provided. + Pal::Result AppendDfSpmTraceData(TraceSample* pTraceSample, + size_t bufferSize, + void* pData, + Pal::gpusize* pSizeInBytes) const; + + Pal::Result AddCodeObjectLoadEvent(const Pal::IPipeline* pPipeline, CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent(const Pal::IShaderLibrary* pLibrary, CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent(const ElfBinaryInfo& elfBinaryInfo, CodeObjectLoadEventType eventType); + + // Recycle used Gart rafts and put back to available pool + void RecycleGartGpuMem(); + + // Recycle used Local rafts and put back to available pool + void RecycleLocalGpuMem(); + + // Recycle used Invisible rafts and put back to available pool + void RecycleInvisGpuMem(); + + // Destroy and free one sample item and its sub-items. + void FreeSampleItem(GpaSession::SampleItem* pSampleItem); + + // Destroy and free the m_sampleItemArray and associated memory allocation + void FreeSampleItemArray(); + + // Destroy the sub-items in m_sampleItemArray but keep associated memory allocations. + void RecycleSampleItemArray(); + + // Helper function to destroy the GpuMemoryInfo object + void DestroyGpuMemoryInfo(GpuMemoryInfo* pGpuMemoryInfo); + + PAL_DISALLOW_DEFAULT_CTOR(GpaSession); + GpaSession& operator =(const GpaSession&); +}; +} // GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h new file mode 100644 index 0000000000..a201fcc8c2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h @@ -0,0 +1,141 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuUtil.h + * @brief Common include for the PAL GPU utility collection. Defines common types, macros, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" + +// Forward declarations. +namespace Pal +{ + struct DeviceProperties; + class IImage; + class IGpuMemory; + struct ImageCopyRegion; + struct TypedBufferCopyRegion; + struct MemoryImageCopyRegion; +} + +/// Library-wide namespace encapsulating all PAL GPU utility entities. +namespace GpuUtil +{ + +/// Validate image copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] src Src image. +/// @param [in] dst Des image. +/// @param [in] region Copy region. +/// +/// @returns true if the image copy is supported by the specific engine, otherwise false. +extern bool ValidateImageCopyRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::IImage& src, + const Pal::IImage& dst, + const Pal::ImageCopyRegion& region); + +/// Validate typed buffer copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] region Copy region. +/// +/// @returns true if the typed buffer copy is supported by the specific engine, otherwise false. +extern bool ValidateTypedBufferCopyRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::TypedBufferCopyRegion& region); + +/// Validate image-memory copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] image The IImage object. +/// @param [in] region Copy region. +/// +/// @returns true if the image-memory copy is supported by the specific engine, otherwise false. +extern bool ValidateMemoryImageRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::IImage& image, + const Pal::IGpuMemory& memory, + const Pal::MemoryImageCopyRegion& region); + +/// Generate a 64-bit uniqueId for a GPU memory allocation +/// +/// @param [in] isInterprocess Indicates this uniqueId is for an externally shareable GPU memory allocation +/// +/// @returns 64-bit uniqueId +extern Pal::uint64 GenerateGpuMemoryUniqueId( + bool isInterprocess); + +} // GpuUtil + +/** + *********************************************************************************************************************** + * @page GpuUtilOverview GPU Utility Collection + * + * In addition to the generic, OS-abstracted software utilities, PAL provides GPU-specific utilities in the @ref GpuUtil + * namespace. The PAL GPU Utility Collection relies on both PAL core and PAL Utility. They are also available for use by + * its clients. + * + * All available PAL GPU utilities are defined in the @ref GpuUtil namespace, and are briefly summarized below. See the + * Reference topics for more detailed information on specific classes, enums, etc. + * + * ### TextWriter + * The TextWriter GPU utility class provides a method for clients to write text directly to an image. This can be used + * for debugging purposes. PAL's internal DbgOverlay uses the TextWriter class to write information about the current + * FPS and total allocated GPU video memory usage. + * + * The TextWriter class is broken up into palTextWriter.h and palTextWriterImpl.h. The intention is that palTextWriter.h + * will be included from other header files that need a full TextWriter definition, while palTextWriterImpl.h will be + * included by .cpp files that actually interact with the TextWriter. This should keep build times down versus putting + * all implementations directly in palTextWriter.h. + * + * Also included in the TextWriter is the TextWriterFont namespace, which defines the shader IL for drawing the text via + * a compute shader. It also defines the Font data, which is a packed binary that represents which pixels of a 10x16 + * rectangle to render. The font is monospaced. + * + * ### Helper Functions + * ValidateImageCopyRegion - Validate the image copy region, returns true if the image copy is supported by the specific + * engine, otherwise false. + * + * ValidateTypedBufferCopyRegion - Validate the typed buffer copy region, returns true if the typed buffer copy is + * supported by the specific engine, otherwise false. + * + * ValidateMemoryImageRegion - Validate the image-memory copy region, returns true if the image-memory copy is supported + * by the specific engine, otherwise false. + * + * Next: @ref Overview + *********************************************************************************************************************** + */ diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h new file mode 100644 index 0000000000..a9b95610db --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h @@ -0,0 +1,236 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palGpuUtil.h" +#include "palTraceSession.h" +#include "palGpaSession.h" + +#include + +struct SqttQueueEventRecord; +struct SqttQueueInfoRecord; + +namespace Pal +{ +class Platform; +} + +namespace GpuUtil +{ +namespace TraceChunk +{ + +/// "QueueInfo" RDF chunk identifier & version +constexpr char QueueInfoChunkId[TextIdentifierSize] = "QueueInfo"; +constexpr Pal::uint32 QueueInfoChunkVersion = 1; + +/// Enum describing logical queue types +enum class QueueType : Pal::uint8 +{ + Unknown = 0, + Universal = 1, + Compute = 2, + Dma = 3, + Encode = 4, + Decode = 5, + Security = 6, + VideoProcessor = 7 +}; + +/// Enum describing hardware engine types +enum class HwEngineType : Pal::uint8 +{ + Unknown = 0, + Universal = 1, + Compute = 2, + ExclusiveCompute = 3, + Dma = 4, + Decode = 5, + Encode = 6, + HighPriorityUniversal = 7, + HighPriorityGraphics = 8, + Security = 9, + Vpe = 10 +}; + +/// Structure describing a queue's properties +struct QueueInfo +{ + Pal::uint32 pciId; ///< The ID of the GPU queried + Pal::uint64 queueId; ///< API-specific queue ID + Pal::uint64 queueContext; ///< OS-level queue context value from Windows KMD to correlate with ETW data. + /// Only applicable to D3D on Windows; 0 otherwise. + QueueType queueType; ///< The logical queue type + HwEngineType engineType; ///< The hardware engine that the queue is mapped to +}; + +// ------------------------------------------------------------------------------------------- // + +/// "QueueEvent" RDF chunk identifier & version +constexpr char QueueEventChunkId[TextIdentifierSize] = "QueueEvent"; +constexpr Pal::uint32 QueueEventChunkVersion = 1; + +/// The type of queue-level timings event +enum class QueueEventType : Pal::uint32 +{ + CmdBufSubmit = 0, + SignalSemaphore = 1, + WaitSemaphore = 2, + Present = 3 +}; + +/// Structure describing a queue-level timings event +struct QueueEvent +{ + Pal::uint32 pciId; ///< The ID of the GPU queried + Pal::uint64 queueId; ///< The API-specific queue ID which triggered the event + QueueEventType eventType; ///< The type of the queue-timing event + Pal::uint32 sqttCmdBufId; ///< [`CmdBufSubmit` only; 0 otherwise] + /// SQTT command buffer ID matching CmdBufStart user data marker + Pal::uint64 frameIndex; ///< [`CmdBufSubmit` & `Present` only; 0 otherwise] + /// Global frame index incremented for each "Present" call + Pal::uint32 submitSubIndex; ///< [`CmdBufSubmit` only; 0 otherwise] + /// Sub-index of event within submission. + /// When there is only one CmdBuffer per submission, `submitSubIndex` is 0. + /// When there are multiple command buffers per submission, `submitSubIndex` + /// is incremented by one for each command buffer within the submission. + Pal::uint64 apiEventId; ///< [`CmdBufSubmit`] API-specific command buffer ID signaled + /// [`SignalSemaphore`] API-specific semaphore ID signaled + /// [`WaitSemaphore`] API-specific semaphore ID waited on + /// [`Present`] N/A (set to 0) + Pal::uint64 cpuTimestamp; ///< CPU start timestamp of when this event is triggered in clock cycle units + Pal::uint64 gpuTimestamp1; ///< [`CmdBufSubmit`] GPU timestamp when the HW execution of command buffer began + /// [`SignalSemaphore`] GPU timestamp when the HW signaled the queue semaphore + /// [`WaitSemaphore`] GPU timestamp when HW finished waiting on the semaphore + /// [`Present`] GPU timestamp when HW processed the Present call + /// + /// All timestamps are expressed in clock cycle units. + Pal::uint64 gpuTimestamp2; ///< [`CmdBufSubmit` only; 0 otherwise] + /// GPU timestamp when the HW execution of command buffer finished +}; + +} // namespace TraceChunk + +// QueueTimings Trace Source name & version +constexpr char QueueTimingsTraceSourceName[] = "queuetimings"; +constexpr Pal::uint32 QueueTimingsTraceSourceVersion = 2; + +// ===================================================================================================================== +// This trace source captures queue timings data through GPA session & produces "QueueInfo" and "QueueEvent" RDF chunks +class QueueTimingsTraceSource : public ITraceSource +{ +public: + explicit QueueTimingsTraceSource(Pal::IPlatform* pPlatform); + virtual ~QueueTimingsTraceSource(); + + // ==== TraceSource Native Functions ========================================================================== // + Pal::Result Init(Pal::IDevice* pDevice); + + Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, + Pal::uint64 queueId, + Pal::uint64 queueContext); + + Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); + + Pal::Result TimedSubmit(Pal::IQueue* pQueue, + const Pal::MultiSubmitInfo& submitInfo, + const TimedSubmitInfo& timedSubmitInfo); + + Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedSignalInfo, + Pal::uint64 value = 0); + + Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedWaitInfo, + Pal::uint64 value = 0); + + Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, + const TimedQueuePresentInfo& timedPresentInfo); + + Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedWaitInfo); + + Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSignalInfo); + + bool IsTimingInProgress() const; + + // ==== Base Class Overrides =================================================================================== // + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { }; + + virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override; +#else + virtual void OnTraceAccepted() override; +#endif + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { }; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override; + virtual void OnTraceEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override {}; +#else + virtual void OnTraceEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override; +#endif + virtual void OnTraceFinished() override; + + virtual const char* GetName() const override { return QueueTimingsTraceSourceName; } + virtual Pal::uint32 GetVersion() const override { return QueueTimingsTraceSourceVersion; } + +private: + void WriteQueueInfoChunks( + const SqttQueueInfoRecord* pQueueInfoRecords, + size_t numQueueInfoRecords); + + void WriteQueueEventChunks( + const SqttQueueInfoRecord* pQueueInfoRecords, + size_t numQueueInfoRecords, + const SqttQueueEventRecord* pQueueEventRecords, + size_t numQueueEventRecords); + + void ReportInternalError(const char* pErrorMsg, Pal::Result result); + + Pal::IPlatform* const m_pPlatform; // IPlatform owning the parent TraceSession + GpaSession* m_pGpaSession; // Handle to GpaSession object for tracking queue timings + bool m_traceIsHealthy; // Internal flag for tracking resource and state health + std::atomic m_timingInProgress; // Flag for tracking if queue timings operations are ongoing + +}; + +} // namespace GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h new file mode 100644 index 0000000000..ce27cf212b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h @@ -0,0 +1,150 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palTraceSession.h" + +namespace Pal +{ +class IPlatform; +class IQueue; +class ICmdBuffer; +class Device; +} + +namespace GpuUtil +{ + +/// Supported render operations used to advance the trace +enum RenderOp : Pal::uint8 +{ + RenderOpDraw = (1u << 0), + RenderOpDispatch = (1u << 1) +}; + +/// Structure used to batch submit render operations on queue submission +/// This struct should have a `*Count` field for each @ref RenderOp enumeration above +struct RenderOpCounts +{ + Pal::uint32 drawCount; + Pal::uint32 dispatchCount; +}; + +constexpr Pal::uint32 RenderOpTraceControllerVersion = 4; +constexpr char RenderOpTraceControllerName[] = "renderop"; + +// ===================================================================================================================== +class RenderOpTraceController : public ITraceController +{ +public: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 + using RenderOp = GpuUtil::RenderOp; +#endif + RenderOpTraceController(Pal::IPlatform* pPlatform, Pal::IDevice* pDevice); + virtual ~RenderOpTraceController(); + + virtual const char* GetName() const override { return RenderOpTraceControllerName; } + virtual Pal::uint32 GetVersion() const override { return RenderOpTraceControllerVersion; } + + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override; + virtual Pal::Result OnTraceRequested() override; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) override; +#endif + virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; + virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; + virtual Pal::Result OnEndPostambleGpuWork( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer** ppCmdBuffer) override; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 + void RecordRenderOp(Pal::IQueue* pQueue, RenderOp renderOp); +#endif + + void FinishTrace(); + + // Cancel the trace currently in progress. + virtual Pal::Result OnTraceCanceled() override; + + /// This function must be called by client drivers implementing the RenderOp controller. + /// On every queue submission, this function is called with the cumulative counts of render operations + /// recorded into that queue's command buffers. + /// Based on the controller's internal mask, set by the user during trace configuration, + /// the trace controller may advance its state. + void RecordRenderOps(Pal::IQueue* pQueue, const RenderOpCounts& renderOpCounts); + +private: + /// Controls whether the trace proceeds on absolute render op counts or relative + enum class CaptureMode : Pal::uint8 + { + Relative = 0, ///< Relative to when the trace request is received + Absolute ///< Absolute render op index + }; + + Pal::Result AcceptTrace(); + Pal::Result BeginTrace(); + + Pal::Result SubmitBeginTraceGpuWork() const; + Pal::Result SubmitEndTraceGpuWork(); + Pal::Result SubmitEndPostambleGpuWork(); + + Pal::Result WaitForTraceEndGpuWorkCompletion() const; + Pal::Result CreateFence(Pal::IFence** ppFence) const; + Pal::Result CreateCommandBuffer(bool traceEnd, Pal::ICmdBuffer** ppCmdBuf) const; + Pal::Result CreateCmdAllocator(); + + void OnRenderOpUpdated(Pal::uint64 countRecorded); + void FreeResources(); + void AbortTrace(); + + Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceController + Pal::IDevice* m_pDevice; // Device associated with this TraceController + Pal::ICmdAllocator* m_pCmdAllocator; // Command allocator for the TraceController + + TraceSession* m_pTraceSession; // TraceSession owning this TraceController + Pal::uint64 m_supportedGpuMask; // Bit mask of GPU indices that are capable of participating in the trace + Pal::uint8 m_renderOpMask; // Bitmask of RenderOp modes, indicating which are accepted + CaptureMode m_captureMode; // Modality for determining the starting renderop index of the trace + Pal::uint64 m_renderOpCount; // The "global" count, incremented on every render op + Pal::uint64 m_prepStartRenderOp; // Relative or absolute render op number indicating trace begin + Pal::uint64 m_numPrepRenderOps; // Number of "warm-up" frames before the start frame + Pal::uint64 m_captureRenderOpCount; // Number of frames to wait before ending the trace + Pal::uint64 m_renderOpTraceAccepted; // The frame number when the trace was accepted + + Util::Mutex m_renderOpLock; // Lock over UpdateFrame/OnFrameUpdated + Pal::IQueue* m_pQueue; // The queue being used to submit Begin/End GPU trace command buffers +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + Pal::ICmdBuffer* m_pCmdBufTracePrepare; // Command buffer for recording during the prep phase +#endif + Pal::ICmdBuffer* m_pCmdBufTraceBegin; // Command buffer to submit Trace Begin + Pal::ICmdBuffer* m_pCmdBufTraceEnd; // Command buffer to submit Trace End + Pal::ICmdBuffer* m_pCmdBufPostambleEnd; // Command buffer to submit Postamble End + Pal::IFence* m_pFenceTraceEnd; // Fence to wait for Trace End command buffer completion + Pal::IFence* m_pFencePostambleEnd; // Fence to wait for Postamble End command buffer completion +}; + +} // namespace GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h new file mode 100644 index 0000000000..b5cf7bda3b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h @@ -0,0 +1,737 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palTraceSession.h + * @brief PAL GPU utility TraceSession class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palPlatform.h" +#include "palDeque.h" +#include "palDevice.h" +#include "palGpuUtil.h" +#include "palHashMap.h" +#include "palMutex.h" +#include "palPipeline.h" +#include "palSysMemory.h" +#include "palGpuMemory.h" +#include "palMemTrackerImpl.h" +#include "palVector.h" + +struct rdfStream; +struct rdfChunkFileWriter; + +namespace DevDriver +{ +class IStructuredWriter; +class IStructuredReader; +class StructuredValue; +} + +namespace GpuUtil +{ + +class ITraceController; +class ITraceSource; + +constexpr Pal::uint16 TextIdentifierSize = 16; + +/// Information required to create a new chunk of trace data in a TraceSession +/// +/// This data inside this structure is expected to be produced by trace source implementations. The specific fields +/// included within this structure are intended to support compatibility with the Radeon Data Format (RDF) spec. +struct TraceChunkInfo +{ + char id[TextIdentifierSize]; ///< Text identifier of the chunk + Pal::uint32 version; ///< Version number of the chunk + const void* pHeader; ///< [in] Pointer to a buffer that contains the header data for the chunk + Pal::int64 headerSize; ///< Size of the buffer pointed to by pHeader + const void* pData; ///< [in] Pointer to a buffer that contains the data for the chunk + Pal::int64 dataSize; ///< Size of the buffer pointed to by pData + bool enableCompression; ///< Indicates if the chunk's data should be compressed or not +}; + +/// The available states of TraceSession +enum class TraceSessionState : Pal::uint32 +{ + Ready = 0, ///< New trace ready to begin + Requested = 1, ///< A trace has been requested and awaiting acceptance + Preparing = 2, ///< Trace has been accepted and is preparing resources before beginning + Running = 3, ///< Trace is in progress +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + Postamble = 4, ///< The detailed frame trace has ended but its data has not yet been written + /// into the session. Some trace sources may still collect data during this time. + PostambleWaiting = 5, ///< Waiting for Postamble to complete. + Completed = 6, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). + Count = 7 +#else + Waiting = 4, ///< Trace has ended, but data has not been written into the session + Completed = 5, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). + Count = 6 +#endif +}; + +/// Defines the type of payload. Currently only strings are supported but in the future can include JSON, structs, etc. +enum class TraceErrorPayload : Pal::uint32 +{ + None, //< Should be set when there is no additional information to be sent with the error + ErrorString //< Should be set when the error payload is string data +}; + +/// Chunk header for the error tracing chunk +struct TraceErrorHeader +{ + char chunkId[TextIdentifierSize]; ///< Text identifier of the failing chunk + Pal::uint32 chunkIndex; ///< Chunk index of the failing chunk + Pal::Result resultCode; ///< PAL Result code of the failure + TraceErrorPayload payloadType; ///< Type of error chunk payload +}; + +constexpr char ErrorChunkTextIdentifier[TextIdentifierSize] = "TraceError"; +constexpr Pal::uint32 ErrorTraceChunkVersion = 1; + +/** +*********************************************************************************************************************** +* @interface ITraceController +* @brief Interface that allows for control of a trace operation through TraceSession. +* +* Trace controllers are responsible for driving the high-level steps of a trace operation. Users of this interface are +* expected to create their own implementation of this interface, register it with a TraceSession, then call the +* following TraceSession functions to drive the trace process: +* +* TraceSession::AcceptTrace +* TraceSession::BeginTrace +* TraceSession::EndTrace +* TraceSession::EndPostamble +* TraceSession::FinishTrace +*********************************************************************************************************************** +*/ +class ITraceController +{ +public: + /// Returns the name of the controller + /// + /// @returns the name of the controller as a null terminated string + virtual const char* GetName() const = 0; + + /// Returns the version of the controller + /// + /// @returns the version of the controller as an unsigned integer value + virtual Pal::uint32 GetVersion() const = 0; + + /// Called by the associated session to update the current trace configuration + /// + /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; + + /// Called by the associated session to notify the controller that a trace has been requested and it can take + /// control of the TraceSession when desired. + virtual Pal::Result OnTraceRequested() = 0; + + /// Called by the associated session to notify the controller that a trace has been canceled and it can start + /// canceling the trace when ready. + virtual Pal::Result OnTraceCanceled() = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + /// Called by TraceSession to indicate that GPU work is required on the indicated GPU during the preparation phase. + /// The command buffer must be ready to record commands; however, the trace controller should not submit it + /// until the trace begins. + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: This command buffer should be submitted at the same time as the command buffer provided in + /// `OnBeginGpuWork`. They may be the same command buffer or separate; the goal is to allow trace sources + /// to frontload recording GPU work before the trace formally begins. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to record GPU work before a trace starts executing. + /// Note that this command buffer shouldn't be submitted until the trace begins. + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; +#endif + + /// Called by TraceSession to indicate that GPU work is required to begin a trace on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; + + /// Called by TraceSession to indicate that GPU work is required to end a trace on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the trace + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; + + /// Called by TraceSession to indicate that GPU work is required to end the postamble on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnEndPostambleGpuWork( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer** ppCmdBuf) = 0; +}; + +/** +*********************************************************************************************************************** +* @interface ITraceSource +* @brief Interface that enables developers to emit arbitrary data chunks into a trace through TraceSession. +* +* Trace sources are used to implement any surrounding logic required to produce a trace data chunk. Users of this +* interface are expected to create their own implementation of this interface, register it with a TraceSession, then +* call TraceSession::WriteDataChunk during a trace operation whenever a data chunk should be produced. +*********************************************************************************************************************** +*/ +class ITraceSource +{ +public: + /// Called by the associated session to update the current trace configuration + /// + /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; + + /// Returns a bitmask that represents which GPUs are relevant to this trace source + /// + /// If the bit at index N is set, GPU N must execute work on the GPU in order to produce trace data + virtual Pal::uint64 QueryGpuWorkMask() const = 0; + + /// Called by the associated session to notify the source that a new trace has been accepted + /// + /// The source may use this notification to do any preparation work that might be required before the trace begins. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + /// A command buffer is provided for the trace source to insert any work into. Note that the work will not be + /// submitted until the trace begins (at the same time as `OnTraceBegin`). This allows for frontloading of + /// expensive operations, such as the construction of a GpaSession sample, that would affect runtime speed + /// or behavior during trace exeecution. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to record any GPU work required during the + /// preparation phase of the trace. Not submitted until `OnTraceBegin`. + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; +#else + virtual void OnTraceAccepted() = 0; +#endif + + /// Called by the associated session to notify the source that it should begin a trace + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// begin a trace operation. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. All GPUs that begin a trace are required to end it later. Sources are not expected + /// to handle cases where the begin/end function calls are mismatched during a trace operation. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; + + /// Called by the associated session to notify the source that it should end the current trace + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// end a trace operation. + /// + /// The command buffer associated with the OnTraceBegin function is not guaranteed to have finished GPU execution + /// when this function is called. The command buffer associated with this function is also not guaranteed to finish + /// execution until OnTraceFinished is called. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the trace + virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; + + /// Called by the associated session to notify the source that it should end the postamble + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// end its postamble operation. + /// + /// The command buffer associated with the OnTraceBegin and OnTraceEnd functions are not guaranteed to have + /// finished GPU execution when this function is called. The command buffer associated with this function is also + /// not guaranteed to finish execution until OnTraceFinished is called. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) = 0; +#endif + + /// Called by the associated session to notify the source that the current trace has finished + /// + /// When this function is called, all prior command buffers provided to the source during the trace operation have + /// finished execution. The source should use this function to collect any data generated by the GPU and emit it + /// via TraceSession::WriteDataChunk. + virtual void OnTraceFinished() = 0; + + /// Returns the name of the source + /// + /// @returns the name of the source as a null terminated string + virtual const char* GetName() const = 0; + + /// Returns the version of the source + /// + /// @returns the version of the source as an unsigned integer value + virtual Pal::uint32 GetVersion() const = 0; + + /// Whether multiple instances of the trace source are allowed + /// + /// @returns true if multiple instances of this trace sources can co-exist in one session, false otherwise. + virtual bool AllowMultipleInstances() const { return false; } +}; + +/** +*********************************************************************************************************************** +* @class TraceSession +* @brief Helper class providing common driver functionality for collecting arbitrary data traces. +* +* Due to the global nature of the trace functionality, only one TraceSession is typically used at a time. +* An interface to acquire a session exists on IPlatform. Users who need to interact with an instance of this object +* should expect to acquire it there. +* +* @see IPlatform::GetTraceSession() +*********************************************************************************************************************** +*/ +class TraceSession final +{ +public: + /// Constructor. + /// + /// @param [in] pPlatform Platform associated with this TraceSesion + TraceSession(Pal::IPlatform* pPlatform); + + /// Destructor + ~TraceSession(); + + /// Initialize the trace session before requesting a trace. + /// + /// @returns Success if initalization was successful, or ErrorUnknown upon failure. + Pal::Result Init(); + + /// Returns whether tracing has been formally enabled via UberTrace or not. + /// If 'true', this means that tool-side applications have requested this + /// TraceSession to capture traces. This has implications for PAL clients. + /// + /// @returns True if tracing has been enabled, and false otherwise. + bool IsTracingEnabled() const { return m_tracingEnabled; } + + /// Attempts to update the current trace configuration + /// + /// This function will only succeed if there is currently to trace in progress + /// + /// TODO: The JSON configuration interface will likely be replaced with driver settings in the future + /// + /// @param [in] pData Buffer that stores the Json-formatted configuration data + /// @param [in] dataSize Configuration data-size + /// + /// @returns Success if the trace configuration was successfully updated. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is currently in progress + /// + ErrorInvalidPointer pData is nullptr + /// + ErrorInvalidParameter pData is not valid json + Pal::Result UpdateTraceConfig(const void* pData, size_t dataSize); + + /// Attempts to request a new trace operation on the trace session. + /// + /// Once a trace is successfully requested, it will become available for a registered trace controller to accept. + /// When a controller accepts the trace, it becomes responsible for managing the rest of the trace operation and + /// notifying the session upon trace completion. + /// + /// Since the session can only run a single trace at a time, this function will not succeed if another trace is + /// is already requested or in progress. + /// + /// @returns Success if the trace operation was successfully requested. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if there is a trace in progress already and a new one cannot be started + Pal::Result RequestTrace(); + + /// Cancels a trace currently in progress. + /// + /// @returns Success if the trace was successfully canceled. + /// Otherwise, one of the following errors may be returned: + /// + NotReady if the trace is not ready to be canceled. + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result CancelTrace(); + + /// Cleans up the RDF chunk stream and makes it ready for a new trace again. + /// + /// @returns Success if the trace session and rdf streams were successfully cleaned up and returned to the + /// initialization state + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result CleanupChunkStream(); + + /// Attempts to consume any trace data stored within the trace session. + /// + /// This function will only successfully return trace data after a trace operation is completed on the session. + /// + /// TODO: This function should be replaced with one that uses a callback so we can avoid needing to store the trace + /// data into memory twice. + /// + /// @param [out] pData (Optional) Destination buffer to copy the trace data into + /// If this parameter is nullptr, the size of the trace data in bytes will be + /// returned via pDataSize instead of consuming any trace data. + /// @param [in/out] pDataSize If pData is nullptr, then this parameter is used to return the trace data + /// size in bytes. + /// If pData is valid, this parameter represents the size of the buffer + /// pointed to by pData. + /// + /// @returns Success if the trace data was successfully consumed or the size of the trace data was returned. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library + /// + ErrorUnavailable if trace data is not available for collection at this time + /// + ErrorInvalidPointer if nullptr is passed as pDataSize + /// + ErrorInvalidMemorySize if *pDataSize indicates that pData is too small to contain the trace data + Pal::Result CollectTrace(void* pData, size_t* pDataSize); + + /// Attempts to register a trace controller + /// + /// Once registered, trace controllers can receive configuration updates from the session. + /// They may also manage the trace operation by calling AcceptTrace, BeginTrace, EndTrace, EndPostamble and FinishTrace. + /// + /// Trace controllers can only be registered when there is no trace in progress + /// + /// @param [in] pController The trace controller to register with the session + /// + /// @returns Success if the controller was successfully registered. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + AlreadyExists if this controller has already been registered + /// + ErrorUnavailable if a trace is in progress + /// + ErrorInvalidPointer if nullptr is passed as pController + Pal::Result RegisterController(ITraceController* pController); + + /// Attempts to unregister a previously registered trace controller + /// + /// @param [in] pController The trace controller to unregister from the session + /// + /// @returns Success if the controller was successfully unregistered. + /// Otherwise, one of the following errors may be returned: + /// + NotFound if the provided controller was not previously registered + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is in progress + Pal::Result UnregisterController(ITraceController* pController); + + /// Attempts to register a trace source + /// + /// Once registered, trace sources can receive configuration updates from the session. + /// They may also emit data during trace operations by calling WriteDataChunk. + /// + /// Trace sources can only be registered when there is no trace in progress + /// + /// @param [in] pSource The trace source to register with the session + /// + /// @returns Success if the source was successfully registered. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + AlreadyExists if this source has already been registered + /// + ErrorUnavailable if a trace is in progress + /// + ErrorInvalidPointer if nullptr is passed as pSource + Pal::Result RegisterSource(ITraceSource* pSource); + + /// Attempts to unregister a previously registered trace source + /// + /// @param [in] pSource The trace source to unregister from the session + /// + /// @returns Success if the source was successfully unregistered. + /// Otherwise, one of the following errors may be returned: + /// + NotFound if the provided source was not previously registered + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is in progress + Pal::Result UnregisterSource(ITraceSource* pSource); + + /// Attempts to accept a previously requested trace with the provided controller + /// + /// Once a trace is successfully accepted by a controller, that controller becomes responsible for managing the + /// rest of the trace operation. Also, once a requested trace is accepted by a controller, no other controllers + /// will be able to accept that trace. Accept is a "consuming" operation. + /// + /// @param [in] pController The trace controller to accept the trace with + /// @param [in] supportedGpuMask Bit mask of GPU indices that are capable of participating in the trace + /// + /// The GPU mask provided to this function is used to determine which GPUs will be involved in the trace. In order + /// to decide which GPUs require GPU work, the session creates a combined mask from all registered sources and + /// checks it against the mask provided by this function. Only GPUs that are present in both masks will be able to + /// submit GPU work during the trace. + /// + /// @returns Success if the trace was successfully accepted. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if no trace has been requested or a trace is currently in progress + /// + ErrorInvalidPointer if nullptr is passed as pController + Pal::Result AcceptTrace(ITraceController* pController, Pal::uint64 supportedGpuMask); + + /// Begins the trace that was previously accepted by the provided controller + /// + /// This function MUST be called after a successful call to AcceptTrace. When this function is called, the session + /// will communicate with all registered trace sources and instruct them to begin the trace operation. The provided + /// trace controller will be notified if any GPU work is required via ITraceController::OnBeginGpuWork. The command + /// buffers returned by OnBeginGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted BEFORE the command buffers + /// generated in response to the EndTrace call! + /// + /// In situations where multiple GPUs are present, the OnBeginGpuWork function will be called once per GPU index, + /// for all GPUs that are relevant for the current trace sources. + /// + /// @returns Success if the trace was successfully started. + /// Otherwise, the error generated by OnBeginGpuWork will be returned. + Pal::Result BeginTrace(); + + /// Ends the trace that was previously started by the provided controller + /// + /// This function MUST be called after BeginTrace. When this function is called, the session will communicate with + /// all registered trace sources and instruct them to end the trace operation. The provided trace controller will + /// trace controller will be notified if any GPU work is required via ITraceController::OnEndGpuWork. The command + /// buffers returned by OnEndGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers + /// generated in response to the previous BeginTrace call! The generated command buffers MUST also complete + /// execution on the GPU BEFORE FinishTrace is called! + /// + /// In situations where multiple GPUs are present, the OnEndGpuWork function will be called once per GPU index + /// for all GPUs that are relevant for the current trace sources. + /// + /// The Trace Session will enter Postamble phase after EndTrace is called. + /// + /// @returns Success if the trace was successfully ended. + /// Otherwise, the error generated by OnEndGpuWork will be returned. + Pal::Result EndTrace(); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + /// Ends the postamble phase, which typically runs until the detailed trace data is available. + /// This function MUST be called after EndTrace. When this function is called, the session will communicate with + /// all registered trace sources and notify them of the end of the postamble phase. The provided trace controller + /// will be notified if any GPU work is required via ITraceController::OnEndPostambleGpuWork. The command + /// buffers returned by OnEndPostambleGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers + /// generated in response to the previous EndTrace call! The generated command buffers MUST also complete + /// execution on the GPU BEFORE FinishPostamble is called! + /// + /// In situations where multiple GPUs are present, the OnEndPostambleGpuWork function will be called once per GPU index + /// for all GPUs that are relevant for the current trace sources. + /// + /// @returns Success if the trace was successfully ended. + /// Otherwise, the error generated by OnEndPostambleGpuWork will be returned. + Pal::Result EndPostamble(); +#endif + + /// Notifies the session that the trace operation started by the provided controller has finished. + /// + /// This function MUST be called after EndPostamble. When this function is called, the session will communicate with + /// all registered trace sources and notify them that all GPU work is complete. This notification is typically + /// used by sources to retrieve data produced by the GPU and write it into the session's trace data. + void FinishTrace(); + + /// Writes a chunk of trace data into the session. + /// + /// Trace sources are expected to call this function whenever they produce a new data chunk that should be added + /// into the session's trace data. + /// + /// This function may ONLY be called AFTER the BeginTrace function returns and BEFORE the FinishTrace call returns! + /// + /// @param [in] pSource The trace source that generated the provided data chunk + /// @param [in] info Information about the provided chunk that will be written into the trace data + /// + /// @returns Success if the incoming data chunk was successfully written/appended into the current data stream. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library + Pal::Result WriteDataChunk(ITraceSource* pSource, const TraceChunkInfo& info); + + /// Returns the current TraceSession state + /// + /// @returns Enum value of the current TraceSessionState + TraceSessionState GetTraceSessionState() const + { + return m_sessionState; + } + + /// Sets the TraceSession state based on external operations + /// + /// @param [in] sessionState TraceSessionState value to be assigned as the current state + void SetTraceSessionState(TraceSessionState sessionState) + { + m_sessionState = sessionState; + } + + /// Returns the current active controller + /// + /// @returns Pointer to the current active controller driving the TraceSession + ITraceController* GetActiveController() const + { + return m_pActiveController; + } + + /// Reports an error encountered during an active trace by inserting a "TraceError" chunk to the trace stream + /// + /// If, during a trace or the construction of an RDF chunk, an error is encountered and a chunk that was + /// expected to be written can no longer be, this function may be called to insert an error chunk in place + /// of the expected chunk. + /// + /// @param [in] chunkId Text identifier of the failed RDF chunk + /// @param [in] pPayload Pointer to the data sent for the error + /// If the payloadType is a string, the string must be null-terminated + /// @param [in] payloadSize Size of the data in the payload + /// @param [in] payloadType Type of payload data represented by `pPayload` + /// @param [in] errorResult The PAL result code of the encountered error + /// + /// @returns Success if the error chunk was written successfully + Pal::Result ReportError( + const char chunkId[TextIdentifierSize], + const void* pPayload, + Pal::uint64 payloadSize, + TraceErrorPayload payloadType, + Pal::Result errorResult); + + /// Explicitly activates this TraceSession for managing traces. + /// + /// This should be called during Platform Init in response to a tool-side request to enable UberTrace tracing. + /// This signals that an active connection has been made to tool-side applications and that profiling via + /// PAL Trace should be prioritized in client drivers. + void EnableTracing() + { + m_tracingEnabled = true; + } + + /// Returns a pointer to a byte array containing the trace configuration. + /// + /// @param [out] pTraceConfigSize Sets *pTraceConfigSize to the number of bytes in the trace config + /// + /// @returns A pointer to the trace configuration data + const void* GetTraceConfig(size_t* pTraceConfigSize) const + { + PAL_ASSERT(pTraceConfigSize != nullptr); + (*pTraceConfigSize) = m_configDataSize; + return m_pConfigData; + } + + /// Indicates if a cancel-trace signal has been received and that a cancelation is in progress. + /// + /// @return true if a cancelation is in progress. + bool IsCancelingTrace() const { return m_cancelingTrace; } + +private: + typedef Pal::IPlatform TraceAllocator; + + Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceSesion + DevDriver::IStructuredReader* m_pReader; // Stores the current JSON-based config of the TraceSession + + // RW Locks for trace sources, controllers, and RDF streams + Util::RWLock m_registerTraceSourceLock; + Util::RWLock m_registerTraceControllerLock; + Util::RWLock m_chunkAppendLock; + + // Trace sources registered with this TraceSession. + using TraceSourcesVec = Util::Vector; + TraceSourcesVec m_registeredTraceSources; + + // TraceSources and corresponding configs + typedef Util::HashMap TraceSourcesConfigMap; + TraceSourcesConfigMap m_traceSourcesConfigs; + + // Unique trace controllers registered with this TraceSession. + typedef Util::HashMap TraceControllersMap; + TraceControllersMap m_registeredTraceControllers; + + ITraceController* m_pActiveController; // The controller currently driving the TraceSession. + // We can have only one active controller at a time. + TraceSessionState m_sessionState; // Current state of the TraceSession + rdfChunkFileWriter* m_pChunkFileWriter; // Helper struct that manages create chunk file streams + // and write data chunks + rdfStream* m_pCurrentStream; // Active RDF stream for writing chunks + Pal::int32 m_currentChunkIndex; // The current chunk index of the RDF stream + bool m_tracingEnabled; // Flag indicating UberTrace tracing is enabled tool-side + void* m_pConfigData; // Buffer containing the cached trace configurationn + size_t m_configDataSize; // Size of the cached trace config buffer + bool m_cancelingTrace; // Indicates that a cancel signal has been received and trace cancelation + // is in progress. +}; +} // GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h b/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h new file mode 100644 index 0000000000..cded304a46 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h @@ -0,0 +1,368 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palAssert.h + * @brief PAL utility collection assert macros. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDbgPrint.h" +#include "palUtil.h" + +#if defined(_WIN32) +#include +/// OS-independent macro to force a break into the debugger. +#define PAL_DEBUG_BREAK() [[unlikely]] __debugbreak() +#else +#include +/// OS-independent macro to force a break into the debugger. +#define PAL_DEBUG_BREAK() [[unlikely]] raise(SIGTRAP); +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +/// Macro to direct static code analysis to assume the specified expression will always be true. +/// Purpose is to suppress warnings from MSVC's /analysis setting. +/// Only pertains to static code analysis. Does not impact compile optimization. Not the same as C++23's [[assume]]. +# define PAL_ANALYSIS_ASSUME(_expr) __analysis_assume(_expr) +#else +/// This macro is only useful on MSVC builds. It has no meaning for other builds. +# define PAL_ANALYSIS_ASSUME(_expr) ((void)0) +#endif + +namespace Util +{ + +/// A helper function to check the size-in-bits of a 'reserved' member in a bitfield. +/// This is intended for use with static_asserts to ensure things don't go out-of-sync. +/// +/// @param [in] expectedTotalBitWidth Number of bits expected in the whole type +/// @param [in] expectedReservedBits Number of bits in the 'reserved' field +/// +/// @return true if the bit lengths of the type T match the values in the args. +/// true if the compiler lacks support to do this at compile time. +/// +/// @note This may not work properly with old compilers, but this is meant for linting anyhow. +template +constexpr bool CheckReservedBits( + uint32 expectedTotalBitWidth, + uint32 expectedReservedBits) +{ + bool match = false; + + // Fail if the whole size is different + if (sizeof(T) * 8 == expectedTotalBitWidth) + { + // Get the width of the reserved field by detecting when it stops filling bits + T sample = {}; + uint64 mask = 0; + uint32 reservedBits = 0; + do + { + sample = {}; + mask = (mask << 1) | 1; + reservedBits++; + sample.reserved = mask; + } while ((sample.reserved == mask) && (reservedBits < sizeof(T) * 8)); + // when the loop terminates, it's one past the size of the field. + match = (reservedBits - 1) == expectedReservedBits; + } + return match; +} + +/// A helper function to check that a series of static numeric values are sequential. +/// This is intended for use with static_asserts to ensure things don't go out-of-sync. +/// +/// @param [in] args Array of numeric values to check +/// @param [in] interval Expected interval between each (default 1, 4 is also common for field offsets) +/// +/// @return true if all the values are sequential +/// true if the compiler lacks support to do this at compile time. +/// +/// @note This may not work properly with old compilers, but this is meant for linting anyhow. +template +constexpr bool CheckSequential( + const T (&args)[N], + T interval = 1) +{ + bool isSequential = true; + for (int i = 0; i < (N - 1); i++) + { + if ((args[i] + interval) != args[i + 1]) + { + isSequential = false; + break; + } + } + return isSequential; +} + +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) + +/// Specifies how severe an triggered assert (or alert) is. +/// +/// Both asserts and alerts can print out a debug string and break into the debugger. Asserts are to be used to verify +/// the known, assumed state of the program at any time. Alerts are to be used to notify the developer of a _possible_, +/// but unexpected condition such as memory allocation failure, an OS call failure, or an application behavior that is +/// known to be slow. +enum AssertCategory : uint32 +{ + AssertCatAssert = 0, + AssertCatAlert, + AssertCatCount +}; + +/// Enables/disables the specified assert category. +/// +/// Probably controlled by a setting and set during initialization. +/// +/// @param [in] category Assert category to enable/disable (asserts or alerts). +/// @param [in] enable True to enable the specified assert category, false to disable it. +extern void EnableAssertMode( + AssertCategory category, + bool enable); + +/// Returns true if the specified assert category is enabled and false otherwise. +/// +/// @param [in] category Assert category to check +extern bool IsAssertCategoryEnabled( + AssertCategory category); + +#endif + +} // namespace Util + +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) +/// Prints an error message with the specified reason via the debug print system. A debug break will also be triggered +/// if they're currently enabled for asserts. +/// +/// @note This version of assert inlines an 'int 3' every time it is used so that each occurrence can be zapped +/// independently. This macro cannot be used in assignment operations. +#define PAL_TRIGGER_ASSERT(_pFormat, ...) [[unlikely]] \ +do { \ + PAL_DPERROR(_pFormat, ##__VA_ARGS__); \ + if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAssert)) \ + { \ + PAL_DEBUG_BREAK(); \ + } \ +} while (false) + +/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the +/// specified reason. +/// +/// @note This assert should not be used in constant evaluated contexts (e.g., constexpr functions). +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// PAL_TRIGGER_ASSERT(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So: +// 1. we ignore PAL coding standards and don't use a compound statement; +// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. +#define PAL_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) \ + PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ < 6) + +// Function to circumvent gcc 5.x inability to use lambdas in unevaluated constant expression contexts. +constexpr void PalTriggerAssertImpl( + const char* pFormat, + const char* pExpr, + const char* pFile, + int line, + const char* pFunc) +{ + // pExpr is always not nullptr, as it's supposed to be a preprocessor string, but it does convince gcc + // to compile PalTriggerAssertImpl() as potentially constexpr + pExpr != nullptr ? + [&] + { + Util::DbgPrintf( + Util::DbgPrintCatErrorMsg, + Util::DbgPrintStyleDefault, + pFormat, + pExpr, + pFile, + line, + pFunc); + if (Util::IsAssertCategoryEnabled(Util::AssertCatAssert)) + { + PAL_DEBUG_BREAK(); + } + return 0; + }() + : 0; +} + +// gcc 5.4 implementation of PAL_CONSTEXPR_ASSERT_MSG that ignores the additional reason for the assertion +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// PalTriggerAssertImpl(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) [[unlikely]] \ + PalTriggerAssertImpl("Assertion failed: %s (%s:%d:%s)", #_expr, __FILE__, __LINE__, __func__); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#else + +/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the +/// specified reason. +/// +/// @note This assert should be used in constant evaluated contexts (e.g., constexpr functions). +/// @note This assert uses an immediately-invoked function expression in the form of an internal lambda to signal a +/// failed assert. Since PAL_TRIGGER_ASSERT is not constexpr, an _expr that evaluates to false will fail to +/// compile the function operator of the lambda. +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// [&] { PAL_TRIGGER_ASSERT(...); }(); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) [[unlikely]] \ + [&] { PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); }(); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#endif + +/// Calls the PAL_ASSERT_MSG macro with a generic reason string +#define PAL_ASSERT(_expr) PAL_ASSERT_MSG(_expr, "%s", "Unknown") + +/// Calls the PAL_CONSTEXPR_ASSERT_MSG macro with a generic reason string +#define PAL_CONSTEXPR_ASSERT(_expr) PAL_CONSTEXPR_ASSERT_MSG(_expr, "%s", "Unknown") + +#if DEBUG +/// Debug build only PAL assert, the typical usage is when make an assertion on a debug-only variables. +/// The only difference than PAL assert is it's empty in release mode. +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) \ +do { \ + PAL_ASSERT(_expr); \ +} while (false) +#else +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) +#endif + +/// If the expression evaluates to true, then a warning message with the specified reason will be printed via the +/// debug print system. A debug break will also be triggered if they're currently enabled for alerts. +/// +/// @note This is the opposite polarity of asserts. The assert macro _asserts_ that the specified condition is true. +/// While the alert macro _alerts_ the developer if the specified condition is true. +/// +/// This macro should be used in places where an assert is inappropriate because an error condition is _possible_, but +/// not typically expected. For example, asserting that an OS call succeeded should be avoided since there cannot be an +/// assumption that it will succeed. Nonetheless, a developer may want to be alerted immediately and dropped into the +/// debugger when such a failure occurs. +#define PAL_TRIGGER_ALERT(_pFormat, ...) [[unlikely]] \ +do { \ + PAL_DPWARN(_pFormat, ##__VA_ARGS__); \ + if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAlert)) \ + { \ + PAL_DEBUG_BREAK(); \ + } \ +} while (false) + +// +// This previously said: +// if (_expr) [[unlikely]] +// { +// PAL_TRIGGER_ASSERT(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So: +// 1. we ignore PAL coding standards and don't use a compound statement; +// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. +#define PAL_ALERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + if (_expr) \ + PAL_TRIGGER_ALERT("Alert triggered: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ +} while (false) + +/// Calls the PAL_ALERT_MSG macro with a generic reason string +#define PAL_ALERT(_expr) PAL_ALERT_MSG(_expr, "%s", "Unknown") + +/// Convenience macro that asserts if something has never been tested. +#define PAL_NOT_TESTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code Not Tested! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NOT_TESTED() PAL_NOT_TESTED_MSG("%s", "Unknown") + +/// Convenience macro that asserts if something has not been implemented. +#define PAL_NOT_IMPLEMENTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Not Implemented! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NOT_IMPLEMENTED() PAL_NOT_IMPLEMENTED_MSG("%s", "Unknown") + +/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. +#define PAL_NEVER_CALLED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code should never be called! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NEVER_CALLED() PAL_NEVER_CALLED_MSG("%s", "Unknown") + +/// Convenience macro that always asserts. Expect this to be used instead of PAL_ASSERT(false). +#define PAL_ASSERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Unconditional Assert | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_ASSERT_ALWAYS() PAL_ASSERT_ALWAYS_MSG("%s", "Unknown") + +/// Convenience macro that always alerts. Expect this to be used instead of PAL_ALERT(true). +#define PAL_ALERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ALERT("Unconditional Alert | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_ALERT_ALWAYS() PAL_ALERT_ALWAYS_MSG("%s", "Unknown") + +#else + +#define PAL_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_CONSTEXPR_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) +#define PAL_ALERT(_expr) ((void)0) +#define PAL_ALERT_MSG(_expr, ...) ((void)0) +#define PAL_NOT_TESTED() [[unlikely]] ((void)0) +#define PAL_NOT_TESTED_MSG(...) [[unlikely]] ((void)0) +#define PAL_NOT_IMPLEMENTED() [[unlikely]] ((void)0) +#define PAL_NOT_IMPLEMENTED_MSG(...) [[unlikely]] ((void)0) +#define PAL_NEVER_CALLED() [[unlikely]] ((void)0) +#define PAL_NEVER_CALLED_MSG(...) [[unlikely]] ((void)0) +#define PAL_ASSERT_ALWAYS() [[unlikely]] ((void)0) +#define PAL_ASSERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) +#define PAL_ALERT_ALWAYS() [[unlikely]] ((void)0) +#define PAL_ALERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) + +#endif + diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h b/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h new file mode 100644 index 0000000000..765a6b70e7 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h @@ -0,0 +1,212 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palAutoBuffer.h + * @brief PAL utility collection AutoBuffer class definition. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSpan.h" +#include "palSysMemory.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Safe version of C99's variable-length arrays. + * + * The general idea is that this class encapsulates a variable-length array where we expect the size required to not + * exceed the 'defaultCapacity' template parameter most of the time. In those "normal" cases, this buffer will + * reference a static array of size 'defaultCapacity', but if the constructor's parameter exceeds defaultCapacity, then + * a dynamic array will be allocated from the heap to satisfy the space requirements. The destructor will clean-up any + * dynamic allocation made by the constructor. + * + * This class violates several PAL coding conventions, but for good reason: + * + * - We have overloaded the [] (array-element-accessor) operator to make using this class just like using a regular + * array, which it semantically represents. + * - In order to return array elements by-reference instead of by-value, we need to use C++ references in the + * overloaded operators because this is required by C++. + * + * This class __does not__ clear the contents of the static or dynamic arrays, for performance reasons. If a client + * needs the buffer to be cleared, it must do the memset itself. (However, if 'Item' is a class type rather than + * plain-old-data, the default c'tor will be invoked.) + *********************************************************************************************************************** + */ +template +class AutoBuffer +{ +public: + /// Constructor. + /// + /// The object is initialized to use the static array of items if the required capacity is less than or equal to the + /// default capacity. Otherwise, a larger array is allocated on the heap. + /// + /// @param [in] requiredCapacity Number of items actually required (unknown until runtime). + /// @param [in] pAllocator The allocator that will allocate memory if required. + AutoBuffer( + size_t requiredCapacity, + Allocator*const pAllocator) + : + m_capacity(requiredCapacity), + m_pBuffer(reinterpret_cast(m_localBuffer)), + m_pAllocator(pAllocator) + { + if (requiredCapacity > defaultCapacity) + { + // Create dynamically allocated array, by allocating memory and constructing its objects. + // On failure, to avoid subtle bugs from misuse, AutoBuffer will be in a zombie state with zero capacity. + m_pBuffer = PAL_NEW_ARRAY(Item, requiredCapacity, pAllocator, AllocInternalTemp); + if (m_pBuffer == nullptr) + { + m_capacity = 0; + } + } + else if (!std::is_trivial::value) + { + // Explicitly construct all objects of non-trivial type in the local buffer. + for (uint32 idx = 0; idx < m_capacity; ++idx) + { + PAL_PLACEMENT_NEW(m_pBuffer + idx) Item(); + } + } + } + + /// Destructor. + /// + /// Cleans up the dynamically allocated buffer if we allocated one. + ~AutoBuffer() + { + if (m_pBuffer != reinterpret_cast(m_localBuffer)) + { + // Destory dynamically allocated array, by destroying its objects and freeing memory. + PAL_SAFE_DELETE_ARRAY(m_pBuffer, m_pAllocator); + } + else if (!std::is_trivial::value) + { + // Explicitly destroy all objects of non-trivial type from the local buffer. + for (uint32 idx = 0; idx < m_capacity; ++idx) + { + m_pBuffer[idx].~Item(); + } + } + } + + /// Getter for the capacity of the buffer. + /// + /// Clients can use this function to determine if the constuctor's allocation succeeded. + /// + /// @returns Size of the array in bytes. Should match the requiredCapacity parameter passed to the constructor + /// unless a dynamic memory allocation failed. + constexpr size_t Capacity() const noexcept { return m_capacity; } + + /// Getter for the size of this buffer, in bytes. + constexpr size_t SizeBytes() const noexcept { return (sizeof(Item) * m_capacity); } + + /// Accessor for the nth element of this buffer. + const Item& operator[](size_t n) const + { + PAL_ASSERT(n < m_capacity); + return m_pBuffer[n]; + } + + /// Non-const accessor for the nth element of this buffer. + Item& operator[](size_t n) + { + PAL_ASSERT(n < m_capacity); + return m_pBuffer[n]; + } + + ///@{ + /// Implicitly gets the current contents of the buffer as a Span. + /// + /// @returns The contents of the buffer as a Span; same as Span(Data(), Size()). + operator Span() { return Span(Data(), Capacity()); } + operator Span() const { return Span(Data(), Capacity()); } + ///@} + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Capacity()). + /// + /// @returns Pointer to the underlying data storage for read & write access. + /// The returned pointer contains address of the first element. + constexpr Item* Data() noexcept { return m_pBuffer; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Capacity()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @returns Pointer to the underlying data storage for read only access. + /// The returned pointer contains address of the first element. + constexpr const Item* Data() const noexcept { return m_pBuffer; } + + ///@{ + /// @internal Satisfies concept `range_expression`, using Item* as `iterator` and 64-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as range-based-for-loops. + using value_type = Item; + using reference = Item&; + using const_reference = const Item&; + using iterator = Item*; + using const_iterator = const Item*; + using difference_type = ptrdiff_t; + using size_type = size_t; + + constexpr iterator begin() noexcept { return Data(); } + constexpr iterator end() noexcept { return Data() + Capacity(); } + constexpr const_iterator begin() const noexcept { return Data(); } + constexpr const_iterator end() const noexcept { return Data() + Capacity(); } + constexpr const_iterator cbegin() const noexcept { return Data(); } + constexpr const_iterator cend() const noexcept { return Data() + Capacity(); } + [[nodiscard]] constexpr bool empty() const noexcept { return Capacity() == 0; } + constexpr size_type size() const noexcept { return Capacity(); } + ///@} + +private: + // This is a POD-type that exactly fits one Item value. + using ValueStorage = typename std::aligned_storage::type; + + // Capacity of this buffer (in Items). + size_t m_capacity; + + // Buffer pointer this object uses to access the buffer's elements: if the required capacity exceeds the default + // capacity, this points to a dynamic array of Items. Otherwise, this points to m_localBuffer. + Item* m_pBuffer; + + // Static array providing storage for Items which we expect most objects of this type to end up using. + ValueStorage m_localBuffer[defaultCapacity]; + + // Allocator for this AutoBuffer. + Allocator*const m_pAllocator; + + PAL_DISALLOW_DEFAULT_CTOR(AutoBuffer); + PAL_DISALLOW_COPY_AND_ASSIGN(AutoBuffer); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h new file mode 100644 index 0000000000..3f3a9b8628 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h @@ -0,0 +1,202 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palBuddyAllocator.h + * @brief PAL utility BuddyAllocator class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include "palHashSet.h" +#include "palHashMap.h" +#include "palMutex.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Buddy Allocator + * + * Responsible for managing small GPU memory requests by allocating a large base allocation and dividing it into + * appropriately sized suballocation blocks. + *********************************************************************************************************************** + */ +template +class BuddyAllocator +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + /// @param [in] baseAllocSize The size of the base allocation this buddy allocator suballocates. + /// @param [in] minAllocSize The size of the smallest block this buddy allocator can allocate. + BuddyAllocator( + Allocator* pAllocator, + gpusize baseAllocSize, + gpusize minAllocSize); + ~BuddyAllocator(); + + /// Initializes the buddy allocator. + /// + /// @returns Success if the buddy allocator has been successfully initialized. + Result Init(); + + /// Suballocates a block from the base allocation that this buddy allocator manages. Expects @ref ClaimGpuMemory to + /// be called directly before it. If a memory manager with multiple buddyAllocators is used, use pattern should + /// be: Iterate through buddyAllocators calling ClaimGpuMemory, if one returns @ref Success break out of the loop, + /// then call Allocate on that buddyAllocator. If none return @ref Success, then a new buddyAllocator needs to be + /// created. The purpose of splitting up buddyAllocator selection and Allocation is to reduce lock contention in + /// multithreaded memory managers. + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// @param [out] pOffset The offset the suballocated block starts within the base allocation. + /// + /// @returns Success if the allocation succeeded, @ref ErrorOutOfMemory if there isn't enough system memory to + /// fulfill the request, or @ref ErrorOutOfGpuMemory if there isn't a large enough block free in the + /// base allocation to fulfill the request. + /// + /// @warning Unless @ref ClaimGpuMemory is called before every single call, the results of @ref Allocate will + /// be invalid. If @ref ClaimGpuMemory returns @ref Success, then @ref ErrorOutOfGpuMemory will never be + /// returned. + Result Allocate( + gpusize size, + gpusize alignment, + gpusize* pOffset); + + /// Frees a previously allocated suballocation. + /// + /// @param [in] offset The offset the suballocated block starts within the base allocation. + /// @param [in] size Optional parameter specifying the size of the original allocation. + /// @param [in] alignment Optional parameter specifying the alignment of the original allocation. + void Free( + gpusize offset, + gpusize size = 0, + gpusize alignment = 0); + + /// Tells whether the base allocation is completely free. If the returned value is true then the caller is safe + /// to deallocate the base allocation. + bool IsEmpty() const + { + return (m_numSuballocations == 0); + } + + /// Returns the size of the largest allocation that can be suballocated with this buddy allocator. + gpusize MaximumAllocationSize() const; + + /// Claims (doesn't allocate) some memory, used to quickly determine if a pool of memory has availible memory. + /// Doesn't affect internal state unless Result::Success is returned + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// + /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, + /// @ref ErrorOutOfGpuMemory if there is not enough memory + /// + /// @warning Unless this is called to test availible memory before every call to Allocate, then the results will not + /// be valid. + Result ClaimGpuMemory( + gpusize size, + gpusize alignment); + + /// Checks if @ref ClaimGpuMemory can actually claim memory, can be used to find the best fit pool. This function + /// does NOT acquire a lock on the structures ClaimGpuMemory uses, and does NOT claim or allocate the memory. + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// @param [out] pKval The highest kval that will need to be split will be stored here. + /// + /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, + /// @ref ErrorOutOfGpuMemory if there is not enough memory + /// + Result CheckIfOpenMemory( + gpusize size, + gpusize alignment, + uint32* pKval); + +private: + typedef Util::HashSet FreeSet; + typedef Util::HashMap UsedMap; + + Result GetNextFreeBlock( + uint32 kval, + gpusize* pOffset); + + Result FreeBlock(gpusize offset); + + static constexpr gpusize KvalToSize(uint32 kVal) { return (1ull << kVal); } + + static uint32 SizeToKval(gpusize size) { return Log2(size); } + + Allocator* const m_pAllocator; + + const uint32 m_baseAllocKval; + const uint32 m_minKval; + + // Array of hashSets of blocks that are free at each level + FreeSet* m_pFreeBlockSets; + + // Hashmap of blocks that are used, key=offset, value=level (kval) + UsedMap* m_pUsedBlockMap; + // List of the free memory at each level + uint32* m_pNumFreeList; + // The highest Kval that has at least 1 free block (used in ClaimGpuMemory) + uint32 m_highestFreeKval; + + uint32 m_numSuballocations; + + // mutex on altering the numFreeList + Util::Mutex m_numFreeMutex; + // mutex on the used block map + Util::Mutex m_usedBlockMapMutex; + // array of mutexes, one for each freeBlockSet + Util::Mutex* m_pFreeSetMutexes; + // mutex on the freeing. Serialize freeing blocks and don't allow allocating blocks while one is freeing. Based on + // testing, applications typically don't try to free and allocate memory at the same time, and almost all of the + // memory freeing is done at the end of the application. + Util::RWLock m_freeLock; + + // Set to true if ClaimGpuMemory is ever called on this buddyAllocator. This signals to free to not merge blocks + // if m_pNumFreeList[kval - m_minKval] = 0 + bool m_usedClaim; + + // HashSet and HashMap utility functions + Result InsertToFreeSet(gpusize offset, uint32 kval); + bool GetKvalUsed(gpusize offset, uint32* pKval); + Result SetKvalUsed(gpusize offset, uint32 kval); + Result PopFromFreeSet(gpusize* pOffset, uint32 kval); + bool IsOffsetFree(gpusize offset, uint32 kval); + Result RemoveOffsetFromFreeSet(gpusize offset, uint32 kval); + Result RemoveOffsetFromUsedMap(gpusize offset); + + PAL_DISALLOW_COPY_AND_ASSIGN(BuddyAllocator); + PAL_DISALLOW_DEFAULT_CTOR(BuddyAllocator); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h new file mode 100644 index 0000000000..18f4839c89 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h @@ -0,0 +1,607 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palBuddyAllocatorImpl.h + * @brief PAL utility BuddyAllocator class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palBuddyAllocator.h" +#include "palHashMapImpl.h" +#include "palHashSetImpl.h" +#include "palInlineFuncs.h" +#include "palSysMemory.h" + +namespace Util +{ + +// ===================================================================================================================== +template +BuddyAllocator::BuddyAllocator( + Allocator* pAllocator, + gpusize baseAllocSize, + gpusize minAllocSize) + : + m_pAllocator(pAllocator), + m_baseAllocKval(SizeToKval(baseAllocSize)), + m_minKval(SizeToKval(minAllocSize)), + m_pFreeBlockSets(nullptr), + m_pUsedBlockMap(nullptr), + m_pNumFreeList(nullptr), + m_numSuballocations(0), + m_pFreeSetMutexes(nullptr), + m_usedClaim(false) +{ + // Allocator must be non-null + PAL_ASSERT(m_pAllocator != nullptr); + + // Base allocation size must be POT + PAL_ASSERT(KvalToSize(m_baseAllocKval) == baseAllocSize); + + // Minimum allocation size must be POT + PAL_ASSERT(KvalToSize(m_minKval) == minAllocSize); +} + +// ===================================================================================================================== +template +BuddyAllocator::~BuddyAllocator() +{ + // lock this here to ensure no other thread was doing anything with the buddyAllocator when the destructor is called + RWLockAuto freeLock(&m_freeLock); + if (m_pFreeBlockSets != nullptr) + { + const uint32 numKvals = m_baseAllocKval - m_minKval; + for (uint32 i = 0; i < numKvals; ++i) + { + // Call the destructor + m_pFreeBlockSets[i].~HashSet(); + } + + // Free the block list array + PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); + } + if (m_pUsedBlockMap != nullptr) + { + PAL_SAFE_DELETE(m_pUsedBlockMap, m_pAllocator); + } + if (m_pNumFreeList != nullptr) + { + PAL_SAFE_DELETE_ARRAY(m_pNumFreeList, m_pAllocator); + } + if (m_pFreeSetMutexes != nullptr) + { + PAL_SAFE_DELETE_ARRAY(m_pFreeSetMutexes, m_pAllocator); + } +} + +// ===================================================================================================================== +// Gets maximum allocation size supported by this buddy allocator. +template +gpusize BuddyAllocator::MaximumAllocationSize() const +{ + // NOTE: Report one less than our base allocation k-value because there's no sense in suballocating a memory + // request which is larger than half a chunk + return KvalToSize(m_baseAllocKval - 1); +} + +// ===================================================================================================================== +// Initializes the buddy allocator. +template +Result BuddyAllocator::Init() +{ + PAL_ASSERT(m_pFreeBlockSets == nullptr); + PAL_ASSERT(m_pUsedBlockMap == nullptr); + PAL_ASSERT(m_pNumFreeList == nullptr); + PAL_ASSERT(m_pFreeSetMutexes == nullptr); + + // start out with success and take it away if something fails. + Result result = Result::Success; + + const uint32 numKvals = m_baseAllocKval - m_minKval; + + // one hashSet per kval + m_pFreeBlockSets = static_cast(PAL_MALLOC(sizeof(FreeSet) * numKvals, + m_pAllocator, + AllocInternal)); + // Initialize the hashSets. + if (m_pFreeBlockSets != nullptr) + { + for (uint32 i = 0; i < numKvals; ++i) + { + // max number of entries at a level is: 2^distFromTop + const uint32 maxEntriesKval = 1 << (m_baseAllocKval - (i + m_minKval)); + // 32 is a suitable max, however its the higher kvals won't even need 32 buckets. + const uint32 bucketsNeeded = Min(maxEntriesKval / (PAL_CACHE_LINE_BYTES) + 1, 32u); + + PAL_PLACEMENT_NEW(&m_pFreeBlockSets[i]) FreeSet(bucketsNeeded, m_pAllocator); + result = m_pFreeBlockSets[i].Init(); + + // if we failed the Init of the hashSet, delete the ones we did create, and free the array. This avoids + // having to keep track of the hashSets we did initialize in the destructor by just destroying it here. + if (result != Result::Success) + { + for (uint32 j = 0; j <= i; j++) + { + m_pFreeBlockSets[j].~HashSet(); + } + PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); + break; + } + } + } + else + { + result = Result::ErrorOutOfMemory; + } + + if (result == Result::Success) + { + m_pNumFreeList = static_cast(PAL_NEW_ARRAY(uint32, numKvals, m_pAllocator, AllocInternal)); + if (m_pNumFreeList == nullptr) + { + result = Result::ErrorOutOfMemory; + } + } + + if (result == Result::Success) + { + m_pFreeSetMutexes = static_cast(PAL_NEW_ARRAY(Mutex, numKvals, m_pAllocator, AllocInternal)); + if (m_pFreeSetMutexes == nullptr) + { + result = Result::ErrorOutOfMemory; + } + } + + const uint32 maxUsedEntries = 1 << (m_baseAllocKval - m_minKval); + const uint32 usedBucketsNeeded = maxUsedEntries / (PAL_CACHE_LINE_BYTES * 8) + 1; + + if (result == Result::Success) + { + // one hashMap for getting the kval a used block is at + m_pUsedBlockMap = static_cast(PAL_NEW(UsedMap, m_pAllocator, AllocInternal) + (usedBucketsNeeded, m_pAllocator)); + if (m_pUsedBlockMap != nullptr) + { + result = m_pUsedBlockMap->Init(); + } + else + { + result = Result::ErrorOutOfMemory; + } + } + + // if we successfully allocated all the memory we need, create the first two free blocks. + if (result == Result::Success) + { + memset(m_pNumFreeList, 0, sizeof(uint32) * numKvals); + // We need to create the first two largest-size blocks and add them to the last block list + const uint32 blockKval = (m_baseAllocKval - 1); + const gpusize blockSize = KvalToSize(blockKval); + FreeSet* pTopFreeSet = &m_pFreeBlockSets[blockKval - m_minKval]; + + // mark both of these as free blocks + result = pTopFreeSet->Insert(0); + if (result == Result::Success) + { + // even though this will never be reached, to pass the asserts, this needs to be + // as this kval + result = m_pUsedBlockMap->Insert(0, blockKval + 1); + } + if (result == Result::Success) + { + result = pTopFreeSet->Insert(blockSize); + } + m_pNumFreeList[blockKval - m_minKval] = 2; + m_highestFreeKval = blockKval; + } + PAL_ALERT(result != Result::Success); + return result; +} + +// ===================================================================================================================== +// Suballocates a block from the base allocation that this buddy allocator manages. If no free space is found then an +// appropriate error is returned. +// In order for m_pNumFreeList bookkeeping to be correct, ClaimGpuMemory MUST be called directly before this call to +// Allocate. The buddyAllocator will still work without this, but the results of ClaimGpuMemory will not be correct. +// unless it is called before every call to Allocate. +template +Result BuddyAllocator::Allocate( + gpusize size, + gpusize alignment, + gpusize* pOffset) +{ + PAL_ASSERT(m_pFreeBlockSets != nullptr); + PAL_ASSERT(m_pUsedBlockMap != nullptr); + PAL_ASSERT(m_pNumFreeList != nullptr); + PAL_ASSERT(m_pFreeSetMutexes != nullptr); + PAL_ASSERT(pOffset != nullptr); + PAL_ASSERT(size <= MaximumAllocationSize()); + + // Pad the requested allocation size to the nearest POT of the size and alignment + const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + + RWLockAuto freeLock(&m_freeLock); + Result result = GetNextFreeBlock(kval, pOffset); + // mark this kval as used here. + if (result == Result::Success) + { + result = SetKvalUsed(*pOffset, kval); + } + + if (result == Result::Success) + { + // Increment the number of suballocations this buddy allocator manages + AtomicIncrement(&m_numSuballocations); + } + return result; +} + +// ===================================================================================================================== +// Gets the next free block by recursively dividing larger blocks until a suitible sized block is created. +template +Result BuddyAllocator::GetNextFreeBlock( + uint32 kval, + gpusize* pOffset) +{ + Result result = Result::ErrorOutOfGpuMemory; + if (kval < m_baseAllocKval) + { + // this lock can not get any more fine grained + MutexAuto freeSetLock(&(m_pFreeSetMutexes[kval - m_minKval])); + result = PopFromFreeSet(pOffset, kval); + + if (result == Result::ErrorOutOfGpuMemory) + { // we didn't find a block at this kval, search the next level up + result = GetNextFreeBlock(kval + 1, pOffset); + + if (result == Result::Success) + { + // insert our buddy to the free set + gpusize buddyOffset = *pOffset + KvalToSize(kval); + result = InsertToFreeSet(buddyOffset, kval); + PAL_ASSERT(result == Result::Success); + } + } + else + { + // only two valid options are ErrorOutOfGpuMemory and Success, other result means the hashing failed. + PAL_ASSERT(result == Result::Success); + } + } + PAL_ALERT_MSG(result != Result::Success, + "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); + return result; +} + +// ===================================================================================================================== +// Frees the memory at the given offset, if it's buddy is also free, merges the two and recursively calls this again. +// This doesn't need any internal locks because Free accquires an exclusive lock on the entire allocator (freeLock), and +// the lock on the m_pNumFreeList. These locks could potentially be more fine grained, however freeing and allocating +// don't typically happen at the same time, and Freeing is already much faster than allocating. +template +Result BuddyAllocator::FreeBlock( + gpusize offset) +{ + Result result = Result::ErrorUnknown; + uint32 usedKval; + bool offsetUsed = GetKvalUsed(offset, &usedKval); + PAL_ASSERT(offsetUsed); + PAL_ASSERT(usedKval >= m_minKval && usedKval < m_baseAllocKval); + + gpusize buddyOffset = offset ^ KvalToSize(usedKval); + gpusize offsetUp = Min(offset, buddyOffset); + + // we don't want merge if we are on the top level. We also don't want to merge if a call to claim was made that + // claimed the buddy we are about to free. + if (IsOffsetFree(buddyOffset, usedKval) && (usedKval < m_baseAllocKval -1) && + ((m_pNumFreeList[usedKval - m_minKval] > 0) || (m_usedClaim == false))) + { // We can combine the two blocks and mark the one in the level above as free + // And do this recursively + result = RemoveOffsetFromFreeSet(buddyOffset, usedKval); + if (result == Result::Success) + { + // even though the block is going to be freed, need to set the kval as used + // so that on the recursive call it will be found and freed again. + PAL_ASSERT_MSG((m_pNumFreeList[usedKval - m_minKval] != 0) || (m_usedClaim == false), + "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); + m_pNumFreeList[usedKval - m_minKval] -= 1; + result = SetKvalUsed(offsetUp, usedKval + 1); + } + // if this offset isn't the one that will be set as free in the next level up, we just need to remove it. + if ((result == Result::Success) && (offset != offsetUp)) + { + result = RemoveOffsetFromUsedMap(offset); + } + if (result == Result::Success) + { + result = FreeBlock(offsetUp); + } + } + else + { // We mark this block as free in this level + result = InsertToFreeSet(offset, usedKval); + + if (result == Result::Success) + { + m_pNumFreeList[usedKval - m_minKval] += 1; + m_highestFreeKval = Util::Max(usedKval, m_highestFreeKval); + if (offsetUp == offset) + { // if on the same offset as level up, move where the used block is + result = SetKvalUsed(offsetUp, usedKval + 1); + } + else + { // if at the top of this offset, remove is from used map + result = RemoveOffsetFromUsedMap(offset); + } + } + } + return result; +} +// ===================================================================================================================== +// Frees a suballocated block making it available for future re-use. +template +void BuddyAllocator::Free( + gpusize offset, + gpusize size, + gpusize alignment) +{ + RWLockAuto freeLock(&m_freeLock); + MutexAuto numFreeMutex(&m_numFreeMutex); + + PAL_ASSERT(m_pFreeBlockSets != nullptr); + PAL_ASSERT(m_pUsedBlockMap != nullptr); + PAL_ASSERT(m_pNumFreeList != nullptr); + PAL_ASSERT(m_pFreeSetMutexes != nullptr); + + Result result = FreeBlock(offset); + + // Freeing should always succeed unless something went wrong with the allocation scheme + PAL_ASSERT(result == Result::Success); + + // Decrement the number of suballocations this buddy allocator manages + AtomicDecrement(&m_numSuballocations); +} + +// ===================================================================================================================== +// Claims the memory that will be used when Allocate is called. +// Returns ErrorOutOfGpuMemory if this buddyAllocator has no free blocks, otherwise returns Success. +template +Result BuddyAllocator::ClaimGpuMemory( + gpusize size, + gpusize alignment) +{ + // Set this to true as soon as the first call to claim is done to signal to Free that claim is being used. + m_usedClaim = true; + + PAL_ASSERT(m_pNumFreeList != nullptr); + // Pad the requested allocation size to the nearest POT of the size and alignment + uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); + + Result result = Result::ErrorOutOfGpuMemory; + + // Do this check twice to avoid taking the lock at all if we have no chance of Claiming the memory. This will stop + // this thread from locking on this, as well as other threads from waiting longer for no reason. + if (kval <= m_highestFreeKval) + { + MutexAuto numFreeLock(&m_numFreeMutex); + if (kval <= m_highestFreeKval) + { + PAL_ASSERT(m_pNumFreeList[m_highestFreeKval - m_minKval] != 0); + result = Result::Success; + // First we add one to each level for every buddy we'll insert + while (m_pNumFreeList[kval - m_minKval] == 0) + { + m_pNumFreeList[kval - m_minKval] += 1; + kval++; + } + + PAL_ASSERT(kval <= m_highestFreeKval); + PAL_ASSERT_MSG(m_pNumFreeList[kval - m_minKval] > 0, + "This should only fail if ClaimGpuMemory() is not called before every call to Allocate()."); + // Then we subtract one for the block we will use or split to the lower level + m_pNumFreeList[kval - m_minKval] -= 1; + + PAL_ASSERT(m_highestFreeKval >= m_minKval); + while (m_pNumFreeList[m_highestFreeKval - m_minKval] == 0) + { + m_highestFreeKval--; + // in this case, there will be no more space left on the entire buddyAllocator + if (m_highestFreeKval < m_minKval) + { + break; + } + } + } + } + return result; +} + +// ===================================================================================================================== +// Used to search through pools before claiming memory to find the one that will fragment the least. pKval will have +// be the highest level needed to be split up for this pool, so the pool with the lowest value will be best. Can NOT +// guarantee the memory will still be availible by the time this thread calls ClaimGpuMemory. +template +Result BuddyAllocator::CheckIfOpenMemory( + gpusize size, + gpusize alignment, + uint32* pKval) +{ + PAL_ASSERT(m_pNumFreeList != nullptr); + // Pad the requested allocation size to the nearest POT of the size and alignment + const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); + + Result result = Result::ErrorOutOfGpuMemory; + if ((kval <= m_highestFreeKval)) + { + result = Result::Success; + if (pKval != nullptr) + { + uint32 topKval = kval; + for (; topKval < m_baseAllocKval; topKval++) + { + if (m_pNumFreeList[topKval - m_minKval] != 0) + { + *pKval = topKval; + break; + } + } + } + } + return result; +} + +// Hashset helper functions. +// ===================================================================================================================== +template +Result BuddyAllocator::InsertToFreeSet( + gpusize offset, + uint32 kval) +{ + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + PAL_ASSERT(pFreeSet->Contains(offset) == false); + Result result = pFreeSet->Insert(offset); + + return result; +} + +// ===================================================================================================================== +template +bool BuddyAllocator::GetKvalUsed( + gpusize offset, + uint32* pKval) +{ + bool isUsed; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + uint32* usedKval = m_pUsedBlockMap->FindKey(offset); + if (usedKval == nullptr) + { + isUsed = false; + } + else + { + isUsed = true; + if (pKval != nullptr) + { + *pKval = *usedKval; + } + } + return isUsed; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::SetKvalUsed( + gpusize offset, + uint32 kval) +{ + uint32* pKval; + bool existed; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + Result result = m_pUsedBlockMap->FindAllocate(offset, &existed, &pKval); + if (result == Result::Success) + { + *pKval = kval; + } + PAL_ASSERT(result == Result::Success); + return result; +} + +// ===================================================================================================================== +// If there are free blocks at this level, removes one, if not, returns Result::ErrorOutOfGpuMemory +template +Result BuddyAllocator::PopFromFreeSet( + gpusize* pOffset, + uint32 kval) +{ + Result result = Result::ErrorUnknown; + + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + PAL_ASSERT(pFreeSet != nullptr); + + auto freeSetIt = pFreeSet->Begin(); + if (freeSetIt.Get() != nullptr) + { + *pOffset = freeSetIt.Get()->key; + bool eraseRes = pFreeSet->Erase(*pOffset); + if (eraseRes) + { + result = Result::Success; + } + else + { + // we got the offset from the iterator, no reason for it to fail. + PAL_ASSERT_ALWAYS(); + } + } + else + { + result = Result::ErrorOutOfGpuMemory; + } + return result; +} + +// ===================================================================================================================== +template +bool BuddyAllocator::IsOffsetFree( + gpusize offset, + uint32 kval) +{ + bool isIn = m_pFreeBlockSets[kval - m_minKval].Contains(offset); + + return isIn; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::RemoveOffsetFromFreeSet( + gpusize offset, + uint32 kval) +{ + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + bool eraseRes = pFreeSet->Erase(offset); + return (eraseRes) ? Result::Success : Result::ErrorInvalidValue; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::RemoveOffsetFromUsedMap( + gpusize offset) +{ + Result result = Result::Success; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + bool removeRes = m_pUsedBlockMap->Erase(offset); + + if (removeRes == false) + { + result = Result::ErrorInvalidValue; + } + return result; +} +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h b/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h new file mode 100644 index 0000000000..791569cdc3 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h @@ -0,0 +1,364 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDbgPrint.h + * @brief Defines PAL utility collection debug print functionality. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include + +#if PAL_ENABLE_LOGGING +#include "palDbgLogHelper.h" +#endif + +namespace Util +{ +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) +// Forward declarations. +class File; +enum FileAccessMode : uint32; + +/// Specifies the category of a debug print. +/// +/// Driver developer can enable/disable each category separately through settings. +enum DbgPrintCategory : uint32 +{ + DbgPrintCatInfoMsg = 0, ///< Generic informational messages. + DbgPrintCatWarnMsg, ///< Warning messages. + DbgPrintCatErrorMsg, ///< Error messages. + DbgPrintCatScMsg, ///< Generic shader compiler messages. + DbgPrintCatEventPrintMsg, ///< System event messages. + DbgPrintCatEventPrintCallbackMsg, ///< System event messages via callback. + DbgPrintCatMsgFile, ///< All Messages via file. + DbgPrintCatCount +}; + +/// Specifies the debug print mode: disabled, print to debugger, or print to file. +enum class DbgPrintMode : uint32 +{ + Disable, ///< Debug print is ignored. + Print, ///< Debug print is routed to the debug window or stdout. + File, ///< Debug print is routed to a file. + PrintCallback, ///< Debug print is routed to the print callback only +}; + +/// Flags specifying style controls for a debug print. +enum DbgPrintStyle : uint32 +{ + DbgPrintStyleDefault = 0x0, ///< Normal mode: has a prefix and a CR-LF. + DbgPrintStyleNoPrefix = 0x1, ///< Skip the prefix. + DbgPrintStyleNoCrLf = 0x2, ///< Skip the CR-LF. + DbgPrintStyleNoPrefixNoCrLf = 0x3, ///< Skip both the prefix and the CR-LF. +}; + +/// Definition for debug print callback. +/// +/// @param [in] pUserData User data that is installed with the callback for use by the installer. +/// @param [in] category Debug print category that the message belongs to. +/// @param [in] pText Text data to be printed by the callback. +typedef void (PAL_STDCALL *DbgPrintCallbackFunc)( + void* pUserdata, + DbgPrintCategory category, + const char* pText); + +/// Debug print callback struct that bundles the callback function and its userdata pointer. +struct DbgPrintCallback +{ + DbgPrintCallbackFunc pCallbackFunc; + void* pUserdata; +}; + +/// Generic debug printf function to be used when the caller wishes to specify the output category and style. Clients +/// should use the PAL_DPF macro instead of calling this function directly. +/// +/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). +/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). +/// @param [in] pFormat Printf-style format string. +extern void DbgPrintf( + DbgPrintCategory category, + DbgPrintStyle style, + const char* pFormat, + ...); + +/// Generic printf function to be used when the caller wishes to specify the output category and style, and has +/// pre-started the variable arg list (va_list argument instead of ...). +/// +/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). +/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList Variable argument list. +extern void DbgVPrintf( + DbgPrintCategory category, + DbgPrintStyle style, + const char* pFormat, + va_list argList); + +#endif + +#if PAL_ENABLE_PRINTS_ASSERTS +/// Sets the debug print mode (output to debugger, write to file, or disabled) for the specified category of messages. +/// +/// Probably controlled by a setting and set during initialization. +/// +/// @param [in] category Message category to control (e.g., CS dumps, SC output, etc.). +/// @param [in] mode New mode to be used for this message category (print to file, etc.). +extern void SetDbgPrintMode( + DbgPrintCategory category, + DbgPrintMode mode); + +/// Opens a file that resides in the selected log directory. +/// +/// This function exists in all build configurations. +/// +/// @param [in,out] pFile File object to represent the opened file. +/// @param [in] pFilename Filename to open. +/// @param [in] flags ORed mask of FileAccessMode values specifying how this file will be accessed. +/// +/// @returns Success if successful, otherwise an appropriate error. +extern Result OpenLogFile( + File* pFile, + const char* pFilename, + uint32 flags); + +/// Sets the global debug print callback. +/// +/// @param [in] callback Debug print callback struct that contains the callback function and a userdata pointer +extern void SetDbgPrintCallback( + const DbgPrintCallback& callback); + +#endif + +// Forward declarations. +template class StringView; + +/// Logs a text string via client callback when provided. +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pLogCbInfo +/// parameter to CreatePlatform() will be passed back to the client on every log callback. +/// @param [in] level Log priority level associated with the message. +/// @param [in] categoryMask Log category mask that represents what category fields the message relates to. +/// @param [in] pFormat Format string for the log message. +/// @param [in] args Variable arguments that correspond to the format string. +typedef void (PAL_STDCALL *LogCallbackFunc)( + void* pClientData, + uint32 level, + uint64 categoryMask, + const char* pFormat, + va_list args); + +/// Specifies client-provided logging callbacks. Used as a parameter to Pal::CreatePlatform(). +/// +/// @ingroup LibInit +struct LogCallbackInfo +{ + void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to + /// every @ref LogCallbackFunc call made by PAL. + LogCallbackFunc pfnLogCb; ///< Debug print logging callback. @see LogCallbackFunc. +}; + +/// Compiler-specific wrapper of the standard snprintf implementation. +/// +/// @param [out] pOutput Output string. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// +/// @returns The resultant length of the formatted string. +extern int32 Snprintf( + char* pOutput, + size_t bufSize, + const char* pFormat, + ...); + +/// Compiler-specific wrapper of the standard vsnprintf implementation. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList variable argument list. +/// +/// @returns The resultant length of the formatted string. +extern int32 Vsnprintf( + char* pOutput, + size_t bufSize, + const char* pFormat, + va_list argList); + +/// Compiler-specific wrapper of the standard snprintf implementation. +/// +/// @param [out] pOutput Output string. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// +/// @returns The resultant length of the formatted string. +extern int32 Snprintf( + wchar_t* pOutput, + size_t bufSize, + const wchar_t* pFormat, + ...); + +/// Compiler-specific wrapper of the standard vsnprintf implementation. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList variable argument list. +/// +/// @returns The resultant length of the formatted string. +extern int32 Vsnprintf( + wchar_t* pOutput, + size_t bufSize, + const wchar_t* pFormat, + va_list argList); + +/// Copy an arbitrary string into the provided buffer, encoding as necessary to avoid characters that are illegal +/// in filenames (assuming the more restrictive Windows rules, even on non-Windows OSs). +/// +/// Any byte that would be illegal is encoded as % then two hex digits, like in a URL. +/// +/// @param [out] pOutput Output string. +/// @param bufSize Available space in pOutput. +/// @param [in] input Input string +/// @param allowSpace Allow (do not % encode) space +/// @param allowDirSeparator Allow (do not % encode) / and \ characters +/// +/// @returns Works like C++ standard snprintf: +/// - If the provided buffer is big enough, it returns the number of bytes written, excluding the +/// terminating \0. +/// - If the provided buffer is not big enough, then the result string is truncated to fit, and the +/// function returns the number of bytes that would have been written if the buffer had been long +/// enough, excluding the terminating \0. +/// - Passing 0 buffer length is allowed as a special case of that, and nullptr pOutput is then allowed. +extern size_t EncodeAsFilename( + char* pOutput, + size_t bufSize, + const StringView& input, + bool allowSpace, + bool allowDirSeparator); + +/// Generate a log filename. +/// +/// @param [inout] pFilenameBuffer Buffer to hold the filename. +/// @param maxSize Max size of the pFilenameBuffer. +/// @param nextPost The next write position. +/// @param [in] pExt The filename extension. +/// @param logDuplicate Log duplicate objects. +extern void GenLogFilename( + char* pFilenameBuffer, + size_t maxSize, + size_t nextPos, + const char* const pExt, + bool logDuplicate); + +} // Util + +/// PAL_ENABLE_LOGGING enables the new logging code. At this time, both, the current and new logging +/// code will be active for development purpose if both macros are enabled. +#if (PAL_ENABLE_PRINTS_ASSERTS && PAL_ENABLE_LOGGING) +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#elif PAL_ENABLE_PRINTS_ASSERTS +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#elif PAL_ENABLE_LOGGING +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#else +/// Debug printf macro. +#define PAL_DPF(...) ((void)0) +/// Debug info printf macro. +#define PAL_DPINFO(...) ((void)0) +/// Debug warning printf macro. +#define PAL_DPWARN(...) ((void)0) +/// Debug error printf macro. +#define PAL_DPERROR(...) ((void)0) +#endif + diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h b/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h new file mode 100644 index 0000000000..8115e052dc --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h @@ -0,0 +1,319 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDeque.h + * @brief PAL utility collection Deque and DequeIterator class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palSysMemory.h" + +namespace Util +{ + +// Forward declarations. +template class Deque; + +/// @internal Private structure used by Deque and its iterators to store chunks of data elements. +struct DequeBlockHeader +{ + DequeBlockHeader* pPrev; ///< Pointer to the previous block. + DequeBlockHeader* pNext; ///< Pointer to the next block. + void* pStart; ///< Pointer to the first element in this block. + void* pEnd; ///< Pointer to the last element in this block. +}; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a Deque collection. + * + * Allows traversal of all elements in a Deque going either forwards or backwards. If you traverse off either end of + * the deque, then you must create a new iterator by calling either the Deque's Begin() or End() method. + *********************************************************************************************************************** + */ +template +class DequeIterator +{ +public: + /// Trivial destructor. + ~DequeIterator() { } + + /// Returns a pointer to the current element. Will return null if we've gone past the end. + T* Get() const { return m_pCurrent; } + + /// Advances the iterator to the next position (move forward). + void Next(); + + /// Advances the iterator to the previous position (move backward). + void Prev(); + + /// Check if the element the iterator references is valid. + bool IsValid() const { return m_pCurrent != nullptr; } + +private: + DequeIterator(const Deque* pDeque, DequeBlockHeader* pHeader, T* pCurrent); + + const Deque*const m_pDeque; // The Deque we're iterating over. + const DequeBlockHeader* m_pCurrentHeader; // The block we're iterating over. + T* m_pCurrent; // Pointer to the current element. Null if we've gone past the + // end. + + PAL_DISALLOW_DEFAULT_CTOR(DequeIterator); + + // Although this is a transgression of coding standards, it means that Deque does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class Deque; +}; + +/** + *********************************************************************************************************************** + * @brief Simple templated deque container - a double-ended queue. + * + * This is meant for storing elements of an arbitrary (but uniform) type. Operations which this class supports are: + * + * - Insertion from the front and back. + * - Deletion from the front and back. + * - Forwards and reverse iteration + * + * @warning This class is not thread-safe for push, pop, or iteration! + * + * @note This class is only designed to work with native types and POD-style structures. If it is needed to have a Deque + * of complex objects with nontrivial destructors, copy constructors or assign operators, then a specialized + * implementation of CleanupElement() will need to be explicitly defined. + *********************************************************************************************************************** + */ +template +class Deque +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + Deque(Allocator*const pAllocator, size_t numElementsPerBlock = 256); + ~Deque(); + + /// Returns the number of elements in the deque. + size_t NumElements() const { return m_numElements; } + + /// Returns an iterator pointing to the first element in the deque. + /// + /// @returns An iterator pointing at the front end of the deque. + DequeIterator Begin() const { return DequeIterator(this, m_pFrontHeader, m_pFront); } + + /// Returns an iterator pointing to the last element in the deque. + /// + /// This is somewhat different from std::deque.End() which returns a pointer to the theoretical object _past_ the + /// end of the deque. + /// + /// @returns An iterator pointing at the back end of the deque. + DequeIterator End() const { return DequeIterator(this, m_pBackHeader, m_pBack); } + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + T& At(uint32 index); + + const T& At(uint32 index) const; + + T& operator[](uint32 index); + const T& operator[](uint32 index) const; + ///@} + + /// Returns the object at the front of the deque. + /// + /// @warning This will cause an access violation if called on an empty deque! + /// + /// @returns Reference to the item stored at the front end of the deque. + T& Front() const + { + PAL_ASSERT(m_numElements != 0); + return *m_pFront; + } + + /// Returns the object at the tail of the deque. + /// + /// @warning This will cause an access violation if called on an empty deque! + /// + /// @returns Reference to the item stored at the back end of the deque. + T& Back() const + { + PAL_ASSERT(m_numElements != 0); + return *m_pBack; + } + + /// Pushes a copy of the specified item onto the front of the deque. + /// + /// @param [in] data Item to be added to the front of the deque. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + Result PushFront(const T& data); + + /// Emplaces a newly constructed item onto the front of the deque. + /// + /// @param [in] args arguments used to construct the new item. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + template + Result EmplaceFront(Args&&... args); + + /// Pushes a copy of the specified item onto the back of the deque. + /// + /// @param [in] data Item to be added to the back of the deque. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + Result PushBack(const T& data); + + /// Emplaces a newly constructed item onto the back of the deque. + /// + /// @param [in] args arguments used to construct the new item. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + template + Result EmplaceBack(Args&&... args); + + /// Pops the first item off the front of the deque, returning the popped value. + /// + /// @param [out] pOut Item popped off the front of the deque. + /// + /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque + /// is empty. + Result PopFront(T* pOut); + + /// Pops the first item off the back of the deque, returning the popped value. + /// + /// @param [out] pOut Item popped off the back of the deque. + /// + /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque + /// is empty. + Result PopBack(T* pOut); + +private: + Result AllocateFront(T**); + Result AllocateBack(T**); + DequeBlockHeader* AllocateNewBlock(); + void FreeUnusedBlock(DequeBlockHeader* pHeader); + + // A helper function to avoid duplication in const and non-const versions of At(). + T& InternalAt(uint32 index) const; + + size_t m_numElements; // Number of elements + const size_t m_numElementsPerBlock; // Block granularity when we need to alloc a new one + + DequeBlockHeader* m_pFrontHeader; // First block of data elements, null for empty deques. + DequeBlockHeader* m_pBackHeader; // Last block of data elements, null for empty deques/ + + T* m_pFront; // First data element, null for empty deques. + T* m_pBack; // Last data element, null for empty deques. + + DequeBlockHeader* m_pLazyFreeHeader; // Cached pointer to the most-recently freed block. + + Allocator*const m_pAllocator; // Pointer to the allocator for this deque. + + PAL_DISALLOW_COPY_AND_ASSIGN(Deque); + + // Although this is a transgression of coding standards, it prevents DequeIterator requiring a public constructor; + // constructing a 'bare' DequeIterator (i.e. without calling Deque::GetIterator) can never be a legal operation, so + // this means that these two classes are much safer to use. + friend class DequeIterator; +}; + +// ===================================================================================================================== +template +Deque::Deque( + Allocator*const pAllocator, + size_t numElementsPerBlock) + : + m_numElements(0), + m_numElementsPerBlock(numElementsPerBlock), + m_pFrontHeader(nullptr), + m_pBackHeader(nullptr), + m_pFront(nullptr), + m_pBack(nullptr), + m_pLazyFreeHeader(nullptr), + m_pAllocator(pAllocator) +{ +} + +// ===================================================================================================================== +// Frees all of the blocks this object allocated over its lifetime. +template +Deque::~Deque() +{ + if (!std::is_trivial::value) + { + while (m_pFrontHeader != nullptr) + { + // Explicitly destroy the removed value since it's non-trivial and advance. + // We must destroy all of them in the current block before freeing it. + m_pFront->~T(); + ++m_pFront; + --m_numElements; + + if ((m_pFront == m_pFrontHeader->pEnd) || (m_numElements == 0)) + { + // Okay, the front block is now empty. Free it and advance to the next block. + DequeBlockHeader* pBlockToFree = m_pFrontHeader; + m_pFrontHeader = m_pFrontHeader->pNext; + PAL_SAFE_FREE(pBlockToFree, m_pAllocator); + + if (m_pFrontHeader != nullptr) + { + // Fixup to the new block. + m_pFront = static_cast(m_pFrontHeader->pStart); + } + } + } + } + else + { + // Elements are trivial so skip iterating through elements and free each block. + while (m_pFrontHeader != nullptr) + { + DequeBlockHeader* pBlockToFree = m_pFrontHeader; + m_pFrontHeader = m_pFrontHeader->pNext; + PAL_SAFE_FREE(pBlockToFree, m_pAllocator); + } + } + + if (m_pLazyFreeHeader != nullptr) + { + PAL_SAFE_FREE(m_pLazyFreeHeader, m_pAllocator); + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h b/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h new file mode 100644 index 0000000000..d57b3626c1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h @@ -0,0 +1,141 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palEvent.h + * @brief PAL utility collection Event class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palTime.h" +#include "palUtil.h" + +namespace Util +{ + +/// Specifies the flags for event. +struct EventCreateFlags +{ + union + { + struct + { + uint32 manualReset : 1; ///< If true, the event is created as manual reset. + uint32 initiallySignaled : 1; ///< If true, the event is created in signaled state. +#if defined(_WIN32) + uint32 canBeInherited : 1; ///< If true, the event can be inherited by child process, it's + /// Windows-specific. + uint32 reserved : 29; ///< Reserved for future use. +#else + uint32 semaphore : 1; ///< If true, provide semaphore-like semantics for reads from the file + /// descriptor. + uint32 nonBlocking : 1; ///< If true, set the O_NONBLOCK file status flag on the new file descriptor. + uint32 closeOnExecute : 1; ///< If true, set the close-on-exec flag for the new file descriptor. + uint32 reserved : 27; ///< Reserved for future use. +#endif + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + }; +}; + +/** + *********************************************************************************************************************** + * @brief Synchronization primitive that can either be in the _set_ or _reset_ state. + * + * Threads can call WaitForEvents() to block waiting for an Event object to be _set_. This is useful for fine-grain + * synchronization between threads. + * + * Event objects start out in the _reset_ state. + *********************************************************************************************************************** + */ +class Event +{ +public: + Event(); + ~Event(); + /// Initializes the event object. Clients must call this before using the Event object. + /// + /// @param flags Event creation flags. + /// @param pName Specified the event's name, it's Windows-specific, Windows uses this name to + /// uniquely identify fence objects across processes. + /// @returns Success if the event was successfully initialized, otherwise an appropriate error code. + Result Init( + const EventCreateFlags& flags +#if defined(_WIN32) + , + const wchar_t* pName = nullptr +#endif + ); + /// Changes the event state to _set_ + /// + /// @returns Success unless the Event has not been initialized yet (@ref ErrorUnavailable) or an unexpected internal + /// error occured when calling the OS (ErrorUnknown). + Result Set() const; + + /// Changes the event state to _reset_. + /// + /// @returns Success unless the Event has not been initialized yet (ErrorUnavailable) or an unexpected + /// internal error occured when calling the OS (ErrorUnknown). + Result Reset() const; + + /// Waits for the event to enter the _set_ state before returning control to the caller. The event will change to + /// the _reset_ state if manualReset was false on initialization. + /// + /// @param [in] timeout Max time to wait, in seconds. If zero, this call will poll the event without blocking. + /// + /// @returns Success if the wait completed successfully or Timeout if the wait did not complete but the operation + /// timed out. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue will be returned if the timeout is negative. + /// + ErrorUnknown may be returned if an unexpected internal occurs when calling the OS. + Result Wait(fseconds timeout) const; + +#if defined(_WIN32) + /// On Windows, a handle to an OS event primitive is a HANDLE, which is just a void*. + typedef void* EventHandle; +#else + /// On Linux, a handle to an OS event primitive is a file descriptor, which is just an int. + typedef int32 EventHandle; +#endif + + /// Returns a handle to the actual OS event primitive associated with this object. + EventHandle GetHandle() const { return m_hEvent; } + + /// Open event handle. + Result Open(EventHandle handle, bool isReference); + + /// Constant EventHandle value which represents an invalid event object. + static const EventHandle InvalidEvent; + +private: + EventHandle m_hEvent; // OS-specific event handle. + bool m_isReference; // If true, the event is a global sharing object handle (not a duplicate) which is + // imported from external, so it can't be closed in the currect destructor, and can only + // be closed by the creater. + + PAL_DISALLOW_COPY_AND_ASSIGN(Event); +}; +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palFile.h b/shared/amdgpu-windows-interop/pal/inc/util/palFile.h new file mode 100644 index 0000000000..219042feb1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palFile.h @@ -0,0 +1,300 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFile.h + * @brief PAL utility collection File class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +// pal +#include "palUtil.h" +#include "palInlineFuncs.h" + +// stl +#include +#include + +#if defined(_WIN32) +/// Macro for wide string literal concatenation. +#define PAL_PATH_SEPW L"\\" +/// Macro for narrow string literal concatenation. +#define PAL_PATH_SEP "\\" +#else +/// Macro for wide string literal concatenation. +#define PAL_PATH_SEPW L"/" +/// Macro for narrow string literal concatenation. +#define PAL_PATH_SEP "/" +#endif + +namespace Util +{ +#if defined(_WIN32) +/// Wide-character of the platform's prefered path separator. +static constexpr wchar_t PathSepW = L'\\'; +/// Narrow-character of the platform's prefered path separator. +static constexpr char PathSep = '\\'; +#else +/// Wide-character of the platform's prefered path separator. +static constexpr wchar_t PathSepW = L'/'; +/// Narrow-character of the platform's prefered path separator. +static constexpr char PathSep = '/'; +#endif + +static constexpr uint32 MaxPathStrLen = 512; +static constexpr uint32 MaxFileNameStrLen = 256; + +/// Enumerates access modes that may be required on an opened file. +/// Can be bitwise ORed together to specify multiple simultaneous modes. +enum FileAccessMode : uint32 +{ + FileAccessRead = 0x1, ///< Read access. + FileAccessWrite = 0x2, ///< Write access. + FileAccessAppend = 0x4, ///< Append access. + FileAccessBinary = 0x8, ///< Binary access. + FileAccessNoDiscard = 0x10, ///< Don't discard existing file. + FileAccessShared = 0x20, ///< Require shared file access (simultaneous reading/writing by more than one process) +}; + +/** + *********************************************************************************************************************** + * @brief Exposes simple file I/O functionality by encapsulating standard C runtime file I/O functions like fopen, + * fwrite, etc. + *********************************************************************************************************************** + */ +class File +{ +public: + // Platform-agnostic 64-bit stat structure. + struct Stat + { + uint64 size; // Size of the file in bytes. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 922 + uint64 ctime; // Time of creation of the file (not valid on FAT). + uint64 atime; // Time of last access to the file (not valid on FAT). + uint64 mtime; // Time of last modification to the file. +#else + std::chrono::system_clock::time_point ctime; // Time of creation of the file (not valid on FAT). + std::chrono::system_clock::time_point atime; // Time of last access to the file (not valid on FAT). + std::chrono::system_clock::time_point mtime; // Time of last modification to the file. +#endif + uint32 nlink; // Number of hard links (always 1 on FAT on Windows). + uint32 mode; // Bitmask for the file-mode information. + uint32 dev; // Drive number of the disk containing the file. + + union + { + struct + { + uint32 isDir : 1; + uint32 isRegular : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + + // Common stat members omitted from this structure: + // uid, gid, and ino because it's not used on Windows + // rdev because it's a duplicate of dev + }; + + // Where in the file to start seeking from. + enum class SeekPosition : int32 + { + // start of the file + Start = SEEK_SET, + // current file pointer position + Current = SEEK_CUR, + // end of the file + End = SEEK_END + }; + + File() : m_pFileHandle(nullptr), m_ownsHandle(false) {} + + /// Closes the file if it is still open. + ~File() { Close(); } + + /// Opens a file stream for read, write or append access. + /// + /// @param [in] pFilename Name of file to open. + /// @param [in] accessFlags Bitmask of FileAccessMode values indicating the usage of the file. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Open(const char* pFilename, uint32 accessFlags); + + /// Borrows an externally opened C runtime file handle for use by a File object. + /// + /// The caller is still responsible for closing this handle after the File object is destroyed. + /// + /// @param [in] pFile Externally opened C runtime file handle to borrow. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result FromNative(std::FILE* pFile); + + /// Closes the file handle. + void Close(); + + /// Writes a stream of bytes to the file. + /// + /// @param [in] pBuffer Byte stream to be written to the file. + /// @param [in] bufferSize Number of bytes to write. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Write(const void* pBuffer, size_t bufferSize); + + /// Reads a stream of bytes from the file. + /// + /// @param [out] pBuffer Buffer to be written with data read from file. + /// @param [in] bufferSize Size of the output buffer. + /// @param [out] pBytesRead Number of bytes actually read (can be null). + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Read(void* pBuffer, size_t bufferSize, size_t* pBytesRead); + + /// Reads a single line of bytes from the file. + /// + /// @param [out] pBuffer Buffer to be written with data read from file. + /// @param [in] bufferSize Size of the output buffer. + /// @param [out] pBytesRead Number of bytes actually read (can be null). + /// + /// @returns Success if successful, otherwise an appropriate error. + Result ReadLine(void* pBuffer, size_t bufferSize, size_t* pBytesRead); + + /// Prints a formatted string to the file. + /// + /// @param [in] pFormatStr Printf-style format string. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Printf(const char* pFormatStr, ...) const; + + /// Prints a formatted string to the file. + /// + /// @param [in] pFormatStr Printf-style format string. + /// @param [in] argList Variable argument list. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result VPrintf(const char* pFormatStr, va_list argList); + + /// Flushes pending I/O to the file. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Flush() const; + + /// Sets the file position to the beginning of the file. + void Rewind(); + + /// Sets the position indicator to a new position. + /// + /// @param offset Number of bytes to offset + /// @param pos File position to seek from + void Seek(int64 offset, SeekPosition pos); + + /// Sets the position indicator to a new position relative to the beginning of the file. + /// + /// @param offset Number of bytes to offset + void Seek(size_t offset) { Seek(offset, SeekPosition::Start); } + + /// Sets the position indicator to a new position relative to the end of the file + /// + /// @param offset Number of bytes to offset + void Rseek(size_t offset) { Seek(-static_cast(offset), SeekPosition::End); } + + /// Sets the file position to the end of the file. + void FastForward() { Rseek(0); } + + /// Returns true if the file is presently open. + bool IsOpen() const { return (m_pFileHandle != nullptr); } + + /// Gets the size of the file contents in bytes + /// + /// @param [in] pFilename Name of the file to check. + /// + /// @returns Size of the file in bytes, or std::numeric_limits::max() on failure. + static size_t GetFileSize(const char* pFilename); + + /// Checks if a file with the specified name exists. + /// + /// @param [in] pFilename Name of the file to check. + /// + /// @returns True if the specified file exists. + static bool Exists(const char* pFilename); + + /// Platform-agnostic 64-bit stat() function. + /// + /// @param [in] pFilename Name of the file to check. + /// @param [out] pStatus The status of that file, if it exists. + /// + /// @returns Success if the structure was retrieved, error otherwise. + static Result GetStat(const char* pFilename, Stat* pStatus); + + /// Removes/erases a file, if it exists. + /// + /// @param [in] pFilename Name of file to remove. + /// + /// @returns Success if successful, otherwise an appropriate error. + static Result Remove(const char* pFilename); + + /// Reads a file into memory. + /// + /// @param [in] pFilename Name of the file to read. + /// @param [in] pData Buffer where the file contents are written to. + /// @param [in] dataSize Size of the buffer in bytes. + /// @param [out] pBytesRead Number of bytes successfully read into the input buffer (can be null). + /// @param [in] binary True for binary mode, false for text. Defaults to binary. + /// + /// @returns Success if successful, otherwise an appropriate error. + /// + /// @note The input buffer must be large enough to hold the file's contents. If the buffer is larger than the file, + /// then the region of the buffer beyond the file size is _not_ modified by this function. It is the caller's + /// responsibility to _not_ read uninitialized portions of the supplied buffer after this call returns. + /// + /// @note In binary mode, the number of bytes read is equal to the file size in bytes upon a successful return. + /// In text mode, newline conversion is performed on Windows, in which case the number of bytes read may not equal + /// the file size in bytes. + /// + /// @note In text mode, should the caller treat the resulting data as a C string, it is the caller's responsibility + /// to null-terminate the buffer. + static Result ReadFile( + const char* pFilename, + void* pData, + size_t dataSize, + size_t* pBytesRead = nullptr, + bool binary = true); + + /// Gets the handle associated with this file. + /// + /// @returns A pointer to the file handle + const std::FILE* GetHandle() const { return m_pFileHandle; } + +private: + std::FILE* m_pFileHandle; + bool m_ownsHandle; // This object owns the file handle and will close it on destruction. + + PAL_DISALLOW_COPY_AND_ASSIGN(File); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h new file mode 100644 index 0000000000..c3b481cac9 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h @@ -0,0 +1,529 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashBase.h + * @brief PAL utility collection shared structures and class declarations used by the HashMap and HashSet containers. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSysMemory.h" + +namespace Util +{ + +/// Entry at the end of a group allocation which contains the size and a pointer to the next group. +template +struct GroupFooter +{ + Entry* pNextGroup; + uint32 numEntries; +}; + +// Forward declarations. +template class HashBase; + +/// Default hash functor. +/// +/// Just directly returns bits 31-6 of the key's first dword. This is a decent hash if the key is a pointer. +template +struct DefaultHashFunc +{ + /// Shifts the key to the right and use the resulting bits as a uint hash. + /// + /// @param [in] pVoidKey Pointer to the key to be hashed. If the key is a pointer, which is the best use case for + /// this hash function, then this is really a pointer to a pointer. + /// @param [in] keyLen Ignored. + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; + + static constexpr uint32 ShiftNum = 6; ///< Right shift bit number + + /// Makes sure the hashing result always contain at least minNumBits bits. + void Init(uint32 minNumBits) const + { + PAL_ASSERT((Min(sizeof(Key), sizeof(uint32)) * 8) >= (minNumBits + ShiftNum)); + PAL_ALERT_MSG(sizeof(Key) > sizeof(void*), "Usage of DefaultHashFunc for non-pointer types!"); + } +}; + +/// Jenkins hash functor. +/// +/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: +/// http://burtleburtle.net/bob/hash/doobs.html +/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +/// code any way you wish, private, educational, or commercial. It's free. +/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm +/// Use for hash table lookup, or anything where one collision in 2^^32 is +/// acceptable. Do NOT use for cryptographic purposes. +template +struct JenkinsHashFunc +{ + /// Hashes the specified key value via the Jenkins hash algorithm. + /// + /// @param [in] pVoidKey Pointer to the key to be hashed. + /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; + + /// No init job. Defined to be compatible with default hash func. + void Init(uint32) const { } +}; + +/// Jenkins hash functor for C-style strings. +/// +/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: +/// http://burtleburtle.net/bob/hash/doobs.html +/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +/// code any way you wish, private, educational, or commercial. It's free. +/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm +/// Use for hash table lookup, or anything where one collision in 2^^32 is +/// acceptable. Do NOT use for cryptographic purposes. +/// +/// @note This hash function is for char* keys only, since the regular JenkinsHashFunc will attempt to do a hash on the +/// address of the pointer, as opposed to the actual string. +template +struct StringJenkinsHashFunc : JenkinsHashFunc +{ + /// Hashes the specified C-style string key via the Jenkins hash algorithm. + /// + /// @param [in] pVoidKey Pointer to the key string (i.e., this is a char**) to be hashed. + /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. Should always be sizeof(char*). + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; +}; + +/// Generic compare functor for types with arbitrary size. +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct DefaultEqualFunc +{ + /// Returns true if key1 and key2 are equal (have identical memory contents). + bool operator()(const Key& key1, const Key& key2) const + { + return (memcmp(&key1, &key2, sizeof(Key)) == 0); + } +}; + +/// String compare functor for use with C-style strings. memcmp doesn't work well for strings, so this uses strcmp. +template +struct StringEqualFunc +{ + /// Returns true if the strings in key1 and key2 are equal. + bool operator()(const Key& key1, const Key& key2) const; +}; + +/** + *********************************************************************************************************************** + * @brief Fixed-size, growable, and lazy-free memory pool allocator. + * + * Memory is divided into blocks and stored in a fixed-sized structure array. One blocks is made of fixed-sized groups. + * Blocks grows exponentially, that is, each block has twice the number of groups than the previous one. + * + * @warning This class is not thread-safe! + *********************************************************************************************************************** + */ +template +class HashAllocator +{ +public: + /// Constructor. + /// + /// @param [in] groupSize Fixed allocation size. Allocate() will only be able to create allocations of this size. + /// @param [in] alignment Required alignment of the allocation in bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + HashAllocator(size_t groupSize, uint32 alignment, Allocator*const pAllocator); + + ~HashAllocator(); + + /// Allocates a new block of memory. + /// + /// No size parameter, the size of allocation is fixed to the groupSize parameter specified in the constructor. + /// + /// @returns A pointer to the allocate memory, or null if the allocation failed. + void* Allocate(); + + /// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. + void Reset(); + + /// Allocates memory using allocator callbacks. + /// + /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define an + /// Alloc(const AllocInfo&) function. + /// + /// @param [in] allocInfo Structure containing information about memory allocation. + /// + /// @returns Pointer to memory allocated. + void* Alloc( + const AllocInfo& allocInfo) + { return m_pAllocator->Alloc(allocInfo); } + + /// Frees memory using allocator callbacks. + /// + /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define a + /// Free(const FreeInfo&) function. + /// + /// @param [in] freeInfo Structure containing information about memory needing to be freed. + void Free( + const FreeInfo& freeInfo) + { return m_pAllocator->Free(freeInfo); } + + /// Returns true if the allocator is in the 'reset' state and has no active allocations. + /// + /// @note There may still be reused but clean allocations present. + bool IsClean() const { return m_curBlock == -1; } + +private: + struct MemBlock + { + void* pMemory; // Pointer to the memory allocated for this block. + uint32 numGroups; // Number of groups in the block. + uint32 curGroup; // Current group index to be allocated. + }; + + // For the i-th block, it will hold Pow(2,i) groups, the whole array could have 4G groups. + static constexpr int32 NumBlocks = 32; // Number of blocks. + + MemBlock m_blocks[NumBlocks]; // Memory blocks holding exponentially growing memory. + const size_t m_groupSize; // Fixed-group-size for each group in one block. + uint32 m_alignment; // Required alignment of the allocation in bytes. + int32 m_curBlock; // Current block index memory is being allocated from. -1 indicates the + // allocator has just been created and hasn't created any blocks yet. + Allocator*const m_pAllocator; // Allocator for this hash allocation function. +}; + +// ===================================================================================================================== +template +HashAllocator::HashAllocator( + size_t groupSize, // Fixed allocation size. Allocate() will always create allocations of this size. + uint32 alignment, // Required alignment of the allocation in bytes. + Allocator*const pAllocator) // Allocator for this hash allocation function. + : + m_groupSize(groupSize), + m_alignment(alignment), + m_curBlock(-1), + m_pAllocator(pAllocator) +{ + for (int32 i = 0; i < NumBlocks; i++) + { + m_blocks[i].pMemory = nullptr; + m_blocks[i].curGroup = 0; + m_blocks[i].numGroups = (1 << i); + } +} + +// ===================================================================================================================== +template +HashAllocator::~HashAllocator() +{ + for (int32 i = 0; i < NumBlocks; i++) + { + if (m_blocks[i].pMemory == nullptr) + { + break; + } + else + { + PAL_SAFE_FREE(m_blocks[i].pMemory, m_pAllocator); + } + } +} + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a Hash container. + * + * Backward iterating is not supported since there is no "footer" or "header" for a hash container. + *********************************************************************************************************************** + */ +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +class HashIterator +{ +public: + /// Convenience typedef for the associated container for this templated iterator. + typedef HashBase Container; + + ~HashIterator() { } + + /// Returns a pointer to current entry. Will return null if the iterator has been advanced off the end of the + /// container. + Entry* Get() const { return m_pCurrentEntry; } + + /// Advances the iterator to the next position (move forward). + void Next(); + + /// Resets the iterator to its starting point. + void Reset(); + +private: + HashIterator(const Container* pHashContainer, uint32 startBucket); + + const Container* const m_pContainer; // Hash container that we're iterating over. + const uint32 m_startBucket; // Bucket where we start iterating. + uint32 m_currentBucket; // Current bucket we're iterating. + Entry* m_pCurrentGroup; // Current group we're iterating (belongs to the current bucket). + Entry* m_pCurrentEntry; // Current entry we're at now (belongs to the current group). + uint32 m_indexInGroup; // Index of current entry in the group. + PAL_DISALLOW_DEFAULT_CTOR(HashIterator); + + // Although this is a transgression of coding standards, it means that Container does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class HashBase; +}; + +/** + *********************************************************************************************************************** + * @brief Templated base class for HashMap and HashSet, supporting the ability to store, find, and remove entries. + * + * The hash container has a fixed number of buckets. These buckets contain a growable number of entry groups. Each + * entry group contains a fixed number of entries and a pointer to the next entry group in the bucket. + * + * The following restrictions are made in order to tune it to the desired usage: + * + * - The item must be significantly smaller than a cache line. + * - The key must be POD-style type. + * + * This class aims to be very efficient when looking up the key and storing small attached items is the primary concern. + * It's therefore not desired to have the key associated with a pointer to the attached data, because the attached data + * may be of similar or smaller size than the pointer anyway, it would also introduce much unnecessary memory + * management, and it would imply a minimum of two cache misses in the typical lookup case. + * + * The idea is that these entry groups can be exactly the size of a cache line, so an entry group can be scanned with + * only a single cache miss. This extends the load factor that the hash-map can manage before performance begins to + * degrade. For the very small items that we expect, this should be a significant advantage; we expect one cache miss + * pretty much always, so packing the items together would not be a significant gain, and the cost in memory usage is + * (relatively) small. + * + * The initial hash container will use up about (buckets * GroupSize) bytes. + *********************************************************************************************************************** + */ +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +class HashBase +{ +public: + /// Convenience typedef for iterators of this templated HashBase. + typedef HashIterator Iterator; + + /// Initializes the hash container. This no longer needs to be called by a client of this API; instead + /// subclasses call InitAndFindBucket() instead of FindBucket() in any method that might insert a + /// new entry. + /// + /// @returns @ref Success if the initialization completed successfully, or ErrorOutOfMemory if the operation failed + /// due to an internal failure to allocate system memory. + Result Init(); + + /// Returns number of entries in the container. + uint32 GetNumEntries() const { return m_numEntries; } + + /// Returns an iterator pointing to the first entry. + Iterator Begin() const; + + /// Empty the hash container. + void Reset(); + +protected: + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator The allocator that will allocate memory if required. + explicit HashBase(uint32 numBuckets, Allocator*const pAllocator); + virtual ~HashBase() { PAL_SAFE_FREE(m_pMemory, &m_allocator); } + + /// @internal Ensures that the hash table has been allocated, then finds the bucket that matches + /// the specified key + /// + /// @param [in] key Key to find matching bucket for. + /// + /// @returns Pointer to the bucket corresponding to the specified key. + Entry* InitAndFindBucket(const Key& key); + + /// @internal Finds the bucket that matches the specified key. A subclass should use this only if it + /// is searching for an entry. If it might want to insert a new entry, it should use InitAndFindBucket() + /// instead. + /// + /// @param [in] key Key to find matching bucket for. + /// + /// @returns Pointer to the bucket corresponding to the specified key. + Entry* FindBucket(const Key& key) const; + + /// @internal Returns pointer to the next group of the specified group. + /// + /// @param [in] pGroup Current group to find next group for. + /// + /// @returns Pointer to the next group. + static Entry* GetNextGroup(Entry* pGroup); + + /// @internal Helper function which returns a pointer to the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns Pointer to the footer + static GroupFooter* GetGroupFooter(Entry* pGroup); + + /// @internal Helper function which returns the number of entries from the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns The number of entries from the footer + static uint32 GetGroupFooterNumEntries(Entry* pGroup); + + /// @internal Helper function which sets the number of entries in the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// @param [in] numEntries Number of entries that should be set. + static void SetGroupFooterNumEntries(Entry* pGroup, uint32 numEntries); + + /// @internal Helper function which returns the next group from the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns Pointer to the next group from the footer + static Entry* GetGroupFooterNextGroup(Entry* pGroup); + + /// @internal Helper function which sets the pointer to the next group in the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// @param [in] pNextGroup The next group that should be written into the footer. + static void SetGroupFooterNextGroup(Entry* pGroup, Entry* pNextGroup); + + /// @internal Allocates a new group if the footer of the specified group is null. + /// + /// @param [in] pGroup Current group to allocate a next group for. + /// + /// @returns Pointer to the next group. + Entry* AllocateNextGroup(Entry* pGroup); + + const HashFunc m_hashFunc; ///< @internal Hash functor object. + const EqualFunc m_equalFunc; ///< @internal Key compare function object. + AllocFunc m_allocator; ///< @internal Allocator object. + + uint32 m_numBuckets; ///< @internal Buckets in the hash table; Padded to power of 2. + uint32 m_numEntries; ///< @internal Entries in the table. + size_t m_memorySize; ///< @internal Memory allocation size for m_pMemory. + void* m_pMemory; ///< @internal Base address as allocated (before alignment). + + static constexpr size_t EntrySize = sizeof(Entry); ///< @internal Size (in bytes) of a single entry. + + /// Size (in bytes) of the footer space of a group linking to next group. + static constexpr size_t GroupFooterSize = sizeof(GroupFooter); + + /// Number of entries in a single group. + static constexpr uint32 EntriesInGroup = ((GroupSize - GroupFooterSize) / EntrySize); + + // There must be at least one entry in each group. + static_assert((EntriesInGroup >= 1), "Hash container entry is too big."); + +private: + PAL_DISALLOW_DEFAULT_CTOR(HashBase); + PAL_DISALLOW_COPY_AND_ASSIGN(HashBase); + + // Although this is a transgression of coding standards, it prevents HashIterator requiring a public constructor; + // constructing a 'bare' HashIterator (i.e. without calling HashSet::GetIterator) can never be a legal operation, so + // this means that these two classes are much safer to use. + friend class HashIterator; +}; + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashIterator::HashIterator( + const Container* pContainer, ///< [retained] The hash container to iterate over + uint32 startBucket) ///< The beginning bucket + : + m_pContainer(pContainer), + m_startBucket(startBucket), + m_currentBucket(m_startBucket), + m_indexInGroup(0) +{ + if (m_startBucket < m_pContainer->m_numBuckets) + { + m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_startBucket * GroupSize)); + } + else + { + m_pCurrentGroup = nullptr; + } + + m_pCurrentEntry = m_pCurrentGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashBase::HashBase( + uint32 numBuckets, + Allocator*const pAllocator) + : + m_hashFunc(), + m_equalFunc(), + m_allocator(GroupSize, alignof(Entry), pAllocator), + m_numBuckets((numBuckets > 0) ? Pow2Pad(numBuckets) : 1), // We always need at least one bucket. + m_numEntries(0), + m_memorySize(m_numBuckets * GroupSize), + m_pMemory(nullptr) +{ +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h new file mode 100644 index 0000000000..395186a5aa --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h @@ -0,0 +1,650 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashBaseImpl.h + * @brief PAL utility collection shared class implementations used by the HashMap and HashSet containers. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +// ===================================================================================================================== +// Default hash function implementation. Simply shift the key to the right and use the resulting bits as the hash. +template +uint32 DefaultHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + // We need this union to do a type conversion from a Key type to a uint for the hash function. This ensures + // that our data won't change when casting and that we don't have to guess which _cast<> operation is the most + // appropriate for each data type for which this template class may be instantiated. + union KeyUint32 + { + Key key; + uint32 uint; + } hash = { }; + + // Get the raw bits. + const Key* pKey = static_cast(pVoidKey); + hash.key = *pKey; + + // Discard the low bits. + return (hash.uint >> ShiftNum); +} + +// ===================================================================================================================== +// Hashes the specified key value with the Jenkins hash algorithm. Implementation based on the algorithm description +// found here: http://burtleburtle.net/bob/hash/doobs.html. +// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +// code any way you wish, private, educational, or commercial. It's free. +// See http:\\ourworld.compuserve.com\homepages\bob_jenkins\evahash.htm +// Use for hash table lookup, or anything where one collision in 2^^32 is +// acceptable. Do NOT use for cryptographic purposes. +template +uint32 JenkinsHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + // Mixing table. + static const uint8 MixTable[256] = + { + 251, 175, 119, 215, 81, 14, 79, 191, 103, 49, 181, 143, 186, 157, 0, 232, + 31, 32, 55, 60, 152, 58, 17, 237, 174, 70, 160, 144, 220, 90, 57, 223, + 59, 3, 18, 140, 111, 166, 203, 196, 134, 243, 124, 95, 222, 179, 197, 65, + 180, 48, 36, 15, 107, 46, 233, 130, 165, 30, 123, 161, 209, 23, 97, 16, + 40, 91, 219, 61, 100, 10, 210, 109, 250, 127, 22, 138, 29, 108, 244, 67, + 207, 9, 178, 204, 74, 98, 126, 249, 167, 116, 34, 77, 193, 200, 121, 5, + 20, 113, 71, 35, 128, 13, 182, 94, 25, 226, 227, 199, 75, 27, 41, 245, + 230, 224, 43, 225, 177, 26, 155, 150, 212, 142, 218, 115, 241, 73, 88, 105, + 39, 114, 62, 255, 192, 201, 145, 214, 168, 158, 221, 148, 154, 122, 12, 84, + 82, 163, 44, 139, 228, 236, 205, 242, 217, 11, 187, 146, 159, 64, 86, 239, + 195, 42, 106, 198, 118, 112, 184, 172, 87, 2, 173, 117, 176, 229, 247, 253, + 137, 185, 99, 164, 102, 147, 45, 66, 231, 52, 141, 211, 194, 206, 246, 238, + 56, 110, 78, 248, 63, 240, 189, 93, 92, 51, 53, 183, 19, 171, 72, 50, + 33, 104, 101, 69, 8, 252, 83, 120, 76, 135, 85, 54, 202, 125, 188, 213, + 96, 235, 136, 208, 162, 129, 190, 132, 156, 38, 47, 1, 7, 254, 24, 4, + 216, 131, 89, 21, 28, 133, 37, 153, 149, 80, 170, 68, 6, 169, 234, 151 + }; + + const uint8* pKey = static_cast(pVoidKey); + + uint32 a = 0x9e3779b9; // The golden ratio; an arbitrary value. + uint32 b = a; + uint32 c = MixTable[pKey[0]]; // Arbitrary value. + uint32 len = keyLen; + + // Handle most of the key. + while (len >= 12) + { + a = a + (pKey[0] + (static_cast(pKey[1]) << 8) + + (static_cast(pKey[2]) << 16) + + (static_cast(pKey[3]) << 24)); + b = b + (pKey[4] + (static_cast(pKey[5]) << 8) + + (static_cast(pKey[6]) << 16) + + (static_cast(pKey[7]) << 24)); + c = c + (pKey[8] + (static_cast(pKey[9]) << 8) + + (static_cast(pKey[10]) << 16) + + (static_cast(pKey[11]) << 24)); + + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + pKey = pKey + 12; + len = len - 12; + } + + // Handle last 11 bytes. + c = c + keyLen; + switch (len) + { + case 11: c = c + (static_cast(pKey[10]) << 24); [[fallthrough]]; + case 10: c = c + (static_cast(pKey[9]) << 16); [[fallthrough]]; + case 9: c = c + (static_cast(pKey[8]) << 8); [[fallthrough]]; + // the first byte of c is reserved for the length + case 8: b = b + (static_cast(pKey[7]) << 24); [[fallthrough]]; + case 7: b = b + (static_cast(pKey[6]) << 16); [[fallthrough]]; + case 6: b = b + (static_cast(pKey[5]) << 8); [[fallthrough]]; + case 5: b = b + pKey[4]; [[fallthrough]]; + case 4: a = a + (static_cast(pKey[3]) << 24); [[fallthrough]]; + case 3: a = a + (static_cast(pKey[2]) << 16); [[fallthrough]]; + case 2: a = a + (static_cast(pKey[1]) << 8); [[fallthrough]]; + case 1: a = a + pKey[0]; + // case 0: nothing left to add + } + + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + return c; +} + +// ===================================================================================================================== +// Hashes the specified C-style string key with the Jenkins hash algorithm. +template +uint32 StringJenkinsHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + const Key* pKey = static_cast(pVoidKey); + const Key key = *pKey; + keyLen = static_cast(strlen(key)); + + return JenkinsHashFunc::operator()(key, keyLen); +} + +// ===================================================================================================================== +// Returns true if the strings in key1 and key2 are the same. +template +bool StringEqualFunc::operator()( + const Key& key1, + const Key& key2 + ) const +{ + bool ret = false; + + // Can't do strcmp on null. + if ((key1 != nullptr) && (key2 != nullptr)) + { + ret = (strcmp(key1, key2) == 0); + } + else if ((key1 == nullptr) && (key2 == nullptr)) + { + ret = true; + } + + return ret; +} + +// ===================================================================================================================== +// Allocates a new block of memory. +template +void* HashAllocator::Allocate() +{ + void* pMemory = nullptr; + + // Leave pBlock null if this is the first allocation made with this object. + MemBlock* pBlock = (m_curBlock >= 0) ? &m_blocks[m_curBlock] : nullptr; + + // If current block is used up (or we haven't allocated one yet), go to next. + if ((pBlock == nullptr) || (pBlock->curGroup >= pBlock->numGroups)) + { + // Only advance to the next block if the current one had memory allocated to it (which implies that it's + // full). + uint32_t nextBlock = m_curBlock; + + if ((pBlock == nullptr) || (pBlock->pMemory != nullptr)) + { + nextBlock++; + } + + PAL_ASSERT(nextBlock < NumBlocks); + + pBlock = &m_blocks[nextBlock]; + + PAL_ASSERT(pBlock->curGroup == 0); + + // Allocate memory if needed (note that this may rarely fail) + if (pBlock->pMemory == nullptr) + { + // Here we allocate another chunk of memory from outside, that we can later distribute internally + // to whichever bucket needs another group linked to it. + PAL_DPWARN("HashAllocator allocating more external memory, enough to hold %u Groups. " + "Consider increasing the GroupSize(%llu) in order to fit more Entries" + "In a Group.", + pBlock->numGroups, static_cast(m_groupSize)); + + pBlock->pMemory = PAL_CALLOC_ALIGNED(pBlock->numGroups * m_groupSize, m_alignment, + m_pAllocator, AllocInternal); + } + + // If we successfully allocated memory (or the block already had some), make it current + if (pBlock->pMemory != nullptr) + { + m_curBlock = nextBlock; + } + } + + if (pBlock->pMemory != nullptr) + { + pMemory = VoidPtrInc(pBlock->pMemory, ((pBlock->curGroup++) * m_groupSize)); + } + + return pMemory; +} + +// ===================================================================================================================== +// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. +template +void HashAllocator::Reset() +{ + for (int32 i = 0; i <= m_curBlock; ++i) + { + PAL_ASSERT(m_blocks[i].pMemory != nullptr); + memset(m_blocks[i].pMemory, 0, m_blocks[i].numGroups * m_groupSize); + + m_blocks[i].curGroup = 0; + } + + m_curBlock = -1; +} + +// ===================================================================================================================== +// Proceeds to the next entry, null if to the end. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashIterator::Next() +{ + if (m_pCurrentEntry != nullptr) + { + PAL_ASSERT(m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup]); + + Entry* pNextGroup = Container::GetNextGroup(m_pCurrentGroup); + + // We're in the middle of a group. + uint32 numEntries = m_pContainer->GetGroupFooterNumEntries(m_pCurrentGroup); + if ((m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup - 1]) && + (m_indexInGroup + 1 < numEntries)) + { + m_pCurrentEntry++; + m_indexInGroup++; + } + // We're in the last entry of a group. + // Considering that the next chained group could be an empty group already, it is better to check the + // next group's footer->numEntries before jump to the next group. If the numEntry of the next chained + // group is 0 (invalid), we need to jump to the next bucket directly to avoid returning invalid entry. + else if ((pNextGroup != nullptr) && + (m_indexInGroup == numEntries - 1) && + (reinterpret_cast*>(&pNextGroup[Container::EntriesInGroup])->numEntries > 0)) + { + m_pCurrentGroup = pNextGroup; + m_pCurrentEntry = pNextGroup; + m_indexInGroup = 0; + } + // The current bucket is done, step to the next. + else + { + do + { + m_currentBucket = (m_currentBucket + 1) % m_pContainer->m_numBuckets; + + pNextGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_currentBucket * GroupSize)); + + numEntries = m_pContainer->GetGroupFooterNumEntries(pNextGroup); + if (numEntries > 0) + { + m_indexInGroup = 0; + break; + } + } while(m_currentBucket != m_startBucket); + + if (m_currentBucket != m_startBucket) + { + m_pCurrentGroup = pNextGroup; + m_pCurrentEntry = pNextGroup; + m_indexInGroup = 0; + } + else + { + m_pCurrentEntry = nullptr; + } + } + } +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashIterator::Reset() +{ + m_currentBucket = m_startBucket; + m_indexInGroup = 0; + + if (m_startBucket < m_pContainer->m_numBuckets) + { + m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_startBucket * GroupSize)); + } + else + { + m_pCurrentGroup = nullptr; + } + + m_pCurrentEntry = m_pCurrentGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashBase::Init() +{ + // Each bucket's address must be aligned as Entry required. + PAL_ASSERT(IsPow2Aligned(GroupSize, alignof(Entry))); + + // Since (m_numBuckets - 1) will mask the hashing result, the hash func should make sure the hashing result always + // contain enough effective bits. + m_hashFunc.Init(Log2(m_numBuckets)); + + // Allocate the hash table. Zero out the memory to mark all entries invalid, since a key of 0 is invalid. + m_pMemory = PAL_CALLOC_ALIGNED(m_memorySize, alignof(Entry), &m_allocator, AllocInternal); + + PAL_ALERT(m_pMemory == nullptr); + + return (m_pMemory != nullptr) ? Result::Success : Result::ErrorOutOfMemory; +} + +// ===================================================================================================================== +// Returns an iterator pointing to the first entry. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashIterator +HashBase::Begin() const +{ + uint32 bucket = 0; + + if (m_numEntries != 0) + { + PAL_ASSERT(m_pMemory != nullptr); + for (;bucket < m_numBuckets; ++bucket) + { + Entry* pEntry = static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)); + const uint32 numEntries = GetGroupFooterNumEntries(pEntry); + if (numEntries > 0) + { + break; + } + } + } + else + { + // If the backing memory does not exist we should return a null Iterator. + // This can be done by setting the start bucket such that it is off the end of the bucket list. + bucket = m_numBuckets; + } + + return Iterator(this, bucket); +} + +// ===================================================================================================================== +// Empty the hash table. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::Reset() +{ + if ((m_pMemory != nullptr) && ((m_numEntries != 0) || (m_allocator.IsClean() == false))) + { + // Re-zero out the hash table. + // We can skip this if: + // - m_numEntries is 0, then each group's numEntries and entry data is already reset. + // - the allocator is clean, then each group's chain pointer must be nullptr (nowhere to point to) + memset(m_pMemory, 0, m_memorySize); + } + + m_numEntries = 0; + + m_allocator.Reset(); +} + +// ===================================================================================================================== +// Ensures that the hash table has been allocated, then returns pointer to start group of the bucket +// corresponding to the specified key. A return of nullptr means out of memory. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::InitAndFindBucket( + const Key& key + ) +{ + if (m_pMemory == nullptr) + { + Init(); + } + return FindBucket(key); +} + +// ===================================================================================================================== +// Returns pointer to start group of the bucket corresponding to the specified key. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::FindBucket( + const Key& key + ) const +{ + const uint32 bucket = m_hashFunc(&key, sizeof(key)) & (m_numBuckets - 1); + return (m_pMemory != nullptr) ? static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)) : nullptr; +} + +// ===================================================================================================================== +// Returns pointer to the next group of the spcified group. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::GetNextGroup( + Entry* pGroup) +{ + // Footer of a group stores the pointer to the next group + return HashBase::GetGroupFooterNextGroup(pGroup); +} + +// ===================================================================================================================== +// Allocates a new group if the footer of the specified group is null. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::AllocateNextGroup( + Entry* pGroup) +{ + // Footer of a group stores the pointer to the next group. + Entry* pNextGroup = GetGroupFooterNextGroup(pGroup); + + if (pNextGroup == nullptr) + { + // This warning is useful in order to tune hash maps, but probably doesn't need to be enabled for anyone not + // actively tuning. We're not asking for more memory here, just assigning more of the memory chunk we have + // already asked for to a bucket. Each bucket starts with a group that it can place hash hits in. + // when the group for that bucket fills up, we call this function and link another group for the same bucket, + // in the form of a linked list, onto that. + // This is expected to happen a bit, as hash distributions aren't perfect. But if this happens too many times, + // you're really searching a linked list, not a hash map, which is much slower. + // It's at that point you need this warning: to help balance out the number of buckets and group sizes + // to better fit your use case. + //PAL_DPWARN("HashBase needs to allocate more internal memory after inserting %u entries. " + // "Consider increasing the NumBuckets(%u) or GroupSize(%llu) in order to " + // "fit more Entries In a Group(%u).", + // m_numEntries, m_numBuckets, GroupSize, EntriesInGroup); + + // We allocate the next entry group if it does not exist. + pNextGroup = static_cast(m_allocator.Allocate()); + SetGroupFooterNextGroup(pGroup, pNextGroup); + } + + PAL_ASSERT(pNextGroup != nullptr); + + return pNextGroup; +} + +// ===================================================================================================================== +// Return a pointer to the group footer. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +GroupFooter* HashBase::GetGroupFooter( + Entry* pGroup) +{ + return reinterpret_cast*>(&pGroup[EntriesInGroup]); +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +uint32 HashBase::GetGroupFooterNumEntries( + Entry* pGroup) +{ + const uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, numEntries)); + uint32 numEntries; + memcpy(&numEntries, pNumEntries, sizeof(numEntries)); + return numEntries; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::SetGroupFooterNumEntries( + Entry* pGroup, uint32 numEntries) +{ + uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, numEntries)); + memcpy(pNumEntries, &numEntries, sizeof(numEntries)); +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::GetGroupFooterNextGroup( + Entry* pGroup) +{ + Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, pNextGroup)); + Entry* pNextGroup; + memcpy(&pNextGroup, ppNextGroup, sizeof(pNextGroup)); + return pNextGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::SetGroupFooterNextGroup( + Entry* pGroup, Entry* pNextGroup) +{ + Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, pNextGroup)); + memcpy(ppNextGroup, &pNextGroup, sizeof(pNextGroup)); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h new file mode 100644 index 0000000000..e5ea46ddb5 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h @@ -0,0 +1,143 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashMap.h + * @brief PAL utility collection HashMap class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +/// Encapsulates one key/value pair in a hash map. +template +struct HashMapEntry +{ + Key key; ///< Hash map entry key. + Value value; ///< Hash map entry value. +}; + +/** + *********************************************************************************************************************** + * @brief Templated hash map container. + * + * This container is meant for storing elements of an arbitrary (but uniform) key/value type. Supported operations: + * + * - Searching + * - Insertion + * - Deletion + * - Iteration + * + * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: + * + * - DefaultHashFunc: Good choice when the key is a pointer. + * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. + * - StringJenkinsHashFunc: Good choice when the key is a C-style string. + * + * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: + * + * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. + * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. + * + * @warning This class is not thread-safe for Insert, FindAllocate, Erase, or iteration! + * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before + * initialization and Begin() will always return an iterator that points to null. + * + * For more details please refer to @ref HashBase. + *********************************************************************************************************************** + */ +template class HashFunc = DefaultHashFunc, + template class EqualFunc = DefaultEqualFunc, + typename AllocFunc = HashAllocator, + size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> +class HashMap : public HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> +{ +public: + /// Convenience typedef for a templated entry of this hash map. + typedef HashMapEntry Entry; + + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + explicit HashMap(uint32 numBuckets, Allocator*const pAllocator): Base::HashBase(numBuckets, pAllocator) { } + virtual ~HashMap() { } + + /// Finds a given entry; if no entry was found, allocate it. + /// + /// @param [in] key Key to search for. + /// @param [out] pExisted True if an entry for the specified key existed before this call was made. False indicates + /// that a new entry was allocated as a result of this call. + /// @param [out] ppValue Readable/writeable value in the hash map corresponding to the specified key. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result FindAllocate(const Key& key, bool* pExisted, Value** ppValue); + + /// Gets a pointer to the value that matches the specified key. + /// + /// @param [in] key Key to search for. + /// + /// @returns A pointer to the value that matches the specified key or null if an entry for the key does not exist. + Value* FindKey(const Key& key) const; + + /// Inserts a key/value pair entry if the key doesn't already exist in the hash map. + /// + /// @warning No action will be taken if an entry matching this key already exists, even if the specified value + /// differs from the current value stored in the entry matching the specified key. + /// + /// @param [in] key Key of the new entry to insert. + /// @param [in] value Value of the new entry to insert. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result Insert(const Key& key, const Value& value); + + /// Removes an entry that matches the specified key. + /// + /// @param [in] key Key of the entry to erase. + /// + /// @returns True if the erase completed successfully, false if an entry for this key did not exist. + bool Erase(const Key& key); + +private: + // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when + // accessing members of HashBase. + typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; + + PAL_DISALLOW_DEFAULT_CTOR(HashMap); + PAL_DISALLOW_COPY_AND_ASSIGN(HashMap); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h new file mode 100644 index 0000000000..36905ae1ac --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h @@ -0,0 +1,250 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashMapImpl.h + * @brief PAL utility collection HashMap class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBaseImpl.h" +#include "palHashMap.h" + +namespace Util +{ + +// ===================================================================================================================== +// Gets a pointer to the value that matches the key. If the key is not present, a pointer to empty space for the value +// is returned. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashMap::FindAllocate( + const Key& key, // Key to search for. + bool* pExisted, // [out] True if a matching key was found. + Value** ppValue) // [out] Pointer to the value entry of the hash map's entry for the specified key. +{ + PAL_ASSERT(pExisted != nullptr); + PAL_ASSERT(ppValue != nullptr); + + Result result = Result::ErrorOutOfMemory; + + // Get the bucket base address.... + Entry* pGroup = this->InitAndFindBucket(key); + + *pExisted = false; + *ppValue = nullptr; + + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + *pExisted = true; + break; + } + } + + // We've reached the end of the allocated buckets and the entry was not found. + // Allocate this entry for the key. + if ((pMatchingEntry == nullptr) && (i < Base::EntriesInGroup)) + { + pGroup[i].key = key; + pMatchingEntry = &(pGroup[i]); + this->m_numEntries++; + this->SetGroupFooterNumEntries(pGroup, numEntries + 1); + } + + if (pMatchingEntry != nullptr) + { + *ppValue = &(pMatchingEntry->value); + result = Result::Success; + break; + } + + // Chain to the next entry group. + pGroup = this->AllocateNextGroup(pGroup); + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Gets a pointer to the value that matches the key. Returns null if no entry is present matching the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Value* HashMap::FindKey( + const Key& key + ) const +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + break; + } + } + + if ((pMatchingEntry != nullptr) || (i < Base::EntriesInGroup)) + { + break; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + return (pMatchingEntry != nullptr) ? &(pMatchingEntry->value) : nullptr; +} + +// ===================================================================================================================== +// Inserts a key/value pair entry if it doesn't already exist. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashMap::Insert( + const Key& key, + const Value& value) +{ + bool existed = true; + Value* pValue = nullptr; + + Result result = FindAllocate(key, &existed, &pValue); + + // Add the new value if it did not exist already. If FindAllocate returns Success, pValue != nullptr. + if ((result == Result::Success) && (existed == false)) + { + *pValue = value; + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Removes an entry with the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashMap::Erase( + const Key& key) +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + + Entry* pFoundEntry = nullptr; + Entry* pLastEntry = nullptr; + Entry* pLastEntryGroup = nullptr; + + // Find the entry to delete + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search each group + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key) == true) + { + // We shouldn't find the same key twice. + PAL_ASSERT(pFoundEntry == nullptr); + + pFoundEntry = &(pGroup[i]); + } + + // keep track of last entry of all groups in bucket + pLastEntry = &(pGroup[i]); + pLastEntryGroup = pGroup; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + // Copy the last entry's data into the entry that we are removing and invalidate the last entry as it now appears + // earlier in the list. This also handles the case where the entry to be removed is the last entry. + if (pFoundEntry != nullptr) + { + PAL_ASSERT(pLastEntry != nullptr); + + pFoundEntry->key = pLastEntry->key; + pFoundEntry->value = pLastEntry->value; + + memset(pLastEntry, 0, sizeof(Entry)); + + PAL_ASSERT(this->m_numEntries > 0); + this->m_numEntries--; + const uint32 numEntries = this->GetGroupFooterNumEntries(pLastEntryGroup); + this->SetGroupFooterNumEntries(pLastEntryGroup, numEntries - 1); + } + + return (pFoundEntry != nullptr); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h new file mode 100644 index 0000000000..9876cd30e0 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h @@ -0,0 +1,144 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashSet.h + * @brief PAL utility collection HashSet class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +/// Encapsulates one entry of a hash set. +template +struct HashSetEntry +{ + Key key; ///< Hash set entry key. +}; + +/** + *********************************************************************************************************************** + * @brief Templated hash set container. + * + * This is meant for storing elements of an arbitrary (but uniform) key type. Supported operations: + * + * - Searching + * - Insertion + * - Deletion + * - Iteration + * + * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: + * + * - DefaultHashFunc: Good choice when the key is a pointer. + * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. + * - StringJenkinsHashFunc: Good choice when the key is a C-style string. + * + * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: + * + * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. + * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. + * + * @warning This class is not thread-safe for Insert, Erase, or iteration! + * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before + * initialization and Begin() will always return an iterator that points to null. + * + * For more details please refer to @ref HashBase. + *********************************************************************************************************************** + */ +template class HashFunc = DefaultHashFunc, + template class EqualFunc = DefaultEqualFunc, + typename AllocFunc = HashAllocator, + size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> +class HashSet : public HashBase, + Allocator, + HashFunc, + EqualFunc, + AllocFunc, + GroupSize> +{ +public: + /// Convenience typedef for a templated entry of this hash set. + typedef HashSetEntry Entry; + + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + explicit HashSet(uint32 numBuckets, Allocator*const pAllocator) : Base::HashBase(numBuckets, pAllocator) {} + virtual ~HashSet() { } + + /// Finds a given entry; if no entry was found, allocate it. + /// + /// @param [in] ppKey Key to search for. + /// @param [out] pExisted True if an entry for the specified key existed before this call was made. + /// False indicates that a new entry was allocated as a result of this call. + /// + /// @returns @ref Success if the operation completed successfully + /// @ref ErrorOutOfMemory if the operation failed because an internal memory allocation failed. + Result FindAllocate(Key** ppKey, bool* pExisted); + + /// Returns true if the specified key exists in the set. + /// + /// @param [in] key Key to search for. + /// + /// @returns True if the specified key exists in the set. + bool Contains(const Key& key) const; + + /// Inserts an entry. + /// + /// No action will be taken if an entry matching this key already exists in the set. + /// + /// @param [in] key New entry to insert. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result Insert(const Key& key); + + /// Removes an entry that matches the specified key. + /// + /// @param [in] key Key of the entry to erase. + /// + /// @returns True if the erase completed successfully, false if an entry for this key did not exist. + bool Erase(const Key& key); + +private: + // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when + // accessing members of HashBase. + typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; + + PAL_DISALLOW_DEFAULT_CTOR(HashSet); + PAL_DISALLOW_COPY_AND_ASSIGN(HashSet); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h new file mode 100644 index 0000000000..87966886bb --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h @@ -0,0 +1,231 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashSetImpl.h + * @brief PAL utility collection HashSet class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBaseImpl.h" +#include "palHashSet.h" + +namespace Util +{ + +// ===================================================================================================================== +// Inserts a key if it doesn't already exist. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashSet::Insert( + const Key& key) +{ + Key* pKey = const_cast(&key); + bool existed; + const Result result = FindAllocate(&pKey, &existed); + if (existed == false) + { + *pKey = key; + } + return result; +} + +// ===================================================================================================================== +// Finds a given entry; if no entry was found, allocate it. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashSet::FindAllocate( + Key** ppKey, + bool* pExisted) +{ + PAL_ASSERT(ppKey != nullptr); + PAL_ASSERT(pExisted != nullptr); + + Result result = Result::ErrorOutOfMemory; + + // Get the bucket base address. + Entry* pGroup = this->InitAndFindBucket(**ppKey); + + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, **ppKey)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + *pExisted = true; + break; + } + } + + if ((pMatchingEntry == nullptr) && (i < Base::EntriesInGroup)) + { + // We've reached the end of the bucket and the entry was not found. Allocate this entry for the key. + *pExisted = false; + *ppKey = &pGroup[i].key; + pMatchingEntry = &(pGroup[i]); + this->m_numEntries++; + this->SetGroupFooterNumEntries(pGroup, numEntries + 1); + } + + if (pMatchingEntry != nullptr) + { + result = Result::Success; + break; + } + + // Chain to the next entry group. + pGroup = this->AllocateNextGroup(pGroup); + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Searches for the specified key to see if it exists. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashSet::Contains( + const Key& key + ) const +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + break; + } + } + + if ((pMatchingEntry != nullptr) || (i < Base::EntriesInGroup)) + { + break; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + return (pMatchingEntry != nullptr); +} + +// ===================================================================================================================== +// Removes an entry with the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashSet::Erase( + const Key& key) +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + + Entry* pFoundEntry = nullptr; + Entry* pLastEntry = nullptr; + + Entry* pLastEntryGroup = nullptr; + + // Find the entry to delete. + while ((pGroup != nullptr)) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key) == true) + { + // We shouldn't find the same key twice. + PAL_ASSERT(pFoundEntry == nullptr); + + pFoundEntry = &(pGroup[i]); + } + + // keep track of last entry of all groups in bucket + pLastEntry = &(pGroup[i]); + pLastEntryGroup = pGroup; + } + + // Chain to the next entry group + pGroup = this->GetNextGroup(pGroup); + } + + // Copy the last entry's data into the entry that we are removing and invalidate the last entry as it now appears + // earlier in the list. This also handles the case where the entry to be removed is the last entry. + if (pFoundEntry != nullptr) + { + PAL_ASSERT(pLastEntry != nullptr); + + pFoundEntry->key = pLastEntry->key; + memset(pLastEntry, 0, sizeof(Entry)); + + PAL_ASSERT(this->m_numEntries > 0); + this->m_numEntries--; + const uint32 numEntries = this->GetGroupFooterNumEntries(pLastEntryGroup); + this->SetGroupFooterNumEntries(pLastEntryGroup, numEntries - 1); + } + + return (pFoundEntry != nullptr); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h b/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h new file mode 100644 index 0000000000..1040e8108c --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h @@ -0,0 +1,1565 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palInlineFuncs.h + * @brief PAL utility collection inline functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palStringUtil.h" +#include +#include +#include +#include +#include +#include +#include + +namespace Util +{ + +/// Describes a value type, primarily used for loading settings values. +enum class ValueType : uint32 +{ + Boolean, ///< Boolean type. + Int8, ///< 8-bit integer type. + Uint8, ///< 8-bit unsigned integer type. + Int16, ///< 16-bit integer type. + Uint16, ///< 16-bit unsigned integer type. + Int32, ///< 32-bit integer type. + Uint32, ///< 32-bit unsigned integer type. + Int64, ///< 64-bit integer type. + Uint64, ///< 64-bit unsigned integer type. + Float, ///< Floating point type. + Str, ///< String type. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 905 + Int = Int32, ///< Signed integer type. + Uint = Uint32, ///< Unsigned integer type. +#endif +}; + +/// Determines the length of an array at compile-time. +/// +/// @returns The length of the array. +template +constexpr size_t ArrayLen( + const T (&array)[N]) ///< The array of arbitrary type T. +{ + return N; +} + +/// Determines the 32-bit length of an array at compile-time. +/// +/// @returns The length of the array. +template +constexpr uint32 ArrayLen32( + const T (&array)[N]) ///< The array of arbitrary type T. +{ + return N; +} + +/// Increments a const pointer by nBytes by first casting it to a const uint8*. +/// +/// @returns Incremented pointer. +constexpr const void* VoidPtrInc( + const void* p, ///< [in] Pointer to be incremented. + size_t numBytes) ///< Number of bytes to increment the pointer by. +{ + return (static_cast(p) + numBytes); +} + +/// Increments a pointer by nBytes by first casting it to a uint8*. +/// +/// @returns Incremented pointer. +constexpr void* VoidPtrInc( + void* p, ///< [in] Pointer to be incremented. + size_t numBytes) ///< Number of bytes to increment the pointer by. +{ + return (static_cast(p) + numBytes); +} + +/// Decrements a const pointer by nBytes by first casting it to a const uint8*. +/// +/// @returns Decremented pointer. +constexpr const void* VoidPtrDec( + const void* p, ///< [in] Pointer to be decremented. + size_t numBytes) ///< Number of bytes to decrement the pointer by. +{ + return (static_cast(p) - numBytes); +} + +/// Decrements a pointer by nBytes by first casting it to a uint8*. +/// +/// @returns Decremented pointer. +constexpr void* VoidPtrDec( + void* p, ///< [in] Pointer to be decremented. + size_t numBytes) ///< Number of bytes to decrement the pointer by. +{ + return (static_cast(p) - numBytes); +} + +/// Finds the number of bytes between two pointers by first casting them to uint8*. +/// +/// This function expects the first pointer to not be smaller than the second. +/// +/// @returns Number of bytes between the two pointers. +constexpr size_t VoidPtrDiff( + const void* p1, ///< [in] First pointer (higher address). + const void* p2) ///< [in] Second pointer (lower address). +{ + PAL_CONSTEXPR_ASSERT(p1 >= p2); + return (static_cast(p1) - static_cast(p2)); +} + +/// Returns the high 32 bits of a 64-bit integer. +/// +/// @returns Returns the high 32 bits of a 64-bit integer. +constexpr uint32 HighPart( + uint64 value) ///< 64-bit input value. +{ + return (value & 0xFFFFFFFF00000000) >> 32; +} + +/// Returns the low 32 bits of a 64-bit integer. +/// +/// @returns Returns the low 32 bits of a 64-bit integer. +constexpr uint32 LowPart( + uint64 value) ///< 64-bit input value. +{ + return (value & 0x00000000FFFFFFFF); +} + +/// Returns the high 32 bits of a 64-bit integer as a 64-bit integer. +/// +/// @returns Returns the high 32 bits of a 64-bit integer as a 64-bit integer +/// without shifting +constexpr uint64 HighPart64( + uint64 value) ///< 64-bit input value. +{ + return (value & 0xFFFFFFFF00000000); +} + +/// Combines the low and high 32 bits of a 64-bit integer. +/// +/// @returns Returns the 64-bit integer. +constexpr uint64 Uint64CombineParts( + uint32 lowPart, + uint32 highPart) +{ + return (uint64(highPart) << 32) | uint64(lowPart); +} + +/// Returns a larger value from repeating a single byte +constexpr uint32 ReplicateByteAcrossDword( + uint8 value) ///< 8-bit input value. +{ + return (value | (value << 8) | (value << 16) | (value << 24)); +} + +/// Returns a larger value from repeating a single byte +constexpr uint64 ReplicateByteAcrossQword( + uint8 value) ///< 8-bit input value. +{ + return ((static_cast(ReplicateByteAcrossDword(value)) << 32) | ReplicateByteAcrossDword(value)); +} + +/// Combines four characters into a uint32-based four-character-code "string". There's no null terminator so it's not a +/// real c-string, it just looks like there's a string if you view the uint in a hex editor or memcmp against a string. +/// +/// For example, FourCC('A', 'B', 'C', 'D') turns into 0x44434241. 'A' is 0x41 and it ends up in the first byte. +/// This function assumes we're running on a little endian platform (PAL only supports little-endian platforms). +/// +/// @returns Returns a uin32 four-character-code made from the given chars. +constexpr uint32 FourCc( + char c1, ///< The 1st character (lowest byte). + char c2, ///< The 2nd character. + char c3, ///< The 3rd character. + char c4) ///< The 4th character (highest byte). +{ + return (uint32(c4) << 24) | (uint32(c3) << 16) | (uint32(c2) << 8) | uint32(c1); +} + +/// Returns a bitfield from within some value. +/// +/// @returns Returns a bitfield from within some value. +template +constexpr T BitExtract( + T value, ///< Extract a bitfield from here. + uint32 firstBit, ///< The zero-based index of the first bit to extract. + uint32 lastBit) ///< The zero-based index of the last bit to extract. +{ + return (value >> firstBit) & ((1 << (lastBit - firstBit + 1)) - 1); +} + +/// Determines if any of the bits set in "test" are also set in "src". +/// +/// @returns True if any bits in "test" are set in "src", false otherwise. +constexpr bool TestAnyFlagSet( + uint32 src, ///< Source pattern. + uint32 test) ///< Test pattern. +{ + return ((src & test) != 0); +} + +/// Determines if all of the bits set in "test" are also set in "src". +/// +/// @returns True if all bits set in "test" are also set in "src", false otherwise. +constexpr bool TestAllFlagsSet( + uint32 src, ///< Source pattern. + uint32 test) ///< Test pattern. +{ + return ((src & test) == test); +} + +/// Determines if any of the bits set in "test" are also set in "src". +/// +/// @returns True if any bits in "test" are set in "src", false otherwise. +constexpr bool TestAnyFlagSet64( + uint64 src, ///< Source pattern. + uint64 test) ///< Test pattern. +{ + return ((src & test) != 0); +} + +/// Determines if all of the bits set in "test" are also set in "src". +/// +/// @returns True if all bits set in "test" are also set in "src", false otherwise. +constexpr bool TestAllFlagsSet64( + uint64 src, ///< Source pattern. + uint64 test) ///< Test pattern. +{ + return ((src & test) == test); +} + +/// Tests if a single bit in a bitfield is set. +/// +/// @param [in] bitfield Bitfield being tested +/// @param [in] bit Bit index to test +/// +/// @returns True if the flag is set. +template +constexpr bool BitfieldIsSet( + const T bitfield, + uint32 bit) +{ + PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); + return (bitfield & (static_cast(1) << bit)); +} + +/// Sets a single bit in a bitfield to one. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the bit to set +template +void BitfieldSetBit( + T &bitfield, + uint32 bit) +{ + PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); + bitfield |= (static_cast(1) << bit); +} + +///@{ +/// Counts the number of one bits (population count) in an unsigned integer using some bitwise magic explained in the +/// Software Optimization Guide for AMD64 Processors. +/// +/// @param [in] value The value need to be counted. +/// +/// @returns Number of one bits in the input +template +constexpr uint32 CountSetBits( + T value) +{ + uint32 x = static_cast(value); + + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> ((sizeof(uint32) - 1) << 3); + + return x; +} + +constexpr uint32 CountSetBits( + uint64 value) +{ + uint64 x = value; + + x = x - ((x >> 1) & 0x5555555555555555ull); + x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull); + x = (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> ((sizeof(uint64) - 1) << 3); + + return static_cast(x); +} +///@} + +/// Update a subfield of a bitfield. +/// +/// @param [in] bitFieldToUpdate Bitfield being updated +/// @param [in] updateValue Source value to update +/// @param [in] updateMask Bitmask to update +/// +/// @returns True if the flag is set. +template +void BitfieldUpdateSubfield( + T* pBitFieldToUpdate, + const T updateValue, + const T updateMask) +{ + *pBitFieldToUpdate = ((*pBitFieldToUpdate) & ~updateMask) | + (updateValue & updateMask); +} + +/// Tests if a single bit in a "wide bitfield" is set. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being tested +/// @param [in] bit Index of the flag to test +/// +/// @returns True if the flag is set. +template +constexpr bool WideBitfieldIsSet( + const T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + return (0 != (bitfield[index] & mask)); +} + +/// Checks if any bit is set in a wide bitfield. A "wide bitfield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Wide bitfield to count. +/// +/// @returns True if the wide bitfield is non-zero; false otherwise. +template +bool WideBitfieldIsAnyBitSet( + const T(&bitfield)[N]) +{ + bool isBitSet = false; + for (uint32 i = 0; i < N; i++) + { + isBitSet |= (bitfield[i] != 0); + } + + return isBitSet; +} + +/// Sets a single bit in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the flag to set +template +void WideBitfieldSetBit( + T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + bitfield[index] |= mask; +} + +/// Clears a single bit in a "wide bitfield" to zero. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the flag to set +template +void WideBitfieldClearBit( + T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + bitfield[index] &= ~mask; +} + +/// Sets consecutive bits in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] startingBit Index of the first flag to set +/// @param [in] numBits Count of consecutive flags to set +template +void WideBitfieldSetRange( + T (&bitfield)[N], + uint32 startingBit, + uint32 numBits) +{ + constexpr uint32 SizeInBits = (sizeof(T) << 3); + + PAL_ASSERT((startingBit + numBits) <= (SizeInBits * N)); + + uint32 index = (startingBit / SizeInBits); + + startingBit &= (SizeInBits - 1); + + while (numBits > 0) + { + const uint32 maxNumBits = SizeInBits - startingBit; + const uint32 curNumBits = (maxNumBits < numBits) ? maxNumBits : numBits; + const T bitMask = (curNumBits == SizeInBits) ? -1 : ((static_cast(1) << curNumBits) - 1); + + bitfield[index++] |= (bitMask << startingBit); + + startingBit = 0; + numBits -= curNumBits; + } +} + +/// XORs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers +/// because there are more flags than bits in one integer. +/// +/// @param [in] bitfield1 Reference to the first bitfield. +/// @param [in] bitfield2 Reference to the second bitfield. +/// @param [out] pOut Result of (bitfield1 ^ bitfield2) +template +void WideBitfieldXorBits( + const T (&bitfield1)[N], + const T (&bitfield2)[N], + T* pOut) +{ + for (uint32 i = 0; i < N; i++) + { + pOut[i] = (bitfield1[i] ^ bitfield2[i]); + } +} + +/// ANDs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers +/// because there are more flags than bits in one integer. +/// +/// @param [in] bitfield1 Reference to the first bitfield. +/// @param [in] bitfield2 Reference to the second bitfield. +/// @param [out] pOut Result of (bitfield1 & bitfield2) +template +void WideBitfieldAndBits( + const T (&bitfield1)[N], + const T (&bitfield2)[N], + T* pOut) +{ + for (uint32 i = 0; i < N; i++) + { + pOut[i] = (bitfield1[i] & bitfield2[i]); + } +} + +/// Counts the number of one bits (population count) in a wide bitfield. A "wide bitfield" is a bitfield which spans +/// an array of integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Wide bitfield to count. +/// +/// @returns Number of one bits in the input +template +uint32 WideBitfieldCountSetBits( + const T(&bitfield)[N]) +{ + uint32 count = 0; + for (uint32 i = 0; i < N; i++) + { + count += CountSetBits(bitfield[i]); + } + + return count; +} + +/// Unsets the least-significant '1' bit in the given number. +/// Usually used in conjunction with BitMaskScanForward +/// +/// @param [in] value The value to be modified +/// +/// @returns A copy of value with the lowest '1' bit unset. +template +T UnsetLeastBit( + T val) +{ + static_assert(std::is_unsigned::value, "Must use unsigned ints here"); + return val & (val - 1); +} + +/// Scans the specified bit-mask for the least-significant '1' bit. +/// +/// @returns True if the input was nonzero; false otherwise. +template +bool BitMaskScanForward( + uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. + T mask) ///< Bit-mask to scan. +{ + // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key + // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. + bool result = false; + + if (mask != 0) + { +#if defined(_WIN64) && defined(_M_X64) + *pIndex = (sizeof(T) > 4) ? static_cast(::_tzcnt_u64(mask)) : (::_tzcnt_u32(static_cast(mask))); +#elif defined(_WIN64) + auto*const pOut = reinterpret_cast(pIndex); + (sizeof(T) > 4) ? (::_BitScanForward64(pOut, mask)) : (::_BitScanForward(pOut, static_cast(mask))); +#elif defined(_WIN32) && defined(_M_IX86) + const uint32 lowPart = LowPart(mask); + uint32 index = (::_tzcnt_u32(((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); + *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; +#elif defined(_WIN32) + unsigned long index; + const uint32 lowPart = LowPart(mask); + (::_BitScanForward(&index, ((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); + *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; +#elif defined(__GNUC__) + *pIndex = (sizeof(T) > 4) ? __builtin_ctzll(mask) : __builtin_ctz(static_cast(mask)); +#else + uint32 index = 0; + for (; ((mask & 0x1) == 0); mask >>= 1, ++index); + *pIndex = index; +#endif + + result = true; + } + return result; +} + +/// Scans the specified bit-mask for the most-significant '1' bit. +/// +/// @returns True if the input was nonzero; false otherwise. +template +bool BitMaskScanReverse( + uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. + T mask) ///< Bit-mask to scan. +{ + // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key + // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. + bool result = false; + + if (mask != 0) + { +#if defined(_WIN64) + auto*const pOut = reinterpret_cast(pIndex); + (sizeof(T) > 4) ? (::_BitScanReverse64(pOut, mask)) : (::_BitScanReverse(pOut, static_cast(mask))); +#elif defined(_WIN32) + unsigned long index; + const uint32 highPart = (sizeof(T) > 4) ? HighPart(mask) : 0; + (::_BitScanReverse(&index, ((sizeof(T) > 4) && (highPart != 0)) ? highPart : LowPart(mask))); + *pIndex = ((sizeof(T) > 4) && (highPart != 0)) ? (index + 32u) : index; +#elif defined(__GNUC__) + *pIndex = (sizeof(T) > 4) ? (63u - __builtin_clzll(mask)) : (31u - __builtin_clz(static_cast(mask))); +#else + uint32 index = 31u; + for (; (((mask >> index) & 0x1) == 0); --index); + *pIndex = index; +#endif + + result = true; + } + return result; +} + +/// Scans the specified wide bit-mask for the least-significant '1' bit. +/// +/// @returns True if input was nonzero; false otherwise. +template +bool WideBitMaskScanForward( + uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. + const T (&mask)[N]) ///< Bit-mask to scan. +{ + uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); + + // Check to see if the wide bitmask has some bits set. + uint32 index = 0; + while ((mask[index] == 0) && (++index < N)); + bool result = (index < N); + + while (result == true) + { + result = BitMaskScanForward(pIndex, mask[maskIndex]); + + if (result == false) + { + ++maskIndex; + result = (maskIndex < N); + } + else + { + (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); + break; + } + } + + return result; +} + +/// Scans the specified wide bit-mask for the most-significant '1' bit. +/// +/// @returns True if input was nonzero; false otherwise. +template +bool WideBitMaskScanReverse( + uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. + const T (&mask)[N]) ///< Bit-mask to scan. +{ + uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); + + // Check to see if the wide bitmask has some bits set. + uint32 index = N - 1; + while ((mask[index] == 0) && (--index > 0)); + bool result = (mask[index] != 0); + + while (result == true) + { + result = BitMaskScanReverse(pIndex, mask[maskIndex]); + + if (result == false) + { + const uint32 oldIndex = maskIndex--; + result = (oldIndex != 0); + } + else + { + (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); + break; + } + } + + return result; +} + +/// Generates a bitmask. +/// +/// @param [in] numBits Number of bits to set (starting at 0) +/// +/// @returns Bitmask in storage of type T with bits [0:numBits-1] set. +template +constexpr T BitfieldGenMask( + T numBits) +{ + PAL_CONSTEXPR_ASSERT(numBits <= (sizeof(T) * 8)); + + const T mask = (numBits < (sizeof(T) * 8)) ? ((static_cast(1) << (numBits)) - static_cast(1)) : static_cast(-1); + return mask; +} + +/// Determines if a value is a power of two. +/// +/// @returns True if it is a power of two, false otherwise. +constexpr bool IsPowerOfTwo( + uint64 value) ///< Value to check. +{ + return (value == 0) ? false : ((value & (value - 1)) == 0); +} + +/// Determines if 'value' is at least aligned to the specified power-of-2 alignment. +/// +/// @returns True if aligned, false otherwise. +constexpr bool IsPow2Aligned( + uint64 value, ///< Value to check. + uint64 alignment) ///< Desired alignment. +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return ((value & (alignment - 1)) == 0); +} + +/// Determines if 'ptr' is at least aligned to the specified power-of-2 alignment. +/// +/// @returns True if aligned, false otherwise. +inline bool VoidPtrIsPow2Aligned( + const void* ptr, ///< Pointer to check. + uint64 alignment) ///< Desired alignment. +{ + PAL_ASSERT(IsPowerOfTwo(alignment)); + return ((reinterpret_cast(ptr) & (alignment - 1)) == 0); +} + +/// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 +/// alignments are supported by this function. +/// +/// @returns Aligned value. +template +constexpr T Pow2Align( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); +} + +/// Rounds the specified uint 'value' up to the nearest power of 2 +/// +/// @param [in] value The value to pad. +/// +/// @returns Power of 2 padded value. +template +T Pow2Pad( + T value) +{ + T ret = value; + + if ((value & (value - 1)) != 0) + { + uint32 lastBitIndex = 0; + BitMaskScanReverse(&lastBitIndex, value); + ret = (static_cast(0x2) << lastBitIndex); + } + + return ret; +} + +/// Computes the base-2 logarithm of an unsigned integer. +/// +/// If the given integer is not a power of 2, this function will not provide an exact answer. +/// +/// @param [in] u Value to compute the logarithm of. +/// +/// @returns log_2(u) +template +uint32 Log2( + T u) +{ + uint32 logValue = 0; + return BitMaskScanReverse(&logValue, u) ? logValue : 0; +} + +/// Computes the base-2 logarithm of an unsigned 64-bit integer based on ceiling +/// +/// If the given integer is not a power of 2, this function will not provide an exact answer. +/// +/// @returns ceilLog_2(u) +template +uint32 CeilLog2( + T u) ///< Value to compute the ceil logarithm of. +{ + const uint32 logValue = Log2(u); + return ((static_cast(0x1ul) << logValue) < u) ? (logValue + 1) : logValue; +} + +/// Implements an alternative version of integer division in which the quotient is always rounded up instead of down. +/// +/// @returns The rounded quotient. +template +constexpr T RoundUpQuotient( + T dividend, ///< Value to divide. + T divisor) ///< Value to divide by. +{ + return ((dividend + (divisor - 1)) / divisor); +} + +/// Rounds up the specified integer to the nearest multiple of the specified alignment value. +/// +/// @returns Rounded value. +template +constexpr T RoundUpToMultiple( + T operand, ///< Value to be aligned. + T alignment) ///< Alignment desired. +{ + return (((operand + (alignment - 1)) / alignment) * alignment); +} + +/// Rounds down the specified integer to the nearest multiple of the specified alignment value. +/// +/// @returns Rounded value. +template +constexpr T RoundDownToMultiple( + T operand, ///< Value to be aligned. + T alignment) ///< Alignment desired. +{ + return ((operand / alignment) * alignment); +} + +/// Rounds the specified 'value' down to the nearest value meeting the specified 'alignment'. Only power of 2 +/// alignments are supported by this function. +/// +/// @returns Rounded value. +template +constexpr T Pow2AlignDown( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return (value & ~(alignment - 1)); +} + +/// Determines the maximum of two numbers. +/// +/// @returns The larger of the two inputs. +template +constexpr T Max( + T value1, ///< First value to check. + T value2) ///< Second value to check. +{ + return ((value1 > value2) ? value1 : value2); +} + +/// Determines the maximum of N numbers. +/// +/// @returns The largest of all the inputs. +template +constexpr T Max( + T value1, ///< First value to check. + T value2, ///< Second value to check. + Ts... values) ///< Additional values to check. +{ + return Max(((value1 > value2) ? value1 : value2), values...); +} + +/// Determines the minimum of two numbers. +/// +/// @returns The smaller of the two inputs. +template +constexpr T Min( + T value1, ///< First value to check. + T value2) ///< Second value to check. +{ + return ((value1 < value2) ? value1 : value2); +} + +/// Determines the minimum of N numbers. +/// +/// @returns The smallest of all the inputs. +template +constexpr T Min( + T value1, ///< First value to check. + T value2, ///< Second value to check. + Ts... values) ///< Additional values to check. +{ + return Min(((value1 < value2) ? value1 : value2), values...); +} + +/// Clamps the input number so that it falls in-between the lower and upper bounds (inclusive). +/// +/// @returns Clamped input number. +template +constexpr T Clamp( + T input, ///< Input number to clamp. + T lowBound, ///< Lower-bound to clamp to. + T highBound) ///< Upper-bound to clamp to. +{ + return ((input <= lowBound) ? lowBound : + (input >= highBound) ? highBound : input); +} + +/// Determines if the input is within the range specified (inclusive). +/// +/// @returns True if within range, False otherwise. +template +constexpr bool InRange( + T input, ///< Input number to range check. + T lowBound, ///< Low bound of the range to check (inclusive). + T highBound) ///< High bound of the range to check (inclusive). +{ + return (lowBound <= input) && (input <= highBound); +} + +/// Converts a byte value to the equivalent number of DWORDs (uint32) rounded up. I.e., 3 bytes will return 1 dword. +/// +/// @returns Number of dwords necessary to cover numBytes. +constexpr uint32 NumBytesToNumDwords( + uint32 numBytes) ///< Byte count to convert. +{ + return Pow2Align(numBytes, static_cast(sizeof(uint32))) / sizeof(uint32); +} + +/// Compare two strings ignoring case +inline int Strcasecmp( + const char* pSrc, ///< [in] The source string to be compared. + const char* pDst) ///< [in] The dest string to compare. +{ + PAL_ASSERT(pSrc != nullptr); + PAL_ASSERT(pDst != nullptr); + +#if defined(_WIN32) + return _stricmp(pDst, pSrc); +#else + return strcasecmp(pDst, pSrc); +#endif +} + +/// Performs a safe strcpy by requiring the destination buffer size. +inline void Strncpy( + char* pDst, ///< [out] Destination string. + const char* pSrc, ///< [in] Source string to be copied into destination. + size_t dstSize) ///< Size of the destination buffer in bytes. +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + PAL_ALERT(strlen(pSrc) >= dstSize); + + if (dstSize > 0) + { +#if defined(_WIN32) + // Clamp the copy to the size of the dst buffer (1 char reserved for the null terminator). + strncpy_s(pDst, dstSize, pSrc, _TRUNCATE); +#else + strncpy(pDst, pSrc, (dstSize - 1)); + pDst[dstSize - 1] = '\0'; +#endif + } +} + +/// Simple wrapper for wcscpy_s or wcsncpy, which are available on Windows and Linux, respectively. +inline void Wcsncpy( + wchar_t* pDst, ///< [out] Destination string. + const wchar_t* pSrc, ///< [in] Source string to copy. + size_t dstSize) ///< Length of the destination buffer, in wchar_t's. +{ +#if defined(_WIN32) + wcscpy_s(pDst, dstSize, pSrc); +#else + wcsncpy(pDst, pSrc, (dstSize - 1)); + pDst[dstSize - 1] = L'\0'; +#endif +} + +// Wrapper for wcscat or wcscat_s which provides a safe version of wcscat +inline void Wcscat( + wchar_t* pDst, + const wchar_t* pSrc, + size_t dstSize) +{ +#if defined(_WIN32) + wcsncat_s(pDst, dstSize, pSrc, _TRUNCATE); +#else + const size_t dstLen = std::wcslen(pDst); + wcsncat(pDst, pSrc, (dstSize - dstLen - 1)); + pDst[dstSize - 1] = L'\0'; +#endif +} + +/// Simple wrapper for strncat or strncat_s which provides a safe version of strncat. +inline void Strncat( + char* pDst, ///< [in,out] Destination string. + size_t sizeDst, ///< Length of the destination string, including the null terminator. + const char* pSrc) ///< [in] Source string. +{ + PAL_ASSERT((pDst != nullptr) && (pSrc != nullptr)); + +#if defined(_WIN32) + // MS compilers provide strncat_s, which will truncate the copy to prevent buffer overruns and always guarantee that + // pDst is null-terminated. + strncat_s(pDst, sizeDst, pSrc, _TRUNCATE); +#else + // Compute the length of the destination string to prevent buffer overruns. + const size_t dstLength = strlen(pDst); + strncat(pDst, pSrc, (sizeDst - dstLength - 1)); +#endif +} + +/// Simple wrapper for strtok_s or strtok_r which provides a safe version of strtok. +inline char* Strtok( + char* str, ///< [in] Token string. + const char* delim, ///< [in] Token delimit. + char** buf) ///< [in,out] Buffer to store the rest of the string. +{ + PAL_ASSERT((delim != nullptr) && (buf != nullptr)); + + char* pToken = nullptr; + +#if defined(_WIN32) + pToken = strtok_s(str, delim, buf); +#else + pToken = strtok_r(str, delim, buf); +#endif + + return pToken; +} + +/// Rounds the specified pointer up to the nearest value meeting the specified 'alignment'. Only power of 2 alignments +/// are supported by this function. +/// +/// @returns Aligned pointer. +inline void* VoidPtrAlign( + void* ptr, ///< Pointer to align. + size_t alignment) ///< Desired alignment. +{ + // This function only works for POW2 alignment + PAL_ASSERT(IsPowerOfTwo(alignment)); + + return reinterpret_cast( + (reinterpret_cast(ptr) + (alignment - 1)) & ~(alignment - 1)); +} + +/// Converts a raw string value to the correct data type. +inline void StringToValueType( + const char* pStrValue, ///< [in] Setting value in string form. + ValueType type, ///< Data type of the value being converted. + size_t valueSize, ///< Size of pValue buffer. + void* pValue) ///< [out] Converted setting value buffer. +{ + switch (type) + { + case ValueType::Boolean: + *(static_cast(pValue)) = ((atoi(pStrValue)) ? true : false); + break; + case ValueType::Int8: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint8: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int16: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint16: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int32: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint32: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int64: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint64: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Float: + *(static_cast(pValue)) = static_cast(atof(pStrValue)); + break; + case ValueType::Str: + Strncpy(static_cast(pValue), pStrValue, valueSize); + break; + } +} + +/// Converts a raw string value to the correct data type, returning 'true' if parsed correctly. +/// When not parsed correctly, the value will be unchanged. +/// +/// @note: A string that is truncated returns false. +/// @note: If the destination type is integer, the string is parsed as either int64 or uint64, and the parsed value is +/// clamped to fit the range of the destination type. +[[nodiscard]] inline bool StringToValueTypeChecked( + const char* pStrValue, ///< [in] Setting value in string form. + ValueType type, ///< Data type of the value being converted. + size_t valueSize, ///< Size of pValue buffer. + void* pValue) ///< [out] Converted setting value buffer. +{ + auto CheckTrailingCharacters = [](char* pChar, const char* pEnd) -> bool { + while ((pChar < pEnd) && isspace(*pChar)) + { + // ignore trailing whitespace. strtoX handles leading whitespace + pChar++; + } + return (pChar == pEnd); + }; + + const size_t len = strlen(pStrValue); + const char* pTerminator = pStrValue + len; + char* pEndptr = nullptr; + bool valid = false; + + switch (type) + { + case ValueType::Boolean: + { + bool value = (strtol(pStrValue, &pEndptr, 0) != 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Int8: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint8: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int16: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint16: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int32: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint32: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int64: + { + const int64 value = strtoll(pStrValue, &pEndptr, 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Uint64: + { + const uint64 value = strtoull(pStrValue, &pEndptr, 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Float: + { + float value = static_cast(strtof(pStrValue, &pEndptr)); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Str: + if (len + 1 <= valueSize) + { + valid = true; + Strncpy(static_cast(pValue), pStrValue, valueSize); + } + break; + } + return valid; +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// +/// @returns 32-bit hash generated from the provided string. +template +constexpr uint32 HashString( + const Char* pStr, ///< [in] String to be hashed. + size_t strSize) ///< Size of the input string. +{ + PAL_CONSTEXPR_ASSERT((pStr != nullptr) && (strSize > 0)); + + constexpr uint32 FnvPrime = 16777619u; + constexpr uint32 FnvOffset = 2166136261u; + + uint32 hash = FnvOffset; + + for (size_t i = 0; i < strSize; i++) + { + Char c = pStr[i]; + for (uint32 j = 0; j < sizeof(Char); ++j) + { + hash ^= uint8(c); + hash *= FnvPrime; + c = (c >> 8); + } + } + + return hash; +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// +/// @returns 32-bit hash generated from the provided string. +template +constexpr uint32 HashString( + const Char* pString) +{ + return HashString(pString, StringLength(pString)); +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// Same as HashString() except consteval enforces that this can only be called at compile-time. +/// +/// @returns 32-bit hash generated from the provided string. +template +#if defined(__cpp_consteval) +consteval +#else +constexpr +#endif +uint32 CompileTimeHashString( + const Char* pString) +{ + return HashString(pString); +} + +/// Indicates that an object may be moved from. +/// Can be understood as preparation for possible move operation. +/// +/// @warning Do not read object after it has been moved from! +/// +/// @param [in] object Universal reference to an object that may be moved from. +/// +/// @returns Rvalue reference to the parameter object. +template +constexpr typename std::remove_reference::type&& Move(T&& object) +{ + // Cast universal reference to rvalue reference. + return static_cast::type&&>(object); +} + +/// Exchanges values between two variables. +/// +/// @param [in] left First variable used in swap operation. +/// @param [in] right Second variable used in swap operation. +template +constexpr void Swap(T& left, T& right) +{ + T tmp = Move(left); + left = Move(right); + right = Move(tmp); +} + +/// Convenient alias for C style arrays. +template +using Array = Element[Size]; + +/// Prevent swapping arrays because of the cost of this operation. +template +void Swap(Array& a, Array& b); + +/// Compacts an array by moving all empty slots to the end of the array. +/// +---+---+---+---+---+---+---+---+---+---+ +/// Input: | A | | C | D | | E | | A | X | J | +/// +---+---+---+---+---+---+---+---+---+---+ +/// +---+---+---+---+---+---+---+---+---+---+ +/// Output: | A | C | D | E | A | X | J | | | | +/// +---+---+---+---+---+---+---+---+---+---+ +template +void PackArray(Array& array, const Element& emptySlot) +{ + int lastOccupiedSlot = -1; + + for (size_t i = 0; i < Size; ++i) + { + if (array[i] != emptySlot) + { + Swap(array[i], array[lastOccupiedSlot + 1]); + ++lastOccupiedSlot; + } + } +} + +/// Performs a safe mbstowcs by requiring the destination buffer size. +inline void Mbstowcs( + wchar_t* pDst, ///< [out] dst string + const char* pSrc, ///< [in] src string + size_t dstSizeInWords) ///< size of the destination buffer in words +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + + bool result = false; + // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) +#if defined(_WIN32) + size_t bytesConverted = 0; + errno_t retCode = mbstowcs_s(&bytesConverted, pDst, dstSizeInWords, pSrc, (dstSizeInWords - 1)); + + result = (retCode != 0) ? false : true; +#else + size_t retCode = mbstowcs(pDst, pSrc, dstSizeInWords); + + result = (retCode == static_cast(-1)) ? false : true; + + if (retCode == dstSizeInWords) + { + // Alert the user when the string has been truncated. + PAL_ALERT_ALWAYS(); + + // NULL terminate the string. + pDst[dstSizeInWords - 1] = '\0'; + } +#endif + + if (result == false) + { + // A non-convertible character was encountered or the string was truncated on the mbstowcs_s or + // ConvertCharStringToUtf16 code paths. + PAL_ALERT_ALWAYS(); + pDst[0] = '\0'; + } +} + +/// Performs a safe wcstombs by requiring the destination buffer size. +inline void Wcstombs( + char* pDst, ///< [out] dst string + const wchar_t* pSrc, ///< [in] src string + size_t dstSizeInBytes) ///< size of the destination buffer in bytes +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + + bool result = false; + // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) +#if defined(_WIN32) + size_t bytesConverted = 0; + errno_t retCode = wcstombs_s(&bytesConverted, pDst, dstSizeInBytes, pSrc, (dstSizeInBytes - 1)); + + result = (retCode != 0) ? false : true; +#else + size_t retCode = wcstombs(pDst, pSrc, (dstSizeInBytes - 1)); + + result = (retCode == static_cast(-1)) ? false : true; +#endif + + if (result == false) + { + // A non-convertible character was encountered. + PAL_ASSERT_ALWAYS(); + pDst[0] = '\0'; + } + + if (wcslen(pSrc) >= dstSizeInBytes) + { + // Assert to alert the user when the string has been truncated. + PAL_ASSERT_ALWAYS(); + + // NULL terminate the string. + pDst[dstSizeInBytes - 1] = '\0'; + } +} + +/// Computes the Greatest Common Divisor of two numbers +/// +/// @returns The GCD of the two inputs. +template +inline typename std::common_type::type Gcd( + T1 value1, + T2 value2) +{ + static_assert((std::is_integral::value == true) && + (std::is_integral::value == true), + "GCD requires integral types"); + + static_assert((std::is_unsigned::value == true) && + (std::is_unsigned::value == true), + "GCD requires unsigned types"); + + static_assert((std::is_same::value == false) && + (std::is_same::value == false), + "GCD requires nonboolean types"); + + using T = typename std::common_type::type; + T ret = 0u; + + if (value1 == 0u) + { + ret = static_cast(value2); + } + else if (value2 == 0u) + { + ret = static_cast(value1); + } + else + { + uint32 value1TrailingZeros = 0u; + BitMaskScanForward(&value1TrailingZeros, value1); + uint32 value2TrailingZeros = 0u; + BitMaskScanForward(&value2TrailingZeros, value2); + + const uint32 shift = Min(value1TrailingZeros, value2TrailingZeros); + value1 >>= value1TrailingZeros; + value2 >>= shift; + + do + { + BitMaskScanForward(&value2TrailingZeros, value2); + value2 >>= value2TrailingZeros; + + if (value1 > value2) + { + T tmp = value1; + value1 = value2; + value2 = tmp; + } + + value2 -= value1; + } + while (value2 != 0); + + ret = static_cast(value1 << shift); + } + + return ret; +} + +/// Computes the Greatest Common Divisor of N numbers +/// +/// @returns The GCD of the all inputs. +template +inline typename std::common_type::type>::type Gcd( + T1 value1, + T2 value2, + Ts... values) +{ + return Gcd(Gcd(value1, value2), values...); +} + +/// Computes the Least Common Multiple of two numbers +/// +/// @returns The LCM of the two inputs. +template +constexpr typename std::common_type::type Lcm( + T1 value1, + T2 value2) +{ + static_assert((std::is_integral::value == true) && + (std::is_integral::value == true), + "LCM requires integral types"); + + static_assert((std::is_unsigned::value == true) && + (std::is_unsigned::value == true), + "LCM requires unsigned types"); + + static_assert((std::is_same::value == false) && + (std::is_same::value == false), + "LCM requires nonboolean types"); + + using T = typename std::common_type::type; + + return (value1 != 0u) && (value2 != 0u) ? static_cast((value1 / Gcd(value1, value2)) * value2) : 0u; +} + +/// Computes the Least Common Multiple of N numbers +/// +/// @returns The LCM of all the inputs. +template +constexpr typename std::common_type::type>::type Lcm( + T1 value1, + T2 value2, + Ts... values) +{ + return Lcm(Lcm(value1, value2), values...); +} + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 +/// Returns the length of a wchar_t based string. +/// +/// @returns The length of the given string in wide characters +inline size_t Wcslen( + const wchar_t* pWideStr) +{ + return wcslen(pWideStr); +} + +/// Performs a reverse string find of wide character wc. +/// +/// @returns The matching character at the end of the string or nullptr if not found. +inline wchar_t* Wcsrchr(wchar_t *pStr, wchar_t wc) +{ + return wcsrchr(pStr, wc); +} +#endif + +/// Compile-time function to report if two values from unrelated strong enums are equivalent. This is useful for +/// static asserts ensuring it is safe to cast an enum without a conversion lookup table. +template +inline constexpr bool EnumSameVal( + T1 lhs, + T2 rhs) +{ + return (static_cast(lhs) == static_cast(rhs)); +} + +/// Comparison function for Sort() below. +template int PAL_CDECL SortComparisonFunc( + const void* pLhs, + const void* pRhs) +{ + return int(*static_cast(pRhs) < *static_cast(pLhs)) - + int(*static_cast(pLhs) < *static_cast(pRhs)); +} + +/// In-place sort of an array. Uses C library qsort, so is probably a non-order-preserving quicksort. +/// Sorts the array given by the random iterator range [pStart,pEnd). +/// The element type (the type you get by dereferencing RandomIt) must have an operator<. +template void Sort( + RandomIt pStart, + RandomIt pEnd) +{ + using ElementTy = typename std::iterator_traits::value_type; + qsort(&pStart[0], pEnd - pStart, sizeof(ElementTy), SortComparisonFunc); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h b/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h new file mode 100644 index 0000000000..c441ecda3b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h @@ -0,0 +1,293 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palIntrusiveList.h + * @brief PAL utility collection IntrusiveList and IntrusiveListIterator class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" + +namespace Util +{ + +// Forward declarations. +template class IntrusiveList; +template class IntrusiveListIterator; + +/** + *********************************************************************************************************************** + * @brief Encapsulates one node of an intrusive double-linked-list. + * + * A node is associated with one data pointer at construction. The data pointer cannot be changed and must be non-null. + * + * Note that InList() allows intrusive list users to verify if a given value has been stored in a list without iterating + * over the list provided that each node object has been designated for a particular list. + *********************************************************************************************************************** + */ +template +class IntrusiveListNode +{ +public: + /// @param [in,out] pData Address of the data element which contains this intrusive node. + explicit IntrusiveListNode(T* pData); + + /// Returns true if this node is present in an intrusive list. + bool InList() const; + +private: + // This special constructor is provided for IntrusiveList's sentinel node which must have a null data pointer. + IntrusiveListNode(); + + T*const m_pData; // The data object that contains this node. + IntrusiveListNode* m_pPrev; // Previous node in the list or null if this node is not in a list. + IntrusiveListNode* m_pNext; // Next node in the list or null if this node is not in a list. + + PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveListNode); + + // Although this is a transgression of coding standards, it prevents IntrusiveListNode from requiring public + // accessor functions. The added encapsulation this provides is worthwhile. + friend class IntrusiveList; + friend class IntrusiveListIterator; +}; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a List collection. + * + * Allows traversal of all elements in a List going either forwards or backwards. + *********************************************************************************************************************** + */ +template +class IntrusiveListIterator +{ +public: + ~IntrusiveListIterator() { } + + /// Returns true unless the iterator has advanced past the end of the list. + bool IsValid() const { return m_pCurrent != m_pSentinel; } + + /// Returns a pointer to the current element. Returns null if the iterator is invalid. + T* Get() const { return m_pCurrent->m_pData; } + + /// Advances the iterator to the previous position (move backward). + void Prev() { m_pCurrent = m_pCurrent->m_pPrev; } + + /// Advances the iterator to the next position (move forward). + void Next() { m_pCurrent = m_pCurrent->m_pNext; } + + /// Moves the iterator back to the start of the list. + void Restart() { m_pCurrent = m_pSentinel->m_pNext; } + +private: + IntrusiveListIterator(const IntrusiveListNode*const pSentinel, IntrusiveListNode* pStart); + + const IntrusiveListNode*const m_pSentinel; // We need the sentinel to locate the list boundaries. + IntrusiveListNode* m_pCurrent; // Pointer to the current node. + + PAL_DISALLOW_DEFAULT_CTOR(IntrusiveListIterator); + + // Although this is a transgression of coding standards, it means that List does not need to have a public interface + // specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class IntrusiveList; +}; + +/** + *********************************************************************************************************************** + * @brief Templated, doubly-linked, intrusive, list container. + * + * This is meant for storing non-null pointers to elements of an arbitrary type using externally managed nodes. + * Operations which this class supports are: + * + * - Insertion at any point + * - Deletion at any point + * - Forwards and reverse iteration + * + * @warning This class is not thread-safe. + *********************************************************************************************************************** + */ +template +class IntrusiveList +{ +public: + /// A convenient shorthand for IntrusiveListNode. + typedef IntrusiveListNode Node; + + /// A convenient shorthand for IntrusiveListIterator. + typedef IntrusiveListIterator Iter; + + IntrusiveList(); + + /// Returns the number of elements in the list, not counting the sentinel. + size_t NumElements() const { return m_numElements; } + + /// Returns true if the list is empty. + bool IsEmpty() const { return m_sentinel.m_pNext == &m_sentinel; } + + /// Returns an iterator pointing to the first element in the list. If the list is empty, the iterator starts out + /// pointing at the permanent sentinel node. + /// + /// @returns An iterator pointing at the front end of the list. + Iter Begin() const { return Iter(&m_sentinel, m_sentinel.m_pNext); } + + /// Returns an iterator pointing to the last element in the list. If the list is empty, the iterator starts out + /// pointing at the permanent sentinel node. + /// + /// @returns An iterator pointing at the back end of the list. + Iter End() const { return Iter(&m_sentinel, m_sentinel.m_pPrev); } + + /// Returns the data pointer at the front of the list. + /// + /// @returns The data pointer at the front of the list or null if the list is empty. + T* Front() const { return m_sentinel.m_pNext->m_pData; } + + /// Returns the data pointer at the back of the list. + /// + /// @returns The data pointer at the back of the list or null if the list is empty. + T* Back() const { return m_sentinel.m_pPrev->m_pData; } + + /// Pushes the specified node onto the front of the list. + /// + /// @param [in] pNode Externally-owned list node to link into the list. + void PushFront(Node* pNode) { InsertBefore(m_sentinel.m_pNext, pNode); } + + /// Pushes the specified node onto the back of the list. + /// + /// @param [in] pNode Externally-owned list node to link into the list. + void PushBack(Node* pNode) { InsertBefore(&m_sentinel, pNode); } + + /// Pushes the contents of pSource onto the front of this list. The ordering of pSource is preserved, meaning that + /// the front of pSource will be the new front of this list. Note that pSource will be left entirely empty. + /// + /// It is illegal to call this function with an empty pSource. + /// + /// @param [in] pSource The contents of pSource will be pushed in-order onto the front of this list. + void PushFrontList(IntrusiveList* pSource); + + /// Pushes the contents of pSource onto the back of this list. The ordering of pSource is preserved, meaning that + /// the end of pSource will be the new end of this list. Note that pSource will be left entirely empty. + /// + /// It is illegal to call this function with an empty pSource. + /// + /// @param [in] pSource The contents of pSource will be pushed in-order onto the back of this list. + void PushBackList(IntrusiveList* pSource); + + /// Inserts the specified node before a particular node in a list. + /// + /// If the iterator has advanced off the end of the list (i.e., the iterator is invalid), the added node will be the + /// new tail node. + /// + /// @param [in] iter Identifies a node where the insertion should take place. The iterator will point to the same + /// spot in the list after insertion. + /// @param [in] pNode Externally-owned list node to link into the list. + void InsertBefore(const Iter& iter, Node* pNode); + + /// Removes the node at the specified position from the list. + /// + /// It is illegal to call this function with an iterator that has already advanced off the end of the list. + /// + /// @param [in,out] pIter Iterator identifying the node to be removed. After the node is removed, this iterator + /// will be advanced to the next node. If this call removes the final remaining node in the + /// list then the iterator will point at the sentinel and will be invalid. + void Erase(Iter* pIter); + + /// Removes the node at the specified position from the list. It is illegal to call this function with a Node that + /// not in this list. + /// + /// @param [in] pNode Node to be removed. + void Erase(Node* pNode); + + /// Removes all nodes from the list. + void EraseAll(); + + /// Truncates the list without touching the elements + void InvalidateList() + { + m_sentinel.m_pNext = &m_sentinel; + m_sentinel.m_pPrev = &m_sentinel; + m_numElements = 0; + } + +private: + void InsertBefore(Node* pBeforeMe, Node* pNode); + void Unlink(Node* pNode); + + Node m_sentinel; // Ties the head to the tail and signifies the boundary of the list. + size_t m_numElements; // Number of elements. + + PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveList); +}; + +// ===================================================================================================================== +// This is the public node constructor; it must be given a non-null data pointer. +template +IntrusiveListNode::IntrusiveListNode( + T* pData) + : + m_pData(pData), + m_pPrev(nullptr), + m_pNext(nullptr) +{ + PAL_ASSERT(pData != nullptr); +} + +// ===================================================================================================================== +// This is the private node constructor which is used exclusively for sentinel nodes. +template +IntrusiveListNode::IntrusiveListNode() + : + m_pData(nullptr), + m_pPrev(nullptr), + m_pNext(nullptr) +{ +} + +// ===================================================================================================================== +// Returns true if this node is present in an intrusive list. +template +bool IntrusiveListNode::InList() const +{ + // The node pointers should always be null or non-null together. + PAL_DEBUG_BUILD_ONLY_ASSERT((m_pPrev == nullptr) == (m_pNext == nullptr)); + + return (m_pNext != nullptr); +} + +// ===================================================================================================================== +template +IntrusiveListIterator::IntrusiveListIterator( + const IntrusiveListNode*const pSentinel, + IntrusiveListNode* pStart) + : + m_pSentinel(pSentinel), + m_pCurrent(pStart) +{ +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h b/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h new file mode 100644 index 0000000000..052c4e0f66 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h @@ -0,0 +1,346 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palLinearAllocator.h + * @brief * @brief PAL utility allocator LinearAllocator class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palIntrusiveList.h" +#include "palSysMemory.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief A linear allocator that allocates virtual memory. + * + * To improve performance, a linear allocator can be used in performance-critical areas to avoid unnecessary heap + * allocations. The VirtualLinearAllocator will instead reserve a specified amount of virtual address space and will + * incrementally back it with real memory as necessary. + * + * As clients reach a steady state, allocations from this allocator will become "free," essentially just costing a + * pointer increment. + * + * This allocator can be used with any of the memory management macros. @see Allocators for more information about the + * Allocation pattern. + *********************************************************************************************************************** + */ +class VirtualLinearAllocator +{ +public: + /// Constructor. + /// + /// @param [in] size Maximum size, in bytes, of virtual memory that this allocator should reserve. + /// Does not need to be aligned to page size. + VirtualLinearAllocator(size_t size) : + m_pStart(nullptr), + m_pCurrent(nullptr), + m_size(size), + m_pageSize(0) {} + + /// Destructor. + virtual ~VirtualLinearAllocator() + { + if (m_pStart != nullptr) + { + // Free all of the pages. + Result result = VirtualRelease(m_pStart, m_size); + PAL_ASSERT(result == Result::_Success); + } + } + + /// Initializes the linear allocator by reserving the requested number of pages. + /// + /// @returns Result::Success if memory reservation and committing of the first page is successful. + Result Init() + { + m_pageSize = VirtualPageSize(); + m_size = Pow2Align(m_size, m_pageSize); + + Result result = VirtualReserve(m_size, &m_pStart); + + if (result == Result::_Success) + { + result = VirtualCommit(m_pStart, m_pageSize); + } + + if (result == Result::_Success) + { + m_pCurrent = m_pStart; + m_pCommittedToPage = VoidPtrInc(m_pCurrent, m_pageSize); + } + + return result; + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pAlignedCurrent = VoidPtrAlign(m_pCurrent, allocInfo.alignment); + void* pNextCurrent = VoidPtrInc(pAlignedCurrent, allocInfo.bytes); + void* pAlignedEnd = VoidPtrAlign(pNextCurrent, m_pageSize); + + if (allocInfo.bytes > Remaining()) + { + pAlignedCurrent = nullptr; + } + else if (pAlignedEnd > m_pCommittedToPage) + { + const size_t commitBytes = VoidPtrDiff(pAlignedEnd, m_pCommittedToPage); + + const Result result = VirtualCommit(m_pCommittedToPage, commitBytes); + + if (result == Result::_Success) + { + m_pCommittedToPage = VoidPtrInc(m_pCommittedToPage, commitBytes); + m_pCurrent = pNextCurrent; + } + else + { + // Return nullptr if allocation fails. + pAlignedCurrent = nullptr; + } + } + else + { + m_pCurrent = pNextCurrent; + } + + return pAlignedCurrent; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) {} + + /// Rewinds the current pointer to the specified location to reuse already allocated memory. + /// + /// @param pStart Where to reset the m_pCurrent to. + /// @param decommit If true, pages that are rewound are freed/decommitted. + void Rewind(void* pStart, bool decommit) + { + PAL_ASSERT((m_pStart <= pStart) && (pStart <= m_pCurrent)); + + if (pStart != m_pCurrent) + { + if (decommit) + { + void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); + void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); + const size_t numPages = VoidPtrDiff(pCurrentPage, pStartPage) / m_pageSize; + + if (numPages > 0) + { + Result result = VirtualDecommit(pStartPage, m_pageSize * numPages); + PAL_ASSERT(result == Result::_Success); + + m_pCommittedToPage = pStartPage; + } + } +#if DEBUG + else + { + void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); + void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); + const size_t numDwords = VoidPtrDiff(pCurrentPage, pStartPage) / sizeof(uint32); + uint32* pNewCurrent = static_cast(pStartPage); + + for (size_t dword = 0; dword < numDwords; dword++) + { + pNewCurrent[dword] = 0xDEADBEEF; + } + } +#endif + + m_pCurrent = pStart; + } + } + + /// Returns the current pointer to backing memory. + /// + /// @returns Current pointer to backing memory. + void* Current() { return m_pCurrent; } + + /// Returns the starting pointer to backing memory. + /// + /// @returns Pointer to the start of backing memory. + void* Start() { return m_pStart; } + + /// Returns the number of bytes that have been allocated. + /// + /// @returns Number of bytes allocated through this allocator. + size_t BytesAllocated() { return VoidPtrDiff(m_pCurrent, m_pStart); } + + /// Compute remaining unallocated space in the allocator; once this space is exhausted allocations will fail. + /// + /// @returns The size of the remaining unallocated space in bytes. + size_t Remaining() const { return m_size - VoidPtrDiff(m_pCurrent, m_pStart); } + +private: + void* m_pStart; ///< Pointer to where the backing allocation starts. + void* m_pCurrent; ///< Pointer to the current position of backing memory. + void* m_pCommittedToPage; ///< Pointer to the end of the last committed page. + + size_t m_size; ///< Size of the allocation. + size_t m_pageSize; ///< OS' defined page size. + + PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocator); + PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocator); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the LinearAllocator classes. + * + * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything allocated the object is allocated on the stack and when it goes out of scope + * will be properly "rewound" by the allocator. See the below example. + * + * + * { + * [Current pointer = 0x10] + * LinearAllocatorAuto allocator(pPtrToAllocator); + * Allocations occur ... + * [Current pointer = 0x80] + * } + * [Current pointer rewinds = 0x10] + *********************************************************************************************************************** + */ +template +class LinearAllocatorAuto +{ +public: + /// Tracks the current start pointer. + /// + /// @param pAllocator The allocator to wrap. + /// @param decommit Whether to decommit any pages of memory allocated when this goes out of scope. + LinearAllocatorAuto(LinearAllocator* pAllocator, bool decommit) + : + m_pAllocator(pAllocator), +#if PAL_MEMTRACK + m_memTracker(pAllocator), +#endif + m_pStart(nullptr), + m_decommit(decommit) + { + PAL_ASSERT(pAllocator != nullptr); + m_pStart = m_pAllocator->Current(); + +#if PAL_MEMTRACK + Result result = m_memTracker.Init(); + PAL_ASSERT(result == Result::_Success); +#endif + } + + /// Rewinds any allocations made when this goes out of scope. + ~LinearAllocatorAuto() + { + m_pAllocator->Rewind(m_pStart, m_decommit); + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pMemory = nullptr; +#if PAL_MEMTRACK + pMemory = m_memTracker.Alloc(allocInfo); +#else + pMemory = m_pAllocator->Alloc(allocInfo); +#endif + + return pMemory; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { +#if PAL_MEMTRACK + m_memTracker.Free(freeInfo); +#else + m_pAllocator->Free(freeInfo); +#endif + } + +private: + LinearAllocator*const m_pAllocator; ///< The LinearAllocator which this object wraps. + +#if PAL_MEMTRACK + MemTracker m_memTracker; ///< Memory tracker for this LinearAllocatorAuto. +#endif + + void* m_pStart; ///< Where the LinearAllocator started when wrapped by this. + const bool m_decommit; ///< Whether to decommit any pages of memory allocated on destruction. + + PAL_DISALLOW_DEFAULT_CTOR(LinearAllocatorAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(LinearAllocatorAuto); +}; + +/** + *********************************************************************************************************************** + * @brief A simple extension of VirtualLinearAllocator that contains an IntrusiveListNode pointing at itself. + * This makes it very easy to create and manage IntrusiveLists of VirtualLinearAllocators. + *********************************************************************************************************************** + */ +class VirtualLinearAllocatorWithNode : public VirtualLinearAllocator +{ +public: + /// Constructor. + VirtualLinearAllocatorWithNode(size_t size) : VirtualLinearAllocator(size), m_node(this) {} + + /// Destructor. + virtual ~VirtualLinearAllocatorWithNode() {} + + /// Gets this linear allocator's associated IntrusiveListNode. + /// + /// @returns Pointer to this allocator's associated IntrusiveListNode. + IntrusiveListNode* GetNode() { return &m_node; } + +private: + IntrusiveListNode m_node; + + PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocatorWithNode); + PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocatorWithNode); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMath.h b/shared/amdgpu-windows-interop/pal/inc/util/palMath.h new file mode 100644 index 0000000000..0c8937bda6 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMath.h @@ -0,0 +1,226 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMath.h + * @brief PAL utility collection function/constant declarations for the Math sub-namespace. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSysMemory.h" + +#include + +namespace Util +{ + +/// Util sub-namespace defining several useful math routines and constants. +namespace Math +{ + +/// Exponent mask of a single-precision IEEE float. +constexpr uint32 FloatExponentMask = 0x7F800000; +/// Exponent bias of a single-precision IEEE float. +constexpr uint32 FloatExponentBias = 127; +/// Number of bits in the mantissa of a single-precision IEEE float. +constexpr uint32 FloatNumMantissaBits = 23; +/// Mantissa mask of a single-precision IEEE float. +constexpr uint32 FloatMantissaMask = 0x007FFFFF; +/// Sign bit mask of a single precision IEEE float. +constexpr uint32 FloatSignBitMask = 0x80000000; +/// Mask of all non-sign bits of a single-precision IEEE float. +constexpr uint32 FloatMaskOutSignBit = 0x7FFFFFFF; +/// Minimum number of float bits in a normalized IEE float. +constexpr uint32 MinNormalizedFloatBits = 0x00800000; + +/// Positive one. +constexpr float FloatOne = 1.0f; +/// Negative one. +constexpr float FloatNegOne = -1.0f; +/// Zero. +constexpr float FloatZero = 0.0f; +/// Positive infinity. +constexpr float FloatInfinity = std::numeric_limits::infinity(); + +/// Fraction structure. +struct Fraction +{ + uint32 num; ///< Numerator + uint32 den; ///< Denominator +}; + +/// Returns the bits of a floating point value as an unsigned integer. +inline uint32 FloatToBits(float f) +{ + return (*(reinterpret_cast(&f))); +} + +/// Assigns the bits contained in an unsigned integer to the float pointer location +inline void SetBitsToFloat(float* f, uint32 u) +{ + *(reinterpret_cast(f)) = u; +} + +/// Returns true if the specified float is denormalized. +extern bool IsDenorm(float f); +/// Returns true if the specified float is +/- infinity. +extern bool IsInf(float f); +/// Returns true if the specified float is a NaN. +extern bool IsNaN(float f); + +/// Determines if a floating-point number is either +/-Infinity or NaN. +inline bool IsInfOrNaN(float f) +{ + return (IsInf(f) || IsNaN(f)); +} + +/// @brief Converts a floating point number to a signed fixed point number with the given integer and fractional bits. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If the +/// intBits is zero, the fracBits is assumed to include 1 sign bit, otherwise the sign bit is assumed to be part of the +/// intBits. A typical use for enableRounding would be when converting SNORM/UNORM values to fixed point. +/// +/// @param [in] f Floating point value to convert. +/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. +/// @param [in] fracBits Number of fractional bits in the fixed point output. +/// @param [in] enableRounding Round before conversion. +/// +/// @returns Fixed point number in a uint32. +extern uint32 FloatToSFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); + +/// @brief Converts a floating point number to an unsigned fixed point number with the given integer and +/// fractional bits. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. A typical use +/// for enableRounding would be when converting SNORM/UNORM values to fixed point. +/// +/// @param [in] f Floating point value to convert. +/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. +/// @param [in] fracBits Number of fractional bits in the fixed point output. +/// @param [in] enableRounding Round before conversion. +/// +/// @returns Fixed point number in a uint32. +extern uint32 FloatToUFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); + +/// @brief Converts a signed fixed point number with the given integer and fractional bits to a floating point number. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If numIntBits +/// is 0, numFracBits is assumed to have 1 bit for the sign, otherwise the sign bit is assumed to be part of the integer +/// bits. +/// +/// @param [in] fixedPtNum Fixed point number to convert. +/// @param [in] intBits Number of integer bits (including the sign bit). +/// @param [in] fracBits Number of fractional bits. +/// +/// @returns Converted floating point number. +extern float SFixedToFloat(int32 fixedPtNum, uint32 intBits, uint32 fracBits); + +/// @brief Converts a unsigned fixed point number with the given integer and fractional bits to a floating point number. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [0, 1.0]. +/// +/// @param [in] fixedPtNum Fixed point number to convert. +/// @param [in] intBits Number of integer bits (including the sign bit). +/// @param [in] fracBits Number of fractional bits. +/// +/// @returns Converted floating point number. +extern float UFixedToFloat(uint32 fixedPtNum, uint32 intBits, uint32 fracBits); + +/// Converts a 32-bit IEEE floating point number to a 16-bit signed floating point number. +extern uint32 Float32ToFloat16(float f); + +/// Converts a 32-bit IEEE floating point number to an 11-bit signed floating point number. +extern uint32 Float32ToFloat11(float f); + +/// Converts a 32-bit IEEE floating point number to a 10-bit signed floating point number. +extern uint32 Float32ToFloat10(float f); + +/// Converts a 32-bit IEEE floating-point number to a 10-bit unsigned floating-point number. +extern uint32 Float32ToFloat10_6e4(float f); + +/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float10_6e4ToFloat32(uint32 fBits); + +/// Converts a 32-bit IEEE floating point number to a N-bit signed floating point number. +extern uint32 Float32ToNumBits(float float32, uint32 numBits); + +/// Converts a 16-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float16ToFloat32(uint32 fBits); + +/// Converts an 11-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float11ToFloat32(uint32 fBits); + +/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float10ToFloat32(uint32 fBits); + +/// Converts an N-bit signed floating point number to a 32-bit IEEE floating point number. +extern float FloatNumBitsToFloat32(uint32 input, uint32 numBits); + +/// Converts a 32-bit IEEE floating point number to a fraction. +extern Fraction Float32ToFraction(float float32); + +/// Returns the square root of the specified value. +extern float Sqrt(float f); + +/// Returns the result of an exponent operation (base^exponent). +extern float Pow(float base, float exponent); + +/// Returns the unsigned integer absolute value. +extern uint32 Absu(int32 number); + +/// Return sign-preserved zero if input is denorm, otherwise input value +extern float FlushDenormToZero(float input); + +/// Return value in 1.7 signed magnitude format. Valid input range is (-127, 127) +extern uint8 IntToSignedMagnitude(int8 input); + +/// @brief Performs unsigned fixed-point rounding operation. +/// +/// @param [in] value Fixed point number to convert in Qm.f format. +/// @param [in] n Number of fractional bits. +/// +/// @returns rounded fixed point number in Q0 format (unsigned integer). +constexpr uint32 UFixedRoundToUint32(uint32 value, uint8 n) +{ + PAL_CONSTEXPR_ASSERT((0 < n) && (n < 31)); + return ((value + (((1 << n) >> 1))) >> n); +} + +/// @brief Performs signed fixed-point rounding operation. +/// +/// @param [in] value Fixed point number to convert in Qm.f format. +/// @param [in] n Number of fractional bits. +/// +/// @returns rounded fixed point number in Q0 format (signed integer). +constexpr int32 SFixedRoundToInt32(int32 value, uint8 n) +{ + PAL_CONSTEXPR_ASSERT((0 < n) && (n < 30)); + return ((value + (((1 << n) >> 1))) >> n); +} + +} // Math +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h b/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h new file mode 100644 index 0000000000..db4df424ba --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h @@ -0,0 +1,158 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMemTracker.h + * @brief PAL utility collection MemTracker class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#if PAL_MEMTRACK + +#include "palIntrusiveList.h" +#include "palMutex.h" + +namespace Util +{ + +// Forward declarations +struct AllocInfo; +struct FreeInfo; +struct MemTrackerElem; +enum SystemAllocType : uint32; + +/// @internal +/// +/// An alloc-less list used by the MemTracker to keep track of all allocations. +typedef IntrusiveList MemTrackerList; + +/// @internal +/// +/// Specifies whether a particular memory block was allocated with PAL_MALLOC/PAL_CALLOC, PAL_NEW, or PAL_NEW_ARRAY. +/// Used to verify correct matching with PAL_FREE, PAL_DELETE, and PAL_DELETE_ARRAY. +enum class MemBlkType : uint32 +{ + Malloc = 0, + New, + NewArray, +}; + +/// @internal +/// +/// Internal structure used by MemTracker to store information on each allocation. +struct MemTrackerElem +{ + size_t size; ///< Size of allocation request. + MemBlkType blockType; ///< Memory block type (malloc, new, new array). + const char* pFilename; ///< File that requested allocation. + uint32 lineNumber; ///< Line number that requested allocation. + void* pClientMem; ///< Starting "client usable" data address. + void* pOrigMem; ///< Original address of the allocation returned from our underlying allocator. + size_t allocNum; ///< The number of the memory allocation. 1 based. + MemTrackerList* pList; ///< The list this struct is in. It helps check which MemTracker owns this struct. +}; + +/** + *********************************************************************************************************************** + * @brief Class responsible for tracking allocations and frees to notify the developer of memory leaks. + * + * Tracking is enabled/disabled via the PAL_MEMTRACK define. + *********************************************************************************************************************** + */ +template +class MemTracker +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + MemTracker(Allocator*const pAllocator); + ~MemTracker(); + + /// Performs any non-safe initialization that cannot be done in the constructor. + /// + /// @returns Result::Success if initialization is successful, otherwise an appropriate error. + Result Init(); + + /// Allocates a block of memory and track it using the memory tracker. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc( + const AllocInfo& allocInfo); + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free( + const FreeInfo& freeInfo); + +private: + void* AddMemElement( + void* pMem, + size_t bytes, + size_t align, + MemBlkType blockType, + const char* pFilename, + uint32 lineNumber); + + void* RemoveMemElement(void* pMem, MemBlkType blockType); + + void MemoryReport(); + void FreeLeakedMemory(); + + // Sentinel patterns used to detect memory underrun. + static constexpr uint32 UnderrunSentinel = 0xDEADBEEF; + // Sentinel patterns used to detect memory overrun. + static constexpr uint32 OverrunSentinel = 0xCAFEBABE; + + // Size of markers for underruns/overruns. Setting this to 0 disables this feature. + static constexpr size_t MarkerSizeUints = PAL_CACHE_LINE_BYTES / sizeof(uint32); + + // Size of underrun/overrun markers in bytes. + static constexpr size_t MarkerSizeBytes = MarkerSizeUints * sizeof(uint32); + + MemTrackerList m_trackerList; // The list of active allocations. + Mutex m_mutex; // Serializes access to list of active allocations. + + const size_t m_markerSizeUints; // Member variable copy of MarkerSizeUints. Only used to prevent compiler + // warnings when MarkerSizeUints is 0. + const size_t m_markerSizeBytes; // Member variable copy of MarkerSizeBytes. Only used to prevent compiler + // warnings when MarkerSizeBytes is 0. + + Allocator*const m_pAllocator; // Allocator for performing the actual allocations. + + size_t m_nextAllocNum; // The allocation number that the next allocated block will receive. + const size_t m_breakOnAllocNum; // The allocation number to trigger a debug break on. + + PAL_DISALLOW_COPY_AND_ASSIGN(MemTracker); +}; + +} // Util + +#endif diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h new file mode 100644 index 0000000000..cb10bfd629 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h @@ -0,0 +1,331 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMemTrackerImpl.h + * @brief PAL utility collection MemTracker class implementations. + *********************************************************************************************************************** + */ + +#pragma once + +#if PAL_MEMTRACK + +#include "palIntrusiveListImpl.h" +#include "palMemTracker.h" +#include "palSysMemory.h" + +#include + +namespace Util +{ + +/// Table to convert a blockType to a string. Used by the logging routines. +constexpr const char* MemBlkTypeStr[] = +{ + "Malloc", ///< MemBlkType::Malloc + "New", ///< MemBlkType::New + "NewArray", ///< MemBlkType::NewArray +}; + +// ===================================================================================================================== +template +MemTracker::MemTracker( + Allocator*const pAllocator) + : + m_markerSizeUints(MarkerSizeUints), + m_markerSizeBytes(MarkerSizeBytes), + m_pAllocator(pAllocator), + m_nextAllocNum(1), + m_breakOnAllocNum(0) +{ +} + +// ===================================================================================================================== +template +MemTracker::~MemTracker() +{ + // Clean-up leaked memory if needed + if (m_trackerList.IsEmpty() == false) + { + // If the list isn't empty, we have a leak. The leak could either be caused by an internal PAL leak, + // a client leak, or even the application not destroying API objects. + PAL_ALERT_ALWAYS(); + + // Dump out a list of unfreed blocks. + MemoryReport(); + + FreeLeakedMemory(); + } +} + +// ===================================================================================================================== +template +Result MemTracker::Init() +{ + return Result::Success; +} + +// ===================================================================================================================== +// Adds the newly allocated memory block to the list of blocks for tracking. +// +// The tracking information includes things like filename, line numbers, and type of block. Also, given a pointer, +// adds the Underrun/Overrun markers to the memory allocated, and return a pointer to the actual client usable memory. +// +// See MemTracker::Alloc() which is used to allocate memory that is being tracked. +template +void* MemTracker::AddMemElement( + void* pMem, // [in,out] Original pointer allocated by MemTracker::Alloc. + size_t bytes, // Client requested allocation size in bytes. + size_t align, // The max of the client-requested alignment or the internal alignment, in bytes. + MemBlkType blockType, // Block type based on calling allocation routine. + const char* pFilename, // Client filename that is requesting the memory. + uint32 lineNumber) // Line number in client file that is requesting the memory. +{ + // Our internal data is all relative to the client pointer so find that first. See Alloc for more details. + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + constexpr size_t InternalSize = sizeof(MemTrackerList::Node) + sizeof(MemTrackerElem); + + void*const pClientMem = VoidPtrAlign(VoidPtrInc(pMem, m_markerSizeBytes + InternalSize), align); + uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); + uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(bytes, sizeof(uint32)))); + + auto*const pNewElement = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); + void*const pNewNodeMem = VoidPtrDec(pNewElement, sizeof(MemTrackerList::Node)); + auto*const pNewNode = PAL_PLACEMENT_NEW(pNewNodeMem) MemTrackerList::Node(pNewElement); + + // Mark the memory with the underrun/overrun marker. + for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) + { + *pUnderrun++ = UnderrunSentinel; + *pOverrun++ = OverrunSentinel; + } + + pNewElement->size = bytes; + pNewElement->pFilename = pFilename; + pNewElement->lineNumber = lineNumber; + pNewElement->blockType = blockType; + pNewElement->pClientMem = pClientMem; + pNewElement->pOrigMem = pMem; + pNewElement->pList = &m_trackerList; + + MutexAuto lock(&m_mutex); + + // Trigger an assert if we're about to allocate the break-on-allocation number. + if (m_nextAllocNum == m_breakOnAllocNum) + { + PAL_ASSERT_ALWAYS(); + } + + pNewElement->allocNum = m_nextAllocNum; + ++m_nextAllocNum; + + m_trackerList.PushFront(pNewNode); + + return pClientMem; +} + +// ===================================================================================================================== +// Removes an allocated block from the list of blocks used for tracking. +// +// The routine checks for invalid frees (and duplicate frees). Also, the routine is able to detect mismatched alloc/free +// usage based on the blockType. The routine is called with the pointer to the client usable memory and returns the +// pointer to the allocated memory. +// +// See MemTracker::Free() which is used to free memory that is being tracked. +template +void* MemTracker::RemoveMemElement( + void* pClientMem, // Pointer to client usable memory. + MemBlkType blockType) // Block type based on calling deallocation routine. +{ + void* pOrigPtr = nullptr; + + // Recall that this is our internal memory layout. See Alloc for more details. + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); + auto*const pCurrent = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); + auto*const pCurrentNode = static_cast(VoidPtrDec(pCurrent, sizeof(MemTrackerList::Node))); + uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(pCurrent->size, sizeof(uint32)))); + + // We should not be trying to free something twice or trying to free something which has not been allocated + // by this MemTracker. We can verify both of these things by checking that the tracker's pList is equal to the + // MemTracker's list. + if (pCurrent->pList != &m_trackerList) + { + // A free was attempted on an unrecognized pointer. + PAL_DPERROR("Invalid Free Attempted with ptr = : (%#x)", pClientMem); + } + else if (pCurrent->blockType != blockType) + { + // We have a mismatch in the alloc/free pair, e.g. PAL_NEW with PAL_FREE etc. return early here without freeing + // the memory so it shows up as a leak. + PAL_DPERROR("Trying to Free %s as %s.", + MemBlkTypeStr[static_cast(pCurrent->blockType)], + MemBlkTypeStr[static_cast(blockType)]); + } + else + { + // We should check for memory corruption due to overflow or underflow before continuing because any underflow + // might indicate that our internal state is corrupted. This could lead to a crash in the code below. + for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) + { + PAL_ASSERT(*pUnderrun++ == UnderrunSentinel); + PAL_ASSERT(*pOverrun++ == OverrunSentinel); + } + + // Remove our tracker from the list and set it's pList to null to detect a double-free in the future. + MutexAuto lock(&m_mutex); + + m_trackerList.Erase(pCurrentNode); + + pCurrent->pList = nullptr; + pOrigPtr = pCurrent->pOrigMem; + } + + // Return a pointer to the actual allocated block. + return pOrigPtr; +} + +// ===================================================================================================================== +// Allocates a block of memory and tracks it using the memory tracker. +template +void* MemTracker::Alloc( + const AllocInfo& allocInfo) +{ + // Allocating zero bytes of memory results in undefined behavior. + PAL_ASSERT(allocInfo.bytes > 0); + + void* pMem = nullptr; + + // We want to allocate extra memory from the caller's allocator, in this layout: + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + // Here's why we need each of those sections: + // 1. align1 is zero or more bytes needed to align the client allocation and our internal data. + // 2. The MemTrackerList::Node object, which is used to link this allocation into m_trackerList. + // 3. The MemTrackerElem struct contains bookkeeping data we need to report memory errors. + // 4. The underflow and overflow trackers detect out of bounds writes. They are optional. + // 5. The client allocation, which is actually returned to the caller. + // 6. align2 is zero or more bytes needed to DWORD-align the overflow tracker. + constexpr size_t InternalAlignment = Max(alignof(MemTrackerList::Node), alignof(MemTrackerElem)); + const size_t paddedAlignBytes = Max(allocInfo.alignment, InternalAlignment); + const size_t paddedSizeBytes = (paddedAlignBytes + // 1 + sizeof(MemTrackerList::Node) + // 2 + sizeof(MemTrackerElem) + // 3 + m_markerSizeBytes + // 4.a + Pow2Align(allocInfo.bytes, sizeof(uint32)) + // 5 & 6 + m_markerSizeBytes); // 4.b + + const AllocInfo memTrackerInfo(paddedSizeBytes, paddedAlignBytes, allocInfo.zeroMem, allocInfo.allocType, + allocInfo.blockType, allocInfo.pFilename, allocInfo.lineNumber); + + pMem = m_pAllocator->Alloc(memTrackerInfo); + + if (pMem != nullptr) + { + // Don't bother adding a failed allocation to the Memtrack list. + pMem = AddMemElement(pMem, + allocInfo.bytes, + paddedAlignBytes, + allocInfo.blockType, + allocInfo.pFilename, + allocInfo.lineNumber); + } + + return pMem; +} + +// ===================================================================================================================== +// Frees a block of memory. The routine is called with the pointer to the client usable memory. +// +// See MemTracker::RemoveMemElement() which is used to validate the free. +template +void MemTracker::Free( + const FreeInfo& freeInfo) +{ + // Don't want to call RemoveMemElement if the ptr is null. + if (freeInfo.pClientMem != nullptr) + { + void* pMem = RemoveMemElement(freeInfo.pClientMem, freeInfo.blockType); + + // If this free call is valid (RemoveMemElement doesn't return nullptr), release the memory. + if (pMem != nullptr) + { + m_pAllocator->Free(FreeInfo(pMem, freeInfo.blockType)); + } + } +} + +// ===================================================================================================================== +// Frees all memory that has not been explicitly freed (in other words, memory that has leaked). This function is only +// expected to be called when the memory tracker is being destroyed. +template +void MemTracker::FreeLeakedMemory() +{ + for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); ) + { + MemTrackerElem*const pCurrent = iter.Get(); + + // Free will release the memory for tracking and the actual element. This will invalidate our list iterator + // unless we advance the iterator first. + iter.Next(); + + Free(FreeInfo(pCurrent->pClientMem, pCurrent->blockType)); + } +} + +// ===================================================================================================================== +// Outputs information about leaked memory by traversing the memory tracker list. +template +void MemTracker::MemoryReport() +{ + // When this env var is set to non-zero, don't report leaks. + // Useful for crashing apps that don't give us a chance to clean up. + const char* pToggle = getenv("AMDPAL_NO_LEAK_REPORT"); + + if ((pToggle == nullptr) || (atoi(pToggle) == 0)) + { + PAL_DPWARN("================ List of Leaked Blocks ================"); + + for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); iter.Next()) + { + MemTrackerElem*const pCurrent = iter.Get(); + + PAL_DPWARN( + "ClientMem = 0x%p, AllocSize = %8d, MemBlkType = %s, File = %-15s, LineNumber = %8d, AllocNum = %8d", + pCurrent->pClientMem, + pCurrent->size, + MemBlkTypeStr[static_cast(pCurrent->blockType)], + pCurrent->pFilename, + pCurrent->lineNumber, + pCurrent->allocNum); + } + + PAL_DPWARN("================ End of List ==========================="); + } +} + +} // Util + +#endif diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h b/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h new file mode 100644 index 0000000000..147d86c234 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h @@ -0,0 +1,398 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMutex.h + * @brief PAL utility collection Mutex and MutexAuto class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" + +#if defined(_WIN32) +// WIN32_NO_STATUS prevents winnt.h from re-defining NTSTATUS macros that cause build warnings if +// ntstatus.h is also included. +#define WIN32_NO_STATUS +#include +#undef WIN32_NO_STATUS +#else +#include +#include + +#endif + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Platform-agnostic mutex primitive. + *********************************************************************************************************************** + */ +class Mutex +{ +public: +#if defined(_WIN32) + /// Defines MutexData as a Windows CRITICAL_SECTION + typedef CRITICAL_SECTION MutexData; + Mutex() noexcept : m_osMutex {} { InitializeCriticalSection(&m_osMutex); } + ~Mutex() { DeleteCriticalSection(&m_osMutex); }; +#else + /// Defines MutexData as a unix pthread_mutex_t + typedef pthread_mutex_t MutexData; + Mutex() noexcept : m_osMutex {} { pthread_mutex_init(&m_osMutex, nullptr); } + ~Mutex() { pthread_mutex_destroy(&m_osMutex); }; +#endif + + /// Enters the critical section if it is not contended. If it is contended, wait for the critical section to become + /// available, then enter it. + void Lock(); + + /// Enters the critical section if it is not contended. Does not wait for the critical section to become available + /// if it is contended. + /// + /// @returns True if the critical section was entered, false otherwise. + bool TryLock(); + + /// Leaves the critical section. + void Unlock(); + + /// Returns the OS specific mutex data. + MutexData* GetMutexData() { return &m_osMutex; } + +private: + MutexData m_osMutex; ///< Opaque structure to the OS-specific Mutex data + + PAL_DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the Mutex class. + * + * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope + * will be protected from access by multiple threads. See the below example. + * + * [Code not protected] + * { + * [Code not protected] + * MutexAuto lock(pPtrToMutex); + * [Code is protected] + * } + * [Code not protected] + *********************************************************************************************************************** + */ +class MutexAuto +{ +public: + /// Locks the given Mutex. + explicit MutexAuto(Mutex* pMutex) : m_pMutex(pMutex) + { + PAL_ASSERT(m_pMutex != nullptr); + m_pMutex->Lock(); + } + + /// Unlocks the Mutex we locked in the constructor. + ~MutexAuto() + { + m_pMutex->Unlock(); + } + +private: + Mutex* const m_pMutex; ///< The Mutex which this object wraps. + + PAL_DISALLOW_DEFAULT_CTOR(MutexAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(MutexAuto); +}; + +/** + *********************************************************************************************************************** + * @brief Platform-agnostic rw lock primitive. + *********************************************************************************************************************** + */ +class RWLock +{ +public: +#if defined(_WIN32) + /// Defines RWLockData as a Windows RWLOCK + typedef SRWLOCK RWLockData; + RWLock() noexcept : m_osRWLock {} { InitializeSRWLock(&m_osRWLock); } + ~RWLock() noexcept { /* No Win32 destory function */ }; +#else + /// Defines RWLockData as a unix pthread_rwlock_t + typedef pthread_rwlock_t RWLockData; + /// @note pthread_rwlock_init will not fail as called + RWLock() noexcept : m_osRWLock {} { pthread_rwlock_init(&m_osRWLock, nullptr); } + ~RWLock() noexcept { pthread_rwlock_destroy(&m_osRWLock); }; +#endif + + /// Enumerates the lock type of RWLockAuto + enum LockType + { + ReadOnly = 0, ///< Lock in readonly mode, in other words shared mode. + ReadWrite ///< Lock in readwrite mode, in other words exclusive mode. + }; + + /// Acquires a rw lock in shared mode if it is not contended in exclusive mode. + /// If it is contended, wait for rw lock to become available, then enter it. + void LockForRead(); + + /// Acquires a rw lock in exclusive mode if it is not contended. + /// If it is contended, wait for rw lock to become available, then enter it. + void LockForWrite(); + + /// Try to acquires a rw lock in shared mode if it is not contended in exclusive mode. + /// Does not wait for the rw lock to become available. + /// @returns True if the rw lock was acquired, false otherwise. + bool TryLockForRead(); + + /// Try to acquires a rw lock in exclusive mode if it is not contended. + /// Does not wait for the rw lock to become available. + /// @returns True if the rw lock was acquired, false otherwise. + bool TryLockForWrite(); + + /// Release the rw lock which is previously contended in shared mode. + void UnlockForRead(); + + /// Release the rw lock which is previously contended in exclusive mode. + void UnlockForWrite(); + + /// Returns the OS specific RWLOCK data. + RWLockData* GetRWLockData() { return &m_osRWLock; } + +private: + RWLockData m_osRWLock; ///< Opaque structure to the OS-specific RWLock data + + PAL_DISALLOW_COPY_AND_ASSIGN(RWLock); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the RWLock class. + * + * The RAII paradigm allows rw lcok to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope + * will be protected from access by multiple threads. See the below example. + * + * [Code not protected] + * { + * [Code not protected] + * RWLockAuto lock(pPtrToMutex, type); + * [Code is protected] + * } + * [Code not protected] + *********************************************************************************************************************** + */ +template +class RWLockAuto +{ +public: + /// Locks the given RWLock. + explicit RWLockAuto(RWLock* pRWLock) : m_pRWLock(pRWLock) + { + PAL_ASSERT(m_pRWLock != nullptr); + if (type == RWLock::ReadOnly) + { + m_pRWLock->LockForRead(); + } + else + { + m_pRWLock->LockForWrite(); + } + } + + /// Unlocks the RWLock we locked in the constructor. + ~RWLockAuto() + { + if (type == RWLock::ReadOnly) + { + m_pRWLock->UnlockForRead(); + } + else + { + m_pRWLock->UnlockForWrite(); + } + } + +private: + RWLock* const m_pRWLock; ///< The RWLock which this object wraps. + + PAL_DISALLOW_DEFAULT_CTOR(RWLockAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(RWLockAuto); +}; + +/// Yields the current thread to another thread in the ready state (if available). +extern void YieldThread(); + +/// Atomic write of 64-bit unsigned integer, using a relaxed memory ordering policy. +/// If you need to synchronize more than just pTarget, you may need a new function. +/// +/// @param [in] pTarget Pointer to the value to be read. +/// +/// @returns The original value of *pTarget. +extern void AtomicWriteRelaxed64(volatile uint64* pTarget, uint64 newValue); + +/// Atomic read of 64-bit unsigned integer, using a relaxed memory ordering policy. +/// If you need to synchronize more than just pTarget, you may need a new function. +/// +/// @param [in] pTarget Pointer to the value to be read. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicReadRelaxed64(const volatile uint64* pTarget); + +/// Atomically increments the specified 32-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be incremented. +/// +/// @returns Result of the increment operation. +extern uint32 AtomicIncrement(volatile uint32* pValue); + +/// Atomically increment a 64-bit-unsigned integer +/// +/// @param [in,out] pAddend Pointer to the value to be incremented +/// +/// @returns Result of the increment operation. +extern uint64 AtomicIncrement64(volatile uint64* pAddend); + +/// Atomically decrements the specified 32-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be decremented. +/// +/// @returns Result of the decrement operation. +extern uint32 AtomicDecrement(volatile uint32* pValue); + +/// Atomically decrements the specified 64-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be decremented. +/// +/// @returns Result of the decrement operation. +extern uint32 AtomicDecrement64(volatile uint64* pValue); + +/// Performs an atomic compare and swap operation on two 32-bit unsigned integers. This operation compares *pTarget +/// with oldValue and replaces it with newValue if they match. If the values don't match, no action is taken. +/// The original value of *pTarget is returned as a result. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] oldValue Value to compare *pTarget to. +/// @param [in] newValue Value to replace *pTarget with if *pTarget matches oldValue. +/// +/// @returns Previous value at *pTarget. +extern uint32 AtomicCompareAndSwap(volatile uint32* pTarget, uint32 oldValue, uint32 newValue); + +/// Atomically exchanges a pair of 32-bit unsigned integers. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] value New value to be stored in *pTarget. +/// +/// @returns Previous value at *pTarget. +extern uint32 AtomicExchange(volatile uint32* pTarget, uint32 value); + +/// Atomically exchanges a pair of 64-bit unsigned integers. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] value New value to be stored in *pTarget. +/// +/// @returns Previous value at *pTarget. +extern uint64 AtomicExchange64(volatile uint64* pTarget, uint64 value); + +/// Atomically exchanges a pair of pointers. +/// +/// @param [in,out] ppTarget Pointer to the address to exchange. The function sets the address pointed to by *ppTarget +/// to pValue. +/// @param [in] pValue New pointer to be stored in *ppTarget. +/// +/// @returns Previous value at *ppTarget. +extern void* AtomicExchangePointer(void*volatile* ppTarget, void* pValue); + +/// Performs an atomic compare and swap operation on a pair of pointers. This operation compares *ppTarget +/// with pOldValue and replaces it with pNewValue if they match. If the values don't match, no action is taken. +/// The original value of *ppTarget is returned as a result. +/// +/// @param [in,out] ppTarget Pointer to the destination value of the operation. + +/// @param [in] pOldValue Old pointer to compare *ppTarget to. +/// @param [in] pNewValue New pointer to replace *ppTarget with if *ppTarget matches pOldValue. +/// +/// @returns Previous value at *ppTarget. +extern void* AtomicCompareExchangePointer(void*volatile* ppTarget, void* pOldValue, void* pNewValue); + +/// Atomically add a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pAddend Pointer to the value to be modified. +/// @param [in] value Value to add to *pAddend. +/// +/// @returns Result of the add operation. +extern uint32 AtomicAdd(volatile uint32* pAddend, uint32 value); + +/// Atomically add a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pAddend Pointer to the value to be modified. +/// @param [in] value Value to add to *pAddend. +/// +/// @returns Result of the add operation. +extern uint64 AtomicAdd64(volatile uint64* pAddend, uint64 value); + +/// Atomically OR a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to OR to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint32 AtomicOr(volatile uint32* pTarget, uint32 value); + +/// Atomically OR a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to OR to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicOr64(volatile uint64* pTarget, uint64 value); + +/// Atomically AND a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to AND to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint32 AtomicAnd(volatile uint32* pTarget, uint32 value); + +/// Atomically AND a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to AND to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicAnd64(volatile uint64* pTarget, uint64 value); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h b/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h new file mode 100644 index 0000000000..8bd28fa554 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h @@ -0,0 +1,447 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palSpan.h +* @brief PAL utility collection Span class declaration. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" +#include "palSysMemory.h" +#include "palInlineFuncs.h" +#include + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Span container + * + * Span is an array with a length, where the data is not owned by the Span object. It is similar to C++20 std::span, + * but only the dynamic extent variant. It is similar to LLVM MutableArrayRef and ArrayRef. A Span is intended to + * be passed around by value. + * + *********************************************************************************************************************** + */ +template +class Span +{ +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : m_pData(nullptr), m_numElements(0) {} + + /// Constructor from pointer and length + /// + /// @param [in] data Pointer to the start of the array + /// @param numElements Number of elements in the array + constexpr Span(T* pData, size_t numElements) : m_pData(pData), m_numElements(numElements) {} + + /// Copy constructor + /// + /// @param [in] src Other Span to copy from + constexpr Span(const Span& src) : m_pData(src.m_pData), m_numElements(src.m_numElements) {} + + /// Constructor from C++ array + /// + /// @param [in] src C++ array + template constexpr Span(T(& src)[NumElements]) : m_pData(&src[0]), m_numElements(NumElements) {} + + /// Constructor from single element + /// + /// @param [in] src Single element + constexpr Span(T& src) : m_pData(&src), m_numElements(1) {} + + /// Implicitly convert a Span to its const-element equivalent. + /// + /// @returns The same span, but with const element type + constexpr operator Span() const { return Span(m_pData, m_numElements); } + + /// Assignment operator + /// + /// @param [in] src Other Span to copy from + constexpr Span& operator=(const Span& src) { + m_pData = src.m_pData; + m_numElements = src.m_numElements; + return *this; + } + + ///@{ + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + constexpr T& At(size_t index) const + { + PAL_CONSTEXPR_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + constexpr T& operator[](size_t index) const noexcept { return At(index); } + ///@} + + /// Returns the data at the front of the vector. + /// + /// @returns The data at the front of the vector. + constexpr T& Front() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return *m_pData; + } + + /// Returns the data at the back of the vector. + /// + /// @returns The data at the back of the vector. + constexpr T& Back() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return *(m_pData + (m_numElements - 1)); + } + + /// Returns an iterator to the first element of the vector. + /// + /// @returns An iterator to first element of the vector. + constexpr T* Begin() const { return m_pData; } + + /// Returns an iterator beyond the last element of the vector. (NOT at the last element like Util::Vector::End()!) + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns VectorIterator An iterator to last element of the vector. + constexpr T* End() const { return m_pData + m_numElements; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + constexpr T* Data() const { return m_pData; } + + /// Returns the extent of the span. + /// + /// @returns An unsigned integer equal to the number of elements currently present in the span. + constexpr size_t NumElements() const { return m_numElements; } + + /// Returns the size in bytes the Span represents. + /// + /// @returns An unsigned integer equal to the size in bytes the entire span represents. + constexpr size_t SizeInBytes() const { return ElementSize() * m_numElements; } + + /// Returns true if the number of elements present in the vector is equal to zero. + /// + /// @returns True if the span is empty. + constexpr bool IsEmpty() const { return (m_numElements == 0); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + constexpr Span Subspan( + size_t offset, + size_t count) const + { + PAL_CONSTEXPR_ASSERT((offset <= NumElements()) + && ((count == size_t(-1)) || (count <= NumElements() - offset))); + if (count == size_t(-1)) + { + count = NumElements() - offset; + } + return Span(Data() + offset, count); + } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + constexpr Span DropFront( + size_t count = 1) const + { + Span retVal; + if (count < NumElements()) + { + retVal = Subspan(count, size_t(-1)); + } + return retVal; + } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + constexpr Span DropBack( + size_t count = 1) const + { + Span retVal; + if (count < NumElements()) + { + retVal = Subspan(0, NumElements() - count); + } + return retVal; + } + + ///@{ + /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as `range for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using value_type = T; + using reference = T&; + using iterator = T*; + using difference_type = size_t; + using size_type = size_t; + + constexpr iterator begin() const noexcept { return m_pData; } + constexpr iterator end() const noexcept { return (m_pData + m_numElements); } + constexpr bool empty() const noexcept { return IsEmpty(); } + constexpr size_type size() const noexcept { return m_numElements; } + ///@} + +protected: + template, char, U>> + static constexpr size_t ElementSize() { return sizeof(R); } + + template + using IfConst = std::enable_if_t == Condition>; + + template + using IfPtr = std::enable_if_t == Condition>; + + T* m_pData; // Pointer to the current data. + size_t m_numElements; // Number of elements present. +}; + +/// ==================================================================================================================== +/// Span template specialization for const void byte buffers. +template<> +class Span : public Span +{ +using Byte = const char; +using Base = Span; + +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : Base() {} + + /// Template constructor from any pointer and length + /// + /// @param [in] data Pointer to the start of the buffer or array + /// @param numElements Number of bytes or elements in the buffer or array + template + Span(const T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) {} + + /// Template copy constructor + /// + /// @param [in] src Other Span to copy from + template + Span(const Span& src) : Span(src.Data(), src.NumElements()) {} + + /// Template constructor from any C++ array + /// + /// @param [in] src C++ array + template + Span(const T(& src)[NumElements]) : Span(&src[0], NumElements) {} + + /// Constructor from any single element + /// + /// @param [in] src Single element + template> + Span(const T& src) : Span(&src, 1) {} + + /// Templated conversion of this typeless Span to a typed Subspan + /// + /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) + template + operator Span() const + { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } + + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + template + const T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } + + /// Returns pointer to the underlying buffer serving as data storage. + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + template + constexpr const T* Data() const { return reinterpret_cast(Base::Data()); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + template + Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + template + Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + template + Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } +}; + +/// ==================================================================================================================== +/// Span template specialization for mutable void byte buffers. +template<> +class Span : public Span +{ +using Byte = char; +using Base = Span; + +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : Base() {} + + /// Template constructor from any pointer and length + /// + /// @param [in] data Pointer to the start of the buffer or array + /// @param sizeInBytes,numElements Number of bytes or elements in the buffer or array + template> + Span(T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) { } + + /// Template copy constructor + /// + /// @param [in] src Other Span to copy from + template> + Span(const Span& src) : Span(src.Data(), src.NumElements()) {} + + /// Template constructor from any C++ array + /// + /// @param [in] src C++ array + template> + Span(T(& src)[NumElements]) : Span(&src[0], NumElements) {} + + /// Constructor from any single element + /// + /// @param [in] src Single element + template, typename = IfPtr> + Span(T& src) : Span(&src, 1) {} + + /// Templated conversion of this typeless Span to a typed Subspan + /// + /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) + template + operator Span() const { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } + + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + template + T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } + + /// Returns pointer to the underlying buffer serving as data storage + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + template + constexpr T* Data() const { return reinterpret_cast(Base::Data()); } + + /// Implicitly convert this void Span to its const void equivalent + /// + /// @returns The same span, but of const void type + operator Span() const { return Span(Data(), NumElements()); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + template + Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + template + Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + template + Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h new file mode 100644 index 0000000000..fb6ef9b694 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h @@ -0,0 +1,188 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palStringUtil.h + * @brief PAL String utility collection functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include +#include +#include + +#include "palUtil.h" + +namespace Util +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 +/// Returns the length of a wchar_t based string. +/// +/// @param [in] wide string to query +/// +/// @returns The length of the given string in wide characters +extern size_t PalWcslen( + const wchar_t* pWideStr); + +/// Performs a reverse string find of wide character wc. +/// +/// @param [in] wide string to scan +/// @param [in] wide character to find +/// +/// @returns The matching character at the end of the string or nullptr if not found. +extern wchar_t* PalWcsrchr( + wchar_t *pStr, + wchar_t wc); +#endif + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but mbstowcs still treats the dest +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in words +/// +/// @returns Returns whether or not the conversion was successful. +extern bool ConvertCharStringToUtf16( + wchar_t* pDst, + const char* pSrc, + size_t dstSizeInWords); + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcstombs still treats the src +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in bytes +/// +/// @returns Returns whether or not the conversion was successful. +extern bool ConvertUtf16StringToUtf8( + char* pDst, + const wchar_t* pSrc, + size_t dstSizeInBytes); + +/// Convert wchar_t string to UTF-8 string. Works whether wchar_t is 16 or 32 bits. +/// If wchar_t is 16 bits, this decodes UTF-16. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in bytes +/// +/// @returns Returns whether or not the conversion was successful. +bool ConvertWcharStringToUtf8(char* pDst, const wchar_t* pSrc, size_t dstSizeInBytes); + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcsncpy still treats its arguments +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] pDst Destination string. +/// @param [in] pSrc Source string to copy. +/// @param [in] dstSize Length of the destination buffer, in wchar_t's. +extern void CopyUtf16String( + wchar_t* pDst, + const wchar_t* pSrc, + size_t dstSize); + +/// A shared helper function which takes an arbitrary blob of data and formats it into a human readable "memory view" +/// string. This is intended to be used by logging code. +/// +/// Imagine your input buffer is: { 0xef, 0xbe, 0xad, 0xde, 0x78, 0x56, 0x34, 0x12, 0xab }, then the string looks like +/// this with a blockSize of 4: "0xdeadbeef 0x12345678 0xab". So the block size determines how many bytes are combined +/// into one "0x" character block. The whole block is effetively cast into an integer of that size and printed in big +/// endian. Trailing bytes are printed without being size-extended. If a block won't fit at the end of the string it +/// is skipped (update your buffer pointer and call again to continue). +/// +/// The return value is the number of bytes consumed from pBuffer. The idea is that you can loop until the full size +/// is consumed, printing a new line for each call. +/// +/// @param [out] pDst The caller-provided destination string. +/// @param [in] dstSize The length of pDst in bytes. +/// @param [in] pBuffer The arbitrary data blob to turn into a string. +/// @param [in] bufferSize The length of pBuffer in bytes. +/// @param [in] blockSize How many bytes to combine into one hexidecimal big endian string. +/// +/// @returns The number of bytes from pBuffer that were formatted into pDst. +extern size_t BytesToStr( + char* pDst, + size_t dstSize, + const void* pBuffer, + size_t bufferSize, + size_t blockSize); + +/// Returns the length of the string. +/// +/// @returns String length. +constexpr uint32 StringLength( + const char* pString) +{ + // TODO: On C++23 we can replace this with consteval-if. + // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. +#if defined(__cpp_lib_is_constant_evaluated) + if (std::is_constant_evaluated()) +#else + if (__builtin_is_constant_evaluated()) +#endif + { + uint32 length = 0; + while (pString[length] != '\0') + { + length++; + } + return length; + } + else + { + return uint32(std::strlen(pString)); + } +} +/// Returns the length of the string. +/// +/// @returns String length. +constexpr uint32 StringLength( + const wchar_t* pString) +{ + // TODO: On C++23 we can replace this with consteval-if. + // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. +#if defined(__cpp_lib_is_constant_evaluated) + if (std::is_constant_evaluated()) +#else + if (__builtin_is_constant_evaluated()) +#endif + { + uint32 length = 0; + while (pString[length] != L'\0') + { + length++; + } + return length; + } + else + { + return uint32(std::wcslen(pString)); + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h b/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h new file mode 100644 index 0000000000..34057ebd60 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h @@ -0,0 +1,228 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palStringView.h +* @brief PAL utility collection string view declaration. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palAssert.h" +#include "palInlineFuncs.h" +#include "palStringUtil.h" +#include "palUtil.h" +#include + +namespace Util +{ + +/** +*********************************************************************************************************************** +* @brief String view. +* +* A StringView is a templated view over a constant contiguous sequence of characters. +* +* @warning The string view assumes that its lifetime does not extend past that of the pointed-to character sequence. +*********************************************************************************************************************** +*/ +template +class StringView +{ + static_assert((std::is_same::value || std::is_same::value), + "StringView type T must be either char or wchar_t."); +public: + /// Constructs an empty StringView. + constexpr StringView() + : + m_pData{}, + m_length{} + {} + + constexpr StringView( + const CharT* s, + uint32 count) + : + m_pData{s}, + m_length{count} + { + PAL_CONSTEXPR_ASSERT((s != nullptr) || (count == 0)); + } + + StringView( + const CharT* s) + : + StringView() + { + if (s != nullptr) + { + m_length = StringLength(s); + m_pData = s; + } + } + + constexpr StringView(std::nullptr_t) = delete; + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + constexpr const CharT& At(uint32 index) const + { + PAL_CONSTEXPR_ASSERT(index < Length()); + return m_pData[index]; + } + + constexpr const CharT& operator[](uint32 index) const { return At(index); } + ///@} + + /// Returns pointer to the underlying string serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Length()), + /// even if the view does not point to any data storage (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from a view that does not point to a data storage will cause + /// an access violation! + /// + /// @returns Pointer to the underlying data storage for read access. + /// For a view to a valid data storage, the returned pointer contains address of the first element. + /// For a view without a valid data storage, the returned pointer will be a @c nullptr. + constexpr const CharT* Data() const noexcept { return m_pData; } + + /// Returns the data at the front of the view. + /// + /// @warning Calling this function on an empty view will cause an access violation! + /// + /// @returns The data at the front of the view. + constexpr const CharT& Front() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return m_pData[0]; + } + + /// Returns the data at the back of the view. + /// + /// @warning Calling this function on an empty view will cause an access violation! + /// + /// @returns The data at the back of the view. + constexpr const CharT& Back() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return m_pData[Length() - 1]; + } + + /// Returns the length of the string. + /// + /// @returns An unsigned integer equal to the length of the string. + constexpr uint32 Length() const { return m_length; } + + /// Returns true if the number of characters the view points to is equal to zero. + /// + /// @returns True if the view points to an empty or non-existing data storage. + constexpr bool IsEmpty() const { return (m_length == 0); } + + ///@{ + /// @internal Satisfies concept `range_expression`, using CharT* as `iterator`. + /// + /// @note - These are a convenience intended to be used by C++ language features such as `range-based for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using const_iterator = const CharT*; + + constexpr const_iterator begin() const noexcept { return m_pData; } + constexpr const_iterator end() const noexcept { return m_pData + Length(); } + ///@} + +private: + const CharT* m_pData; + uint32 m_length; +}; + +// ===================================================================================================================== +template +constexpr bool operator==( + StringView x, + StringView y) +{ + bool equal = (x.Length() == y.Length()); + if (equal) + { + if (x.Data() != y.Data()) + { + // they are not pointing to the same storage, so we need to compare the contents + for (uint32 index = 0; equal && (index < x.Length()); ++index) + { + equal = (x[index] == y[index]); + } + } + } + return equal; +} + +// ===================================================================================================================== +template +constexpr bool operator!=(StringView x, StringView y) { return (x == y) == false; } + +// ===================================================================================================================== +template +bool operator<( + StringView x, + StringView y) +{ + const uint32 minLength = Min(x.Length(), y.Length()); + int compare = strncmp(x.Data(), y.Data(), minLength); + if (compare == 0) + { + // strings are equal up to minLength, so check which is shorter + compare = int(x.Length()) - int(y.Length()); + } + + return compare < 0; +} + +// ===================================================================================================================== +template +bool operator<=(StringView x, StringView y) { return (y < x) == false; } + +// ===================================================================================================================== +template +bool operator>(StringView x, StringView y) { return y < x; } + +// ===================================================================================================================== +template +bool operator>=(StringView x, StringView y) { return (x < y) == false; } + +/// Specialization of @ref HashString(const char*,size_t) for @ref StringView. +template +constexpr uint32 HashString( + StringView sv) +{ + return HashString(sv.Data(), sv.Length()); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h b/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h new file mode 100644 index 0000000000..c43ef0069e --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h @@ -0,0 +1,820 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palSysMemory.h + * @brief PAL utility collection system memory management macros. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palInlineFuncs.h" +#include "palMemTracker.h" +#include +#include + +// Forward declarations +namespace Util { struct AllocInfo; } +namespace Util { struct FreeInfo; } +namespace Util { enum SystemAllocType : uint32; } + +#if !defined(__GNUC__) || (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 8)) +/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. +#define PAL_DEFAULT_MEM_ALIGN alignof(std::max_align_t) +#else +// GCC versions prior to 4.9 break C++11 compatibility by putting max_align_t in the global namespace. +/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. +#define PAL_DEFAULT_MEM_ALIGN alignof(::max_align_t) +#endif + +namespace Util +{ + +/// Informs that @p p is aligned to at least @p Alignment. +template +constexpr T* AssumeAligned(T* p) +{ + return static_cast(__builtin_assume_aligned(p, Alignment)); +} + +} // Util + +#if PAL_MEMTRACK + +/// @internal Malloc allocation method with extra memory leak tracking arguments. +#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType, _memBlkType, __FILE__, __LINE__)) + +/// @internal Calloc allocation method with extra memory leak tracking arguments. +#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType, _memBlkType, __FILE__, __LINE__)) + +/// @internal Free method with extra memory leak tracking arguments. +#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ + _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)), _memBlkType)) + +#else + +/// @internal Malloc method not wrapped with memory leak tracking. +#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType)) + +/// @internal Calloc method not wrapped with memory leak tracking. +#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType)) + +/// @internal Free method not wrapped with memory leak tracking. +#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ + _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)))) + +#endif + +/// Allocates heap memory in place of malloc(). +/// +/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. +/// The client is also free to use this macro in order to take advantage of PAL's memory leak tracking. +#define PAL_MALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ + PAL_MALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) + +/// Same as @ref PAL_MALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. +#define PAL_MALLOC(_size, _allocator, _allocType) \ + PAL_MALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) + +/// Allocates zero-initialized heap memory in place of calloc(). See @ref PAL_MALLOC_ALIGNED. +#define PAL_CALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ + PAL_CALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) + +/// Same as @ref PAL_CALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. +#define PAL_CALLOC(_size, _allocator, _allocType) \ + PAL_CALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) + +/// Frees heap memory allocated with the @ref PAL_MALLOC* or @ref PAL_CALLOC* macros. +#define PAL_FREE(_ptr, _allocator) PAL_FREE_BASE((_ptr), (_allocator), ::Util::MemBlkType::Malloc) + +/// Safe free macro. Pointer is set to null after the free. +#define PAL_SAFE_FREE(_ptr, _allocator) { PAL_FREE((_ptr), (_allocator)); (_ptr) = nullptr; } + +/// @internal +/// +/// This type only exists to force a unique override for placement new. We need to override placement new in order to +/// call the constructor in the PAL_NEW and PAL_NEW_ARRAY implementations, but we do not want to overload global +/// placement new or include \ since either could interfere with the client. Adding a dummy parameter allows us +/// to define a PAL-only placement new implementation. +namespace Util +{ +struct Dummy +{ + explicit Dummy() { } ///< Explicit default constructor prevents this from being instantiated via unqualified "{}". +}; +} + +/// @internal +/// +/// PAL-internal placement new override. The Dummy is used to ensure there won't be a conflict if a client tries to +/// override global placement new. +/// +/// @param [in] size Size of the memory allocation. +/// @param [in] pObjMem Memory where object will be constructed. +/// @param [in] dummy Unused. +extern void* PAL_CDECL operator new( + size_t size, + void* pObjMem, + Util::Dummy dummy) noexcept; + +/// @internal +/// +/// Silences compiler warnings about not have a matching delete for the placement new override above. Will never be +/// called. +/// +/// @param [in] pObj Unused. +/// @param [in] pObjMem Unused. +/// @param [in] dummy Unused. +extern void PAL_CDECL operator delete( + void* pObj, + void* pObjMem, + Util::Dummy dummy) noexcept; + +/// Placement new macro. +#define PAL_PLACEMENT_NEW(_ptr) new((_ptr), ::Util::Dummy{}) + +/// Allocates heap memory and calls constructor for an object of the specified type. +/// +/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. +/// The client is also free to use this macro. +/// +/// Instead of calling "MyClass* pMyClass = new MyClass(arg1, arg2)", call +/// "MyClass* pMyClass = PAL_NEW(MyClass, AllocInternal)(arg1, arg2)". +#define PAL_NEW(_className, _allocator, _allocType) \ + PAL_PLACEMENT_NEW( \ + PAL_MALLOC_BASE(sizeof(_className), alignof(_className), (_allocator), (_allocType), ::Util::MemBlkType::New)) \ + _className + +/// Calls destructor and frees heap memory for the object allocated with PAL_NEW*. +#define PAL_DELETE(_ptr, _allocator) \ +{ \ + /* we want to evaluate the expression (_allocator) before calling + the destructor because the destructor might have side effects */ \ + auto _allocator_ = (_allocator); \ + ::Util::Destructor(_ptr); \ + PAL_FREE_BASE((_ptr), _allocator_, ::Util::MemBlkType::New); \ +} + +/// Calls destructor and frees heap memory for "this". Use this macro to delete an object without a public destructor. +#define PAL_DELETE_THIS(_className, _allocator) \ +{ \ + /* we want to evaluate the expression (_allocator) before calling + the destructor because the destructor might have side effects */ \ + auto _allocator_ = (_allocator); \ + this->~_className(); \ + PAL_FREE_BASE(this, _allocator_, ::Util::MemBlkType::New); \ +} + +/// Safe delete macro. Pointer is set to null after the delete. +#define PAL_SAFE_DELETE(_ptr, _allocator) { PAL_DELETE(_ptr, _allocator); (_ptr) = nullptr; } + +/// Allocates an array of the specified object type. +/// +/// For non-POD types, the default constructor will be called. Default constructor is not available for POD types +/// (i.e., PAL_NEW_ARRAY(int, 3, AllocInternal)() won't work. +#if PAL_MEMTRACK +#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ + ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType), __FILE__, __LINE__) +#else +#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ + ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType)) +#endif + +/// Destroys an array of the specified object type. +/// +/// For non-POD types, the destructor will be called. +#define PAL_DELETE_ARRAY(_ptr, _allocator) ::Util::DeleteArray(_ptr, _allocator) + +/// Safe delete array macro. Pointer is set to null after the delete. +#define PAL_SAFE_DELETE_ARRAY(_ptr, _allocator) { PAL_DELETE_ARRAY(_ptr, _allocator); (_ptr) = nullptr; } + +namespace Util +{ +/// Specifies the usage of a system memory allocation made via a client allocation callback. +/// +/// The selected type gives the client an idea of the expected lifetime of the allocation, perhaps allowing intelligent +/// selection of sub-allocation pool, etc. +/// +/// @note This is a weak uint32 enum where all PAL values set the top bit. The client is free to use PAL's memory +/// utilities for their own allocations with their own uint32 enum using the range 0 to 0x7FFFFFFF. The client's +/// allocation callback can then separately handle any memory allocation category, whether allocated by PAL or +/// themselves. +/// +/// @see AllocCallbacks +/// @see AllocFunc +enum SystemAllocType : uint32 +{ + /// Indicates an allocation will be attached to a client-created PAL object and will not be freed until the client + /// frees the associated object. This type will be specified when allocation callbacks are made during a PAL create + /// call (e.g., IDevice::CreateGraphicsPipeline()). + AllocObject = 0x80000000, + + /// Indicates an allocation is for internal PAL use. The client should assume such allocations have a long + /// lifetime, and may not be freed until IPlatform::Destroy() is called. + AllocInternal = 0x80000001, + + /// Indicates an allocation is for internal PAL use and that the lifetime of the allocation will be short. + /// Typically this will be specified for heap allocations that will be freed before control is returned to the + /// client. + AllocInternalTemp = 0x80000002, + + /// Indicates an allocation was requested by the shader compiler. + AllocInternalShader = 0x80000003 +}; + +/// Function pointer type defining a callback for client-controlled system memory allocation. +/// +/// @see AllocCallbacks +/// +/// @ingroup LibInit +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter +/// to CreatePlatform() will be passed back to the client on every allocation callback. +/// @param [in] size Size of the requested allocation in bytes. Must be non-zero. +/// @param [in] alignment Required alignment of the requested allocation in bytes. Must be a power of two. +/// @param [in] allocType Hint to client about expected allocation usage and lifetime. See @ref SystemAllocType. +/// +/// @returns Pointer to system memory with the specified size and alignment. nullptr means that the allocation failed. +typedef void* (PAL_STDCALL *AllocFunc)( + void* pClientData, + size_t size, + size_t alignment, + SystemAllocType allocType); + +/// Function pointer type defining a callback for client-controlled system memory deallocation. +/// +/// @see AllocCallbacks +/// +/// @ingroup LibInit +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter +/// to CreatePlatform() will be passed back to the client on every free callback. +/// @param [in] pMem System memory pointer to be freed. The specified pointer must have been allocated by an +/// @ref AllocFunc callback. +typedef void (PAL_STDCALL *FreeFunc)( + void* pClientData, + void* pMem); + +/// Specifies client-provided system allocation callbacks. Used as a parameter to Pal::CreatePlatform(). +/// +/// @ingroup LibInit +struct AllocCallbacks +{ + void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to + /// every @ref AllocFunc and @ref FreeFunc call made by PAL. + AllocFunc pfnAlloc; ///< System memory allocation callback. @see AllocFunc. + FreeFunc pfnFree; ///< System memory deallocation callback. @see FreeFunc. +}; + +/// Information about requested allocation. +/// +/// Contains necessary information (size, alignment, etc.) to allocate new system memory. +/// +/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator +/// does not need to use any of this information, but can if desired. +/// +/// @see Allocators +struct AllocInfo +{ + /// Constructor. + AllocInfo( + size_t bytes, ///< [in] Number of bytes to allocate. + size_t alignment, ///< [in] Required alignment of the requested allocation in bytes. + bool zeroMem, ///< [in] True for calloc, false for malloc. + SystemAllocType allocType ///< [in] Hint on type of allocation and lifetime for client callbacks. +#if PAL_MEMTRACK + , MemBlkType blockType, ///< [in] Type of allocation (malloc, new, or new array). + const char* pFilename, ///< [in] Source filename that requested the memory allocation. + uint32 lineNumber ///< [in] Line number in the source file that requested the memory allocation. +#endif + ) + : + bytes(bytes), + alignment(alignment), + zeroMem(zeroMem), + allocType(allocType) +#if PAL_MEMTRACK + , blockType(blockType), + pFilename(pFilename), + lineNumber(lineNumber) +#endif + {} + + size_t bytes; ///< Number of bytes to allocate. + const size_t alignment; ///< Required alignment of the requested allocation in bytes. + const bool zeroMem; ///< True for calloc, false for malloc. + const SystemAllocType allocType; ///< Hint on type of allocation and lifetime for client callbacks. +#if PAL_MEMTRACK + const MemBlkType blockType; ///< Type of allocation (malloc, new, or new array). + const char* pFilename; ///< Source filename that requested the memory allocation. + const uint32 lineNumber; ///< Line number in the source file that requested the memory allocation. +#endif +}; + +/// Information about freeing a specified allocation. +/// +/// Contains necessary information about memory that needs to be freed. +/// +/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator +/// does not need to use any of this information, but can if desired. +/// +/// @see Allocators +struct FreeInfo +{ + /// Constructor. + FreeInfo( + void* pClientMem ///< [in] Pointer to memory allocation. +#if PAL_MEMTRACK + , MemBlkType blockType ///< [in] Type of free (free, delete, or delete array). +#endif + ) + : + pClientMem(pClientMem) +#if PAL_MEMTRACK + , blockType(blockType) +#endif + {} + + void* pClientMem; ///< Pointer to memory allocation. +#if PAL_MEMTRACK + const MemBlkType blockType; ///< Type of free (free, delete, or delete array). +#endif +}; + +/** + *********************************************************************************************************************** + * @brief Wraps a AllocCallbacks struct into a class compatible with PAL's Allocator concept. + *********************************************************************************************************************** + */ +class ForwardAllocator +{ +public: + /// Constructor. + ForwardAllocator(const AllocCallbacks& callbacks) : m_callbacks(callbacks) { } + + /// Allocates memory using the provided pfnAlloc callback. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + // Allocating zero bytes of memory results in undefined behavior. + PAL_ASSERT(allocInfo.bytes > 0); + + void* pMem = m_callbacks.pfnAlloc(m_callbacks.pClientData, + allocInfo.bytes, + allocInfo.alignment, + allocInfo.allocType); + + if ((pMem != nullptr) && allocInfo.zeroMem) + { + memset(pMem, 0, allocInfo.bytes); + } + + return pMem; + } + + /// Frees memory using the provided pfnFree callback. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + if (freeInfo.pClientMem != nullptr) + { + m_callbacks.pfnFree(m_callbacks.pClientData, freeInfo.pClientMem); + } + } + +private: + const AllocCallbacks m_callbacks; +}; + +/** +************************************************************************************************************************ +* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps ForwardAllocator. +************************************************************************************************************************ +*/ +#if PAL_MEMTRACK +class ForwardAllocatorTracked +{ +public: + /// Constructor + ForwardAllocatorTracked(const AllocCallbacks& callbacks) + : + m_allocator(callbacks), + m_memTracker(&m_allocator) + { + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + return m_memTracker.Alloc(allocInfo); + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + m_memTracker.Free(freeInfo); + } + +private: + Util::ForwardAllocator m_allocator; ///< The ForwardAllocator which this object wraps. + MemTracker m_memTracker; ///< Memory tracker for this ForwardAllocator. +}; +#else +using ForwardAllocatorTracked = ForwardAllocator; +#endif + +/** +************************************************************************************************************************ +* @brief A wrapper representing an allocator const-pointer. Can be implicitly constructed from any Allocator pointer. +* +* IndirectAllocator is a type-erasure replacement for `Allocator*const pAllocator`, to abstract around Allocator types. +* This allows classes to not need fully template on `typename Allocator`, at the cost of more pointer-indirection. +* +* Const-correctness should be treated as `Allocator*const pAllocator` - the pointed-to Allocator may be mutable. +************************************************************************************************************************ +*/ +class IndirectAllocator +{ +public: + /// Implicit conversion from any Allocator pointer. + template + IndirectAllocator(Allocator*const pAllocator) + : + m_pAllocator(pAllocator), + m_pfnAlloc(&DispatchAlloc), + m_pfnFree(&DispatchFree) + { } + + /// Constructor specialization for a pointer to another IndirectAllocator, which acts like a copy constructor. + IndirectAllocator(const IndirectAllocator*const pAllocator) : IndirectAllocator(*pAllocator) { } + + /// Allocates memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) const { return m_pfnAlloc(m_pAllocator, allocInfo); } + + /// Frees memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) const { return m_pfnFree(m_pAllocator, freeInfo); } + + /// Returns true if the allocator == nullptr. Used in place of `pAllocator == nullptr`. + constexpr bool operator==(std::nullptr_t) const { return m_pAllocator == nullptr; } + +private: + /// @internal Allocation dispatch function. This is what the non-template @ref m_pfnAlloc callback references. + template + static void* DispatchAlloc(void*const pAllocator, const AllocInfo& allocInfo) + { + auto*const pTypedAllocator = static_cast(pAllocator); + return pTypedAllocator->Alloc(allocInfo); + } + + /// @internal Free dispatch function. This is what the non-template @ref m_pfnFree callback references. + template + static void DispatchFree(void*const pAllocator, const FreeInfo& freeInfo) + { + auto*const pTypedAllocator = static_cast(pAllocator); + pTypedAllocator->Free(freeInfo); + } + + using DispatchAllocCb = void* (*)(void*const, const AllocInfo&); + using DispatchFreeCb = void (*)(void*const, const FreeInfo&); + + void*const m_pAllocator; + + const DispatchAllocCb m_pfnAlloc; + const DispatchFreeCb m_pfnFree; +}; + +/** + *********************************************************************************************************************** + * @brief A generic allocator class that allocate and free memory for general purpose use. + *********************************************************************************************************************** + */ +class GenericAllocator +{ +public: + /// Allocates memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + static void* Alloc(const AllocInfo& allocInfo); + + /// Frees memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + static void Free(const FreeInfo& freeInfo); +}; + +/** +************************************************************************************************************************ +* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps GenericAllocator. +************************************************************************************************************************ +*/ +#if PAL_MEMTRACK +class GenericAllocatorTracked +{ +public: + /// Constructor + GenericAllocatorTracked() + : + m_memTracker(&m_allocator) + { + Result result = m_memTracker.Init(); + PAL_ASSERT(result == Result::_Success); + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pMemory = m_memTracker.Alloc(allocInfo); + return pMemory; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + m_memTracker.Free(freeInfo); + } + +private: + GenericAllocator m_allocator; ///< The GenericAllocator which this object wraps. + + MemTracker m_memTracker; ///< Memory tracker for this GenericAllocator. +}; +#else +using GenericAllocatorTracked = GenericAllocator; +#endif + +/// Returns the OS-specific page size. +/// +/// @note All virtual reservations/commits/decommits/releases must be aligned to the value returned by this function. +/// +/// @return The OS-specific size, in bytes, of a page. +extern size_t VirtualPageSize(); + +/// Reserves the specified amount of virtual address space. +/// +/// @param [in] sizeInBytes Size in bytes of the requested reservation. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [out] ppOut Pointer to reserved memory. Must not be null. +/// @param [in] pMem Pointer to the starting virtual address to reserve. +/// Subject to failure "ErrorOutOfMemory" if any pages in that range have already been reserved +/// elsewhere by the process. +/// @param [in] alignment Optional parameter specifying the alignment of the reserved cpu VA. +/// +/// @returns Success if reservation is successful. +/// Otherwise: +/// - ErrorOutOfMemory if memory reservation failed. +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if ppOut is null. +extern Result VirtualReserve(size_t sizeInBytes, void** ppOut, void* pMem = nullptr, size_t alignment = 1); + +/// Commits the specified amount of virtual address space, requesting backing memory from the OS. +/// +/// @param [in] pMem Pointer to the start of reserved memory to commit. Must be aligned to the page size +/// returned from @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of the requested commit. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] isExecutable Indicate if the committed memory is used for execution on gpu(For instance: PM4, kernel). +/// Used only on Windows. +/// +/// @returns Success if committing is successful. +/// Otherwise: +/// - ErrorOutOfMemory if memory committing failed. +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualCommit(void* pMem, size_t sizeInBytes, bool isExecutable = false); + +/// Decommits the specified amount of virtual address space, freeing the backing memory back to the OS. +/// +/// @param [in] pMem Pointer to the start of committed memory. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of how much to decommit. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// +/// @returns Success if decommitting is successful. +/// Otherwise: +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualDecommit(void* pMem, size_t sizeInBytes); + +/// Releases the specified amount of virtual address space, both freeing the backing memory and virtual address space +/// back to the OS. +/// +/// @param [in] pMem Pointer to the start of reserved memory. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of how much to release. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// +/// @returns Success if decommitting is successful. +/// Otherwise: +/// - ErrorInvalidValue if sizeInBytes is zero +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualRelease(void* pMem, size_t sizeInBytes); + +/// @internal +/// +/// OS-specific implementation to install default allocation callbacks in the specified structure. Expected to be +/// called during CreatePlatform if the client doesn't specify their own allocation callbacks. +/// +/// @param [in,out] pAllocCb Allocation callback structure to be updated with the OS-specific default callbacks. +/// +/// @returns Success if successful, otherwise an appropriate error code. +extern Result OsInitDefaultAllocCallbacks(AllocCallbacks* pAllocCb); + +/// @internal Internal template implementation for calling a destructor from PAL_DELETE or PAL_DELETE_ARRAY. +/// +/// @param [in] p Object to be destructed. +template +void Destructor(T* p) +{ + if ((p != nullptr) && !std::is_trivial::value) + { + p->~T(); + } +} + +/// @internal +/// +/// Internal template implementation of PAL_NEW_ARRAY. +/// +/// For non-POD types, allocate extra memory and store the array count for use by the destructor. An entire extra cache +/// line is allocated in order to prevent misaligning the actual array data. In practice, we rarely allocate an array +/// of non-POD objects, so this is likely not an issue. +/// +/// @param [in] arrayCnt Number of entries in the array. +/// @param [in] pAllocator The allocator that will allocate the memory for the array. +/// @param [in] allocType Hint to client on the lifetime/type of allocation. +/// @param [in] pFilename Source filename that requested the new array. +/// @param [in] lineNumber Line number in the source file that requested the new array. +/// +/// @returns Pointer to the allocated array, nullptr if the allocation failed. +template +T* NewArray( + size_t arrayCnt, + Allocator* pAllocator, + SystemAllocType allocType +#if PAL_MEMTRACK + , + const char* pFilename, + uint32 lineNumber +#endif + ) +{ + size_t align = alignof(T); + size_t allocSize = sizeof(T) * arrayCnt; + size_t headerSize = 0; + + if (!std::is_trivial::value) + { + align = Max(align, alignof(size_t)); + headerSize = Max(align, sizeof(size_t)); + allocSize += headerSize; + } + +#if PAL_MEMTRACK + const Util::AllocInfo info(allocSize, align, false, allocType, MemBlkType::NewArray, pFilename, lineNumber); +#else + const Util::AllocInfo info(allocSize, align, false, allocType); +#endif + + T* pRet = static_cast(pAllocator->Alloc(info)); + + if ((!std::is_trivial::value) && (pRet != nullptr)) + { + pRet = static_cast(Util::VoidPtrInc(static_cast(pRet), headerSize)); + + size_t* pArrayCnt = static_cast(Util::VoidPtrDec(static_cast(pRet), sizeof(size_t))); + *pArrayCnt = arrayCnt; + + T* pCurObj = static_cast(pRet); + for (uint32 i = 0; i < arrayCnt; i++) + { + PAL_PLACEMENT_NEW(pCurObj) T; + pCurObj++; + } + } + + return pRet; +} + +/// @internal +/// +/// Internal template implementation for PAL_DELETE_ARRAY. +/// +/// For non-POD types, find the array count stored before the client pointer, and call the destructor on each object in +/// the array. +/// +/// @param [in] p Pointer to the memory to be deleted. +/// @param [in] pAllocator The allocator that will free the memory for the array. +template +void DeleteArray(T* p, Allocator* pAllocator) +{ + if ((p != nullptr) && !std::is_trivial::value) + { + const size_t headerSize = Max(Max(alignof(T), alignof(size_t)), sizeof(size_t)); + const size_t* pArrayCnt = const_cast(reinterpret_cast(p)) - 1; + + for (uint32 i = 0; i < *pArrayCnt; i++) + { + Destructor(p + i); + } + + p = const_cast(reinterpret_cast((Util::VoidPtrDec(pArrayCnt + 1, headerSize)))); + } + +#if PAL_MEMTRACK + const Util::FreeInfo info(const_cast(static_cast(p)), MemBlkType::NewArray); +#else + const Util::FreeInfo info(const_cast(static_cast(p))); +#endif + + pAllocator->Free(info); +} + +constexpr size_t FastMemCpyMaxSmallSize = 64; + +typedef void* (PAL_CDECL *FastMemCpySmallFunc)(void* pDst, const void* pSrc, size_t count); + +extern const FastMemCpySmallFunc FastMemCpySmallFuncTable[]; + +/// A version of memcpy that has fewer branches for small copies. It computes an index into +/// a table based on the size requested then jumps to a branchless memcpy for that size. +/// Note that The compiler will NOT inline this if the count is known at compile time. The +/// regular memcpy() will be inlined and should be used. Note further that it is NOT always +/// obvious that the count is not known. Consider the case: +/// void SetData(UINT count, UINT* pData) +/// { +/// ... +/// memcpy(pBuf, pData, count*sizeof(UINT)); +/// ... +/// } +/// and SetData is called like this +/// SetData(4, buf); +/// In such a case, although the memcpy itself has an unknown size, if the SetData function +/// inlines and is used with a fixed count, the memcpy count is actually known. Such cases need +/// to be carefully managed. + +PAL_FORCE_INLINE void* FastMemCpy(void* pDst, const void* pSrc, size_t count) +{ + // The last entry in the table handles all entries larger than 64 bytes, so clamping the size + // to 64 calls the correct routine. + const size_t index = Min(count, FastMemCpyMaxSmallSize + 1); + return (FastMemCpySmallFuncTable[index])(pDst, pSrc, count); +} + +/// Get the default allocation callback. +/// +/// @param [out] pAllocCb Pointer to the allocation callback structure. Must not be null. +void PAL_STDCALL GetDefaultAllocCb(Util::AllocCallbacks* pAllocCb); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h new file mode 100644 index 0000000000..e17690a133 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h @@ -0,0 +1,816 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palSysUtil.h + * @brief PAL utility collection system functions. + *********************************************************************************************************************** + */ + +#pragma once + +// pal +#include "palAssert.h" +#include "palAutoBuffer.h" +#include "palFile.h" +#include "palSpan.h" +#include "palStringView.h" +#include "palTime.h" +#include "palUtil.h" +#include "palVector.h" + +// stl +#include +#include + +// platform +#if defined(_WIN32) +#define PAL_HAS_CPUID (_M_IX86 || _M_X64) +#include +#include +typedef void* HANDLE; +#elif defined(__unix__) +#define PAL_HAS_CPUID (__i386__ || __x86_64__) +#if PAL_HAS_CPUID +#include +#endif +#endif + +namespace Util +{ + +static constexpr uint32 RyzenMaxCcxCount = 4; +static constexpr uint32 CpuVendorAmd = 0x01000000; +static constexpr uint32 CpuVendorIntel = 0x02000000; + +/// Specifies a keyboard key for detecting key presses. +enum class KeyCode : uint32 +{ + Esc, + F1, + F2, + F3, + F4, + F5, + F6, + F7, + F8, + F9, + F10, + F11, + F12, + F13, + F14, + F15, + F16, + Backtick, // ` ~ + Minus, // - _ + Equal, // = + + LBrace, // [ { + RBrace, // ] } + Backslash, // \ | + Semicolon, // ; : + Apostrophe, // " ' + Comma, // , < + Dot, // . > + Slash, // / ? + Enter, + Space, + Backspace, + Tab, + Capslock, + Shift, + LShift, + RShift, + Control, + LControl, + RControl, + Alt, + LAlt, + RAlt, + Scroll, + Insert, + Delete, + Home, + End, + PageUp, + PageDown, + ArrowUp, + ArrowDown, + ArrowLeft, + ArrowRight, + Numlock, + NumSlash, + NumAsterisk, + NumMinus, + NumPlus, + NumDot, + NumEnter, + Num0, + Num1, + Num2, + Num3, + Num4, + Num5, + Num6, + Num7, + Num8, + Num9, + Zero, + One, + Two, + Three, + Four, + Five, + Six, + Seven, + Eight, + Nine, + A, + B, + C, + D, + E, + F, + G, + H, + I, + J, + K, + L, + M, + N, + O, + P, + Q, + R, + S, + T, + U, + V, + W, + X, + Y, + Z, + Shift_F10, + Shift_F11, + Undefined // Used as value where key code is not defined in the enum +}; + +/// Enum to identify possible configurations +enum class CpuType : uint32 +{ + Unknown = 0, ///< No capabilites set + AmdK5 = (CpuVendorAmd + 0), ///< No MMX, no cmov, no 3DNow + AmdK6 = (CpuVendorAmd + 1), ///< No MMX, no cmov, 3DNow (models 6 and 7) + AmdK6_2 = (CpuVendorAmd + 2), ///< MMX, no cmov, 3DNow (model 8, no HW WC but not part of cpuid) + AmdK6_3 = (CpuVendorAmd + 3), ///< MMX, no cmov, 3DNow (model 9) + AmdK7Basic = (CpuVendorAmd + 4), ///< K7 missing one of the features of K7 + AmdK7 = (CpuVendorAmd + 5), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext + AmdK7Sse = (CpuVendorAmd + 6), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext, SSE + AmdK8 = (CpuVendorAmd + 7), ///< Athlon 64, Athlon 64 FX, and Opteron + AmdK10 = (CpuVendorAmd + 8), ///< Barcelona, Phenom, Greyhound + AmdFamily12h = (CpuVendorAmd + 9), ///< Family 12h - Llano + AmdBobcat = (CpuVendorAmd + 10), ///< Bobcat + AmdFamily15h = (CpuVendorAmd + 11), ///< Family 15h - Orochi, Trinity, Komodo, Kaveri, Basilisk + AmdFamily16h = (CpuVendorAmd + 12), ///< Family 16h - Kabini + AmdRyzen = (CpuVendorAmd + 13), ///< Ryzen + IntelOld = (CpuVendorIntel + 0), ///< Inidicate cpu type befor Intel Pentium III + IntelP3 = (CpuVendorIntel + 1), ///< Generic Pentium III + IntelP3Model7 = (CpuVendorIntel + 2), ///< PIII-7, PIII Xeon-7 + IntelP3Model8 = (CpuVendorIntel + 3), ///< PIII-8, PIII Xeon-8, Celeron-8 + IntelPMModel9 = (CpuVendorIntel + 4), ///< Pentium M Model 9 (Banias) + IntelXeonModelA = (CpuVendorIntel + 5), ///< Xeon-A + IntelP3ModelB = (CpuVendorIntel + 6), ///< PIII-B + IntelPMModelD = (CpuVendorIntel + 7), ///< Pentium M Model D (Dothan) + IntelP4 = (CpuVendorIntel + 8), ///< Pentium 4, Pentium 4-M, Xenon, Celeron + IntelPMModelE = (CpuVendorIntel + 9), ///< Pentium M Model E (Yonah) + IntelCoreModelF = (CpuVendorIntel + 10), ///< Core F (Conroe) +}; + +/// Specifies a struct that contains information about the system. +struct SystemInfo +{ + CpuType cpuType; ///< Cpu type + char cpuVendorString[16]; ///< Null-terminated cpu vendor string + char cpuBrandString[48]; ///< Null-terminated cpu brand string + uint32 cpuLogicalCoreCount; ///< Number of logical cores on the cpu + uint32 cpuPhysicalCoreCount; ///< Number of physical cores on the cpu + uint32 totalSysMemSize; ///< Total system memory (RAM) size in megabytes + uint32 cpuFrequency; ///< Reports CPU clock speed in MHz. + /// (From Registry for Windows, current average processor speed for Linux.) + uint32 displayFamily; ///< Display Family of cpu + uint32 displayModel; ///< Display Model of cpu + + union + { + struct + { + uint32 affinityMask[RyzenMaxCcxCount]; ///< Affinity mask for each core complex (CCX). + } amdRyzen; ///< Properties specific to AMD Ryzen CPU's. + } cpuArchInfo; ///< This member should be used only for Ryzen for now. +}; + +/// Returns an appropriate result from the given errno +/// +/// @param errno_in Value from 'errno' (or functions that return errno_t) +/// +/// @returns Relevent Result value for the given errno-- never Success. +inline Result ConvertErrno( + int32 errnoIn) +{ + Result result = Result::ErrorUnknown; + switch (errnoIn) + { + case EAGAIN: + case EBUSY: + case EINTR: + result = Result::NotReady; + break; + case ETIMEDOUT: + case ETIME: + result = Result::Timeout; + break; + case EEXIST: + result = Result::AlreadyExists; + break; + case ENOENT: + case ENOTDIR: + result = Result::NotFound; + break; + case EACCES: + case EPERM: + case EROFS: + result = Result::ErrorPermissionDenied; + break; + case ENOSPC: + result = Result::ErrorDiskFull; + break; + case EISDIR: + case EINVAL: + case EBADF: + case ENAMETOOLONG: + case ELOOP: + result = Result::ErrorInvalidValue; + break; + case ENOMEM: + case EOVERFLOW: + result = Result::ErrorOutOfMemory; + break; + default: + PAL_ALERT_ALWAYS_MSG("Unknown result generated from errno %d (%s)", errnoIn, strerror(errnoIn)); + break; + } + return result; +} + +#if defined(_WIN32) +/// Helper function to convert Pal::Result to HRESULT +/// +/// @param result Pal::Result to convert +/// +/// @returns the equivalent HRESULT +extern HRESULT PalToHResult(Result result); + +/// Helper function to convert HRESULT to Pal::Result +/// +/// @param hr HRESULT to convert +/// +/// @returns the equivalent Pal::Result +extern Result HResultToPal(HRESULT hr); +#endif + +#if defined(_WIN32) +/// Returns an appropriate result from the given error from 'GetLastError'. +/// Certain OS functions may also return a 'ERROR_SUCCESS' which this helper also translates. +/// +/// @param errno_in System error code from 'GetLastError' +/// +/// @returns Relevent Result value for the given system error code. +inline Result ConvertWinError( + uint32 winError) +{ + Result result = Result::ErrorUnknown; + switch (winError) + { + // File already exists + case ERROR_ALREADY_EXISTS: + case ERROR_FILE_EXISTS: + result = Result::AlreadyExists; + break; + + // File or path does not exist + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + case ERROR_INVALID_NAME: + result = Result::NotFound; + break; + + // File locked for specified access + case ERROR_SHARING_VIOLATION: + case ERROR_LOCK_VIOLATION: + result = Result::ErrorNotShareable; + break; + case ERROR_ACCESS_DENIED: + result = Result::ErrorPermissionDenied; + break; + + // Temporary Errors + case ERROR_NOT_READY: + case ERROR_BUSY: + result = Result::NotReady; + break; + case WAIT_TIMEOUT: + result = Result::Timeout; + break; + + // Usage errors + case ERROR_BUFFER_OVERFLOW: + case ERROR_INVALID_HANDLE: + result = Result::ErrorInvalidValue; + break; + + // Various expected but rare error cases + case ERROR_TOO_MANY_OPEN_FILES: + case ERROR_FILE_TOO_LARGE: + case ERROR_VIRUS_INFECTED: + case ERROR_VIRUS_DELETED: + result = Result::ErrorUnavailable; + break; + case ERROR_HANDLE_DISK_FULL: + case ERROR_DISK_FULL: + result = Result::ErrorDiskFull; + break; + case ERROR_NOT_ENOUGH_MEMORY: + case ERROR_OUTOFMEMORY: + result = Result::ErrorOutOfMemory; + break; + + case ERROR_SUCCESS: + result = Result::Success; + break; + default: + PAL_ALERT_ALWAYS_MSG("Unknown result generated from Windows error code %d", winError); + break; + } + return result; +} + +/// Windows Process Integrity Levels +enum class ProcessIntegrityLevel +{ + Untrusted, //< Untrusted Process (No System Resource Access) + Low, //< Restricted Process + Medium, //< Normal Process + High, //< Elevated Process + System //< System-Level or higher Process +}; + +/// Get the local storage path based on process integrity level +/// +/// @returns the Integrity Level of the current process +extern ProcessIntegrityLevel GetProcessIntegrityLevel(); + +/// Detects if we're inside an App Container sandbox (ie, a Windows Store or UWP app) +/// +/// @returns whether the current process is in an App Container +extern bool IsProcessInAppContainer(); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 921 +/// Tests whether the passed handle is valid or not. +/// A handle could be either nullptr or INVALID_HANDLE_VALUE. +/// INVALID_HANDLE_VALUE is the proper value to set a handle to when you intend to error out. +/// However, handles default-initialize to nullptr. So that's also a common value you see for an unset handle. +/// A common error is to check for one value but not the other. So PAL prescribes that clients use this helper. +constexpr bool IsValidHandle(HANDLE handle) +{ +#ifdef INVALID_HANDLE_VALUE + return (handle != nullptr) && (handle != INVALID_HANDLE_VALUE); +#else + return (handle != nullptr) && (handle != HANDLE(size_t(-1))); +#endif +} +#endif +#endif + +/// Queries system information. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns Success if querying the system info was successful. Otherwise, the following results will be returned: +/// + ErrorInvalidPointer returned if pSystemInfo is nullptr. +/// + ErrorOutOfMemory returned if the system ran out of memory during the function call. +/// + ErrorUnavailable returned if querying the system info is not supported. +/// + ErrorUnknown returned if an error occurs while calling OS functions. +extern Result QuerySystemInfo(SystemInfo* pSystemInfo); + +/// Query cpu type for AMD processor. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns none. +extern void QueryAMDCpuType(SystemInfo* pSystemInfo); + +/// Query cpu type for Intel processor. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns none. +extern void QueryIntelCpuType(SystemInfo* pSystemInfo); + +/// Gets the frequency of performance-related queries. +/// +/// @returns Current CPU performance counter frequency in Hz. +extern int64 GetPerfFrequency(); + +/// Gets the current time of a performance-related query. +/// +/// This is a high resolution time stamp that can be used in conjunction with GetPerfFrequency to measure time +/// intervals. +/// +/// @param [in] raw Whether to use a 'monotonic raw' clock which ignores smoothing. Ignored on Windows. +/// +/// @returns Current value of the CPU performance counter. +extern int64 GetPerfCpuTime(bool raw=false); + +/// Determines if a specific key is pressed down. +/// +/// @param [in] key Specified which key to check. +/// @param [in, out] pPrevState The previous state of the key. +/// +/// @returns True if the specified key is currently pressed down. +extern bool IsKeyPressed(KeyCode key, bool* pPrevState = nullptr); + +/// Determines if profiling is restricted +/// +/// @returns true if the process is not restricted for profiling, otherwise, false will be returned. +extern bool IsProfileRestricted(); + +/// Retrieves the fully resolved file name of the application binary. +/// +/// @param [out] pBuffer Character buffer to contain the application's executable and (fully-resolved) path +/// string. +/// @param [out] ppFilename Pointer to the location within the output buffer where the executable name begins. +/// @param [in] bufferLength Length of the output buffer, in bytes. +/// @returns Result::Success if GetModuleFileNameA succeeds. Otherwise, the following result codes would be returned: +/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. +extern Result GetExecutableName( + char* pBuffer, + char** ppFilename, + size_t bufferLength); + +/// Retrieves the fully resolved wchar_t file name of the application binary. +/// +/// @param [out] pWcBuffer wchar_t buffer to contain the application's executable and (fully-resolved) path +/// string. +/// @param [out] ppWcFilename Pointer to the location within the wchar_t output buffer where the executable name begins. +/// @param [in] bufferLength Length of the output buffer, in bytes. +/// @returns Result::Success if GetModuleFileNameW succeeds. Otherwise, the following result codes would be returned: +/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. +extern Result GetExecutableName( + wchar_t* pWcBuffer, + wchar_t** ppWcFilename, + size_t bufferLength); + +/// Gets the current library name. ie: the name of the library containing the function +/// 'GetCurrentLibraryName'. Optionally, it will also return the extension if the input +/// buffer for extension is valid. +/// +/// @param [out] pLibBuffer Character buffer where the library name will be stored. +/// @param [in] libBufferLength Length of the output buffer that will hold the library name, in bytes. +/// @param [out] pExtBuffer Character buffer where the extension will be stored. +/// @param [in] extBufferLength Length of the output buffer that will hold the extension, in bytes. +/// @returns Result::Success if no error. Otherwise, returns one of the following codes: +/// Result::ErrorInvalidMemorySize - if incoming buffer is too small. +/// Result::ErrorUnknown - for all other types of errors. +extern Result GetCurrentLibraryName( + char* pLibBuffer, + size_t libBufferLength, + char* pExtBuffer, + size_t extBufferLength); + +/// Opaque build ID obtained with GetCurrentLibraryBuildId +struct BuildId { + uint8 data[16]; +}; + +/// Gets build-unique identifier for the executable or shared library PAL was built into. This will be some opaque +/// hash or timestamp embedded in the code if present and falls back to random bytes (constant until exit) for a +/// 'usuable' build id to always exist. +/// +/// @param [out] pBuildId Will contain unique id for build +/// +/// @returns true if build id will be persistent, false if temporary +/// +/// @note We fallback to using a random build id here instead of failing so that even if this fails and nobody checks, +/// a new driver will always invalidate caches. It is better, of course, not to persist these on disk. +/// @note This may be heavy on first invocation but can be assumed to always return quickly after that. +extern bool GetCurrentLibraryBuildId( + BuildId* pBuildId); + +/// Splits a filename into its path and file components. +/// +/// @param [in] pFullPath Buffer containing the full path & file name. +/// @param [out] pPathBuf Optional. If non-null, will contain the path to the file name. On Windows, this will also +/// include the drive letter. +/// @param [in] pathLen Length of the pPathBuf buffer. Must be zero when pPathBuf is null. +/// @param [out] pFileBuf Optional. If non-null, will contain the base file name, and extension. +/// @param [in] fileLen Length of the pFileBuf buffer. Must be zero when pFileBuf is null. +extern void SplitFilePath( + const char* pFullPath, + char* pPathBuf, + size_t pathLen, + char* pFileBuf, + size_t fileLen); + +/// Creates a new directory at the specified path. +/// +/// @param [in] pPathName String specifying the new path to create. Note that this method can only create one +/// directory, if you specify "foo/bar" the "bar" directory can only be created if "foo" already +/// exists. +/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the +/// following result codes may be returned: +/// + Result::AlreadyExists if the specified directory already exists. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +extern Result MkDir( + const char* pPathName); + +/// Creates a new directory at the specified path and all intermediate directories. +/// +/// @param [in] pPathName String specifying the new path to create.n +/// +/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the +/// following result codes may be returned: +/// + Result::AlreadyExists if the specified directory already exists. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +extern Result MkDirRecursively( + const char* pPathName); + +/// A tuple containing a file's name and statistics +struct StatName +{ + File::Stat stat; + char name[MaxPathStrLen]; +}; + +/// Gets file information for the files in a directory +/// +/// @param [in] dirPath string specifying the directory +/// @param [out] pFileInfos list of information on every file in the directory +/// +/// @returns Result::ErrorInvalidPointer if any of the input pointers are null +/// @returns Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows +/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux +/// @returns Result::Success if the dir is empty +/// @returns Result::Success otherwise +extern Result GetFileInfoInDir( + StringView dirPath, + Vector* pFileInfos); + +/// Counts the number of files found within the directory. +/// +/// @param [in] pDirPath string specifying the directory +/// @param [out] pFileCount the number of files in the directory +/// @param [out] pCharCount the number of characters in the names of all the files +/// +/// @returns Result::ErrorInvalidPointer if any of the input pointers are null +/// @returns Value of Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows +/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux +/// @returns Result::Success if the dir is empty (pFileCount and pCharCount will be 0) +/// @returns Result::Success otherwise +extern Result CountFilesInDir( + Util::StringView dirPath, + size_t* pFileCount, + size_t* pCharCount); + +/// Lists the contents of the specified directory in an array of strings +/// +/// @param [in] dirPath String specifying the directory +/// @param [out] fileNames An array where pointers the file names will be written. +/// @param [out] buffer Memory where the file names can be stored. +/// +/// @returns Result::ErrorInvalidPointer if any of the inputs are null or empty +/// @returns Result::ErrorInvalidValue if there are any file I/O errors +/// @returns Result::Success otherwise +extern Result GetFileNamesInDir( + Util::StringView dirPath, + Util::Span> fileNames, + Util::Span buffer); + +/// Non-recursively delete the least-recently-accesssed files from a directory until the directory reaches size in bytes. +/// +/// @param [in] pPathName string specifying the absolute path to the directory you want to remove files from +/// @param desiredSize the size you want to shrink the directory to +/// +/// @returns Result::ErrorUnknown on File I/O error. +/// Result::Success otherwise. +Result RemoveOldestFilesOfDirUntilSize( + const char* pPathName, + uint64 desiredSize); + +/// Non-recursively delete the least-recently-accesssed files until the total reaches size in bytes. +/// +/// @param [in] pDirPath String specifying the directory. +/// @param [in/out] pFileInfos List of files. This span will be modified to remove the files which were deleted. +/// @param desiredSize The size you want to shrink the list of files to. +/// +/// @returns Result::ErrorUnknown on File I/O error. +/// Result::Success otherwise. +Result RemoveOldestFilesOfDirUntilSize( + StringView dirPath, + Span* pFileInfos, + uint64 desiredSize); + +/// Remove all files below threshold of a directory at the specified path. +/// +/// @param [in] pPathName String specifying the absolute path to remove. +/// @param [in] threshold The file time(from 1970/01/01 00:00:00) older(smaller) than threshold will be removed. +/// +/// @returns Result::Success if all files are successfully removed. Otherwise, the +/// following result codes may be returned: +/// + Result::ErrorUnknown if the specified directory is failed to open/remove. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +Result RemoveFilesOfDirOlderThan( + const char* pPathName, + SecondsSinceEpoch threshold); + +/// Get status of a directory at the specified path. +/// +/// @param [in] pPathName String specifying the absolute path. +/// @param [out] pTotalSize Size(byte) of all files +/// @param [out] pOldestTime The oldest time(seconds from 1970/01/01 00:00:00) of all files +/// +/// @returns Result::Success if all files are successfully removed. Otherwise, the +/// following result codes may be returned: +/// + Result::ErrorUnknown if the specified directory is failed to open. +Result GetStatusOfDir( + const char* pPathName, + uint64* pTotalSize, + SecondsSinceEpoch* pOldestTime); + +/// Almost-Posix-style rename file or directory: replaces already-existing file. +/// Posix says this operation is atomic; Windows does not specify. +/// +/// @param [in] pOldName Old file or directory name +/// @param [in] pNewName Name to rename to +/// +/// @returns Result::Success if file/directory successfully moved. +Result Rename( + const char* pOldName, + const char* pNewName); + +/// Get the Process ID of the current process +/// +/// @returns The Process ID of the current process +extern uint32 GetIdOfCurrentProcess(); + +/// OS-specific wrapper for printing stack trace information. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] skipFrames Number of stack frames to skip. Implied skip of 1 (0 is 1). +/// +/// @returns The resultant length of the stack trace string. +extern size_t DumpStackTrace( + char* pOutput, + size_t bufSize, + uint32 skipFrames); + +/// Puts the calling thread to sleep for a specified number of milliseconds. +/// +/// @param [in] duration Amount of time to sleep for, in milliseconds. +extern void Sleep(std::chrono::milliseconds duration); + +/// Create Directory and SubDirectory of Executable's Name at pBaseDir +/// Like if pBaseDir="amdPal/" then final pLogDir will be created like this: amdpal/app.exe/ +/// +/// @param [in] pBaseDir Base Dir Path +/// @param [out] pLogDir Pointer to Memory where Final Path will be saved +/// @param [in] logDirSize Size of pLogDir in chars +/// +/// @returns Result::Success if the directories were created and pLogDir was populated. +extern Result CreateLogDir( + const char* pBaseDir, + char* pLogDir, + size_t logDirSize); + +/// Check if the requested key is combo key. +/// +/// @param [in] key The requested key value +/// @param [out] pKeys The array of keys the combo key composed of +/// +/// @returns If the requested key is a combo key. +inline bool IsComboKey( + KeyCode key, + KeyCode* pKeys) +{ + bool ret = false; + + if (key == KeyCode::Shift_F10) + { + ret = true; + pKeys[0] = KeyCode::Shift; + pKeys[1] = KeyCode::F10; + } + else if (key == KeyCode::Shift_F11) + { + ret = true; + pKeys[0] = KeyCode::Shift; + pKeys[1] = KeyCode::F11; + } + else + { + pKeys[0] = key; + } + + return ret; +} + +#if PAL_HAS_CPUID +/// Issue the cpuid instruction. +/// +/// @param [out] pRegValues EAX/EBX/ECX/EDX values +/// @param [in] level CpuId instruction feature level. +inline void CpuId( + uint32* pRegValues, + uint32 level) +{ +#if defined(_WIN32) + __cpuid(reinterpret_cast(pRegValues), static_cast(level)); +#elif defined(__unix__) + __get_cpuid(level, pRegValues, pRegValues + 1, pRegValues + 2, pRegValues + 3); +#else +#error "Not implemented for the current platform" +#endif +} + +/// Issue the cpuid instruction, with an additional sublevel code. +/// +/// @param [out] pRegValues EAX/EBX/ECX/EDX values +/// @param [in] level CpuId instruction feature level. +/// @param [in] sublevel CpuId instruction feature sublevel. +inline void CpuId( + uint32* pRegValues, + uint32 level, + uint32 sublevel) +{ +#if defined(_WIN32) + __cpuidex(reinterpret_cast(pRegValues), static_cast(level), static_cast(sublevel)); +#elif defined(__unix__) + __cpuid_count(level, sublevel, *pRegValues, *(pRegValues + 1), *(pRegValues + 2), *(pRegValues + 3)); +#else +#error "Not implemented for the current platform" +#endif +} +#endif + +/// Play beep sound. Currently function implemented only for WIN platform. +/// +/// @param [in] frequency Frequency in hertz of the beep sound. +/// @param [in] duration Duration in milliseconds of the beep sound. +extern void BeepSound( + uint32 frequency, + uint32 duration); + +/// Detect if a debugger is attached to the current process. +/// +/// @note This function is conservative, declaring the debugger not attached on error. +/// +/// @returns true if there is a debugger attached. +extern bool IsDebuggerAttached(); + +/// Set path to be accessible by everyone. +/// +/// In Linux world that is 777. For Windows it does nothing. +/// To be used for debug folders mainly. Use carefully for release paths as this will open them wide +/// and possibly create a security hole. +/// +/// @param [in] fileName Path to a file or folder, it must exist. +/// +/// @returns Returns success if permissions were set successfully, Result::ErrorUnknown otherwise +extern Result SetRwxFilePermissions(const char* pFileName); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palTime.h b/shared/amdgpu-windows-interop/pal/inc/util/palTime.h new file mode 100644 index 0000000000..69e3394966 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palTime.h @@ -0,0 +1,87 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palTime.h + * @brief PAL time-related utility collection. + *********************************************************************************************************************** + */ + +#pragma once + +#include + +namespace Util +{ + +/// Specifies a class that implements a timestamp. +class Timestamp +{ +public: + /// Creates a new timestamp object that records the time it was created. + Timestamp(); + + /// Returns the timestamp as a C-string. + const char* CStr() const { return m_data; } + +private: + char m_data[64]; +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 873 +/// Seconds stored as a float instead of an integer. +using fseconds = std::chrono::duration; +/// Milliseconds stored as a float instead of an integer. +using fmilliseconds = std::chrono::duration; +/// Microseconds stored as a float instead of an integer. +using fmicroseconds = std::chrono::duration; +/// Nanoseconds stored as a float instead of an integer. +using fnanoseconds = std::chrono::duration; + +/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. +/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. +/// system_clock's internal duration is still implementation defined. +/// On Windows it's hundreds-of-nanoseconds and on Linux it's seconds. +/// However time_point has it's own duration type. +/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. +using SecondsSinceEpoch = std::chrono::time_point; + +/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. +template +constexpr DestDuration TimeoutCast( + const std::chrono::duration& d) +{ + if (d == (std::chrono::duration::max)()) + { + return (DestDuration::max)(); + } + else + { + return std::chrono::duration_cast(d); + } +} +#endif + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h new file mode 100644 index 0000000000..878731e1db --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h @@ -0,0 +1,714 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palUtil.h + * @brief Common include for the PAL utility collection. Defines common types, macros, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +/// Utility macro for turning another macro into a string literal. +#define _PAL_STRINGIFY(_x) #_x +#define PAL_STRINGIFY(_x) _PAL_STRINGIFY(_x) + +/// C++11 standard version. +#define PAL_CPLUSPLUS_11 201103L +/// C++14 standard version. +#define PAL_CPLUSPLUS_14 201402L +/// C++17 standard version. +#define PAL_CPLUSPLUS_17 201703L +/// C++ feature version from September 2017 contains a few C++20 features. +#define PAL_CPLUSPLUS_1709 201709L +/// C++20 standard version. +#define PAL_CPLUSPLUS_20 202002L + +/// C++ standard version used to compile PAL. +# ifdef _MSVC_LANG +// MSVC requires the /Zc:__cplusplus flag, otherwise __cplusplus is that of 199711L (C++98). _MSVC_LANG has the correct +// C++ standard version. +# define PAL_CPLUSPLUS _MSVC_LANG +# else +# define PAL_CPLUSPLUS __cplusplus +# endif + +/// Checks if PAL is compiled with C++ of at least version @p v. +#define PAL_CPLUSPLUS_AT_LEAST(v) (PAL_CPLUSPLUS >= (v)) + +static_assert( + PAL_CPLUSPLUS_AT_LEAST(PAL_CPLUSPLUS_1709), + "C++ standard version " PAL_STRINGIFY(PAL_CPLUSPLUS_1709) " is required to build PAL. " + "Found " PAL_STRINGIFY(PAL_CPLUSPLUS) "."); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 878 +/// We already declare NOMINMAX publicly, but that won't stop clients from defining their own min/max macros. +/// These macros confuse the compiler when using functions named min/max, leading to build errors. +#if defined(min) || defined(max) +static_assert(false, "Clients may not define macros named \"min\" or \"max\"."); +#endif +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 +#include +#endif +#include + +/// stdint is included instead of cstdint to allow Visual Studio Intellisense to work for Linux builds. This can be +/// removed if the error caused by including cstdint is figured out. +#include "stdint.h" + +/// Include in the class declaration in order to disallow use of the copy constructor and assignment operator for that +/// class. +#define PAL_DISALLOW_COPY_AND_ASSIGN(_typename) \ + _typename(const _typename&) = delete; \ + _typename& operator=(const _typename&) = delete; + +/// Include in the declaration in order to disallow use of the default constructor for a class. +#define PAL_DISALLOW_DEFAULT_CTOR(_typename) \ + _typename() = delete; + +#if !defined(__GNUC__) + +// Equates to the [__stdcall](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/stdcall.md) convention on Windows. +#define PAL_STDCALL __stdcall +// Equates to the [__cdecl](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/cdecl.md) convention on Windows. +#define PAL_CDECL __cdecl +// Equates to [__declspec(align(__x))](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/align-cpp.md) on Windows. +#define PAL_ALIGN(__x) __declspec(align(__x)) +#define PAL_FORCE_INLINE __forceinline +#else +/// Undefined on GCC platforms. +#define PAL_STDCALL +/// Undefined on GCC platforms. +#define PAL_CDECL +/// Undefined on GCC platforms. +#define PAL_ALIGN(__x) +#define PAL_FORCE_INLINE __attribute__((always_inline)) inline +#endif + +/// Platform cache line size in bytes. +#define PAL_CACHE_LINE_BYTES 64 +/// Platform system memory page size in bytes. +#define PAL_PAGE_BYTES 4096 + +/// Force cache line alignment. +#define PAL_ALIGN_CACHE_LINE PAL_ALIGN(PAL_CACHE_LINE_BYTES) + +#if !defined(_WIN32) +/// Value representing an invalid file descriptor on Linux systems. +constexpr int32_t InvalidFd = -1; +#endif + +#ifdef __has_builtin +/// A macro that checks for the presence of builtin functions. Will default to false if the compiler does not have +/// support for doing this check. +#define PAL_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +#define PAL_HAS_BUILTIN(builtin) 0 +#endif + +#if defined(__has_cpp_attribute) +#define PAL_HAS_CPP_ATTR(attr) __has_cpp_attribute(attr) +#else +#define PAL_HAS_CPP_ATTR(attr) 0 +#endif + +/// Library-wide namespace encapsulating all PAL utility collection entities. +namespace Util +{ + +typedef int8_t int8; ///< 8-bit integer. +typedef int16_t int16; ///< 16-bit integer. +typedef int32_t int32; ///< 32-bit integer. +typedef int64_t int64; ///< 64-bit integer. +typedef uint8_t uint8; ///< Unsigned 8-bit integer. +typedef uint16_t uint16; ///< Unsigned 16-bit integer. +typedef uint32_t uint32; ///< Unsigned 32-bit integer. +typedef uint64_t uint64; ///< Unsigned 64-bit integer. +typedef uint64_t gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from + /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. + +/// Error and return codes indicating outcome of a requested operation. Success result codes are greater than or equal +/// to 0, and error results codes are less than 0. +enum class Result : int32 +{ + /// @internal The operation completed successfully. + _Success = 0x00000000, + + // Unfortunately for Linux clients, X.h includes a "#define Success 0" macro. Clients have their choice of either + // undefing Success before including this header or using _Success when dealing with PAL. +#ifndef Success + /// The operation completed successfully. + Success = _Success, +#endif + + /// The operation is not supported. + Unsupported = 0x00000001, + + /// The operation completed successfully but the result is not ready. This result code normally applies to + /// situations where results of queued GPU operations such as queries and fences have not been written to memory + /// yet. + NotReady = 0x00000002, + + /// The wait operation completed due to a client-specified timeout condition. + Timeout = 0x00000003, + + /// The event is in the "set" state. @see IGpuEvent::GetStatus. + EventSet = 0x00000004, + + /// The event is in the "reset" state. @see IGpuEvent::GetStatus. + EventReset = 0x00000005, + + /// The operation was successful, but the client has reached the maximum allowable number of flippable GPU memory + /// objects. Future requests to create presentable Images or flippable GPU memory objects may fail due to + /// limitations within the underlying OS. + /// @see IDevice::CreateGpuMemory. + /// @see IDevice::CreatePresentableImage. + TooManyFlippableAllocations = 0x00000006, + + /// The present was successful, but some portion of the window is currently occluded by another window. + PresentOccluded = 0x00000007, + + /// The directory/file/etc. being created already exists. + AlreadyExists = 0x00000008, + + /// A warning indicates an operation is successful (supported by H/W) but out of a certain spec (e.g. VESA). + OutOfSpec = 0x00000009, + + /// The value being searched for was not found. + NotFound = 0x0000000A, + + /// End of file reached successfully. + Eof = 0x0000000B, + + /// If ReserveEntryOnMiss was specified, the entry was not found, and the entry was successfully reserved. + Reserved = 0x0000000C, + + /// If an operation is purposefully terminated early, rather than from an error. + Aborted = 0x0000000D, + + /// The operation encountered an unknown error. + ErrorUnknown = -(0x00000001), + + /// The requested operation is unavailable at this time. + ErrorUnavailable = -(0x00000002), + + /// The initialization operation failed for unknown reasons. + ErrorInitializationFailed = -(0x00000003), + + /// The operation could not complete due to insufficient system memory. + ErrorOutOfMemory = -(0x00000004), + + /// The operation could not complete due to insufficient GPU memory. + ErrorOutOfGpuMemory = -(0x00000005), + + /// The device was lost due to its removal or a possible hang and recovery condition. The client should destroy all + /// devices (and objects attached to them) and re-enumerate the available devices be calling EnumerateDevices(). + ErrorDeviceLost = -(0x00000007), + + /// A required input pointer passed to the call was invalid (probably null). + ErrorInvalidPointer = -(0x00000008), + + /// An invalid value was passed to the call. + ErrorInvalidValue = -(0x00000009), + + /// An invalid ordinal was passed to the call. + ErrorInvalidOrdinal = -(0x0000000A), + + /// An invalid memory size was passed to the call. + ErrorInvalidMemorySize = -(0x0000000B), + + /// Invalid flags were passed to the call. + ErrorInvalidFlags = -(0x0000000C), + + /// An invalid alignment parameter was specified + ErrorInvalidAlignment = -(0x0000000D), + + /// An invalid resource format was specified. + ErrorInvalidFormat = -(0x0000000E), + + /// The requested operation cannot be performed on the provided @ref Pal::IImage object. + ErrorInvalidImage = -(0x0000000F), + + /// The descriptor set data is invalid or does not match the related pipeline. + ErrorInvalidDescriptorSetData = -(0x00000010), + + /// An invalid queue type was specified. + ErrorInvalidQueueType = -(0x00000011), + + /// An invalid object type was specified. + ErrorInvalidObjectType = -(0x00000012), + + /// The specified shader uses an unsupported version of AMD IL. + ErrorUnsupportedShaderIlVersion = -(0x00000013), + + /// The specified shader code is invalid or corrupt. + ErrorBadShaderCode = -(0x00000014), + + /// The specified serialized pipeline data is invalid or corrupt. + ErrorBadPipelineData = -(0x00000015), + + /// The queue operation specified more GPU memory references than are supported. + /// @see Pal::IQueue::Submit + /// @see Pal::IDevice::AddGpuMemoryReferences + /// @see Pal::DeviceProperties::maxGpuMemoryRefsResident + ErrorTooManyMemoryReferences = -(0x00000016), + + /// The memory object cannot be mapped because it does not reside in a CPU visible heap. + ErrorNotMappable = -(0x00000017), + + /// The map operation failed due to an unknown or system reason. + ErrorGpuMemoryMapFailed = -(0x00000018), + + /// The unmap operation failed due to an unknown or system reason. + ErrorGpuMemoryUnmapFailed = -(0x00000019), + + /// The serialized pipeline load operation failed due to an incompatible device. + ErrorIncompatibleDevice = -(0x0000001A), + + /// The serialized pipeline load operation failed due to an incompatible PAL library. + ErrorIncompatibleLibrary = -(0x0000001B), + + /// The requested operation (such as command buffer submission) can't be completed because command buffer + /// construction is not complete. + ErrorIncompleteCommandBuffer = -(0x0000001C), + + /// The specified command buffer failed to build correctly. This error can be delayed from the original source of + /// the error since the command buffer building methods do not return error codes. + ErrorBuildingCommandBuffer = -(0x0000001D), + + /// The operation cannot complete since not all objects have valid GPU memory bound to them. + ErrorGpuMemoryNotBound = -(0x0000001E), + + /// The requested operation is not supported on the specified queue type. + ErrorIncompatibleQueue = -(0x0000001F), + + /// The object cannot be created or opened for sharing between multiple GPU devices. + ErrorNotShareable = -(0x00000020), + + /// The operation failed because the specified fullscreen mode was unavailable. This could be a failure while + /// attempting to take fullscreen ownership, or when attempting to perform a fullscreen present and the user has + /// left fullscreen mode. + ErrorFullscreenUnavailable = -(0x00000021), + + /// The targeted screen of the operation has been removed from the system. + ErrorScreenRemoved = -(0x00000022), + + /// Present failed because the screen mode is no longer compatible with the source image. + ErrorIncompatibleScreenMode = -(0x00000023), + + /// The cross-GPU present failed, possibly due to a lack of system bus bandwidth to accommodate the transfer. + ErrorMultiDevicePresentFailed = -(0x00000024), + + /// The slave GPU(s) in an MGPU system cannot create BLTable present images. + ErrorWindowedPresentUnavailable = -(0x00000025), + + /// The attempt to enter fullscreen exclusive mode failed because the specified image doesn't properly match the + /// screen's current dimensions. + ErrorInvalidResolution = -(0x00000026), + + /// The shader specifies a thread group size that is bigger than what is supported by this device. + ErrorThreadGroupTooBig = -(0x00000027), + + /// Invalid image create info: Specified both color target and depth usage + ErrorInvalidImageTargetUsage = -(0x00000028), + + /// Invalid image create info: Specified a 1D type for a color target + ErrorInvalidColorTargetType = -(0x00000029), + + /// Invalid image create info: Specified a non-2D type for a depth/stencil target + ErrorInvalidDepthTargetType = -(0x0000002A), + + /// Invalid image create info: The image format supports depth/stencil but depth/stencil usage was not specified + ErrorMissingDepthStencilUsage = -(0x0000002B), + + /// Invalid image create info: Specified MSAA and multiple mip levels + ErrorInvalidMsaaMipLevels = -(0x0000002C), + + /// Invalid image create info: The image format is incompatible with MSAA + ErrorInvalidMsaaFormat = -(0x0000002D), + + /// Invalid image create info: The image type is incompatible with MSAA + ErrorInvalidMsaaType = -(0x0000002E), + + /// The sample count is invalid + ErrorInvalidSampleCount = -(0x0000002F), + + /// Invalid image create info: Invalid block compressed image type + ErrorInvalidCompressedImageType = -(0x00000030), + + /// Invalid image create info: Format is incompatible with the specified image usage + ErrorInvalidUsageForFormat = -(0x00000032), + + /// Invalid image create info: Array size is invalid + ErrorInvalidImageArraySize = -(0x00000033), + + /// Invalid image create info: Array size is invalid for a 3D image + ErrorInvalid3dImageArraySize = -(0x00000034), + + /// Invalid image create info: Image width is invalid + ErrorInvalidImageWidth = -(0x00000035), + + /// Invalid image create info: Image height is invalid + ErrorInvalidImageHeight = -(0x00000036), + + /// Invalid image create info: Image depth is invalid + ErrorInvalidImageDepth = -(0x00000037), + + /// Invalid image create info: Mip count is invalid + ErrorInvalidMipCount = -(0x00000038), + + /// Invalid image create info: Image format is incompatible with the image usage specified. + ErrorFormatIncompatibleWithImageUsage = -(0x00000039), + + /// Operation requested an image plane that is not available on the image. + ErrorImagePlaneUnavailable = -(0x0000003A), + + /// Another format is incompatible with an image's format. + ErrorFormatIncompatibleWithImageFormat = -(0x0000003B), + + /// Another format is incompatible with an image plane's format. + ErrorFormatIncompatibleWithImagePlane = -(0x0000003C), + + /// Operation requires a shader readable or writable image usage but the image does not support it. + ErrorImageNotShaderAccessible = -(0x0000003D), + + /// Format is paired with a channel mapping that contains invalid components. + ErrorInvalidFormatSwizzle = -(0x0000003E), + + /// A base mip level that is out of bounds or otherwise invalid was specified. + ErrorInvalidBaseMipLevel = -(0x0000003F), + + /// A view array size that was zero or otherwise invalid was specified. + ErrorInvalidViewArraySize = -(0x00000040), + + /// A view base array slice that was out of bounds or otherwise invalid was specified. + ErrorInvalidViewBaseSlice = -(0x00000041), + + /// A view image type was specified that is incompatible with the image's type. + ErrorViewTypeIncompatibleWithImageType = -(0x00000042), + + /// A view specifies an array slice range that is larger than what is supported by the image. + ErrorInsufficientImageArraySize = -(0x00000043), + + /// It is illegal to create a cubemap view into an MSAA image. + ErrorCubemapIncompatibleWithMsaa = -(0x00000044), + + /// A cubemap view was created to an image that does not have square width and height. + ErrorCubemapNonSquareFaceSize = -(0x00000045), + + /// An fmask view was created to an image that does not support an fmask. + ErrorImageFmaskUnavailable = -(0x00000046), + + /// A private screen was removed. + ErrorPrivateScreenRemoved = -(0x00000047), + + /// A private screen was already in exclusive use. + ErrorPrivateScreenUsed = -(0x00000048), + + /// The image count created or opened on this private display exceed maximum. + ErrorTooManyPrivateDisplayImages = -(0x00000049), + + /// The private screen is not enabled. + ErrorPrivateScreenNotEnabled = -(0x0000004A), + + /// The private screen count exceeds the maximum (including emulated and physical ones). + ErrorTooManyPrivateScreens = -(0x0000004B), + + /// Invalid image create info: Image rowPitch does not equal the image's actual row pitch. + ErrorMismatchedImageRowPitch = -(0x0000004C), + + /// Invalid image create info: Image depthPitch does not equal the image's actual depth pitch. + ErrorMismatchedImageDepthPitch = -(0x0000004D), + + /// The given swap chain cannot be associated with any more presentable images. + ErrorTooManyPresentableImages = -(0x0000004E), + + /// A fence was used in GetStatus() or WaitForFences() without being used in any submission. + ErrorFenceNeverSubmitted = -(0x0000004F), + + /// The image used on the specified private screen has an invalid format. + ErrorPrivateScreenInvalidFormat = -(0x00000050), + + /// The timing data set on the specified private screen was invalid. + ErrorPrivateScreenInvalidTiming = -(0x00000051), + + /// The resolution set on the specified private screen was invalid. + ErrorPrivateScreenInvalidResolution = -(0x00000052), + + /// The scaling parameter set on the specified private screen was invalid. + ErrorPrivateScreenInvalidScaling = -(0x00000053), + + /// Invalid image create info: Invalid YUV image type + ErrorInvalidYuvImageType = -(0x00000054), + + /// The external shader cache found a matching hash but the with different key data. + ErrorShaderCacheHashCollision = -(0x00000055), + + /// The external shader cache is full + ErrorShaderCacheFull = -(0x00000056), + + /// The operation caused a pagefault. + ErrorGpuPageFaultDetected = -(0x00000057), + + /// The provided pipeline ELF uses an unsupported ABI version. + ErrorUnsupportedPipelineElfAbiVersion = -(0x00000058), + + /// The provided pipeline ELF is invalid. + ErrorInvalidPipelineElf = -(0x00000059), + + /// The returned results were incomplete. + ErrorIncompleteResults = -(0x00000060), + + /// The display mode is imcompatible with framebuffer or CRTC. + ErrorIncompatibleDisplayMode = -(0x00000061), + + /// Implicit fullscreen exclusive mode is not safe because the specified window size doesn't match the + /// screen's current dimensions. + ErrorIncompatibleWindowSize = -(0x00000062), + + /// A semaphore was used in WaitForSemaphores() without being signaled. + ErrorSemaphoreNeverSignaled = -(0x00000063), + + /// Invalid image create info: specified metadataMode is invalid for the Image. + ErrorInvalidImageMetadataMode = -(0x00000064), + + /// Invalid external handle detected for the Image. + ErrorInvalidExternalHandle = -(0x00000065), + + /// The permission of operation is denied. + ErrorPermissionDenied = -(0x00000066), + + /// The operation failed because the disk is full. + ErrorDiskFull = -(0x00000067), + + /// The static VMID acquire/release operation failed. + ErrorStaticVmidOpFailed = -(0x00000068), + +}; + +///Specifies a ratio of two unsigned integers. +struct Rational +{ + uint32 numerator; ///< Numerator + uint32 denominator; ///< Denominator +}; + +/// Implements operator== for PAL's Rational struct where similar ratios like 2/3 and 4/6 are treated as equal. +/// +/// @param left Rational to be compared +/// @param right Rational to be compared +/// +/// @return true if the ratios are logically equal. +constexpr bool operator==( + const Rational& lhs, + const Rational& rhs) +{ + // Any ratio with a zero denominator is illegal/undefined, for example: "3/0 == 5/0" or "5/3 == 0/0". We must pick + // either "true" or "false" for these illegal cases. "true" seems like the most wrong option so we use "false". + if ((lhs.denominator == 0) || (rhs.denominator == 0)) + { + return false; + } + // Otherwise, our equality check is: lhs_n / lhs_d == rhs_n / rhs_d + // Multiply both sides by lhs_d: lhs_n == rhs_n * lhs_d / rhs_d + // Multiply both sides by rhs_d: lhs_n * rhs_d == rhs_n * lhs_d + // This trick avoids dealing with common factors or remainders and uses no slow division instructions. + return (static_cast(lhs.numerator) * static_cast(rhs.denominator) == + static_cast(rhs.numerator) * static_cast(lhs.denominator)); +} + +// Flags to be passed to store operations. +struct StoreFlags +{ + union + { + struct + { + uint32 enableFileCache : 1; ///< If we should skip the file cache layer when we get to it. + uint32 enableCompression : 1; ///< If we should skip the compression layer when we get to it. + uint32 reserved : 30; + }; + uint32 all; + }; +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 +/// Seconds stored as a float instead of an integer. +using fseconds = std::chrono::duration; +/// Milliseconds stored as a float instead of an integer. +using fmilliseconds = std::chrono::duration; +/// Microseconds stored as a float instead of an integer. +using fmicroseconds = std::chrono::duration; +/// Nanoseconds stored as a float instead of an integer. +using fnanoseconds = std::chrono::duration; + +/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. +/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. +/// system_clock's internal duration is still implementation defined. +/// On Windows it's hundreds of nanoseconds and on Linux it's seconds. +/// However time_point has it's own duration type. +/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. +using SecondsSinceEpoch = std::chrono::time_point; + +/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. +template +constexpr ToDuration TimeoutCast( + const std::chrono::duration& d) +{ + if (d == (std::chrono::duration::max)()) + { + return (ToDuration::max)(); + } + else + { + return std::chrono::duration_cast(d); + } +} +#endif + +/// Inline function to determine if a Result enum is considered an error. +constexpr bool IsErrorResult(Result result) { return (static_cast(result) < 0); } + +/// Inline function to collapse two Result enums into the most useful Result code. It considers errors to be more +/// interesting than success codes and considers "Success" to be the least interesting success code. If both Results +/// are errors, the first Result is returned. +constexpr Result CollapseResults(Result lhs, Result rhs) + { return (IsErrorResult(lhs) || (static_cast(lhs) > static_cast(rhs))) ? lhs : rhs; } + +/** + *********************************************************************************************************************** + * @page UtilOverview Utility Collection + * + * In addition to its GPU-specific core functionality, PAL provides a lot of generic, OS-abstracted software utilities + * in the @ref Util namespace. The PAL core relies on these utilities, but they are also available for use by its + * clients. In fact, it is possible to build and use PAL only for its utility collection by building PAL with the + * PAL_BUILD_CORE build option set to 0. + * + * All available PAL utilities are defined in the @ref Util namespace, and are briefly summarized below. See the + * Reference topics for more detailed information on specific classes, enums, etc. + * + * ### System Memory Management + * palSysMemory.h defines a handful of macros that can be used for allocating and freeing system heap memory. These + * macros will use the client-specified allocation callbacks specified by the client at CreatePlatform() if specified. + * These macros are: + * + * - PAL_MALLOC: Equivalent to malloc(). + * - PAL_CALLOC: Equivalent to calloc(). + * - PAL_FREE: Equivalent to free(). + * - PAL_SAFE_FREE: Equivalent to free(), then nulls out the specified pointer. + * - PAL_NEW: Equivalent to C++ new. + * - PAL_NEW_ARRAY: Equivalent to C++ new[]. + * - PAL_PLACEMENT_NEW: Equivalent to C++ placement new. + * - PAL_DELETE: Equivalent to C++ delete. + * - PAL_DELETE_THIS: Special version of PAL_DELETE that effectively does "delete this;" This is necessary for + * classes that have non-public destructors. + * - PAL_DELETE_ARRAY: Equivalent to C++ delete[]. + * - PAL_SAFE_DELETE_ARRAY: Equivalent to C++ delete, then nulls out the specified pointer. + * - PAL_SAFE_DELETE: Equivalent to C++ delete[], then nulls out the specified pointer. + * + * ### Allocators + * All of the memory management macros take in a templated allocator, which is required to have the following two + * functions defined: + * + * void* Alloc(const Util::AllocInfo) + * void Free(const Util::FreeInfo) + * + * It is expected that clients that specify their own allocators will handle cases that require specific alignments + * and/or zeroing the returned memory. + * + * Some allocators can be created for use by clients: + * - VirtualLinearAllocator: A linear allocator that allocates virtual memory and backs it with physical memory + * when needed. + * + * ### Debug Prints and Asserts + * palDbgPrint.h and palAssert.h provide a number of macros used widely by the PAL core and also available for use + * by clients. + * + * The PAL_DPF, PAL_DPINFO, PAL_DPERROR, and PAL_DPWARN can be used to issue debug prints. These macros will be nulled + * out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. SetDbgPrintMode() can be called to configure how the + * different categories of debug prints will be handled (e.g., print to the debugger, print to file, etc.). + * + * The PAL_ASSERT and PAL_ALERT macros can be used to verify expected states of the program at runtime. PAL_ASSERT + * should be used for verifying expected invariants and assumptions, while PAL_ALERT should be used to alert the + * developer of a condition that is allowed, but not typically expected (i.e., failure of a system memory allocation). + * Note that the polarity of the condition check is different between assert and alert. Asserts "assert" that the + * specified condition is true (and complain if it's not), while alerts "alert" a developer if an unexpected condition + * is true. These macros will be nulled out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. EnableAssertMode() + * can be called to enable/disable asserts or alerts at runtime. + * + * ### Generic Containers + * Util includes a number of generic container data structure implementations. Note that most of these are broken up + * into two header files - for example, list.h and listImpl.h. The intention is that list.h will be included from + * other header files that need a full list definition, while listImpl.h will be included by .cpp files that actually + * interact with the list. This should keep build times down versus putting all implementations directly in list.h. + * - AutoBuffer: Allows dynamic arrays to be placed on the stack without a heap allocation in situations where a + * maximum reasonable expected size is known. + * - Deque: Double ended queue. + * - HashMap: Fast map implementation. Note that this implementation has some non-standard restrictions on the key + * (can't be 0) and value size (must fit in a cache line). + * - HashSet: Fast set implementation. Note the similar restrictions to HashMap. + * - IntervalTree: [Interval tree] implementation. + * - RingBuffer: A ringed buffer of variable length and size. + * + * ### Multithreading and Synchronization + * Util includes a number of OS-abstracted multithreading and CPU synchronization constructs: + * + * - Thread + * - Mutex + * - Semaphore + * - ConditionVariable + * - Event + * + * ### Files + * The File class provides an OS-abstracted interface for opening files and reading/writing data in those files. + * Further, the ElfReadContext and ElfWriteContext classes provide functionality for reading and writing buffers in the + * [Executable and Linkable Format (ELF)] + * The ELF utilities can be used in conjunction with File in order to read/write ELF files on disk. + * + * ### Inline Functions + * palInlineFuncs.h defines a bunch of simple inline functions that are used throughout PAL and might be useful to + * clients. Some examples include VoidPtrInc(), Pow2Pad(), Min(), Max(), Strncpy(), etc. + * + * palMath.h defines a Math namespace with various constants and functions related to floating point conversions and + * basic math rouintes like Sqrt(). + * + * Additionally, palInlineFuncs.h defines a template metaprogramming string hash implementation that can produce + * a FNV1A hash for a string specified in the source code without the string showing up in a compiled release build. + * + * ### System Utilities + * palSysUtil.h defines a few functions providing abstracted system-specific functionality: + * - Access to the high resolution CPU performance counters with GetPerfFrequency() and GetPerfCpuTime(). + * - Support for asynchronously querying if a particular keyboard key is currently pressed with IsKeyPressed(). + * + * ### Cryptographic Algorithm Implementations + * Util provides the crypto algorithm Md5 + * + * Next: @ref GpuUtilOverview + *********************************************************************************************************************** + */ + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palVector.h b/shared/amdgpu-windows-interop/pal/inc/util/palVector.h new file mode 100644 index 0000000000..633b00d71d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/inc/util/palVector.h @@ -0,0 +1,455 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palVector.h +* @brief PAL utility collection Vector and VectorIterator class declarations. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" +#include "palSpan.h" +#include "palSysMemory.h" +#include + +namespace Util +{ + +// Forward declarations. +template class Vector; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in Vector. + * + * Supports forward traversal. + *********************************************************************************************************************** + */ +template +class VectorIterator +{ +public: + /// Checks if the current index is within bounds of the number of elements in the vector. + /// + /// @returns True if the current element this iterator is pointing to is within the permitted range. + bool IsValid() const { return (m_curIndex < m_srcVector.m_numElements); } + + /// Returns the element the iterator is currently pointing to as a reference. + /// + /// @warning This may cause an access violation if the iterator is not valid. + /// + /// @returns The element the iterator is currently pointing to. + T& Get() const + { + PAL_ASSERT(IsValid()); + return (*(m_srcVector.m_pData + m_curIndex)); + } + + /// Advances the iterator to point to the next element. + /// + /// @warning Does not do bounds checking. + void Next() { ++m_curIndex; } + + /// Retrieves the current vector position of this iterator. + /// + /// @returns The location in the vector of the element the iterator is currently pointing to. + uint32 Position() const { return m_curIndex; } + +private: + VectorIterator(uint32 index, const Vector& srcVec); + + uint32 m_curIndex; // The current index of the vector iterator. + const Vector& m_srcVector; // The vector container this iterator is used for. + + PAL_DISALLOW_DEFAULT_CTOR(VectorIterator); + + // Although this is a transgression of coding standards, it means that Vector does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class Vector; +}; + +/** + *********************************************************************************************************************** + * @brief Vector container. + * + * Vector is a templated array based storage that starts with a default-size allocation in the stack. If more space is + * needed it then resorts to dynamic allocation by doubling the size every time the capacity is exceeded. + * Operations which this class supports are: + * + * - Insertion at the end of the array. + * - Forward iteration. + * - Random access. + * + * @warning This class is not thread-safe. + *********************************************************************************************************************** + */ +template +class Vector +{ +public: + /// A convenient shorthand for VectorIterator. + typedef VectorIterator Iter; + + /// When this allocates, it doubles the old size of memory + static constexpr uint32 GrowthFactor = 2; + + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + Vector(Allocator*const pAllocator); + + /// Destructor. + ~Vector(); + + /// Move constructor. + /// + /// @param [in] vector Reference to a dying vector, from which resources will be stolen. + Vector(Vector&& vector); + + /// Increases maximal capacity to value greater or equal to the newCapacity. + /// If newCapacity is greater than the maximal capacity, new storage is allocated, + /// otherwise the method does nothing. + /// + /// @note All existing iterators will not get invalidated, even in case new storage is allocated, + /// because iterators are referencing vector, rather than elements of that vector. + /// + /// @warning All pointers and references to elements of a vector will be invalidated, + /// in case new storage is allocated. + /// + /// @param [in] newCapacity The new capacity of a vector, which is lower limit of the maximal capacity. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Reserve(uint32 newCapacity); + + /// Increases maximum capacity to the number of elements in the vector, plus the specified increment amount. + /// Equivalent to this->Reserve(this->NumElements() + amount); + /// + /// @param [in] amount Number of items beyond the current element count to increas the capacity to. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Grow(uint32 amount) { return Reserve(NumElements() + amount); } + + /// Set size to newSize. + /// If size is decreased, elements at the end of the vector will be removed. + /// If size is increased, new elements will be set to newVal. + /// If newSize requires a higher capacity, a new allocation is made. See notes on Reserve. + /// + /// @note If size is decreased, any iterators referencing removed elements will become invalid. All other + /// iterators will remain valid. Otherwise, all iterators will still be valid. + /// + /// @warning All pointers and references to elements of a vector will be invalidated, + /// in case new storage is allocated. + /// + /// @param [in] newSize The new size of a vector. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Resize(uint32 newSize, const T& newVal = T()); + + /// Copy/Move an element to end of the vector. If not enough space is available, new space will be allocated and + /// the old data will be copied to the new space. + /// + /// @param [in] data The element to be pushed to the vector. The element will become the last element. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result PushBack(const T& data); + Result PushBack(T&& data); + + /// Constructs an object in-place at the end of the vector. If not enough space is available, new space will be + /// allocated and the old data will be copied to the new space. + /// + /// @param [in] args... The arguments passed to the constructor + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + template + Result EmplaceBack(Args&&... args); + + /// Returns the element at the end of the vector and destroys it. + /// + /// @param [out] pData The element at the end of the vector. + /// It is expected that pData is uninitialized as it will be overwritten and not destructed. + void PopBack(T* pData); + + /// Destroys all elements stored in the vector. All dynamically allocated memory will be saved for reuse. + void Clear(); + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + T& At(uint32 index) + { + PAL_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + const T& At(uint32 index) const + { + PAL_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + T& operator[](uint32 index) noexcept { return At(index); } + const T& operator[](uint32 index) const noexcept { return At(index); } + ///@} + + /// Returns the data at the front of the vector. + /// + /// @warning Calling this function on an empty vector will cause an access violation! + /// + /// @returns The data at the front of the vector. + T& Front() const + { + PAL_ASSERT(IsEmpty() == false); + return *m_pData; + } + + /// Returns the data at the back of the vector. + /// + /// @warning Calling this function on an empty vector will cause an access violation! + /// + /// @returns The data at the back of the vector. + T& Back() const + { + PAL_ASSERT(IsEmpty() == false); + return *(m_pData + (m_numElements - 1)); + } + + /// Returns an iterator to the first element of the vector. + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns An iterator to first element of the vector. + Iter Begin() const { return Iter(0, *this); } + + /// Returns an iterator to the last element of the vector. + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns VectorIterator An iterator to last element of the vector. + Iter End() const { return Iter((m_numElements - 1), *this); } + + ///@{ + /// Implicitly gets the current contents of the vector as a Span. + /// + /// @returns The contents of the vector as a Span; same as Span(Data(), NumElements()). + operator Span() { return Span(Data(), NumElements()); } + operator Span() const { return Span(Data(), NumElements()); } + ///@} + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + NumElements()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! + /// + /// @returns Pointer to the underlying data storage for read & write access. + /// For a non-empty vector, the returned pointer contains address of the first element. + /// For an empty vector, the returned pointer may or may not be a null pointer. + T* Data() { return m_pData; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + NumElements()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! + /// + /// @returns Pointer to the underlying data storage for read only access. + /// For a non-empty vector, the returned pointer contains address of the first element. + /// For an empty vector, the returned pointer may or may not be a null pointer. + const T* Data() const { return m_pData; } + + /// Returns the size of the vector. + /// + /// @returns An unsigned integer equal to the number of elements currently present in the vector. + uint32 NumElements() const { return m_numElements; } + + /// Returns true if the number of elements present in the vector is equal to zero. + /// + /// @returns True if the vector is empty. + bool IsEmpty() const { return (m_numElements == 0); } + + /// Returns a pointer to the allocator used for this container's memory management. + /// + /// @returns Allocator pointer. + Allocator* GetAllocator() const { return m_pAllocator; } + + ///@{ + /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as `range for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using value_type = T; + using reference = T&; + using const_reference = const T&; + using iterator = T*; + using const_iterator = const T*; + using difference_type = int32; + using size_type = uint32; + + iterator begin() noexcept { return m_pData; } + iterator end() noexcept { return (m_pData + m_numElements); } + const_iterator begin() const noexcept { return m_pData; } + const_iterator end() const noexcept { return (m_pData + m_numElements); } + const_iterator cbegin() const noexcept { return m_pData; } + const_iterator cend() const noexcept { return (m_pData + m_numElements); } + [[nodiscard]] bool empty() const noexcept { return IsEmpty(); } + size_type size() const noexcept { return m_numElements; } + ///@} + + /// Erases the element at the specified iterator. + void Erase(Iter it); + + /// Erases the element at the specified iterator. + void Erase(iterator it); + + /// Erases the element at the specified index. + void Erase(uint32 index); + + /// Erase the element at the specified iterator, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(Iter it); + + /// Erase the element at the specified iterator, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(iterator it); + + /// Erases the element at the specified index, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(uint32 index); + +private: + // This is a POD-type that exactly fits one T value. + typedef typename std::aligned_storage::type ValueStorage; + + ValueStorage m_data[defaultCapacity]; // The initial data buffer stored within the vector object. + T* m_pData; // Pointer to the current data buffer. + uint32 m_numElements; // Number of elements present. + uint32 m_maxCapacity; // Maximum size it can hold. + Allocator*const m_pAllocator; // Allocator for this Vector. + + PAL_DISALLOW_COPY_AND_ASSIGN(Vector); + + // Although this is a transgression of coding standards, it prevents VectorIterator requiring a public constructor; + // constructing a 'bare' VectorIterator (i.e. without calling Vector::GetIterator) can never be a legal operation, + // so this means that these two classes are much safer to use. + friend class VectorIterator; +}; + +// ===================================================================================================================== +template +VectorIterator::VectorIterator( + uint32 index, + const Vector& srcVec) + : + m_curIndex(index), + m_srcVector(srcVec) + { + } + +// ===================================================================================================================== +template +Vector::Vector( + Allocator*const pAllocator) + : + m_pData(reinterpret_cast(m_data)), + m_numElements(0), + m_maxCapacity(defaultCapacity), + m_pAllocator(pAllocator) + { + } + +// ===================================================================================================================== +template +Vector::~Vector() +{ + // Explicitly destroy all non-trivial types. + if (!std::is_trivial::value) + { + for (uint32 idx = 0; idx < m_numElements; ++idx) + { + m_pData[idx].~T(); + } + } + + // Check if we have dynamically allocated memory. + if (m_pData != reinterpret_cast(m_data)) + { + // Free the memory that was allocated dynamically. + PAL_FREE(m_pData, m_pAllocator); + } +} + +// ===================================================================================================================== +// Steals allocation from a dying vector, if data buffer uses storage from heap allocation. +// Moves objects between local buffers of new and dying vectors (for non-trivial types) or +// copies local buffer from a dying vector to a new vector (for trivial types), +// if data buffer uses storage from local buffer. +template +Vector::Vector( + Vector&& vector) + : + m_numElements(vector.m_numElements), + m_maxCapacity(vector.m_maxCapacity), + m_pAllocator(vector.m_pAllocator) +{ + if (vector.m_pData == reinterpret_cast(vector.m_data)) // Local buffer + { + // Data buffer will be using storage from local buffer. + m_pData = reinterpret_cast(m_data); + + if (std::is_trivial::value) + { + // Optimize trivial types by copying local buffer. + std::memcpy(m_pData, vector.m_pData, sizeof(T) * m_numElements); + } + else + { + // Move objects from local buffer of a dying vector to local buffer of a new vector. + for (uint32 idx = 0; idx < m_numElements; ++idx) + { + PAL_PLACEMENT_NEW(m_pData + idx) T(Move(vector.m_pData[idx])); + } + } + } + else // Heap allocation + { + // Steal heap allocation from dying vector. + m_pData = vector.m_pData; + + // After the allocation has been stolen, dying vector is just an empty shell. + vector.m_pData = nullptr; + vector.m_numElements = 0; + vector.m_maxCapacity = 0; + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/.gitignore b/shared/amdgpu-windows-interop/pal/lib/Release/x64/.gitignore new file mode 100644 index 0000000000..27c54ac908 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/.gitignore @@ -0,0 +1,32 @@ +/addrlib.lib +/amdrdf.lib +/cwpack.lib +/ddCommon.lib +/ddCore.lib +/ddEventClient.lib +/ddEventParser.lib +/ddEventServer.lib +/ddEventStreamer.lib +/ddNet.lib +/ddRpcClient.lib +/ddRpcServer.lib +/ddRpcShared.lib +/ddSocket.lib +/ddYaml.lib +/dd_common.lib +/dd_libyaml.lib +/dd_settings.lib +/devdriver.lib +/DriverUtilsService.lib +/metrohash.lib +/mpack.lib +/pal.lib +/palCompilerDeps.lib +/palUtil.lib +/pal_lz4.lib +/pal_uuid.lib +/SettingsRpcService2.lib +/stb_sprintf.lib +/UberTraceService.lib +/vam.lib +/zstd.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc new file mode 100644 index 0000000000..9ce4af7fc2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fd5f7481a122f40f73d1f638e3b9b027 + size: 16738 + hash: md5 + path: DriverUtilsService.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc new file mode 100644 index 0000000000..9d853eaa31 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: e09dbb1896128ac2b2bcac2b35878a40 + size: 9460 + hash: md5 + path: SettingsRpcService2.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc new file mode 100644 index 0000000000..8ed81148d4 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 364bc94b5b81ef5bb337e6afb0060c55 + size: 13912 + hash: md5 + path: UberTraceService.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc new file mode 100644 index 0000000000..7fe0b48324 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 56362998d9feb9b0ce6ccad8441bf1c8 + size: 820446 + hash: md5 + path: addrlib.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc new file mode 100644 index 0000000000..603a536a3d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 35af646710d883bfe6184113cb88e96a + size: 702568 + hash: md5 + path: amdrdf.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc new file mode 100644 index 0000000000..21eab68dfb --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 06c7697ce380a8127e7478041aed7fc8 + size: 27894 + hash: md5 + path: cwpack.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc new file mode 100644 index 0000000000..c9b8c6bfd3 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7c6ba83c44ee8bd70397a1458dbea7e0 + size: 82210 + hash: md5 + path: ddCommon.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc new file mode 100644 index 0000000000..8312dd1b09 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 32f4aa9943ab5fde0da6f09bcbacf9be + size: 72778 + hash: md5 + path: ddCore.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc new file mode 100644 index 0000000000..2b15e06af2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 09fc5ce2eb8653cbd8ddda5d983ce836 + size: 137794 + hash: md5 + path: ddEventClient.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc new file mode 100644 index 0000000000..dbdd525e3b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 2f5e7eb06485bebea0ec2779d8df9f97 + size: 46862 + hash: md5 + path: ddEventParser.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc new file mode 100644 index 0000000000..8bf234218c --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: bdb3738c5bbd4ac6abd3a805930b89fe + size: 30582 + hash: md5 + path: ddEventServer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc new file mode 100644 index 0000000000..aebfdd6d57 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 43ea2bd7328593fab054754f9fd1a7c9 + size: 35768 + hash: md5 + path: ddEventStreamer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc new file mode 100644 index 0000000000..449f6f8e2d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: d91f19ed479fd51b481f6b25566dfd31 + size: 13230 + hash: md5 + path: ddNet.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc new file mode 100644 index 0000000000..0f95f12baf --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 5b37ae9cc29dfaba3cb0d08a30bd684a + size: 23224 + hash: md5 + path: ddRpcClient.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc new file mode 100644 index 0000000000..13273407e5 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: b3f63ef6d9a9d6bfb3e5934a9a34465e + size: 179024 + hash: md5 + path: ddRpcServer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc new file mode 100644 index 0000000000..3565cd252d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: db87375bafb0d667ac054dbf7dc0dc36 + size: 16268 + hash: md5 + path: ddRpcShared.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc new file mode 100644 index 0000000000..37471b02ed --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: f88897c7d989d95f0352cf6e1a21df99 + size: 106228 + hash: md5 + path: ddSocket.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc new file mode 100644 index 0000000000..4481f54001 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 4d64cf4c3b034f09a8a3abd1b7e657b4 + size: 35902 + hash: md5 + path: ddYaml.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc new file mode 100644 index 0000000000..27c10a4cb0 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 2394b7141b71f0b738dd3ad024dcbfc0 + size: 661222 + hash: md5 + path: dd_common.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc new file mode 100644 index 0000000000..58c95fb8c1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 2b9f0af04b216527b49338cc1b8fa1a5 + size: 264022 + hash: md5 + path: dd_libyaml.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc new file mode 100644 index 0000000000..b72ed57fbc --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: d9a1105679db9411bf2365aae2b6d2a5 + size: 212936 + hash: md5 + path: dd_settings.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc new file mode 100644 index 0000000000..e65c97aa25 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 18e7d04c4ecc9fb872de2e0ac9dffd61 + size: 2700190 + hash: md5 + path: devdriver.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc new file mode 100644 index 0000000000..40ca7236c1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 64dbdb1c2d7c68e7ae3083ea35878a83 + size: 28682 + hash: md5 + path: metrohash.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc new file mode 100644 index 0000000000..6f403fe56b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 43f91cf1e53eef1411a6e4a40776cd79 + size: 218874 + hash: md5 + path: mpack.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc new file mode 100644 index 0000000000..a732119591 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 852e161ac4115309a2591db0b80f13dd + size: 24025742 + hash: md5 + path: pal.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc new file mode 100644 index 0000000000..35efc5ee4b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: a929ad3103021925d382e419b0e5343d + size: 433780 + hash: md5 + path: palCompilerDeps.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc new file mode 100644 index 0000000000..02e9c72251 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 3cca5923fa12cf564360058254c2c6db + size: 799750 + hash: md5 + path: palUtil.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc new file mode 100644 index 0000000000..3941913ab4 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 36731971681f4a89f4e89b5ad44473ac + size: 291664 + hash: md5 + path: pal_lz4.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc new file mode 100644 index 0000000000..a9d688ea70 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 1ad5de7ebbb13b41f7d7dc0367d7d1d8 + size: 3460 + hash: md5 + path: pal_uuid.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc new file mode 100644 index 0000000000..596f2ad962 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: b2a1cd0f59d07aaa0cf21afa9235dbda + size: 25990 + hash: md5 + path: stb_sprintf.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc new file mode 100644 index 0000000000..aae38aec12 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 02784ea9d25a9a9c94c20acca001456c + size: 215198 + hash: md5 + path: vam.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc new file mode 100644 index 0000000000..2f44e47a41 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 6a1ac31db298434da1573cda69d9e4d3 + size: 1356642 + hash: md5 + path: zstd.lib diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h new file mode 100644 index 0000000000..0799878bf9 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h @@ -0,0 +1,269 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#include +#include + +// Macros for conditional language support. +#ifdef _MSVC_LANG + #define DD_CPLUSPLUS _MSVC_LANG +#else + #define DD_CPLUSPLUS __cplusplus +#endif +// Denotes versions of the C++ standard from __cplusplus. +#define CPP98 (199711L) +#define CPP11 (201103L) +#define CPP14 (201402L) +#define CPP17 (201703L) +#define CPP20 (202002L) +#define DD_CPLUSPLUS_SUPPORTS(x) (DD_CPLUSPLUS >= (x)) + +static_assert(DD_CPLUSPLUS_SUPPORTS(CPP11), "C++11 is required to build devdriver."); + +#if defined(_MSC_VER) + #define DD_ALIGNAS(x)__declspec(align(x)) + #if _MSC_VER < 1900 + #define DD_STATIC_CONST static const + #else + #define DD_STATIC_CONST static constexpr + #endif + #ifndef va_copy + #define va_copy(d,s) ((d) = (s)) + #endif +#endif + +#if !defined(DD_STATIC_CONST) + #if defined(__cplusplus) && __cplusplus >= 201103L + #define DD_STATIC_CONST static constexpr + #else + #define DD_STATIC_CONST static const + #endif +#endif + +#if DD_CPLUSPLUS_SUPPORTS(CPP14) + #define DD_CPP14_CONSTEXPR_FN constexpr + #define DD_CPP14_STATIC_ASSERT(a, b) static_assert(a, b) +#else + #define DD_CPP14_CONSTEXPR_FN inline + #define DD_CPP14_STATIC_ASSERT(a, b) +#endif + +#if !defined(DD_ALIGNAS) + #if defined(__cplusplus) && __cplusplus >= 201103L + #define DD_ALIGNAS(x) alignas(x) + #else + static_assert(false, "Error: unsupported compiler detected. Support is required to build."); + #endif +#endif + +/// Remove the __FILE__ macro for release builds +#ifndef DD_FILE + #ifdef NDEBUG + #define DD_FILE "" + #else + #define DD_FILE __FILE__ + #endif +#endif + +// Creates a structure with the specified name and alignment. +#define DD_ALIGNED_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name + +// Creates a structure with the specified alignment, and mark it as final to ensure it cannot be used as a parent class +#define DD_NETWORK_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name final + +#define DD_CHECK_SIZE(x, size) static_assert(sizeof(x) == size_t(size), "sizeof(" # x ") should be " # size " bytes but has changed recently") + +#define DD_UNUSED(x) (static_cast(x)) + +#define _DD_STRINGIFY(str) #str +#define DD_STRINGIFY(x) _DD_STRINGIFY(x) + +#if DD_CPLUSPLUS_SUPPORTS(CPP17) + // Require that a function's return value, or an entire type, be used. + #define DD_NODISCARD [[nodiscard]] + + // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. + // switch (x) + // { + // case 0: DD_FALLTHROUGH(); + // case 1: DD_FALLTHROUGH(); + // case 2: + // printf("0, 1, or 2"); + // break; + // } + // + #define DD_FALLTHROUGH() [[fallthrough]] +#else + // Require that a function's return value, or an entire type, be used. + // This option is aggressive enough that we do not enable it when C++17 is not enabled + #define DD_NODISCARD + + // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. + // switch (x) + // { + // case 0: DD_FALLTHROUGH(); + // case 1: DD_FALLTHROUGH(); + // case 2: + // printf("0, 1, or 2"); + // break; + // } + // + #if defined(__clang__) + #define DD_FALLTHROUGH() [[clang::fallthrough]] + #elif defined(__GNUC__) + #if __GNUC__ >= 7 + // gnu::fallthrough isn't supported until GCC 7+ + #define DD_FALLTHROUGH() [[gnu::fallthrough]] + #else + // Not supported on older versions of GCC + #define DD_FALLTHROUGH() + #endif + #elif defined(_MSC_VER) + // Not supported on MSVC - who doesn't warn about this issue in the first place. + #define DD_FALLTHROUGH() + #else + // We don't know what compiler this is, so just no-op the macro. + #define DD_FALLTHROUGH() + #endif +#endif + +// Include in the private section of a class declaration in order to disallow use of the copy and assignment operator +#define DD_DISALLOW_COPY_AND_ASSIGN(_typename) \ + _typename(const _typename&); \ + _typename& operator =(const _typename&); + +// Include in the private section of a class declaration in order to disallow use of the default constructor +#define DD_DISALLOW_DEFAULT_CTOR(_typename) \ + _typename(); + +// Detect the CPU architecture for the target. +// These are often evaluated during the preprocessor stage, so it's important that we don't rely on things like sizeof. +#if UINTPTR_MAX == 0xFFFFFFFF + #define DEVDRIVER_ARCHITECTURE_BITS 32 +#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFF + #define DEVDRIVER_ARCHITECTURE_BITS 64 +#else + static_assert(false, "Unknown or unsupported target architecture."); +#endif +static_assert(DEVDRIVER_ARCHITECTURE_BITS == (8 * sizeof(void*)), // Assume 8-bits-per-byte. + "DEVDRIVER_ARCHITECTURE_BITS does not match sizeof(void*)."); + +// Add a detailed function name macro +// These vary across platforms, so we'll just pick the first one that's defined +#if defined(__FUNCSIG__) + #define DD_FUNCTION_NAME __FUNCSIG__ +#elif defined(__PRETTY_FUNCTION__) + #define DD_FUNCTION_NAME __PRETTY_FUNCTION__ +#else + #define DD_FUNCTION_NAME __FUNCTION__ +#endif + +// Common Typedefs +// These types are shared between all platforms, +// and need to be defined before including a specific platform header. + +namespace DevDriver +{ + +typedef int8_t int8; ///< 8-bit integer. +typedef int16_t int16; ///< 16-bit integer. +typedef int32_t int32; ///< 32-bit integer. +typedef int64_t int64; ///< 64-bit integer. +typedef uint8_t uint8; ///< Unsigned 8-bit integer. +typedef uint16_t uint16; ///< Unsigned 16-bit integer. +typedef uint32_t uint32; ///< Unsigned 32-bit integer. +typedef uint64_t uint64; ///< Unsigned 64-bit integer. + +typedef uint32_t ProcessId; +typedef uint32_t Size; +typedef uint64_t Handle; + +DD_STATIC_CONST Handle kNullPtr = 0; +DD_STATIC_CONST Handle kInvalidHandle = 0; + +//////////////////////////// +// Common result codes +enum struct Result : uint32 +{ + //// Generic Result Code //// + Success = 0, + Error = 1, + NotReady = 2, + VersionMismatch = 3, + Unavailable = 4, + Rejected = 5, + EndOfStream = 6, + Aborted = 7, + InsufficientMemory = 8, + InvalidParameter = 9, + InvalidClientId = 10, + ConnectionExists = 11, + FileNotFound = 12, + FunctionNotFound = 13, + InterfaceNotFound = 14, + EntryExists = 15, + FileAccessError = 16, + FileIoError = 17, + LimitReached = 18, + MemoryOverLimit = 19, + + //// URI PROTOCOL //// + UriServiceRegistrationError = 1000, + UriStringParseError = 1001, + UriInvalidParameters = 1002, + UriInvalidPostDataBlock = 1003, + UriInvalidPostDataSize = 1004, + UriFailedToAcquirePostBlock = 1005, + UriFailedToOpenResponseBlock = 1006, + UriRequestFailed = 1007, + UriPendingRequestError = 1008, + UriInvalidChar = 1009, + UriInvalidJson = 1010, + + //// Settings URI Service //// + SettingsUriInvalidComponent = 2000, + SettingsUriInvalidSettingName = 2001, + SettingsUriInvalidSettingValue = 2002, + SettingsUriInvalidSettingValueSize = 2003, + + //// Info URI Service //// + InfoUriSourceNameInvalid = 3000, + InfoUriSourceCallbackInvalid = 3001, + InfoUriSourceAlreadyRegistered = 3002, + InfoUriSourceWriteFailed = 3003, + + //// Settings Service //// + SettingsInvalidComponent = 4000, + SettingsInvalidSettingName = 4001, + SettingsInvalidSettingValue = 4002, + SettingsInsufficientValueSize = 4003, + SettingsInvalidSettingValueSize = 4004, +}; + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h new file mode 100644 index 0000000000..d09fa62c99 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h @@ -0,0 +1,824 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +// can not be used in the kernel +#if !DD_PLATFORM_IS_KM +#include +#endif + +#include +#include + +#define DD_CACHE_LINE_BYTES 64 + +#define DD_MALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, false) +#define DD_CALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, true) +#define DD_FREE(memory, allocCb) allocCb.Free(memory) + +#define DD_NEW(className, allocCb) new(allocCb, alignof(className), true, DD_FILE, __LINE__, __FUNCTION__) className +#define DD_DELETE(memory, allocCb) DevDriver::Platform::Destructor(memory); DD_FREE(memory, allocCb) + +#define DD_NEW_ARRAY(className, numElements, allocCb) DevDriver::Platform::NewArray(numElements, allocCb) +#define DD_DELETE_ARRAY(memory, allocCb) DevDriver::Platform::DeleteArray(memory, allocCb) + +// Always enable asserts in Debug builds +#if !defined(NDEBUG) + #if !defined(DD_OPT_ASSERTS_ENABLE) + #define DD_OPT_ASSERTS_ENABLE + #endif + #if !defined(DD_OPT_ASSERTS_DEBUGBREAK) + #define DD_OPT_ASSERTS_DEBUGBREAK + #endif +#endif + +#define DD_PTR_TO_HANDLE(x) ((DevDriver::Handle)(uintptr_t)(x)) + +#define DD_SANITIZE_RESULT(x) ((x != Result::Success) ? Result::Error : x) + +namespace DevDriver +{ + +//////////////////////////// +// Common logging levels +enum struct LogLevel : uint8 +{ + Debug = 0, + Verbose, + Info, + Warn, + Error, + Always, + Count, + + // Backwards compatibility for old alert log level + Alert = Warn, + + Never = 0xFF +}; + +typedef void*(*AllocFunc)(void* pUserdata, size_t size, size_t alignment, bool zero); +typedef void(*FreeFunc)(void* pUserdata, void* pMemory); + +struct AllocCb +{ + void* pUserdata; + AllocFunc pfnAlloc; + FreeFunc pfnFree; + + void* Alloc(size_t size, size_t alignment, bool zero) const; + void* Alloc(size_t size, bool zero) const; + void Free(void* pMemory) const; +}; + +namespace Platform +{ + +// Used by the Platform::Thread implementation. +typedef void (*ThreadFunction)(void* pThreadParameter); + +} // namespace Platform + +} // namespace DevDriver + +#if defined(DD_PLATFORM_WINDOWS_UM) + #include +#elif defined(DD_PLATFORM_WINDOWS_KM) + #include +#elif defined(DD_PLATFORM_DARWIN_UM) + #include +#elif defined(DD_PLATFORM_LINUX_UM) + #include +#else + // Legacy system for Ati Make + #if defined(_WIN32) && !defined(_KERNEL_MODE) + #define DD_PLATFORM_WINDOWS_UM + #include + #elif defined(__linux__) + #define DD_PLATFORM_LINUX_UM + #include + #else + #error "Unknown Platform - please configure your build system" + #endif + + #if __x86_64__ + #define DD_ARCH_BITS 64 + #else + #define DD_ARCH_BITS 32 + #endif +#endif + +#if !defined(DD_RESTRICT) + #error "DD_RESTRICT not defined by platform!" +#endif + +#if !defined(DD_DEBUG_BREAK) + #error "DD_DEBUG_BREAK not defined by platform!" +#endif + +// This only exists for 32bit Windows to specificy callbacks as __stdcall. +#if !defined(DD_APIENTRY) + #define DD_APIENTRY +#endif + +// TODO: remove this and make kDebugLogLevel DD_STATIC_CONST when we use a version of visual studio that supports it +#ifdef DD_OPT_LOG_LEVEL + #define DD_OPT_LOG_LEVEL_VALUE static_cast(DD_OPT_LOG_LEVEL) +#else + #if defined(NDEBUG) + // In non-debug builds, default to printing asserts, Error, and Always log messages + #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Error + #else + // In debug builds, default to more messages + #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Verbose + #endif +#endif + +#define DD_WILL_PRINT(lvl) ((lvl >= DD_OPT_LOG_LEVEL_VALUE) && (lvl < DevDriver::LogLevel::Count)) +#define DD_PRINT(lvl, ...) DevDriver::LogString(__VA_ARGS__) + +#if defined(DD_OPT_ASSERTS_DEBUGBREAK) + #define DD_ASSERT_DEBUG_BREAK() DD_DEBUG_BREAK() +#else + #define DD_ASSERT_DEBUG_BREAK() +#endif + +#include + +// Calling `check_expr_is_bool(x)` when `x` is not exactly a bool will create a compile error. +// When it is a bool, it's a no-op. +// This allows us to enforce bool arguments to DD_ASSERT() macros +namespace DevDriver +{ + inline void check_expr_is_bool(bool) {} + + template + void check_expr_is_bool(const T&) = delete; +} + +#if !defined(DD_OPT_ASSERTS_ENABLE) + #define DD_WARN(statement) DD_UNUSED(0) + #define DD_WARN_REASON(reason) DD_UNUSED(0) + + #ifndef DD_ASSERT + #define DD_ASSERT(statement) DD_UNUSED(0) + #endif + + #define DD_ASSERT_REASON(reason) DD_UNUSED(0) +#else + #define DD_WARN(statement) do \ + { \ + DevDriver::check_expr_is_bool(statement); \ + if (!(statement)) \ + { \ + DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ + DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ + } \ + } while (0) + + #define DD_WARN_REASON(reason) do \ + { \ + DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ + DD_FILE, __LINE__, __func__, reason); \ + } while (0) + + #ifndef DD_ASSERT + #define DD_ASSERT(statement) do \ + { \ + DevDriver::check_expr_is_bool(statement); \ + if (!(statement)) \ + { \ + DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ + DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ + DD_ASSERT_DEBUG_BREAK(); \ + } \ + } while (0) + #endif + + #define DD_ASSERT_REASON(reason) do \ + { \ + DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ + DD_FILE, __LINE__, __func__, reason); \ + DD_ASSERT_DEBUG_BREAK(); \ + } while (0) +#endif + +/// Convenience macro that always warns. +#define DD_WARN_ALWAYS() DD_WARN_REASON("Unconditional Warning") + +/// Convenience macro that always asserts. +#define DD_ASSERT_ALWAYS() DD_ASSERT_REASON("Unconditional Assertion") + +/// Convenience macro that asserts if something has not been implemented. +#define DD_NOT_IMPLEMENTED() DD_ASSERT_REASON("Code not implemented!") + +/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. +#define DD_UNREACHABLE() DD_ASSERT_REASON("Unreachable code has been reached!") + +// Backwards compatibility for old alert macro +#define DD_ALERT(statement) DD_WARN(statement) +#define DD_ALERT_REASON(reason) DD_WARN_REASON(reason) +#define DD_ALERT_ALWAYS() DD_WARN_ALWAYS() + +// Debug utility to log an expression +// +// This works by taking the format specifier for a local variable, and an expression. +// The expression is evaluated once. +// It then prints that expression and its value: +// ```cpp +// int x = 5; +// int y = 10; +// int z = 0xf0; +// DD_DBG("0x%x", x + y + z); // Prints: foo/file.cpp:5 "x + y + z" == 0xff +// ``` +#define DD_DBG(level, fmt, expr) DD_PRINT( \ + level, \ + "%s:%d:\t\"" DD_STRINGIFY(expr) "\" == " fmt, \ + DD_FILE, \ + __LINE__, \ + (expr) \ + ) + +// Allocates memory using an AllocCb. +// This overload is declared noexcept, and will correctly handle AllocCb::pfnAlloc() returning NULL. +void* operator new( + size_t size, + const DevDriver::AllocCb& allocCb, + size_t align, + bool zero, + const char* pFilename, + int lineNumber, + const char* pFunction +) noexcept; + +#if DD_PLATFORM_IS_KM +// Provide a placement new function if is not available +inline void* operator new(size_t size, void *pMemory) +{ + return pMemory; +}; +#endif + +// Overload of operator delete that matches the previously declared operator new. +// The compiler can call this version automatically in the case of exceptions thrown in the Constructor +// ... even though we turn them off? +// Compilers are fussy. +void operator delete( + void* pObject, + const DevDriver::AllocCb& allocCb, + size_t align, + bool zero, + const char* pFilename, + int lineNumber, + const char* pFunction +) noexcept; + +namespace DevDriver +{ + +namespace Platform +{ + +template +inline void static Destructor(T* p) +{ + if (p != nullptr) + { + p->~T(); + } +} + +template +static T* NewArray(size_t numElements, const AllocCb& allocCb) +{ + size_t allocSize = (sizeof(T) * numElements) + DD_CACHE_LINE_BYTES; + size_t allocAlign = DD_CACHE_LINE_BYTES; + + T* pMem = reinterpret_cast(DD_MALLOC(allocSize, allocAlign, allocCb)); + if (pMem != nullptr) + { + pMem = reinterpret_cast(reinterpret_cast(pMem) + DD_CACHE_LINE_BYTES); + size_t* pNumElements = reinterpret_cast(reinterpret_cast(pMem) - sizeof(size_t)); + *pNumElements = numElements; + T* pCurrentElement = pMem; + for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) + { + new(pCurrentElement) T; + ++pCurrentElement; + } + } + + return pMem; +} + +template +static void DeleteArray(T* pElements, const AllocCb& allocCb) +{ + if (pElements != nullptr) + { + size_t numElements = *reinterpret_cast(reinterpret_cast(pElements) - sizeof(size_t)); + T* pCurrentElement = pElements; + for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) + { + pCurrentElement->~T(); + ++pCurrentElement; + } + + pElements = reinterpret_cast(reinterpret_cast(pElements) - DD_CACHE_LINE_BYTES); + } + + DD_FREE(pElements, allocCb); +} + +// Get the number of elements in a statically sized array +// Usage: +// char buffer[1024]; +// size_t size = ArraySize(buffer); // size == 1024 +// +// With a cast: +// char buffer[1024]; +// uint32 size = ArraySize(buffer); +// +template < + typename SizeT = size_t, // Type to return + typename T, // Inferred type of array elements - you should not need to supply this argument + size_t Size // Inferred length of array (in elements) - you should not need to supply this argument +> +constexpr SizeT ArraySize(const T(&)[Size]) +{ + return static_cast(Size); +} + +// Log to consoles and attached debuggers +void DebugPrint(LogLevel lvl, const char* pFormat, ...); + +// Platform-specific loggers, this is called from DebugPrint. +void PlatformDebugPrint(LogLevel lvl, const char* pString); + +/// Get the absolute path to a file or directory that already exists +/// If ppAbsPathFilePart is non-NULL, *ppAbsPathFilePart will point into absPath at the beginning of the Filename +/// This is recommended to do whenever you need to display a path to a user. +Result GetAbsPathName( + const char* pPath, + char (&absPath)[256] +); + +/* platform functions for performing atomic operations */ + +int32 AtomicIncrement(Atomic* pVariable); +int32 AtomicDecrement(Atomic* pVariable); +int32 AtomicAdd(Atomic* pVariable, int32 num); +int32 AtomicSubtract(Atomic* pVariable, int32 num); + +int64 AtomicIncrement(Atomic64* pVariable); +int64 AtomicDecrement(Atomic64* pVariable); +int64 AtomicAdd(Atomic64* pVariable, int64 num); +int64 AtomicSubtract(Atomic64* pVariable, int64 num); + +// A generic AllocCb that defers allocation to Platform::AllocateMemory() +// Suitable for memory allocation if you don't care about it. +extern AllocCb GenericAllocCb; + +void* AllocateMemory(size_t size, size_t alignment, bool zero); +void FreeMemory(void* pMemory); + +/* fast locks */ +class AtomicLock +{ +public: + AtomicLock() : m_lock(0) {}; + ~AtomicLock() {}; + void Lock(); + bool TryLock(); + void Unlock(); + bool IsLocked() { return (m_lock != 0); }; +private: + Atomic m_lock; +}; + +class Mutex +{ +public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); +private: + MutexStorage m_mutex; +}; + +class Semaphore +{ +public: + explicit Semaphore(uint32 initialCount, uint32 maxCount); + ~Semaphore(); + Result Signal(); + Result Wait(uint32 millisecTimeout); +private: + SemaphoreStorage m_semaphore; +}; + +class Event +{ +public: + explicit Event(bool signaled); + ~Event(); + void Clear(); + void Signal(); + Result Wait(uint32 timeoutInMs); +private: + EventStorage m_event; +}; + +class Thread +{ +public: + Thread() = default; + + Thread(Thread&& other) noexcept = default; + Thread& operator=(Thread&& other) noexcept = default; + + // Copying a thread doesn't make sense + Thread(const Thread&) = delete; + Thread& operator= (const Thread& other) = delete; + + ~Thread(); + + Result Start(ThreadFunction pFnThreadFunc, void* pThreadParameter); + + // Set the user-visible name for the thread using printf-style formatters + // This should only be called on valid thread objects. (Threads that have been started) + // This function will return Result::Error if it's called on an invalid thread. + // Note: This change is global to the thread and can be changed by other means + // Treat this as an aid for people + Result SetName(const char* pFmt, ...); + + Result Join(uint32 timeoutInMs); + + bool IsJoinable() const; + +private: + static ThreadReturnType DD_APIENTRY ThreadShim(void* pShimParam); + + // Reset our object to a default state + void Reset() + { + pFnFunction = nullptr; + pParameter = nullptr; + hThread = kInvalidThreadHandle; + + onExit.Clear(); + } + + // Set the thread name to a hard-coded string. + // The thread name passed to this function must be no larger than kThreadNameMaxLength including the NULL byte. + // If a larger string is passed, errors may occur on some platforms. + Result SetNameRaw(const char* pThreadName); + + ThreadFunction pFnFunction = nullptr; + void* pParameter = nullptr; + ThreadHandle hThread = kInvalidThreadHandle; + Event onExit = Event(false); // Start unsignaled +}; + +class Random +{ +public: + // Algorithm Constants + static constexpr uint64 kModulus = (uint64(1) << 48); + static constexpr uint64 kMultiplier = 0X5DEECE66Dull; + static constexpr uint16 kIncrement = 0xB; + + Random(); + Random(uint64 seed) + { + Reseed(seed); + } + ~Random() {} + + uint32 Generate(); + void Reseed(uint64 seed); +private: + uint64 m_prevState = 0; + + // Sanity checks. + static_assert(0 < kModulus, "Invalid modulus"); + static_assert(0 < kMultiplier, "Invalid multiplier"); + static_assert(kMultiplier < kModulus, "Invalid multiplier"); + static_assert(kIncrement < kModulus, "Invalid increment"); +}; + +class Library +{ +public: + Library() : m_hLib(nullptr) { } + ~Library() { Close(); } + + Result Load(const char* pLibraryName); + + void Close(); + + bool IsLoaded() const { return (m_hLib != nullptr); } + + void Swap(Library* pLibrary) + { + m_hLib = pLibrary->m_hLib; + pLibrary->m_hLib = nullptr; + } + + // Retrieve a function address from the dynamic library object. Returns true if successful, false otherwise. + template + bool GetFunction(const char* pName, Func_t* ppfnFunc) const + { + (*ppfnFunc) = reinterpret_cast(GetFunctionHelper(pName)); + return ((*ppfnFunc) != nullptr); + } + +private: + void* GetFunctionHelper(const char* pName) const; + + LibraryHandle m_hLib; + + DD_DISALLOW_COPY_AND_ASSIGN(Library); +}; + +enum struct MkdirStatus +{ + Unknown, + Created, + Existed, +}; + +// Create a directory with default permissions +// On Windows, this uses NULL for LPSECURITY_ATTRIBUTES +// On Unix, this uses 0777 for the mode. +// When pStatus is non-NULL, *pStatus is set to +// MkdirStatus::Created if the directory did not exist and was created +// MkdirStatus::Existed if the directory already existed +// Returns: +// - Result::Success, if the directory already exists or was created +// - Result::FileIoError, if the directory failed to be created +Result Mkdir(const char* pDir, MkdirStatus* pStatus = nullptr); + +ProcessId GetProcessId(); + +uint64 GetCurrentTimeInMs(); + +uint64 QueryTimestampFrequency(); +uint64 QueryTimestamp(); + +// Todo: Remove Sleep() entirely from our platform API. It cannot be used in the KMD and should not be used +// anywhere else either. +void Sleep(uint32 millisecTimeout); + +void GetProcessName(char* buffer, size_t bufferSize); + +void Strncpy(char* pDst, const char* pSrc, size_t dstSize); + +template +void Strncpy(char(&dst)[DstSize], const char* pSrc) +{ + Strncpy(dst, pSrc, DstSize); +} + +char* Strtok(char* pDst, const char* pDelimiter, char** ppContext); + +void Strncat(char* pDst, const char* pSrc, size_t dstSize); + +template +void Strncat(char(&dst)[DstSize], const char* pSrc) +{ + Strncat(dst, pSrc, DstSize); +} + +int32 Strcmpi(const char* pSrc1, const char* pSrc2); + +int32 Snprintf(char* pDst, size_t dstSize, const char* pFormat, ...); +int32 Vsnprintf(char* pDst, size_t dstSize, const char* pFormat, va_list args); + +template +int32 Snprintf(char(&dst)[DstSize], const char* pFormat, Args&&... args) +{ + return Snprintf(dst, DstSize, pFormat, args...); +} + +struct OsInfo +{ + DD_STATIC_CONST const char* kOsTypeWindows = "Windows"; + DD_STATIC_CONST const char* kOsTypeLinux = "Linux"; + DD_STATIC_CONST const char* kOsTypeDarwin = "Darwin"; + + char type[16]; /// The type of the OS, either "Windows", "Linux", or "Darwin". + + char name[32]; /// A human-readable string to identify the version of the OS running + char description[256]; /// A human-readable string to identify the detailed version of the OS running + char hostname[128]; /// The hostname for the machine + + struct UserInfo { + char name[32]; /// Username for the current user + char homeDir[128]; /// Path to the current user's home directory + //< This is typically stored in $HOME or %HOMEPATH% and looks like one of: + //< C:\Users\BobMarley + //< /home/bob_ross + //< /Users/BobTheBuilder + } user; + + uint64 physMemory; /// Total amount of memory available on host in bytes + uint64 swapMemory; /// Total amount of swap memory available on host in bytes +}; + + Result QueryOsInfo(OsInfo* pInfo); + +struct EtwSupportInfo + { + bool isSupported; ///< If true, indicates that the OS platform supports system monitoring, false otherwise. + bool hasPermission; ///< If true, indicates the account has the required permissions, false otherwise. + uint32 statusCode; ///< The status result returned when attempting to open a monitoring session. + char statusDescription[256]; ///< The textual status result returned when attempting to open a monitoring. + }; + + Result QueryEtwInfo(EtwSupportInfo* pInfo); + +} // Platform + +#ifndef DD_PRINT_FUNC +#define DD_PRINT_FUNC Platform::DebugPrint +#else +void DD_PRINT_FUNC(LogLevel logLevel, const char* format, ...); +#endif + +template +inline void LogString(const char *format, Ts&&... args) +{ + if (DD_WILL_PRINT(logLevel)) + { + DD_PRINT_FUNC(logLevel, format, Platform::Forward(args)...); + } +} + +// Increments a const pointer by numBytes by first casting it to a const uint8*. +DD_NODISCARD +constexpr const void* VoidPtrInc( + const void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) + numBytes); +} + +// Increments a pointer by numBytes by first casting it to a uint8*. +DD_NODISCARD +constexpr void* VoidPtrInc( + void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) + numBytes); +} + +// Decrements a const pointer by numBytes by first casting it to a const uint8*. +DD_NODISCARD +constexpr const void* VoidPtrDec( + const void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) - numBytes); +} + +// Decrements a pointer by numBytes by first casting it to a uint8*. +DD_NODISCARD +constexpr void* VoidPtrDec( + void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) - numBytes); +} + +/// Convert a `DevDriver::Result` into a human recognizable string. +static inline const char* ResultToString(Result result) +{ + switch (result) + { + //// Generic Result Code //// + case Result::Success: return "Success"; + case Result::Error: return "Error"; + case Result::NotReady: return "NotReady"; + case Result::VersionMismatch: return "VersionMismatch"; + case Result::Unavailable: return "Unavailable"; + case Result::Rejected: return "Rejected"; + case Result::EndOfStream: return "EndOfStream"; + case Result::Aborted: return "Aborted"; + case Result::InsufficientMemory: return "InsufficientMemory"; + case Result::InvalidParameter: return "InvalidParameter"; + case Result::InvalidClientId: return "InvalidClientId"; + case Result::ConnectionExists: return "ConnectionExists"; + case Result::FileNotFound: return "FileNotFound"; + case Result::FunctionNotFound: return "FunctionNotFound"; + case Result::InterfaceNotFound: return "InterfaceNotFound"; + case Result::EntryExists: return "EntryExists"; + case Result::FileAccessError: return "FileAccessError"; + case Result::FileIoError: return "FileIoError"; + case Result::LimitReached: return "LimitReached"; + case Result::MemoryOverLimit: return "MemoryOverLimit"; + + //// URI PROTOCOL //// + case Result::UriServiceRegistrationError: return "UriServiceRegistrationError"; + case Result::UriStringParseError: return "UriStringParseError"; + case Result::UriInvalidParameters: return "UriInvalidParameters"; + case Result::UriInvalidPostDataBlock: return "UriInvalidPostDataBlock"; + case Result::UriInvalidPostDataSize: return "UriInvalidPostDataSize"; + case Result::UriFailedToAcquirePostBlock: return "UriFailedToAcquirePostBlock"; + case Result::UriFailedToOpenResponseBlock: return "UriFailedToOpenResponseBlock"; + case Result::UriRequestFailed: return "UriRequestFailed"; + case Result::UriPendingRequestError: return "UriPendingRequestError"; + case Result::UriInvalidChar: return "UriInvalidChar"; + case Result::UriInvalidJson: return "UriInvalidJson"; + + //// Settings URI Service //// + case Result::SettingsUriInvalidComponent: return "SettingsUriInvalidComponent"; + case Result::SettingsUriInvalidSettingName: return "SettingsUriInvalidSettingName"; + case Result::SettingsUriInvalidSettingValue: return "SettingsUriInvalidSettingValue"; + case Result::SettingsUriInvalidSettingValueSize: return "SettingsUriInvalidSettingValueSize"; + + //// Info URI Service //// + case Result::InfoUriSourceNameInvalid: return "InfoUriSourceNameInvalid"; + case Result::InfoUriSourceCallbackInvalid: return "InfoUriSourceCallbackInvalid"; + case Result::InfoUriSourceAlreadyRegistered: return "InfoUriSourceAlreadyRegistered"; + case Result::InfoUriSourceWriteFailed: return "InfoUriSourceWriteFailed"; + + //// Settings Service //// + case Result::SettingsInvalidComponent: return "SettingsInvalidComponent"; + case Result::SettingsInvalidSettingName: return "SettingsInvalidSettingName"; + case Result::SettingsInvalidSettingValue: return "SettingsInvalidSettingValue"; + case Result::SettingsInsufficientValueSize: return "SettingsInsufficientValueSize"; + case Result::SettingsInvalidSettingValueSize: return "SettingsInvalidSettingValueSize"; + } + + DD_PRINT(LogLevel::Warn, "Result code %u is not handled", static_cast(result)); + return "Unrecognized DevDriver::Result"; +} + +// Helper function for converting bool values into Result enums +// Useful for cases where Results and bools are interleaved in logic +static inline Result BoolToResult(bool value) +{ + return (value ? Result::Success : Result::Error); +} + +// Use this macro to mark Result values that have not been or cannot be handled correctly. +#define DD_UNHANDLED_RESULT(x) DevDriver::MarkUnhandledResultImpl((x), DD_STRINGIFY(x), DD_FILE, __LINE__, __func__) + +// Implementation for DD_UNHANDLED_RESULT. +// This is a specialized assert that should be used through the macro, and not called directly. +// This is implemented in ddPlatform.h, so that it has access to DD_ASSERT. +static inline void MarkUnhandledResultImpl( + Result result, + const char* pExpr, + const char* pFile, + int lineNumber, + const char* pFunc) +{ +#if defined(DD_OPT_ASSERTS_ENABLE) + if (result != Result::Success) + { + DD_PRINT(DevDriver::LogLevel::Error, + "%s (%d): Unchecked Result in %s: \"%s\" == \"%s\" (0x%X)\n", + pFile, + lineNumber, + pFunc, + pExpr, + ResultToString(result), + result); + } +#else + DD_UNUSED(result); + DD_UNUSED(pExpr); + DD_UNUSED(pFile); + DD_UNUSED(lineNumber); + DD_UNUSED(pFunc); +#endif +} + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h new file mode 100644 index 0000000000..48dede89c2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h @@ -0,0 +1,378 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#if !defined(_MSC_VER) +#include +#endif + +namespace DevDriver +{ + namespace Platform + { + /// Templated LockGuard class. Works with any type that implements Lock() and Unlock() + template + class LockGuard + { + public: + explicit LockGuard(T &lock) : m_lock(lock) { lock.Lock(); } + ~LockGuard() { m_lock.Unlock(); } + private: + T &m_lock; + }; + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline uint32 Log2(T u) ///< Value to compute the logarithm of. + { + uint32 logValue = 0; + + while (u > 1) + { + ++logValue; + u >>= 1; + } + return logValue; + } + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline constexpr uint32 _ConstLog2(T u, uint32 logValue) ///< Value to compute the logarithm of. + { + return (u > 1) ? _ConstLog2(u >> 1, logValue + 1) : logValue; + } + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline constexpr uint32 ConstLog2(T u) ///< Value to compute the logarithm of. + { + return _ConstLog2(u, 0); + } + + static_assert(ConstLog2(1) == 0, "ConstLog2 failure"); + static_assert(ConstLog2(2) == 1, "ConstLog2 failure"); + static_assert(ConstLog2(128) == 7, "ConstLog2 failure"); + static_assert(ConstLog2(255) == 7, "ConstLog2 failure"); + + /// Computes 2 ^ value provided + /// + /// @returns 2 ^ (u) + template + inline constexpr T Pow2(T u) + { + return ((T)1 << u); + } + + static_assert(Pow2(0) == 1, "Pow2 failure"); + static_assert(Pow2(1) == 2, "Pow2 failure"); + static_assert(Pow2(7) == 128, "Pow2 failure"); + + /// Determines if a value is a power of two. + /// + /// @returns True if it is a power of two, false otherwise. + inline constexpr bool IsPowerOfTwo(uint64 value) + { + return (value == 0) ? false : ((value & (value - 1)) == 0); + } + + /// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 + /// alignments are supported by this function. + /// + /// returns Aligned value. + template + inline constexpr T Pow2Align( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). + { + return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); + } + + /// Rounds the specified uint 'value' up to the nearest power of 2 + /// + /// @returns Power of 2 padded value. + template + inline T Pow2Pad(T value) ///< Value to pad. + { + T ret = 1; + if (IsPowerOfTwo(value)) + { + ret = value; + } + else + { + while (ret < value) + { + ret <<= 1; + } + } + + return ret; + } + + /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. + /// + /// @returns Power of 2 padded value. + template + inline constexpr T _ConstPow2Pad(T value, T padded) ///< Value to pad. + { + return (padded < value) ? _ConstPow2Pad(value, padded << 1) : padded; + } + + /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. + /// + /// @returns Power of 2 padded value. + template + inline constexpr T ConstPow2Pad(T value) ///< Value to pad. + { + return (IsPowerOfTwo(value)) ? value : _ConstPow2Pad(value, (T)1); + } + + static_assert(ConstPow2Pad(512) == 512, "ConstPow2Pad failure"); + static_assert(ConstPow2Pad(511) == 512, "ConstPow2Pad failure"); + static_assert(ConstPow2Pad(257) == 512, "ConstPow2Pad failure"); + + /// Finds the smallest of two values + /// + /// @returns a if a < b, otherwise b. + template + inline constexpr T Min(const T &a, const T &b) + { + return ((a < b) ? a : b); + } + + /// Finds the larger of two values + /// + /// @returns a if a > b, otherwise b. + template + inline constexpr T Max(const T &a, const T &b) + { + return ((a > b) ? a : b); + } + + // Given a type T, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // Given a type T&, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // Given a type T&&, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // std::move equivalent + template + inline typename RemoveRef::Type&& Move(T&& obj) + { + return static_cast::Type&&>(obj); + } + + // std::forward equivalent + template + inline T&& Forward(typename RemoveRef::Type&& args) + { + return static_cast(args); + } + + // std::forward equivalent + template + inline T&& Forward(typename RemoveRef::Type& args) + { + return static_cast(args); + } + + // Returns the contents of Value in a new variable, and assign newValue into the memory occupied by value. + template + inline T Exchange(T& value, U&& newValue) + { + T oldValue = Move(value); + value = Forward(newValue); + return (oldValue); + } + + // Convenience structure that defined Value as either true or false, and Type as either TrueType or FalseType + template + struct BoolType + { + static const bool Value = value; + using Type = BoolType; + }; + + using FalseType = BoolType; + using TrueType = BoolType; + + // Struct whose ::Type member is undefined if the first condition is not true + template + struct EnableIf + { + }; + + // Struct whose ::Type member is equal to T if the first condition is true. + template + struct EnableIf + { + typedef T Type; + }; + + template + struct IsPointer : FalseType + { + }; + + template + struct IsPointer : TrueType + { + }; + +#if defined(_MSC_VER) + // If we are building with MSVC we want to use the compiler intrinsics here. This is primarily because building with + // the /kernel precludes the use of the C++ type traits library. For all other compilers we simply implement this + // using the standard C++ library. + + // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. + template + struct IsConvertible : BoolType<__is_convertible_to(T, U)> + { + }; + + // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments + // provided. + template + struct IsConstructible : BoolType<__is_constructible(T, Args...)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsAbstract : BoolType<__is_abstract(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsPod : BoolType<__is_pod(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. + template + struct IsStandardLayout : BoolType<__is_standard_layout(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. + template + struct IsTriviallyDestructible : BoolType<__is_trivially_destructible(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. + template + struct IsEnum : BoolType<__is_enum(T)> + { + + }; +#else + // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. + template + struct IsConvertible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments + // provided. + template + struct IsConstructible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsAbstract : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsPod : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. + template + struct IsStandardLayout : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. + template + struct IsTriviallyDestructible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. + template + struct IsEnum : BoolType::value> + { + + }; +#endif + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h new file mode 100644 index 0000000000..300bae754e --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h @@ -0,0 +1,115 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#pragma once + +#if defined(_KERNEL_MODE) +static_assert(false, "This header is for user mode windows, and it does not work in kernel mode."); +#endif + +// Our code expects these defined before including Windows.h. +// However, we need to guard against clients defining them too. +#ifndef _CRT_RAND_S + #define _CRT_RAND_S +#endif + +#ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN +#endif + +#ifndef NOMINMAX + #define NOMINMAX +#endif + +// WIN32_NO_STATUS makes Windows.h not include macro definitions from winnt.h +// which collide with those from ntstatus.h. This avoids compilation errors +// when other files that include ntstatus.h also include this file. +#define WIN32_NO_STATUS +#include +#undef WIN32_NO_STATUS + +#include + +#define DD_RESTRICT __restrict + +#define DD_DEBUG_BREAK() __debugbreak() + +namespace DevDriver +{ + namespace Platform + { + /* platform functions for performing atomic operations */ + typedef volatile LONG Atomic; + DD_CHECK_SIZE(Atomic, sizeof(int32)); + + typedef volatile LONG64 Atomic64; + DD_CHECK_SIZE(Atomic64, sizeof(int64)); + + struct EmptyStruct {}; + + struct MutexStorage + { + CRITICAL_SECTION criticalSection; +#if !defined(NDEBUG) + Atomic lockCount; +#endif + }; + typedef Handle SemaphoreStorage; + typedef HANDLE EventStorage; + typedef HANDLE ThreadHandle; + typedef DWORD ThreadReturnType; + typedef HMODULE LibraryHandle; + + constexpr ThreadHandle kInvalidThreadHandle = NULL; + + // Maximum supported size for thread names, including NULL byte + // This exists because some platforms have hard limits on thread name size. + // Windows doesn't seem to have a thread name size limit, but we use this variable to control + // a formatting buffer as well and we want to keep it reasonably small since it's stack allocated. + static constexpr size_t kThreadNameMaxLength = 64; + + #define DD_APIENTRY APIENTRY + + namespace Windows + { + // Windows specific functions required for in-memory communication + Handle CreateSharedSemaphore(uint32 initialCount, uint32 maxCount); + Handle CopySemaphoreFromProcess(ProcessId processId, Handle hObject); + Result SignalSharedSemaphore(Handle pSemaphore); + Result WaitSharedSemaphore(Handle pSemaphore, uint32 millisecTimeout); + void CloseSharedSemaphore(Handle pSemaphore); + + Handle CreateSharedBuffer(Size bufferSizeInBytes); + void CloseSharedBuffer(Handle hSharedBuffer); + + Handle MapSystemBufferView(Handle hBuffer, Size bufferSizeInBytes); + Handle MapProcessBufferView(Handle hBuffer, ProcessId processId); + void UnmapBufferView(Handle hSharedBuffer, Handle hSharedBufferView); + + // Whether or not the user has enabled Windows Developer Mode on their system + // See: https://github.com/MicrosoftDocs/windows-uwp/blob/docs/hub/apps/get-started/enable-your-device-for-development.md + bool IsWin10DeveloperModeEnabled(); + } + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h new file mode 100644 index 0000000000..21748a8e18 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h @@ -0,0 +1,62 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "protocolServer.h" + +namespace DevDriver +{ + class IMsgChannel; + + class BaseProtocolServer : public IProtocolServer + { + public: + virtual ~BaseProtocolServer(); + + Protocol GetProtocol() const override final { return m_protocol; }; + SessionType GetType() const override final { return SessionType::Server; }; + Version GetMinVersion() const override final { return m_minVersion; }; + Version GetMaxVersion() const override final { return m_maxVersion; }; + + bool GetSupportedVersion(Version minVersion, Version maxVersion, Version * version) const override final; + + virtual void Finalize() override; + protected: + BaseProtocolServer(IMsgChannel* pMsgChannel, Protocol protocol, Version minVersion, Version maxVersion); + + // Helper functions for working with SizedPayloadContainers + Result SendPayload(ISession* pSession, const SizedPayloadContainer* pPayload, uint32 timeoutInMs); + Result ReceivePayload(ISession* pSession, SizedPayloadContainer* pPayload, uint32 timeoutInMs); + + IMsgChannel* const m_pMsgChannel; + const Protocol m_protocol; + const Version m_minVersion; + const Version m_maxVersion; + + bool m_isFinalized; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h new file mode 100644 index 0000000000..67cc6aecb9 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h @@ -0,0 +1,28 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcDefs.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h new file mode 100644 index 0000000000..c34d52b362 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h @@ -0,0 +1,48 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#ifndef DD_PLATFORM_WINDOWS_UM + #if _WIN32 && !_KERNEL_MODE + #define DD_PLATFORM_WINDOWS_UM 1 + #define DD_PLATFORM_IS_UM 1 + #endif +#endif + +#ifndef DD_PLATFORM_WINDOWS_KM + #if _WIN32 && _KERNEL_MODE + #define DD_PLATFORM_WINDOWS_KM 1 + #define DD_PLATFORM_IS_KM 1 + #endif +#endif + +#ifndef DD_PLATFORM_LINUX_UM + #ifdef __linux__ + #define DD_PLATFORM_LINUX_UM 1 + #define DD_PLATFORM_IS_UM 1 + #define DD_PLATFORM_IS_GNU 1 + #endif +#endif diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h new file mode 100644 index 0000000000..481338b52b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h @@ -0,0 +1,28 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcPlatform.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h new file mode 100644 index 0000000000..5dd7d29db6 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h @@ -0,0 +1,28 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcTemplate.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h new file mode 100644 index 0000000000..2a61155c9e --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h @@ -0,0 +1,291 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "ddPlatform.h" + +namespace DevDriver +{ + namespace TransferProtocol + { + class ServerBlock; + } + + // The maximum allowed name for a service name + DD_STATIC_CONST size_t kMaxUriServiceNameLength = 128; + + enum struct URIDataFormat : uint32 + { + Unknown = 0, + Text, + Binary, + Count + }; + + // An interface to write bytes. + class IByteWriter + { + protected: + virtual ~IByteWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Write exactly `length` bytes. + virtual void WriteBytes(const void* pBytes, size_t length) = 0; + + // Write a value as a byte array. + // N.B.: Be mindful of your struct's implicit padding! + template + void Write(const T& value) + { + static_assert(!Platform::IsPointer::Value, "Writing a pointer is likely an error. Cast to an integer type if you mean it."); + WriteBytes(&value, sizeof(value)); + } + }; + + // An interface to write and validate text. + class ITextWriter + { + protected: + virtual ~ITextWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Write formatted text. + // Try and only pass string literals as `pFmt`. Prefer: Write("%s", myGeneratedBuffer); + virtual void Write(const char* pFmt, ...) = 0; + + // Write specific types + virtual void Write(uint64 value) = 0; + virtual void Write(uint32 value) = 0; + virtual void Write(uint16 value) = 0; + virtual void Write(uint8 value) = 0; + virtual void Write(int64 value) = 0; + virtual void Write(int32 value) = 0; + virtual void Write(int16 value) = 0; + virtual void Write(double value) = 0; + virtual void Write(float value) = 0; + virtual void Write(bool value) = 0; + virtual void Write(char value) = 0; + }; + + // An interface to write and validate structured data - e.g. json or message pack + class IStructuredWriter + { + protected: + virtual ~IStructuredWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Structured data is often nullable. + // Write a "null" value. + virtual void ValueNull() = 0; + + // ===== Collection Writers ==================================================================================== + + // Begin writing a new list collection. + virtual void BeginList() = 0; + + // End the current list collection. + virtual void EndList() = 0; + + // Begin writing a new map collection. + virtual void BeginMap() = 0; + + // End the current map collection. + virtual void EndMap() = 0; + + // Write a key into a map. + virtual void Key(const char* pKey) = 0; + + // ===== Value Writers ========================================================================================= + + virtual void Value(const char* pValue) = 0; + virtual void Value(const char* pValue, size_t length) = 0; + + virtual void Value(uint64 value) = 0; + virtual void Value(uint32 value) = 0; + virtual void Value(uint16 value) = 0; + virtual void Value(uint8 value) = 0; + virtual void Value(int64 value) = 0; + virtual void Value(int32 value) = 0; + virtual void Value(int16 value) = 0; + virtual void Value(int8 value) = 0; + virtual void Value(double value) = 0; + virtual void Value(float value) = 0; + virtual void Value(bool value) = 0; + virtual void Value(char value) = 0; + + /// Writes an enum value as a String or hex value + /// If DevDriver::ToString(Enum) returns NULL or an empty string, it will hex-encode the integer value. + /// Otherwise, it will write that string + template + void ValueEnumOrHex(Enum value) + { + const char* pString = ToString(value); + if ((pString == nullptr) || (strcmp(pString, "") != 0)) + { + Value(pString); + } + else + { + Valuef("0x%x", value); + } + } + + // Write a formatted string + template + void Valuef(const char* pFmt, Args&&... args) + { + char buffer[1024]; + Platform::Snprintf(buffer, pFmt, args...); + Value(buffer); + } + + // ===== Key + Value Writers =================================================================================== + + // Write a key-value pair where the value will be a list. + void KeyAndBeginList(const char* pKey) { Key(pKey); BeginList(); } + + // Write a key-value pair where the value will be a map. + void KeyAndBeginMap(const char* pKey) { Key(pKey); BeginMap(); } + + // Write a key-value pair. + void KeyAndValue(const char* pKey, const char* pValue) { Key(pKey); Value(pValue); } + void KeyAndValue(const char* pKey, const char* pValue, size_t length) { Key(pKey); Value(pValue, length); } + void KeyAndValue(const char* pKey, uint64 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, uint32 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, int64 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, int32 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, double value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, float value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, bool value) { Key(pKey); Value(value); } + + template + void KeyAndValueEnumOrHex(const char* pKey, Enum value) { Key(pKey); ValueEnumOrHex(value); } + + // Write a key-value pair where the value will be a "null" value. + void KeyAndValueNull(const char* pKey) { Key(pKey); ValueNull(); } + + // Write a key-value pair with a formatted value + template + void KeyAndValuef(const char* pKey, const char* pFmt, Args&&... args) { Key(pKey); Valuef(pFmt, args...); } + }; + + // An aggregate of the POST metadata for a request. + struct PostDataInfo + { + const void* pData; // Immutable view of the post data + uint32 size; // Size of the post data in bytes + URIDataFormat format; // Format of the post data - i.e. how to read it + + // Zero initialize the struct. + PostDataInfo() + { + memset(this, 0, sizeof(*this)); + } + }; + + // An interface that represents a unique URI request + class IURIRequestContext + { + protected: + virtual ~IURIRequestContext() {} + + public: + // Retrieve the request argument string + // N.B: This is non-const and designed to be mutated + virtual char* GetRequestArguments() = 0; + + // Retrieve information about the post data of this request + virtual const PostDataInfo& GetPostData() const = 0; + + // Creates and returns a Writer to copy bytes into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginByteResponse(IByteWriter** ppWriter) = 0; + + // Creates and returns a Writer to copy text into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginTextResponse(ITextWriter** ppWriter) = 0; + + // Creates and returns a Writer to copy json into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginJsonResponse(IStructuredWriter** ppWriter) = 0; + }; + + struct URIResponseHeader + { + // The size of the response data in bytes + size_t responseDataSizeInBytes; + + // The format of the response data + URIDataFormat responseDataFormat; + }; + + // Base class for URI services + class IService + { + public: + virtual ~IService() {} + + // Returns the name of the service + virtual const char* GetName() const = 0; + + // Returns the service version + virtual Version GetVersion() const = 0; + + // Attempts to handle a request from a client + virtual Result HandleRequest(IURIRequestContext* pContext) = 0; + + // Determines the size limit for post data requests for the client request. By default services + // will not accept any post data. The pArguments paramter must remain non-const because the + // service may need to manipulate it for further processing. + virtual size_t QueryPostSizeLimit(char* pArguments) const + { + DD_UNUSED(pArguments); + return 0; + } + + protected: + IService() {}; + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h new file mode 100644 index 0000000000..123e9eb7db --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h @@ -0,0 +1,103 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "msgChannel.h" +#include "msgTransport.h" +#include "protocols/systemProtocols.h" +#include "protocols/typemap.h" + +namespace DevDriver +{ + class IProtocolServer; + + // Server Creation Info + // This struct extends the MessageChannelCreateInfo struct and adds information about the destination host + // the client will connect to. It additionally allows specifying protocol servers to enable during initialization. + // See msgChannel.h for a full list of members. + struct ServerCreateInfo : public MessageChannelCreateInfo + { + HostInfo connectionInfo; // Connection information describing how the Server should connect + // to the message bus. + ProtocolFlags servers; // Set of boolean values indicating which servers should be created + // during initialization. + }; + + DD_STATIC_CONST uint32 kQueryStatusTimeoutInMs = 50; + + class DevDriverServer + { + public: + static bool IsConnectionAvailable(const HostInfo& hostInfo, uint32 timeout = kQueryStatusTimeoutInMs); + + explicit DevDriverServer(const AllocCb& allocCb, const ServerCreateInfo& createInfo); + ~DevDriverServer(); + + Result Initialize(); + void Finalize(); + void Destroy(); + + const AllocCb& GetAllocCb() const { return m_allocCb; } + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + // Called by the driver to mark the end of Platform and the start of device initialization. + // Starting with GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION the driver should call the driver control + // functions directly. + void StartDeviceInit(); +#endif + + bool IsConnected() const; + IMsgChannel* GetMessageChannel() const; + + DriverControlProtocol::DriverControlServer* GetDriverControlServer(); + RGPProtocol::RGPServer* GetRGPServer(); + EventProtocol::EventServer* GetEventServer(); + SettingsURIService::SettingsService* GetSettingsService(); + InfoURIService::InfoService* GetInfoService(); + + bool ShouldShowOverlay(); + + private: + Result InitializeProtocols(); + void DestroyProtocols(); + + Result RegisterProtocol(Protocol protocol); + void UnregisterProtocol(Protocol protocol); + void FinalizeProtocol(Protocol protocol); + + IMsgChannel* m_pMsgChannel; + AllocCb m_allocCb; + ServerCreateInfo m_createInfo; + + template + inline Result RegisterProtocol(Args... args); + + template + inline ProtocolServerType* GetServer(); + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h new file mode 100644 index 0000000000..ac24de416a --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h @@ -0,0 +1,590 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#define GPUOPEN_INTERFACE_MAJOR_VERSION 42 + +#define GPUOPEN_INTERFACE_MINOR_VERSION 1 + +#define GPUOPEN_INTERFACE_VERSION ((GPUOPEN_INTERFACE_MAJOR_VERSION << 16) | GPUOPEN_INTERFACE_MINOR_VERSION) + +#define GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION 38 + +#ifndef GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION + static_assert(false, "Client must define GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION."); +#else + static_assert((GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION) && + (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION <= GPUOPEN_INTERFACE_MAJOR_VERSION), + "The specified GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION is not supported."); +#endif + +// Next version number for interface breaking changes +#define DD_UNRELEASED_MAJOR_VERSION 40 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 42.1 | Move Escape Commands to the shared header for access outside of message.h | +*| 42.0 | Updates RGP Protocol to support SPM counters and SE masking. | +*| 41.0 | Updates DriverControlProtocol to allow user to query device clock frequencies for a given | +*| | clock mode without changing the clock mode. | +*| 40.0 | Moves DriverStatus enum out of DriverControlProtocol and into gpuopen.h, and renames several | +*| | DriverControlProtocol functions. | +*| 39.0 | Simplified the LoggingClient interface to remove the internal pending message requirement. | +*| | Removed kInfiniteTimeout and replaced its uses with kLogicFailureTimeout. | +*| | Decoupled RGP trace parameters from trace execution. | +*| 38.0 | Added support for specifying hostname in ListenerCreateInfo and renamed enableUWP flag to | +*| | enableKernelTransport. | +*| 37.0 | Added support for Querying ClientInfo from DriverControlProtocol | +*| 36.1 | Removed internal log message queue inside LoggingClient. This improves performance significantly. | +*| 36.0 | Added support for capturing the RGP trace on specific frame or dispatch. | +*| | Added bitfield to control whether driver internal code objects are included in the code object database. | +*| 35.0 | Updated Settings URI enum SettingType to avoid X11 macro name collision. | +*| 34.0 | Updated URI services to define a version number for each service. | +*| 33.0 | Abstracts URIRequestContext into an abstract interface. | +*| 32.0 | Updated RGPClient::EndTrace to support user specified timeout values. This allows tools to support | +*| | long running traces via user controlled cancellation dialogs. | +*| 31.0 | Clean up DevDriverClient and DevDriverServer create info structs. Replace TransportCreateInfo | +*| | struct with MessageChannelCreateInfo and HostInfo structs. | +*| 30.2 | Added support for RGP v6 protocol which supports trace trigger markers. | +*| 30.1 | Add Push transfer support to the transfer protocol. Added PushBlock class, added v2 of the | +*| | TransferProtocol, and did a lot of internal cleanup. Legacy interfaces will be deprecated in a future | +*| | interface version change alongside URI changes. | +*| 30.0 | Remove CloseSession and OrphanSession from the public ISession object interface, and move the | +*| | functionality into the Session class. | +*| 29.0 | Added a ResponseDataFormat enum to the URI protocol to distinguish between binary and text responses. | +*| 28.0 | Formally deprecate legacy KMD client manager support in the Listener. | +*| 27.2 | Updated FindFirstClient to support returning the matching ClientMetadata struct. | +*| 27.1 | Added PipelineDumpsEnabled status flag. | +*| 27.0 | Deprecate global client status flags + replace it with client metadata. | +*| 26.0 | Add new trace parameters in the RGP protocol. | +*| 25.0 | Initial refactor of LoggingProtocol. Removes Subcategories, being able to set/clear filter outside of a | +*| | trace, and significantly reduces the complexity that is involved in using it. | +*| 24.0 | Expanded driver initialization concept in driver control protocol. | +*| 23.0 | Modified RGP client API usage pattern to be uniform across protocol versions. | +*| 22.0 | Refactor RGP client interface to support calculating transfer progress. | +*| 21.1 | Added backwards compatible workaround for the session termination bug until we get the fix in mainline. | +*| 21.0 | Enable link disconnection detection for socket based transports. | +*| 20.0 | Added support for specifying the clock mode used during RGP profiling. | +*| 19.0 | Refactor platform thread functions so that they are contained in a class. This is the last part of the | +*| | platform library that needed to be refactored, so future work will be focused on migrating the message | +*| | bus components to use the main platform library again. | +*| 18.0 | Consolidate DevDriver::DebugLevel and DevDriver::Message::DebugLevel into DevDriver::LogLevel. | +*| 17.0 | Rename DD_VERSION_IS_SUPPORTED macro to DD_VERSION_SUPPORTS for conciseness. | +*| 16.1 | Rework session version negotiation to allow clients to support multiple server versions. | +*| 16.0 | Change semantics of CreateProtocolClient to AcquireProtocolClient. This aligns better with | +*| | ReleaseProtoclClient in terms of semantics. | +*| 15.0 | Added support for memory allocator callbacks via AllocCb. | +*| 14.1 | Added DisableTrace call in RGP server to allow drivers to disable future traces if necessary. | +*| 14.0 | Add TraceParameters to the RGP protocol to allow for configuration of trace behavior. | +*| 13.0 | Deprecate DevDriverClient::CreateProtocolClient() in favor of typesafe templated version. | +*| 12.0 | Deprecate API features tied to legacy network protocol versions: | +*| | * Replace ConnectToRemoteClient with Connect, which now returns more detailed errors on failure | +*| | * Eliminate Send and ReceiveSessionMessage functions in IMsgChannel and SessionManager | +*| | * Rename AuthenticationFailed to VersionMismatch since it is more semantically accurate | +*| | * Update IMsgChannel::Update so that it takes a default timeout value, get rid of m_receiveTimeoutInMs | +*| 11.5 | Updated server to remove GetVersion() call and pass version into AcceptSession() instead. This allows | +*| | servers to potentially implement backwards compatibility for older client versions. Additionally, | +*| | completely eliminate SessionTermination type in favor of expanding Result type. This allows propagating | +*| | more information on connection failures back to clients, as well as streamlines some code. | +*| 11.4 | Implement per-protocol versioning. Client protocol is sent as part of session request, server decides | +*| | whether or not to accept session both from client and from version. Also rearrange how certain network | +*| | operations work: Syn now stores the initial session ID in the sessionId field of the message, Rst now | +*| | includes a result code, and closing a session now implicitly flushes both the client/server. Rst | +*| | Is also sent on just about every unknown session packet received, allowing faster error detection and | +*| | recovery. Bump network version number | +*| 11.3 | Change ProcessId type from 64bit to 32bit integer and bump network protocol version. | +*| 11.2 | Update the network protocol to give external protocols values from 0-223 and system protocols 224-255. | +*| | Also clean up + deprecate some of the constants associated with protocols. | +*| 11.1 | Force alignment of all network transmitted structs, as well as pad. This is a breaking change for the | +*| | network protocol, but is otherwise API compatible. | +*| 11.0 | Deprecate the Protocol::ClientManangement enum, as well as ReadMessageBuffer and SendMessageBuffer in | +*| | message.lib. | +*| 10.0 | Remove callback from MessageChannel to prevent usage that can cause deadlocking. | +*| 9.0 | Formalized support for selective discard of non-session messages based on right in the message. | +*| | Implementation is that the sequence field of a message can be populated with the contents of a | +*| | ClientMetadata struct, which is then used by the receiving message channel to determine if it should | +*| | respond. Decision is based on whether or not the metadata matches the metadata of the receiving client. | +*| 8.0 | Added support for default settings values in the settings protocol. Removed support for min and max | +*| | settings values since the scripts don't actually support those anyways. | +*| 7.0 | Added a Finalize function to DevDriverServer and all protocol server objects. This function now handles | +*| | the wait on start functionality for drivers internally. Finalize should now be called instead of the old | +*| | wait on start logic in client drivers. | +*| 6.0 | Update client protocol management so that DevDriverClient no longer caches a single instance of each | +*| | client protocol, and add ability for clients to directly create more than one client protocol instance. | +*| | Additionally, make changes to underlying message channel/transport API that is not backwards compatible, | +*| | as well as rename QueryClientInfoResponse to ClientInfoResponse and QueryClientInfoResponsePayload to | +*| | ClientInfoStruct. | +*| 5.0 | Update network protocol to allow specifying status flags at registration time, and add system message. | +*| | to indicate when a driver has been halted. Additionally, this changes the format of the client | +*| | registration packets so as to better detect version mismatch. It also fixes the ClientManangement typo. | +*| 4.0 | Refactor interface so as to better delineate between system protcols/client protocols, as well as add | +*| | ability to query protocol availability. Requires version bump, so also formally deprecated | +*| | Result::Timeout and ClientStatusFlags::ProfilingEnabled, as well as moved entire SessionProtocol | +*| | namespace out of the public headers. | +*| 3.1 | Introduce kNumberClientProtocols to replace usage of Protocol::Count | +*| 3.0 | Rename SettingsProtocol::SettingType::Bool to Boolean to avoid conflict with Xlib macro. Additionally | +*| | formally deprecate Result::Timeout. | +*| 2.2 | Added None (0) to ClientStatusFlags enum. | +*| 2.1 | Added kNamedPipeName to global namespace. | +*| 2.0 | Added functionality for enabling and disabling traces in RGPServer. Traces must now be explicitly | +*| | enabled before remote trace requests will succeed. | +*| 1.2 | Added AbortTrace() function to RGPServer. | +*| 1.1 | Added support for RGP protocol. | +*| 1.0 | Initial versioned release. | +*********************************************************************************************************************** +*/ + +#define GPUOPEN_RGP_SPM_COUNTERS_VERSION 42 +#define GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION 41 +#define GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION 40 +#define GPUOPEN_DECOUPLED_RGP_PARAMETERS_VERSION 39 +#define GPUOPEN_SIMPLER_LOGGING_VERSION 39 +#define GPUOPEN_LISTENER_HOSTNAME_VERSION 38 +#define GPUOPEN_SETTINGS_URI_LINUX_BUILD 35 +#define GPUOPEN_VERSIONED_URI_SERVICES_VERSION 34 +#define GPUOPEN_URIINTERFACE_CLEANUP_VERSION 33 +#define GPUOPEN_LONG_RGP_TRACES_VERSION 32 +#define GPUOPEN_CREATE_INFO_CLEANUP_VERSION 31 +#define GPUOPEN_SESSION_INTERFACE_CLEANUP_VERSION 30 +#define GPUOPEN_URI_RESPONSE_FORMATS_VERSION 29 +#define GPUOPEN_DEPRECATE_LEGACY_KMD_VERSION 28 +#define GPUOPEN_DISTRIBUTED_STATUS_FLAGS_VERSION 27 +#define GPUOPEN_RGP_TRACE_PARAMETERS_V3_VERSION 26 +#define GPUOPEN_LOGGING_SIMPLIFICATION_VERSION 25 +#define GPUOPEN_DRIVERCONTROL_INITIALIZATION_VERSION 24 +#define GPUOPEN_RGP_UNIFORM_API_VERSION 23 +#define GPUOPEN_RGP_PROGRESS_VERSION 22 +#define GPUOPEN_KEEPALIVE_VERSION 21 +#define GPUOPEN_PROFILING_CLOCK_MODES_VERSION 20 +#define GPUOPEN_THREAD_REFACTOR_VERSION 19 +#define GPUOPEN_LOGLEVEL_CLEANUP_VERSION 18 +#define GPUOPEN_RENAME_MACRO_VERSION 17 +#define GPUOPEN_PROTOCOL_CLIENT_REUSE_VERSION 16 +#define GPUOPEN_MEMORY_ALLOCATORS_VERSION 15 +#define GPUOPEN_RGP_TRACE_PARAMETERS_VERSION 14 +#define GPUOPEN_DEPRECATE_CREATEPROTOCOLCLIENT_VERSION 13 +#define GPUOPEN_DEPRECATE_LEGACY_NETAPI_VERSION 12 +#define GPUOPEN_POST_GDC_CLEANUP_VERSION 11 +#define GPUOPEN_DEPRECATE_EXTERNAL_CALLBACK_VERSION 10 +#define GPUOPEN_SELECTIVE_RESPOND_VERSION 9 +#define GPUOPEN_DEFAULT_SETTINGS_VERSION 8 +#define GPUOPEN_SERVER_FINALIZE_VERSION 7 +#define GPUOPEN_DEPRECATE_LEGACY_VERSION 6 +#define GPUOPEN_CLIENT_REGISTRATION_VERSION 5 +#define GPUOPEN_PROTOCOL_CLEANUP_VERSION 4 +#define GPUOPEN_LINUX_BUILD_VERSION 3 +#define GPUOPEN_EXPLICIT_ENABLE_RGP_VERSION 2 +#define GPUOPEN_INITIAL_VERSION 1 + +// This will be properly defined when RMV 1.1 features are complete, defining it now allows +// clients to code to the interface ahead of all of the work being complete. +#define GPUOPEN_RMV_1_1_VERSION 0xFFFF + +#define DD_VERSION_SUPPORTS(x) (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= x) + +namespace DevDriver +{ + typedef uint16_t ClientId; + typedef uint32_t SessionId; + typedef uint8_t MessageCode; + typedef uint16_t WindowSize; + typedef uint64_t Sequence; + typedef uint16_t Version; + typedef uint16_t StatusFlags; + +#if DD_VERSION_SUPPORTS(GPUOPEN_SIMPLER_LOGGING_VERSION) + // A common timeout in milliseconds for components to use when they do not expect timeout to fail. + // If an operation that uses this timeout returns Result::NotReady, consider it a fatal error. + DD_STATIC_CONST uint32 kLogicFailureTimeout = 1000; +#else + DD_STATIC_CONST uint32 kInfiniteTimeout = ~(0u); + DD_STATIC_CONST uint32 kLogicFailureTimeout = kInfiniteTimeout; +#endif + DD_STATIC_CONST uint32 kNoWait = (0u); + + //////////////////////////// + // Driver states + enum struct DriverStatus : uint32 + { + Running = 0, + Paused, + HaltedOnDeviceInit, + EarlyDeviceInit, + LateDeviceInit, + PlatformInit, + HaltedOnPlatformInit, + HaltedPostDeviceInit, + Count + }; + + //////////////////////////// + // Client status codes + enum struct ClientStatusFlags : StatusFlags + { + None = 0, + DeveloperModeEnabled = (1 << 0), + DeviceHaltOnConnect = (1 << 1), + GpuCrashDumpsEnabled = (1 << 2), + PipelineDumpsEnabled = (1 << 3), + PlatformHaltOnConnect = (1 << 4), + DriverInitializer = (1 << 5) + }; + + DD_CHECK_SIZE(ClientId, 2); + DD_STATIC_CONST int16 kRouterPrefixWidth = 3; + DD_STATIC_CONST int16 kRouterPrefixShift = (int16)(16 - kRouterPrefixWidth); + DD_STATIC_CONST ClientId kClientIdMask = (1 << kRouterPrefixShift) - 1; + DD_STATIC_CONST ClientId kRouterPrefixMask = static_cast(~(kClientIdMask)); + + union ProtocolFlags + { + struct DD_ALIGNAS(4) + { + // TODO: Replace logging, settings, and gpuCrashDump with "reserved" once all driver usage is removed. + uint32 logging : 1; + uint32 settings : 1; + uint32 driverControl : 1; + uint32 rgp : 1; + uint32 etw : 1; + uint32 gpuCrashDump : 1; + uint32 event : 1; + uint32 reserved : 25; + }; + uint32 value; + }; + + DD_CHECK_SIZE(ProtocolFlags, 4); + + //////////////////////////// + // Component definitions + enum struct Component : uint8 + { + Unknown = 0, + Server, + Tool, + Driver, + Count + }; + + struct DD_ALIGNAS(4) ClientMetadata + { + ProtocolFlags protocols; + Component clientType; + uint8 reserved; + StatusFlags status; + + // For System messages, which are not session-based, we alias the sequence field as ClientMetadata. This constructor + // is provided to help unpack the raw 64-bit sequence field into a ClientMetadata struct without needing to type-cast + explicit ClientMetadata(uint64 value) + { + // If we're going to alias as a 64-bit value, make sure the struct is still just 64-bits) + static_assert(sizeof(uint64) == sizeof(ClientMetadata), + "Size of ClientMetadata is no longer 64-bits, alias constructor needs updating"); + + // Bits 0-31 are the ProtocolFlags + protocols.value = static_cast(value & 0xFFFF); + + // Bits 32-39 are the Component + clientType = static_cast((value & 0xFF00000000) >> 32); + + // Bits 40-47 are reserved, ignore them and zero initialize + reserved = 0; + + // Bits 48-63 are the StatusFlags + status = static_cast((value & 0xFFFF000000000000) >> 48); + } + + // Default constructor, default initialize everything + ClientMetadata() = default; + + // Returns true if all values are default values + bool IsDefault() const + { + return ((protocols.value == 0) && (clientType == Component::Unknown) && (status == 0)); + } + + // Test if all non-zero fields in the ClientMetadata value are contained in the function parameter + bool Matches(const ClientMetadata &right) const + { + bool result = true; + + // The Matches function treats this struct as a filter, so a ClientMetadata with all default (zero) values + // by definition always matches. + if (IsDefault() == false) + { + // Component is an enum, so the comparison needs to be equality + const bool clientTypeMatches = + (clientType != Component::Unknown) + ? (clientType == right.clientType) + : true; + + // ProtocolFlags is a bit field, so we can do a bitwise comparison + const bool protocolMatches = + (protocols.value != 0) + ? (protocols.value & right.protocols.value) == protocols.value + : true; + // StatusFlags is a bit field, so we can do a bitwise comparison + const bool statusMatches = + (status != 0) + ? (status & right.status) == status + : true; + result = clientTypeMatches & protocolMatches & statusMatches; + } + + return result; + } + + // Test if any non-zero fields in the ClientMetadata value are contained in the function parameter + bool MatchesAny(const ClientMetadata &right) const + { + bool result = true; + + // The MatchesAny function treats this struct as a filter, so a ClientMetadata with all default (zero) values + // by definition always matches. + if (IsDefault() == false) + { + // Component is an enum, so the comparison needs to be equality + const bool clientTypeMatches = (clientType == right.clientType); + // ProtocolFlags is a bit field, so we can do a bitwise comparison + const bool protocolMatches = (protocols.value & right.protocols.value) != 0; + // StatusFlags is a bit field, so we can do a bitwise comparison + const bool statusMatches = (status & right.status) != 0; + result = clientTypeMatches | protocolMatches | statusMatches; + } + + return result; + } + }; + + DD_CHECK_SIZE(ClientMetadata, 8); + + //////////////////////////// + // Protocol definitions + enum struct Protocol : uint8 + { + DriverControl = 0, + Reserved0, + Reserved1, + RGP, + ETW, + Reserved2, + Event, + DefinedProtocolCount, + + // System enumerations + MaxUserProtocol = 223, + /* RESERVED FOR SYSTEM USE */ + Transfer = 251, + URI = 252, + Session = 253, + ClientManagement = 254, + System = 255, + }; + + // this gives you the number of pre-defined user protocols that exist + DD_STATIC_CONST uint32 kNumberClientProtocols = static_cast(Protocol::DefinedProtocolCount); + + // this gives you the maximum number of client protocols you can reserve. + DD_STATIC_CONST uint32 kMaxClientProtocolId = static_cast(Protocol::MaxUserProtocol); + + static_assert(kNumberClientProtocols <= (kMaxClientProtocolId + 1), "Invalid protocol definitions specified"); + + /////////////////////// + // General definitions + DD_STATIC_CONST uint32 kMessageVersion = 1011; + + // Max string size for names and messages + DD_STATIC_CONST Size kMaxStringLength = 128; + + // Broadcast client ID + DD_STATIC_CONST ClientId kBroadcastClientId = 0; + + // Invalid Session ID + DD_STATIC_CONST SessionId kInvalidSessionId = 0; + + // Default network port number + DD_STATIC_CONST uint16_t kDefaultNetworkPort = 27300; + + // Transport type enumeration + enum class TransportType : uint32 + { + Local = 0, + Remote, +#if defined(DD_PLATFORM_WINDOWS_UM) + MessageBus, +#endif + }; + + // Struct used to designate a transport type, port number, and hostname + struct HostInfo + { + TransportType type; // Transport type, as defined above + uint16_t port; // Port number if applicable + const char* pHostname; // Host address, address, or path + }; + + // Default local host information + DD_STATIC_CONST HostInfo kDefaultLocalHost = + { + TransportType::Remote, + kDefaultNetworkPort, + "localhost" + }; + + // Default named pipe information + DD_STATIC_CONST HostInfo kDefaultNamedPipe = + { + TransportType::Local, + 0, + nullptr + }; + +#if defined(DD_PLATFORM_WINDOWS_UM) + // Default message bus information + DD_STATIC_CONST HostInfo kMessageBus = + { + TransportType::MessageBus, + 0, + nullptr + }; +#endif + + //////////////////////////// + // Common definition of a message header + // + // todo: better packing of these values + // - payloadSize needs to be moved to where windowSize is currently + // - windowSize, sessionId, and sequence need to be moved into protocol specific payloads + // - minimum alignment could then be reduced to 2 bytes, and min packet size would be 8 bytes + // - downside is that pretty much every protocol would need to define some extra data + + DD_NETWORK_STRUCT(MessageHeader, 8) + { + // source and destination client ids + ClientId srcClientId; // 0 - 15 + ClientId dstClientId; // 16 - 31 + + // protocol and command + Protocol protocolId; // 31 - 38 + MessageCode messageId; // 39 - 47 + WindowSize windowSize; // 48 - 63 + + // payload size + current session ID + Size payloadSize; // 64 - 91 + SessionId sessionId; // 92 - 127 + + // sequence number when using a session + Sequence sequence; // 128 - 191 + }; + + DD_CHECK_SIZE(MessageHeader, 24); + + DD_STATIC_CONST Size kMaxMessageSizeInBytes = 1408; + DD_STATIC_CONST Size kMaxPayloadSizeInBytes = (kMaxMessageSizeInBytes - sizeof(MessageHeader)); + + DD_NETWORK_STRUCT(MessageBuffer, 8) + { + MessageHeader header; + char payload[kMaxPayloadSizeInBytes]; + }; + + DD_CHECK_SIZE(MessageBuffer, sizeof(MessageHeader) + kMaxPayloadSizeInBytes); + + // Helper function used to validate message buffers that arrive from an external source + // Returns Success if the message buffer is valid and Error otherwise. + inline Result ValidateMessageBuffer(const void* pMsgBuffer, size_t msgBufferSize) + { + Result result = Result::Error; + + // Ensure that we've been passed valid parameters + if ((pMsgBuffer != nullptr) && (msgBufferSize > 0)) + { + // A valid message buffer must be no larger than the full size message buffer structure + // and it must also be large enough to contain a valid header. + if ((msgBufferSize <= sizeof(MessageBuffer)) && (msgBufferSize >= sizeof(MessageHeader))) + { + // Calculate the total size of the message from the data encoded in the buffer. + const MessageHeader* pHeader = reinterpret_cast(pMsgBuffer); + const size_t encodedMessageSize = (sizeof(MessageHeader) + pHeader->payloadSize); + + // The encoded message size should match our expected size exactly + if (encodedMessageSize == msgBufferSize) + { + result = Result::Success; + } + } + } + else + { + result = Result::InvalidParameter; + } + + return result; + } + + // tripwire - this intentionally will break if the message version changes. Since these are breaking changes already, we need to address + // this problem when it happens. + static_assert(kMessageVersion == 1011, "ClientInfoStruct needs to be updated so that clientName is long enough to support a full path"); + // todo: shorten clientDescription to 64bytes and make clientName 320bytes to support full path + DD_NETWORK_STRUCT(ClientInfoStruct, 4) + { + char clientName[kMaxStringLength]; + char clientDescription[kMaxStringLength]; + // reserve 128bytes in case we need another string in the future + char reserved[kMaxStringLength]; + ClientMetadata metadata; + ProcessId processId; + // pad this out to 512 bytes for future expansion + char padding[116]; + }; + + DD_CHECK_SIZE(ClientInfoStruct, 512); + + /////////////////////// + // GPU Open Message codes + enum struct EscapeCommand : uint32 + { + Unknown = 0, + QueryStatus, // Will be deprecated in a future change + RegisterClient, + UnregisterClient, + RegisterExternalClient, // Will be deprecated in a future change + UnregisterExternalClient, // Will be deprecated in a future change + UpdateClientStatus, // Will be deprecated in a future change + QueryCapabilities, + EnableDeveloperMode, + DisableDeveloperMode, + QueryDeveloperModeStatus, + RegisterRouter, + UnregisterRouter, + AmdLogEvent, + Count + }; +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h new file mode 100644 index 0000000000..fe23bd9828 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h @@ -0,0 +1,271 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolClient.h" +#include "protocolServer.h" +#include "msgTransport.h" +#include "ddUriInterface.h" +#include "util/string.h" +#include "util/vector.h" + +namespace DevDriver +{ + class IMsgChannel; + class IService; + class ISession; + + namespace TransferProtocol + { + class TransferManager; + } + + namespace InfoURIService + { + class InfoService; + } + + // Temporarily changing from 10ms to 15ms to workaround a timing issue with Windows named pipes, should change back once that + // transport is refactored/replaced. + DD_STATIC_CONST uint32 kDefaultUpdateTimeoutInMs = 15; + DD_STATIC_CONST uint32 kFindClientTimeout = 500; + + // Enumeration of events that can occur on the message bus. + enum class BusEventType : uint32 + { + Unknown = 0, + ClientHalted, + PongRequest, + }; + + /// Event data structure for the ClientHalted bus event + struct BusEventClientHalted + { + ClientId clientId; /// Id of the client that is currently halted + ClientInfoStruct clientInfo; /// Additional information about the client + }; + + /// Event data structure for the PongRequest bus event + struct BusEventPongRequest + { + ClientId clientId; /// Id of the client that is requesting a pong message + const ClientInfoStruct* pClientInfo; /// Additional information about the client + /// Note: May be nullptr for older clients + bool* pShouldRespond; /// Set this to false if a pong should not be sent + /// The default value is true. + /// Note: This will never be nullptr + }; + + // Callback function used to handle bus events + typedef void (*PFN_BusEventCallback)(void* pUserdata, BusEventType type, const void* pEventData, size_t eventDataSize); + + // Helper structure used to contain a bus event callback + struct BusEventCallback + { + PFN_BusEventCallback pfnEventCallback; // Message bus event callback function + void* pUserdata; // Message bus event callback userdata + + /// Returns true if this callback contains a valid function + bool IsValid() const { return (pfnEventCallback != nullptr); } + + /// Executes the function stored within the callback + void operator()(BusEventType type, const void* pEventData, size_t eventDataSize) + { + DD_ASSERT(IsValid()); + + pfnEventCallback(pUserdata, type, pEventData, eventDataSize); + } + }; + + // Struct of information required to initialize an IMsgChannel instance + struct MessageChannelCreateInfo + { + StatusFlags initialFlags; // Initial client status flags. + Component componentType; // Type of component the message channel represents. + bool createUpdateThread; // Create a background processing thread for the message + // channel. This should only be set to false if the + // owning object is able to call IMsgChannel::Update() + // at least once per frame. + char clientDescription[kMaxStringLength]; // Description of the client provided to other clients on + // the message bus. + }; + + // Information required to establish a new session + struct EstablishSessionInfo + { + Protocol protocol; + Version minProtocolVersion; + Version maxProtocolVersion; + ClientId remoteClientId; + const char* pSessionName; + }; + + // "Temporary" structure to pack all create info without breaking back-compat + struct MessageChannelCreateInfo2 + { + MessageChannelCreateInfo channelInfo; + HostInfo hostInfo; + AllocCb allocCb; + }; + + // Data structure that contains information about a client that has been discovered + struct DiscoveredClientInfo + { + ClientId id; /// Id of the client + ClientMetadata metadata; /// Metadata for the client + + // Structure that contains additional information about the discovered client + // This information may or may not be valid depending on the value of the "valid" field. + struct + { + bool valid; + ClientInfoStruct data; + } clientInfo; + }; + + // Callback function used to handle client discovery + // Return true from this callback to indicate that the discovery process should be continued. + typedef bool (*PFN_ClientDiscoveredCallback)(void* pUserdata, const DiscoveredClientInfo& clientInfo); + + // Data structure that describes how a client discovery operation should be performed + struct DiscoverClientsInfo + { + PFN_ClientDiscoveredCallback pfnCallback; /// Callback function pointer + void* pUserdata; /// Userdata for callback + ClientMetadata filter; /// Filters out incoming clients from the callback + uint32 timeoutInMs; /// Timeout in milliseconds + }; + + // Create a new message channel object + Result CreateMessageChannel(const MessageChannelCreateInfo2& createInfo, IMsgChannel** ppMessageChannel); + + class IMsgChannel + { + public: + virtual ~IMsgChannel() {} + + // Register, unregister, or check connected status. + virtual Result Register(uint32 timeoutInMs = ~(0u)) = 0; + virtual void Unregister() = 0; + virtual bool IsConnected() = 0; + + virtual void SetBusEventCallback(const BusEventCallback& callback) = 0; + + // Send, receive, and forward messages + virtual Result Send(ClientId dstClientId, + Protocol protocol, + MessageCode message, + const ClientMetadata& metadata, + uint32 payloadSizeInBytes, + const void* pPayload) = 0; + virtual Result Receive(MessageBuffer& message, uint32 timeoutInMs) = 0; + virtual Result Forward(const MessageBuffer& messageBuffer) = 0; + + // Register, unregister, and retrieve IProtocolServer objects + virtual Result RegisterProtocolServer(IProtocolServer* pServer) = 0; + virtual Result UnregisterProtocolServer(IProtocolServer* pServer) = 0; + virtual IProtocolServer* GetProtocolServer(Protocol protocol) = 0; + + // Initiates a connection to the specified destination client id + // Returns the intermediate session via ppSession + virtual Result EstablishSessionForClient(SharedPointer* ppSession, + const EstablishSessionInfo& sessionInfo) = 0; + + // Register or Unregister an IService object + virtual Result RegisterService(IService* pService) = 0; + virtual Result UnregisterService(IService* pService) = 0; + + // Get the allocator used to create this message channel + virtual const AllocCb& GetAllocCb() const = 0; + + // Attempts to discover clients on the message bus + virtual Result DiscoverClients(const DiscoverClientsInfo& info) = 0; + + // Returns client information for the first client to respond that matches the specified filter + virtual Result FindFirstClient(const ClientMetadata& filter, + ClientId* pClientId, + uint32 timeoutInMs = kFindClientTimeout, + ClientMetadata* pClientMetadata = nullptr) = 0; + + // Get the client ID, or returns kBroadcastClientId if disconnected. + virtual ClientId GetClientId() const = 0; + + // Get the client information struct for the message channel. + virtual const ClientInfoStruct& GetClientInfo() const = 0; + + // Get a human-readable string describing the connection type. + virtual const char* GetTransportName() const = 0; + + // Set and get all client status flags. + virtual Result SetStatusFlags(StatusFlags flags) = 0; + virtual StatusFlags GetStatusFlags() const = 0; + + // Set the specified client status flag. + template + Result SetStatusFlag(bool enable) + { + Result toggleResult = Result::Success; + StatusFlags oldFlags = GetStatusFlags(); + StatusFlags newFlags; + + if (enable) + { + // Toggle developer mode + newFlags = oldFlags | static_cast(flag); + } + else + { + // Toggle developer mode + newFlags = oldFlags & ~static_cast(flag); + } + + if (newFlags != oldFlags) + { + toggleResult = SetStatusFlags(newFlags); + } + return toggleResult; + } + + // Get the specified client status flag. + template + bool GetStatusFlag() const + { + return ((GetStatusFlags() & static_cast(flag)) != 0); + } + + virtual InfoURIService::InfoService& GetInfoService() = 0; + + // Utility functions that should probably not be publicly exposed. + // TODO: Refactor surrounding code to eliminate these. + virtual TransferProtocol::TransferManager& GetTransferManager() = 0; + virtual void Update(uint32 timeoutInMs = kDefaultUpdateTimeoutInMs) = 0; + + protected: + IMsgChannel() {}; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h new file mode 100644 index 0000000000..187109e76e --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h @@ -0,0 +1,70 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +namespace DevDriver +{ + class IMsgTransport + { + public: + virtual ~IMsgTransport() {} + + // Connect and disconnect from the transport. + virtual Result Connect(ClientId* pClientId, uint32 timeoutInMs) = 0; + virtual Result Disconnect() = 0; + + // Read and Write messages from a connected transport + virtual Result WriteMessage(const MessageBuffer &messageBuffer) = 0; + virtual Result ReadMessage(MessageBuffer &messageBuffer, uint32 timeoutInMs) = 0; + + // Get a human-readable string describing the connection type. + virtual const char* GetTransportName() const = 0; + + // Static method to be implemented by individual transports + // true indicates that the transport is incapable of detecting + // dropped connections and some form of keep-alive is required + // false indicates that the transport can properly detect dropped + // connections + DD_STATIC_CONST bool RequiresKeepAlive() + { + return false; + } + + // Static method to be implemented by individual transports + // true indicates that Connect is expected to also negotiate a client ID + // false indicates that the MessageChannel needs to do it's own client ID + // negotiation, e.g. in the case of network connections + DD_STATIC_CONST bool RequiresClientRegistration() + { + return false; + } + protected: + IMsgTransport() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h new file mode 100644 index 0000000000..bb18c9d498 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h @@ -0,0 +1,28 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../../core/inc/platforms/ddcWinPlatform.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h new file mode 100644 index 0000000000..1649f888e1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h @@ -0,0 +1,54 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolSession.h" + +namespace DevDriver +{ + class Session; + + class IProtocolClient : public IProtocolSession + { + public: + virtual ~IProtocolClient() {} + + virtual Version GetSessionVersion() const = 0; + + virtual Result Connect(ClientId clientId, uint32 timeoutInMs) = 0; + virtual Result Connect(ClientId clientId) = 0; + virtual void Disconnect() = 0; + + virtual bool IsConnected() const = 0; + virtual ClientId GetRemoteClientId() const = 0; + + virtual bool QueryConnectionStatus() = 0; + protected: + IProtocolClient() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h new file mode 100644 index 0000000000..d238091e18 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h @@ -0,0 +1,53 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolSession.h" + +namespace DevDriver +{ + class IMsgChannel; + class Session; + + class IProtocolServer : public IProtocolSession + { + public: + virtual ~IProtocolServer() {} + + virtual void Finalize() = 0; + + virtual bool GetSupportedVersion(Version minVersion, Version maxVersion, Version *version) const = 0; + virtual bool AcceptSession(const SharedPointer& pSession) = 0; + + virtual void SessionEstablished(const SharedPointer &pSession) = 0; + virtual void UpdateSession(const SharedPointer &pSession) = 0; + virtual void SessionTerminated(const SharedPointer &pSession, Result terminationReason) = 0; + protected: + IProtocolServer() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h new file mode 100644 index 0000000000..beaf32a746 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h @@ -0,0 +1,175 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace DevDriver +{ + // A container struct that can hold any protocol's payload and keep track of its size. + // Not intended for network transport. This struct is intended to help simplify code that works with variably sized payloads. + // The struct is 8 byte aligned because the internal payload field requires 8 byte alignment. + DD_ALIGNED_STRUCT(SizedPayloadContainer, 8) + { + uint32 payloadSize; + uint32 padding; + char payload[kMaxPayloadSizeInBytes]; + + // For safety purposes we limit the CreatePayload and GetPayload methods to types that: + // + // 1. Have a standard layout, to ensure that the contents are safe to transmit across the network + // 2. Are trivially destructible, to ensure that a user doesn't construct an object and then overwrite it + // without destroying it + // 3. Small enough to fit inside the payload field of this struct + template + struct CanUseAsPayload + { + static_assert(Platform::IsStandardLayout::Value, "Type provided does not meet standard layout requirements"); + static_assert(Platform::IsTriviallyDestructible::Value, "Type provided is not trivially destructible"); + static_assert((sizeof(T) <= kMaxPayloadSizeInBytes), "Type provided is too large to fit in the container"); + + DD_STATIC_CONST bool Value = Platform::IsStandardLayout::Value && + Platform::IsTriviallyDestructible::Value && + (sizeof(T) <= kMaxPayloadSizeInBytes); + }; + + // We additionally only allow creation of a payload if the type is constructible using the arguments specified + template + struct CanCreatePayload + { + static_assert(CanUseAsPayload::Value, "Type specified cannot be used as a payload"); + static_assert(Platform::IsConstructible::Value, "Type provided cannot be constructed with the provided arguments"); + + DD_STATIC_CONST bool Value = CanUseAsPayload::Value && + Platform::IsConstructible::Value; + }; + + // Convenience function to allow in-place construction of a payload object using placement new. + template::Value>::Type> + void CreatePayload(Args&&... args) + { + // This is tremendously unsafe, but we use placement new to construct an object inside the buffer. + // Why do we do this? The big benefit is that it lets us skip having to create a temporary object + // and then copy it into this buffer. + // + // There are a couple of other ancillary benefits that are useful. The biggest is that if an object has + // a constexpr constructor it can initialize the memory using a memcpy/move instead of having to actually + // call the constructor. The other benefit is that if the constructor omits initializing memory (e.g., + // a giant data buffer) it will also skip re-initializing the memory here. This is not the case with + // when you create another instance of the object and copy it - the temporary object is almost certainly + // zero initialized, and the copy/move will result in the entire struct being copied. + + static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); + + new(reinterpret_cast(&payload[0])) T(Platform::Forward(args)...); + payloadSize = sizeof(T); + padding = 0; + } + + // Convenience function to allow accessing the payload as if it was the specified type. + template::Value>::Type> + T& GetPayload() + { + return *GetPayloadPointer(&payload[0]); + } + + private: + // Convenience function to allow accessing the payload as if it was the specified type. + template::Value>::Type> + static constexpr T* GetPayloadPointer(char* DD_RESTRICT pPointer) + { + static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); + return (T*)(pPointer); + } + }; + + DD_CHECK_SIZE(SizedPayloadContainer, 8 + kMaxPayloadSizeInBytes); + + class IMsgChannel; + class Session; + + enum struct SessionType + { + Unknown = 0, + Client, + Server + }; + + class ISession + { + public: + virtual ~ISession() {}; + + virtual Result Send(uint32 payloadSizeInBytes, const void* pPayload, uint32 timeoutInMs) = 0; + virtual Result Receive(uint32 payloadSizeInBytes, void *pPayload, uint32 *pBytesReceived, uint32 timeoutInMs) = 0; + virtual Result WaitForConnection(uint32 timeoutInMs) = 0; + virtual Result WaitForDisconnection(uint32 timeoutInMs) = 0; + + virtual bool IsClosed() const = 0; + + virtual void* SetUserData(void* pUserdata) = 0; + virtual void* GetUserData() const = 0; + virtual SessionId GetSessionId() const = 0; + virtual ClientId GetDestinationClientId() const = 0; + virtual Version GetVersion() const = 0; + virtual Protocol GetProtocol() const = 0; + + // Helper functions for working with SizedPayloadContainers and managing back-compat. + Result SendPayload(const SizedPayloadContainer& payload, uint32 timeoutInMs) + { + return Send(payload.payloadSize, payload.payload, timeoutInMs); + } + + Result ReceivePayload(SizedPayloadContainer* pPayload, uint32 timeoutInMs) + { + DD_ASSERT(pPayload != nullptr); + return Receive(sizeof(pPayload->payload), pPayload->payload, &pPayload->payloadSize, timeoutInMs); + } + + protected: + ISession() {} + }; + + class IProtocolSession + { + public: + virtual ~IProtocolSession() {} + + virtual Protocol GetProtocol() const = 0; + virtual SessionType GetType() const = 0; + virtual Version GetMinVersion() const = 0; + virtual Version GetMaxVersion() const = 0; + + protected: + IProtocolSession() {} + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h new file mode 100644 index 0000000000..afd88a00e2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h @@ -0,0 +1,531 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +#define DRIVERCONTROL_PROTOCOL_VERSION 10 + +#define DRIVERCONTROL_PROTOCOL_MINIMUM_VERSION 1 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 10.0 | Add ability to set clock mode on all adapters. | +*| 9.0 | Added a feature that allows tools to indicate when they will be ignoring a specific driver. | +*| 8.0 | Added a new version of the step driver response that contains the current driver status. | +*| 7.0 | Corrected a back-compat issue related to the new device clock query code. | +*| 6.0 | Added ability to query device clock frequencies for a given clock mode. | +*| 5.0 | Cleaned up the driver facing interface. | +*| 4.0 | Added HaltedOnPostDeviceInit state. | +*| 3.0 | Added QueryClientInfoRequest support. | +*| 2.1 | Added initialization time step functionality. | +*| 2.0 | Added initialization time driver status values and a terminate driver command. | +*| 1.0 | Initial version | +*********************************************************************************************************************** +*/ + +#define DRIVERCONTROL_SET_CLOCKS_ALL_ADAPTERS_VERSION 10 +#define DRIVERCONTROL_IGNORE_DRIVER_VERSION 9 +#define DRIVERCONTROL_STEP_RETURN_STATUS_VERSION 8 +#define DRIVERCONTROL_QUERY_BY_MODE_BACK_COMPAT_VERSION 7 +#define DRIVERCONTROL_QUERY_DEVICE_CLOCKS_BY_MODE_VERSION 6 +#define DRIVERCONTROL_DRIVER_INTERFACE_CLEANUP_VERSION 5 +#define DRIVERCONTROL_HALTEDPOSTDEVICEINIT_VERSION 4 +#define DRIVERCONTROL_QUERYCLIENTINFO_VERSION 3 +#define DRIVERCONTROL_INITIALIZATION_STATUS_VERSION 2 +#define DRIVERCONTROL_INITIAL_VERSION 1 + +namespace DevDriver +{ + namespace DriverControlProtocol + { + /////////////////////// + // DriverControl Constants + DD_STATIC_CONST uint32 kLegacyDriverControlPayloadSize = 16; + + /////////////////////// + // DriverControl Protocol + enum struct DriverControlMessage : MessageCode + { + Unknown = 0, + PauseDriverRequest, + PauseDriverResponse, + ResumeDriverRequest, + ResumeDriverResponse, + QueryNumGpusRequest, + QueryNumGpusResponse, + QueryDeviceClockModeRequest, + QueryDeviceClockModeResponse, + SetDeviceClockModeRequest, + SetDeviceClockModeResponse, + QueryDeviceClockRequest, + QueryDeviceClockResponse, + QueryMaxDeviceClockRequest, + QueryMaxDeviceClockResponse, + QueryDriverStatusRequest, + QueryDriverStatusResponse, + StepDriverRequest, + StepDriverResponse, + QueryClientInfoRequest, + QueryClientInfoResponse, + QueryDeviceClockByModeRequest, + QueryDeviceClockByModeResponse, + StepDriverResponseV2, + IgnoreDriverRequest, + IgnoreDriverResponse, + Count + }; + + /////////////////////// + // DriverControl Types + enum struct DeviceClockMode : uint32 + { + Unknown = 0, + Default, + Profiling, + MinimumMemory, + MinimumEngine, + Peak, + Count + }; + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + typedef DevDriver::DriverStatus DriverStatus; +#endif + + /////////////////////// + // DriverControl Payloads + DD_NETWORK_STRUCT(DriverControlHeader, 4) + { + DriverControlMessage command; + char _padding[3]; + + constexpr DriverControlHeader(DriverControlMessage message) + : command(message) + , _padding() + { + } + }; + + DD_CHECK_SIZE(DriverControlHeader, 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Pause Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(PauseDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr PauseDriverRequestPayload() + : header(DriverControlMessage::PauseDriverRequest) + { + } + }; + + DD_CHECK_SIZE(PauseDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(PauseDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr PauseDriverResponsePayload(Result result) + : header(DriverControlMessage::PauseDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(PauseDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Resume Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(ResumeDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr ResumeDriverRequestPayload() + : header(DriverControlMessage::ResumeDriverRequest) + { + } + }; + + DD_CHECK_SIZE(ResumeDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(ResumeDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr ResumeDriverResponsePayload(Result result) + : header(DriverControlMessage::ResumeDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(ResumeDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Num Gpus Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryNumGpusRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryNumGpusRequestPayload() + : header(DriverControlMessage::QueryNumGpusRequest) + { + } + }; + + DD_CHECK_SIZE(QueryNumGpusRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryNumGpusResponsePayload, 4) + { + DriverControlHeader header; + Result result; + uint32 numGpus; + + constexpr QueryNumGpusResponsePayload(Result result, uint32 numGpus) + : header(DriverControlMessage::QueryNumGpusResponse) + , result(result) + , numGpus(numGpus) + { + } + }; + + DD_CHECK_SIZE(QueryNumGpusResponsePayload, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryDeviceClockModeRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryDeviceClockModeRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryDeviceClockModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + DeviceClockMode mode; + + constexpr QueryDeviceClockModeResponsePayload(Result result, DeviceClockMode mode) + : header(DriverControlMessage::QueryDeviceClockModeResponse) + , result(result) + , mode(mode) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Set Device Clock Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(SetDeviceClockModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + DeviceClockMode mode; + + constexpr SetDeviceClockModeRequestPayload(uint32 gpuIndex, DeviceClockMode mode) + : header(DriverControlMessage::SetDeviceClockModeRequest) + , gpuIndex(gpuIndex) + , mode(mode) + { + } + }; + + DD_CHECK_SIZE(SetDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 8); + + DD_NETWORK_STRUCT(SetDeviceClockModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr SetDeviceClockModeResponsePayload(Result result) + : header(DriverControlMessage::SetDeviceClockModeResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(SetDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryDeviceClockRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryDeviceClockRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryDeviceClockResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float gpuClock; + float memClock; + + constexpr QueryDeviceClockResponsePayload(Result result, float gpuClock, float memClock) + : header(DriverControlMessage::QueryDeviceClockResponse) + , result(result) + , gpuClock(gpuClock) + , memClock(memClock) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock By Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockByModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + DeviceClockMode deviceClockMode; + + constexpr QueryDeviceClockByModeRequestPayload(uint32 gpuIndex, DeviceClockMode clockMode) + : header(DriverControlMessage::QueryDeviceClockByModeRequest) + , gpuIndex(gpuIndex) + , deviceClockMode(clockMode) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockByModeRequestPayload, sizeof(DriverControlHeader) + 8); + + DD_NETWORK_STRUCT(QueryDeviceClockByModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float gpuClock; + float memClock; + + constexpr QueryDeviceClockByModeResponsePayload(Result result, float gpuClock, float memClock) + : header(DriverControlMessage::QueryDeviceClockByModeResponse) + , result(result) + , gpuClock(gpuClock) + , memClock(memClock) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockByModeResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Max Device Clock Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryMaxDeviceClockRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryMaxDeviceClockRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryMaxDeviceClockRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryMaxDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryMaxDeviceClockResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float maxGpuClock; + float maxMemClock; + + constexpr QueryMaxDeviceClockResponsePayload(Result result, float maxGpuClock, float maxMemClock) + : header(DriverControlMessage::QueryMaxDeviceClockResponse) + , result(result) + , maxGpuClock(maxGpuClock) + , maxMemClock(maxMemClock) + { + } + }; + + DD_CHECK_SIZE(QueryMaxDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Driver Status Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDriverStatusRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryDriverStatusRequestPayload() + : header(DriverControlMessage::QueryDriverStatusRequest) + { + } + }; + + DD_CHECK_SIZE(QueryDriverStatusRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryDriverStatusResponsePayload, 4) + { + DriverControlHeader header; + DriverStatus status; + + constexpr QueryDriverStatusResponsePayload(DriverStatus status) + : header(DriverControlMessage::QueryDriverStatusResponse) + , status(status) + { + } + }; + + DD_CHECK_SIZE(QueryDriverStatusResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Step Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(StepDriverRequestPayload, 4) + { + DriverControlHeader header; + uint32 count; + + constexpr StepDriverRequestPayload(uint32 count) + : header(DriverControlMessage::StepDriverRequest) + , count(count) + { + } + }; + + DD_CHECK_SIZE(StepDriverRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(StepDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr StepDriverResponsePayload(Result result) + : header(DriverControlMessage::StepDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(StepDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(StepDriverResponsePayloadV2, 4) + { + DriverControlHeader header; + Result result; + DriverStatus status; + + constexpr StepDriverResponsePayloadV2(Result result, DriverStatus status) + : header(DriverControlMessage::StepDriverResponseV2) + , result(result) + , status(status) + { + } + }; + + DD_CHECK_SIZE(StepDriverResponsePayloadV2, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Client Info Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryClientInfoRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryClientInfoRequestPayload() + : header(DriverControlMessage::QueryClientInfoRequest) + { + } + }; + + DD_CHECK_SIZE(QueryClientInfoRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryClientInfoResponsePayload, 4) + { + DriverControlHeader header; + ClientInfoStruct clientInfo; + + constexpr QueryClientInfoResponsePayload(const ClientInfoStruct& clientInfo) + : header(DriverControlMessage::QueryClientInfoResponse) + , clientInfo(clientInfo) + { + } + }; + + DD_CHECK_SIZE(QueryClientInfoResponsePayload, sizeof(DriverControlHeader) + sizeof(ClientInfoStruct)); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ignore Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(IgnoreDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr IgnoreDriverRequestPayload() + : header(DriverControlMessage::IgnoreDriverRequest) + { + } + }; + + DD_CHECK_SIZE(IgnoreDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(IgnoreDriverResponsePayload, 4) + { + DriverControlHeader header; + + constexpr IgnoreDriverResponsePayload() + : header(DriverControlMessage::IgnoreDriverResponse) + { + } + }; + + DD_CHECK_SIZE(IgnoreDriverResponsePayload, sizeof(DriverControlHeader)); + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h new file mode 100644 index 0000000000..428ff76b1b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h @@ -0,0 +1,157 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "baseProtocolServer.h" +#include "driverControlProtocol.h" + +namespace DevDriver +{ + namespace DriverControlProtocol + { + DD_STATIC_CONST uint32 kMaxNumGpus = 16; + + typedef Result(*SetDeviceClockModeCallback)(uint32 gpuIndex, DeviceClockMode clockMode, void* pUserdata); + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION + typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, float* pGpuClock, float* pMemClock, void* pUserdata); + typedef Result(*QueryMaxDeviceClockCallback)(uint32 gpuIndex, float* pMaxGpuClock, float* pMaxMemClock, void* pUserdata); + + struct DeviceClockCallbackInfo + { + QueryDeviceClockCallback queryClockCallback; + QueryMaxDeviceClockCallback queryMaxClockCallback; + SetDeviceClockModeCallback setCallback; + void* pUserdata; + }; +#else + typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, DevDriver::DriverControlProtocol::DeviceClockMode clockMode, float* pGpuClock, float* pMemClock, void* pUserdata); + + struct DeviceClockCallbackInfo + { + QueryDeviceClockCallback queryClockCallback; + SetDeviceClockModeCallback setCallback; + void* pUserdata; + }; +#endif + + enum class SessionState; + + class DriverControlServer : public BaseProtocolServer + { + public: + explicit DriverControlServer(IMsgChannel* pMsgChannel); + ~DriverControlServer(); + + void Finalize() override; + + // Session handling functions + bool AcceptSession(const SharedPointer& pSession) override; + void SessionEstablished(const SharedPointer& pSession) override; + void UpdateSession(const SharedPointer& pSession) override; + void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; + + // Driver state functions +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + // These functions just pass through to the new renamed variants to preserve backward compatibility + void WaitForDriverResume() { DriverTick(); } + void StartDeviceInit() { StartEarlyDeviceInit(); } + void FinishDriverInitialization() { FinishDeviceInit(); } +#endif + void StartEarlyDeviceInit(); + void StartLateDeviceInit(); + void FinishDeviceInit(); + void PauseDriver(); + void ResumeDriver(); + void DriverTick(); + + // Other public functions + bool IsDriverInitialized() const; + DriverStatus QueryDriverStatus(); + void SetNumGpus(uint32 numGpus); + void SetDeviceClockCallback(const DeviceClockCallbackInfo& deviceClockCallbackInfo); + uint32 GetNumGpus(); + DeviceClockMode GetDeviceClockMode(uint32 gpuIndex); + + // Sets the client id that's expected to walk us through the driver initialization process. + // If this isn't set, the server will attempt to find a suitable client itself via broadcast + discovery. + void SetDriverInitClientId(ClientId clientId) { m_driverInitClientId = clientId; } + + /// Returns true if this driver will be ignored by tools + bool IsDriverIgnored() const { return m_isIgnored; } + + private: + void LockData(); + void UnlockData(); + + // Private driver state functions + void AdvanceDriverInitState(); + void WaitForResume(); + bool DiscoverHaltRequests(); + void HandleDriverHalt(); + bool IsHalted() const + { + return ((m_driverStatus == DriverStatus::HaltedOnPlatformInit) || + (m_driverStatus == DriverStatus::HaltedOnDeviceInit) || + (m_driverStatus == DriverStatus::HaltedPostDeviceInit)); + } + + // Protocol message handlers + SessionState HandlePauseDriverRequest(SizedPayloadContainer& container); + SessionState HandleResumeDriverRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockModeRequest(SizedPayloadContainer& container); + SessionState HandleSetDeviceClockModeRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockByModeRequest(SizedPayloadContainer& container); + SessionState HandleQueryMaxDeviceClockRequest(SizedPayloadContainer& container); + SessionState HandleQueryNumGpusRequest(SizedPayloadContainer& container); + SessionState HandleQueryDriverStatusRequest(SizedPayloadContainer& container, const Version sessionVersion); + SessionState HandleStepDriverRequest(SizedPayloadContainer& container, const Version sessionVersion); + SessionState HandleIgnoreDriverRequest(SizedPayloadContainer& container); + + Platform::Mutex m_mutex; + DriverStatus m_driverStatus; + Platform::Event m_driverResumedEvent; + + uint32 m_numGpus; + DeviceClockMode m_deviceClockModes[kMaxNumGpus]; + DeviceClockCallbackInfo m_deviceClockCallbackInfo; + Platform::Atomic m_numSessions; + Platform::Atomic m_stepCounter; + bool m_initStepRequested; + + // The client id of the remote client who's responsible for walking us through the driver initialization + // process. + ClientId m_driverInitClientId; + + // This value is set to true if a remote tool has indicated that this driver will be ignored + bool m_isIgnored; + + DD_STATIC_CONST uint32 kBroadcastIntervalInMs = 100; + DD_STATIC_CONST uint32 kDefaultDriverStartTimeoutMs = 1000; + }; + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h new file mode 100644 index 0000000000..027b3568b5 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h @@ -0,0 +1,537 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#pragma pack(push) + +#include "gpuopen.h" + +#define RGP_PROTOCOL_VERSION 11 + +#define RGP_PROTOCOL_MINIMUM_VERSION 2 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 11.0 | Version bumped to indicate driver support for SE mask only applying to detailed instruction tracing | +*| 10.0 | Added support for SPM counters and SE masking. | +*| 9.0 | Decoupled trace parameters from execute trace request. | +*| 8.0 | Added support for capturing the RGP trace on specific frame or dispatch | +*| | Added bitfield to control whether driver internal code objects are included in the code object database | +*| 7.0 | Added support for aborting traces that are still in the pending state on the server. | +*| 6.0 | Added support for trace trigger markers. | +*| 5.0 | Added support for allow compute presents trace parameter and removed unused clock mode parameter. | +*| 4.0 | Added support for reporting trace transfer progress. | +*| 3.0 | Updated TraceParameters struct to allow for specifying profiling clock mode. | +*| 2.0 | Add TraceParameters struct and ExecuteTraceRequestPayload so a client can specify trace options. | +*| 1.0 | Initial version | +*********************************************************************************************************************** +*/ + +#define RGP_DETAILED_SEMASK_VERSION 11 +#define RGP_SPM_COUNTERS_VERSION 10 +#define RGP_DECOUPLED_TRACE_PARAMETERS 9 +#define RGP_FRAME_CAPTURE_VERSION 8 +#define RGP_PENDING_ABORT_VERSION 7 +#define RGP_TRIGGER_MARKERS_VERSION 6 +#define RGP_COMPUTE_PRESENTS_VERSION 5 +#define RGP_TRACE_PROGRESS_VERSION 4 +#define RGP_PROFILING_CLOCK_MODES_VERSION 3 +#define RGP_TRACE_PARAMETERS_VERSION 2 +#define RGP_INITIAL_VERSION 1 + +namespace DevDriver +{ + namespace RGPProtocol + { + /////////////////////// + // RGP Protocol + enum struct RGPMessage : MessageCode + { + Unknown = 0, + ExecuteTraceRequest, + TraceDataChunk, + TraceDataSentinel, + QueryProfilingStatusRequest, + QueryProfilingStatusResponse, + EnableProfilingRequest, + EnableProfilingResponse, + TraceDataHeader, + AbortTrace, + QueryTraceParametersRequest, + QueryTraceParametersResponse, + UpdateTraceParametersRequest, + UpdateTraceParametersResponse, + UpdateSpmConfigRequest, + UpdateSpmConfigData, + UpdateSpmConfigResponse, + Count + }; + + // @note: We currently subtract sizeof(uint32) instead of sizeof(RGPMessage) to work around struct packing issues. + // The compiler pads out RGPMessage to 4 bytes when it's included in the payload struct. It also pads out + // the TraceDataChunk data field to 1000 bytes. This causes the total payload size to be 1004 bytes which is + // 4 bytes larger than the maximum size allowed. + DD_STATIC_CONST Size kMaxTraceDataChunkSize = (kMaxPayloadSizeInBytes - sizeof(uint32) - sizeof(uint32)); + + /////////////////////// + // RGP Constants + const uint32 kMarkerStringLength = 256; + const uint32 kMaxSpmCountersPerUpdate = 320; + + // Define the number of bits per SPM id value + constexpr uint32 kSpmBlockIdBits = 8; + constexpr uint32 kSpmInstanceIdBits = 12; + constexpr uint32 kSpmEventIdBits = 12; + + // Define the max SPM id values based on the number of bits we allocate for them in the network packet + constexpr uint32 kMaxSpmBlockId = (1 << kSpmBlockIdBits); + constexpr uint32 kMaxSpmInstanceId = (1 << kSpmInstanceIdBits); + constexpr uint32 kMaxSpmEventId = (1 << kSpmEventIdBits); + + // The application can specify this value for the instance id and it will be expanded into + // all available instances on the driver side. + // The counter fields are bit packed when transferred over the network so we need to account for that here + // rather than simply setting all bits. + constexpr uint32 kSpmAllInstancesId = (kMaxSpmInstanceId - 1); + + /////////////////////// + // RGP Types + DD_NETWORK_STRUCT(TraceDataChunk, 4) + { + uint32 dataSize; + uint8 data[kMaxTraceDataChunkSize]; + }; + + DD_CHECK_SIZE(TraceDataChunk, kMaxTraceDataChunkSize + sizeof(int32)); + + DD_NETWORK_STRUCT(TraceParameters, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 reserved : 31; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParameters, 12); + + enum struct ProfilingClockMode : uint32 + { + Stable = 0, + Max, + Normal, + Count + }; + + DD_NETWORK_STRUCT(TraceParametersV2, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + ProfilingClockMode clockMode; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 reserved : 31; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParametersV2, 16); + + DD_NETWORK_STRUCT(TraceParametersV3, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParametersV3, 12); + + DD_NETWORK_STRUCT(TraceParametersV4, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + }; + + DD_CHECK_SIZE(TraceParametersV4, 540); + + enum struct CaptureTriggerMode : uint32 + { + Present = 0, + Markers, + Index, + Count + }; + + DD_NETWORK_STRUCT(TraceParametersV5, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 reserved : 29; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + }; + + DD_CHECK_SIZE(TraceParametersV5, 552); + + DD_NETWORK_STRUCT(TraceParametersV6, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 reserved : 29; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + // Target pipeline hash + uint32 pipelineHashHi; + uint32 pipelineHashLo; + }; + + DD_CHECK_SIZE(TraceParametersV6, 560); + + DD_NETWORK_STRUCT(TraceParametersV7, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 enableSpm : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + // Target pipeline hash + uint32 pipelineHashHi; + uint32 pipelineHashLo; + + // Shader Engine Mask + uint32 seMask; + }; + + DD_CHECK_SIZE(TraceParametersV7, 564); + + DD_NETWORK_STRUCT(SpmCounterId, 4) + { + uint32 blockId : kSpmBlockIdBits; + uint32 instanceId : kSpmInstanceIdBits; + uint32 eventId : kSpmEventIdBits; + }; + + DD_CHECK_SIZE(SpmCounterId, 4); + + static_assert( + kSpmBlockIdBits + kSpmInstanceIdBits + kSpmEventIdBits == 8 * sizeof(SpmCounterId), + "SpmCounterId is wasting bits"); + + enum struct ProfilingStatus : uint32 + { + NotAvailable = 0, + Available, + Enabled, + Count + }; + + /////////////////////// + // RGP Payloads + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayload, 4) + { + TraceParameters parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayload, 12); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV2, 4) + { + TraceParametersV2 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV2, 16); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV3, 4) + { + TraceParametersV3 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV3, 12); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV4, 4) + { + TraceParametersV4 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV4, 540); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV5, 4) + { + TraceParametersV5 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV5, 552); + + DD_NETWORK_STRUCT(TraceDataChunkPayload, 4) + { + TraceDataChunk chunk; + }; + + DD_CHECK_SIZE(TraceDataChunkPayload, kMaxTraceDataChunkSize + sizeof(int32)); + + DD_NETWORK_STRUCT(TraceDataSentinelPayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(TraceDataSentinelPayload, 4); + + DD_NETWORK_STRUCT(TraceDataHeaderPayload, 4) + { + Result result; + uint32 numChunks; + uint32 sizeInBytes; + }; + + DD_CHECK_SIZE(TraceDataHeaderPayload, 12); + + DD_NETWORK_STRUCT(QueryProfilingStatusResponsePayload, 4) + { + ProfilingStatus status; + }; + + DD_CHECK_SIZE(QueryProfilingStatusResponsePayload, 4); + + DD_NETWORK_STRUCT(EnableProfilingResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(EnableProfilingResponsePayload, 4); + + DD_NETWORK_STRUCT(QueryTraceParametersResponsePayload, 4) + { + Result result; + TraceParametersV6 parameters; + }; + + DD_CHECK_SIZE(QueryTraceParametersResponsePayload, 564); + + DD_NETWORK_STRUCT(QueryTraceParametersResponsePayloadV2, 4) + { + Result result; + TraceParametersV7 parameters; + }; + + DD_CHECK_SIZE(QueryTraceParametersResponsePayloadV2, 568); + + DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayload, 4) + { + TraceParametersV6 parameters; + }; + + DD_CHECK_SIZE(UpdateTraceParametersRequestPayload, 560); + + DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayloadV2, 4) + { + TraceParametersV7 parameters; + }; + + DD_CHECK_SIZE(UpdateTraceParametersRequestPayloadV2, 564); + + DD_NETWORK_STRUCT(UpdateTraceParametersResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(UpdateTraceParametersResponsePayload, 4); + + DD_NETWORK_STRUCT(UpdateSpmConfigRequestPayload, 4) + { + uint32 sampleFrequency; + uint32 memoryLimitInMb; + uint32 numDataPayloads; + }; + + DD_CHECK_SIZE(UpdateSpmConfigRequestPayload, 12); + + DD_NETWORK_STRUCT(UpdateSpmConfigDataPayload, 4) + { + uint32 numCounters; + SpmCounterId counters[kMaxSpmCountersPerUpdate]; + }; + + DD_CHECK_SIZE(UpdateSpmConfigDataPayload, 1284); + + DD_NETWORK_STRUCT(UpdateSpmConfigResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(UpdateSpmConfigResponsePayload, 4); + + DD_NETWORK_STRUCT(RGPPayload, 4) + { + RGPMessage command; + // pad out to 4 bytes for alignment requirements + char padding[3]; + union + { + ExecuteTraceRequestPayload executeTraceRequest; + ExecuteTraceRequestPayloadV2 executeTraceRequestV2; + ExecuteTraceRequestPayloadV3 executeTraceRequestV3; + ExecuteTraceRequestPayloadV4 executeTraceRequestV4; + ExecuteTraceRequestPayloadV5 executeTraceRequestV5; + TraceDataChunkPayload traceDataChunk; + TraceDataSentinelPayload traceDataSentinel; + TraceDataHeaderPayload traceDataHeader; + QueryProfilingStatusResponsePayload queryProfilingStatusResponse; + EnableProfilingResponsePayload enableProfilingStatusResponse; + QueryTraceParametersResponsePayload queryTraceParametersResponse; + QueryTraceParametersResponsePayloadV2 queryTraceParametersResponseV2; + UpdateTraceParametersRequestPayload updateTraceParametersRequest; + UpdateTraceParametersRequestPayloadV2 updateTraceParametersRequestV2; + UpdateTraceParametersResponsePayload updateTraceParametersResponse; + UpdateSpmConfigRequestPayload updateSpmConfigRequest; + UpdateSpmConfigDataPayload updateSpmConfigData; + UpdateSpmConfigResponsePayload updateSpmConfigResponse; + }; + }; + + DD_CHECK_SIZE(RGPPayload, kMaxPayloadSizeInBytes); + } +} + +#pragma pack(pop) diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h new file mode 100644 index 0000000000..05002b2c8b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h @@ -0,0 +1,174 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "baseProtocolServer.h" +#include "util/vector.h" + +#include "rgpProtocol.h" + +namespace DevDriver +{ + namespace RGPProtocol + { + enum class TraceStatus : uint32 + { + Idle = 0, + Pending, + Running, + Finishing, + Aborting + }; + + struct ServerTraceParametersInfo + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 enableSpm : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } flags; + + uint64 beginTag; + uint64 endTag; + + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + uint64 pipelineHash; + +#if DD_VERSION_SUPPORTS(GPUOPEN_RGP_SPM_COUNTERS_VERSION) + uint32 seMask; +#endif + }; + + struct ServerSpmCounterId + { + uint32 blockId; + uint32 instanceId; + uint32 eventId; + }; + + struct ServerSpmConfig + { + uint32 sampleFrequency; + uint32 memoryLimitInMb; + }; + + typedef bool (*PFN_ValidateSpmConfig)(void* pUserdata, const ServerSpmConfig* pConfig, const Vector* pCounterData); + struct ValidateSpmCallbackInfo + { + void* pUserdata; + PFN_ValidateSpmConfig pfnValidateSpmConfig; + }; + + struct RGPSession; + + class RGPServer : public BaseProtocolServer + { + public: + explicit RGPServer(IMsgChannel* pMsgChannel); + ~RGPServer(); + + void Finalize() override; + + bool AcceptSession(const SharedPointer& pSession) override; + void SessionEstablished(const SharedPointer& pSession) override; + void UpdateSession(const SharedPointer& pSession) override; + void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; + + // Returns true if traces are currently enabled. + bool TracesEnabled(); + + // Allows remote clients to request traces. + Result EnableTraces(); + + // Disable support for traces. + Result DisableTraces(); + + // Returns true if a client has requested a trace and it has not been started yet. + bool IsTracePending(); + + // Returns true if a client has requested a trace and it is currently running. + bool IsTraceRunning(); + + // Returns true if the server is currently outputting trace results to a client. + bool IsTraceOutputInProgress(); + + // Starts a new trace. This will only succeed if a trace was previously pending. + Result BeginTrace(); + + // Ends a trace. This will only succeed if a trace was previously in progress. + Result EndTrace(); + + // Aborts a trace. This will only succeed if a trace was previously in progress. + Result AbortTrace(); + + // Writes data into the current trace. This can only be performed when there is a trace in progress. + Result WriteTraceData(const uint8* pTraceData, size_t traceDataSize); + + // Returns the current profiling status on the rgp server. + ProfilingStatus QueryProfilingStatus(); + + // Returns the current trace parameters on the rgp server. + ServerTraceParametersInfo QueryTraceParameters(); + + // Populates the provided structure with the current perf counter config and returns data for each counter + // in the provided vector + Result QuerySpmConfig(ServerSpmConfig* pConfig, Vector* pCounterData); + + // Sets a validation callback that will be used to validate SPM configuration data + void SetSpmValidationCallback(const ValidateSpmCallbackInfo& callback); + + private: + void LockData(); + void UnlockData(); + void ClearCurrentSession(); + Result UpdateSpmConfig(const ServerSpmConfig& config, const Vector& counters); + + Platform::Mutex m_mutex; + TraceStatus m_traceStatus; + RGPSession* m_pCurrentSessionData; + ProfilingStatus m_profilingStatus; + ServerTraceParametersInfo m_traceParameters; + ServerSpmConfig m_spmConfig; + Vector m_spmCounterData; + ValidateSpmCallbackInfo m_spmValidationCb; + }; + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h new file mode 100644 index 0000000000..fcc817123f --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h @@ -0,0 +1,217 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +namespace DevDriver +{ + namespace SystemProtocol + { + /////////////////////// + // GPU Open System Protocol + enum struct SystemMessage : MessageCode + { + Unknown = 0, + ClientConnected, + ClientDisconnected, + Ping, + Pong, + QueryClientInfo, + ClientInfo, + Halted, + Count, + }; + } + + namespace SessionProtocol + { + /////////////////////// + // GPU Open Session Protocol + enum struct SessionMessage : MessageCode + { + Unknown = 0, + Syn, + SynAck, + Fin, + Data, + Ack, + Rst, + Count + }; + + typedef uint8 SessionVersion; + // Session protocol 2 lets session servers return session version as part of the synack + DD_STATIC_CONST SessionVersion kSessionProtocolVersionSynAckVersion = 2; + // Session protocol 1 lets session clients specify a max range supported as part of the syn + DD_STATIC_CONST SessionVersion kSessionProtocolRangeVersion = 1; + // current version is 2 + DD_STATIC_CONST SessionVersion kSessionProtocolVersion = kSessionProtocolVersionSynAckVersion; + // not mentioned is session version 0. It only supported min version in SynAck, servers reporting it cannot + // cleanly terminate in response to a Fin packet. + + // tripwire - this intentionally will break if the message version changes. Since that implies a breaking change, we need to address + // to re-baseline this as version 0 and update the SynPayload struct at the same time + static_assert(kMessageVersion == 1011, "Session packets need to be cleaned up as part of the next protocol version"); + + DD_NETWORK_STRUCT(SynPayload, 4) + { + Version minVersion; + Protocol protocol; + // pad out to 4 bytes + SessionVersion sessionVersion; + + // New fields read if sessionVersion != 0 + Version maxVersion; + // pad out to 8 bytes + uint8 reserved[2]; + }; + + DD_CHECK_SIZE(SynPayload, 8); + + // + // SynPayloadV2 is here so that we can use it with the next breaking message bus change. + // + //DD_NETWORK_STRUCT(SynPayloadV2, 4) + //{ + // Protocol protocol; + // SessionVersion sessionVersion; + // Version minVersion; + // Version maxVersion; + // // pad out to 8 bytes + // uint8 reserved[2]; + //}; + + //DD_CHECK_SIZE(SynPayloadV2, 8); + + DD_NETWORK_STRUCT(SynAckPayload, 8) + { + Sequence sequence; + SessionId initialSessionId; + Version version; + SessionVersion sessionVersion; + uint8 reserved[1]; + }; + + DD_CHECK_SIZE(SynAckPayload, 16); + } + + namespace ClientManagementProtocol + { + + /////////////////////// + // GPU Open ClientManagement Protocol + enum struct ManagementMessage : MessageCode + { + Unknown = 0, + ConnectRequest, + ConnectResponse, + DisconnectNotification, + DisconnectResponse, + SetClientFlags, + SetClientFlagsResponse, + QueryStatus, + QueryStatusResponse, + KeepAlive, + Count + }; + + DD_STATIC_CONST MessageBuffer kOutOfBandMessage = + { + { // header + kBroadcastClientId, //srcClientId + kBroadcastClientId, //dstClientId + Protocol::ClientManagement, //protocolId + 0, //messageId + 0, //windowSize + 0, //payloadSize + 0, //sessionId + kMessageVersion //sequence + }, + {} // payload + }; + + inline bool IsOutOfBandMessage(const MessageBuffer &message) + { + // an out of band message is denoted by both the dstClientId and srcClientId + // being initialized to kBroadcastClientId. + static_assert(kBroadcastClientId == 0, "Error, kBroadcastClientId is non-zero. IsOutOfBandMessage needs to be fixed"); + return ((message.header.dstClientId | message.header.srcClientId) == kBroadcastClientId); + } + + inline bool IsValidOutOfBandMessage(const MessageBuffer &message) + { + // an out of band message is only valid if the sequence field is initialized with the correct version + // and the protocolId is equal to the receiving client's Protocol::ClientManagement value + return ((message.header.sequence == kMessageVersion) & + (message.header.protocolId == Protocol::ClientManagement)); + } + + DD_NETWORK_STRUCT(ConnectRequestPayload, 4) + { + StatusFlags initialClientFlags; + uint8 padding[2]; + Component componentType; + uint8 reserved[3]; + }; + + DD_CHECK_SIZE(ConnectRequestPayload, 8); + + DD_NETWORK_STRUCT(ConnectResponsePayload, 4) + { + Result result; + ClientId clientId; + // pad this out to 8 bytes for future expansion + uint8 padding[2]; + }; + + DD_CHECK_SIZE(ConnectResponsePayload, 8); + + DD_NETWORK_STRUCT(SetClientFlagsPayload, 4) + { + StatusFlags flags; + uint8 padding[2]; + }; + + DD_CHECK_SIZE(SetClientFlagsPayload, 4); + + DD_NETWORK_STRUCT(SetClientFlagsResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(SetClientFlagsResponsePayload, 4); + + DD_NETWORK_STRUCT(QueryStatusResponsePayload, 4) + { + Result result; + StatusFlags flags; + uint8 reserved[2]; + }; + + DD_CHECK_SIZE(QueryStatusResponsePayload, 8); + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h new file mode 100644 index 0000000000..7935107d10 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h @@ -0,0 +1,168 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +namespace DevDriver +{ + class IProtocolServer; + class IProtocolClient; + + template + struct ProtocolServerMap + { + typedef IProtocolServer type; + }; + + template + struct ProtocolClientMap + { + typedef IProtocolClient type; + }; + + template + using ProtocolServerType = typename ProtocolServerMap::type; + + template + using ProtocolClientType = typename ProtocolClientMap::type; + + namespace DriverControlProtocol + { + class DriverControlServer; + class DriverControlClient; + } + + template <> + struct ProtocolServerMap + { + typedef DriverControlProtocol::DriverControlServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef DriverControlProtocol::DriverControlClient type; + }; + + namespace RGPProtocol + { + class RGPServer; + class RGPClient; + } + + template <> + struct ProtocolServerMap + { + typedef RGPProtocol::RGPServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef RGPProtocol::RGPClient type; + }; + + namespace EventProtocol + { + class EventServer; + class EventClient; + } + + template <> + struct ProtocolServerMap + { + typedef EventProtocol::EventServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef EventProtocol::EventClient type; + }; + + namespace ETWProtocol + { + class ETWServer; + class ETWClient; + } + + namespace SettingsURIService + { + class SettingsService; + } + + namespace InfoURIService + { + class InfoService; + } + + template <> + struct ProtocolServerMap + { + typedef ETWProtocol::ETWServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef ETWProtocol::ETWClient type; + }; + + namespace TransferProtocol + { + class TransferServer; + class TransferClient; + } + + template <> + struct ProtocolServerMap + { + typedef TransferProtocol::TransferServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef TransferProtocol::TransferClient type; + }; + + namespace URIProtocol + { + class URIServer; + class URIClient; + } + + template <> + struct ProtocolServerMap + { + typedef URIProtocol::URIServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef URIProtocol::URIClient type; + }; +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h new file mode 100644 index 0000000000..ed16ad2714 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h @@ -0,0 +1,86 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ + +namespace MetroHash +{ +/// 128-bit hash structure +struct Hash +{ + union + { + uint32 dwords[4]; ///< Output hash in dwords. + uint8 bytes[16]; ///< Output hash in bytes. + }; +}; + +// Compacts a 128-bit hash into a 64-bit one by XOR'ing the low and high 64-bits together. +inline uint64 Compact64( + const Hash* pHash) +{ + return (static_cast(pHash->dwords[3] ^ pHash->dwords[1]) | + (static_cast(pHash->dwords[2] ^ pHash->dwords[0]) << 32)); +} + +// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. +inline uint32 Compact32( + const Hash* pHash) +{ + return pHash->dwords[3] ^ pHash->dwords[2] ^ pHash->dwords[1] ^ pHash->dwords[0]; +} + +// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. +inline uint32 Compact32( + const uint64 hash) +{ + return static_cast(hash) ^ static_cast(hash >> 32); +} + +inline uint64 MetroHash64(const uint8* pData, const uint64 dataSize) +{ + uint64 hash = 0; + Util::MetroHash64::Hash(pData, dataSize, reinterpret_cast(&hash)); + return hash; +} + +inline uint32 MetroHash32(const uint8* pData, const uint64 dataSize) +{ + return Compact32(MetroHash64(pData, dataSize)); +} + +inline uint64 HashCStr64(const char* pString) +{ + return MetroHash64(reinterpret_cast(pString), strlen(pString)); +} + +} // MetroHash +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h new file mode 100644 index 0000000000..5008db7748 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h @@ -0,0 +1,335 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#pragma once + +#include + +namespace DevDriver +{ + // The value half of a key-value pair from a StructuredReader. + // This always wraps a valid IValue pointer, but the value semantically stored may be empty. (e.g. a Json null) + class StructuredValue + { + public: + // In order to avoid leaking internal headers, we treat this member as an opaque data type. + // Its size and alignment is checked in the cpp file. + /// This is an internal type, only exposed due to limitations in C++ semantics. + struct OpaqueNode + { + void* blob[2] = {}; + }; + + ~StructuredValue() = default; + + StructuredValue() + : m_opaque() + {} + + explicit StructuredValue(OpaqueNode opaque) + : m_opaque(opaque) + {} + + StructuredValue(StructuredValue&& other) = default; + StructuredValue(const StructuredValue& other) = default; + + StructuredValue& operator=(StructuredValue&& other) = default; + StructuredValue& operator=(const StructuredValue& other) = default; + + enum class Type + { + Null = 0, + Array, + Map, + Str, + Bool, + Int, + Uint, + Double, + Float, + }; + + // Type of data contained in this node. + Type GetType() const; + + const char* GetTypeString() const + { + switch (GetType()) + { + case StructuredValue::Type::Null: return "Null"; + + case StructuredValue::Type::Array: return "Array"; + case StructuredValue::Type::Map: return "Map"; + case StructuredValue::Type::Str: return "Str"; + + case StructuredValue::Type::Bool: return "Bool"; + + case StructuredValue::Type::Int: return "Int"; + case StructuredValue::Type::Uint: return "Uint"; + + case StructuredValue::Type::Double: return "Double"; + case StructuredValue::Type::Float: return "Float"; + default: + DD_WARN_ALWAYS(); + return "Unknown"; + } + } + + // Create a new empty value + StructuredValue MakeNull() const; + + // Return whether this is an empty, or "null" node. + bool IsNull() const; + + /// ===== Unsigned Integer Types + + /// Returns true when this node contains a Uint8. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint8(uint8* pValue) const; + // Returns true when this node contains a Uint16. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint16(uint16* pValue) const; + // Returns true when this node contains a Uint32. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint32(uint32* pValue) const; + // Returns true when this node contains a Uint64. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint64(uint64* pValue) const; + + /// ===== Signed Integer Types + + /// Returns true when this node contains a Int8. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt8(int8* pValue) const; + // Returns true when this node contains a Int16. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt16(int16* pValue) const; + // Returns true when this node contains a Int32. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt32(int32* pValue) const; + // Returns true when this node contains a Int64. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt64(int64* pValue) const; + + /// ===== Floating Point Types + + /// Returns true when this node contains a Float. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetFloat(float* pValue) const; + // Returns true when this node contains a Double. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetDouble(double* pValue) const; + + /// ===== Other Types + + // Returns true when this node contains a Bool. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetBool(bool* pValue) const; + + // Copy a string value from a node into a buffer + // If the StructuredValue is not a string, + // false is returned and no writes occur + // + // If pStringSize is not NULL, + // the string length is written and processing continues + // + // If pBuffer is not NULL, + // not more than bufferSize bytes (including a NULL terminator) are written. + // If the buffer is large enough to hold the entire string, + // true is returned + // + // If both pBuffer and pStringSize are NULL and the value *is* a string, + // true is returned + // + // TODO: ... this is complicated. Should we use a Result? + // The other types are simple enough that they benefit from using bool instead of a Result, but Strings may not. + DD_NODISCARD bool GetStringCopy(char* pBuffer, size_t bufferSize, size_t* pStringSize) const; + + template + DD_NODISCARD bool GetStringCopy(char(&buffer)[BufferSize]) const + { + return GetStringCopy(buffer, BufferSize, nullptr); + } + + // Return a NULL-terminated string from the backing messagepack data. + // This will fail and return NULL if the embedded string does not end with a NULL byte. Use GetStringCopy() if this is the case. + DD_NODISCARD const char* GetStringPtr() const; + + // Lookup a value in a map by a string key + // If the key does not exist, returns false and writes a Null value to `*pValue` + DD_NODISCARD bool GetValueByKey(const char* pKey, StructuredValue* pValue) const; + + // Lookup a value in an array. + // If `index` is out of bounds, returns false and writes a Null value to `*pValue` + DD_NODISCARD bool GetValueByIndex(size_t index, StructuredValue* pValue) const; + + // Query information about Maps and Arrays + + // Returns whether this node has key-value pairs + bool IsMap() const; + + // Returns whether this node has numeric indices + bool IsArray() const; + + // Returns the length of the array if this node is an array, otherwise 0. + size_t GetArrayLength() const; + + // Get-methods with defaults + // If you don't want to check the `bool` value anyway, prefer these. + + uint8 GetUint8Or(uint8 defaultValue) const + { + const bool ok = GetUint8(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint16 GetUint16Or(uint16 defaultValue) const + { + const bool ok = GetUint16(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint32 GetUint32Or(uint32 defaultValue) const + { + const bool ok = GetUint32(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint64 GetUint64Or(uint64 defaultValue) const + { + const bool ok = GetUint64(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int8 GetInt8Or(int8 defaultValue) const + { + const bool ok = GetInt8(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int16 GetInt16Or(int16 defaultValue) const + { + const bool ok = GetInt16(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int32 GetInt32Or(int32 defaultValue) const + { + const bool ok = GetInt32(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int64 GetInt64Or(int64 defaultValue) const + { + const bool ok = GetInt64(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + float GetFloatOr(float defaultValue) const + { + const bool ok = GetFloat(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + double GetDoubleOr(double defaultValue) const + { + const bool ok = GetDouble(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + bool GetBoolOr(bool defaultValue) const + { + const bool ok = GetBool(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + // Index methods + + StructuredValue operator[](const char* pKey) const + { + StructuredValue next = MakeNull(); + + // Ignore the result of this fetch, `next` is already an empty value + const bool ok = GetValueByKey(pKey, &next); + DD_UNUSED(ok); + + return next; + } + + StructuredValue operator[](size_t index) const + { + StructuredValue next = MakeNull(); + + // Ignore the result of this fetch, `next` is already an empty value + const bool ok = GetValueByIndex(index, &next); + DD_UNUSED(ok); + + return next; + } + + template + StructuredValue operator[](T index) const + { + // This overload exists so that we don't get ambiguous calls when calling operator[] with integer types. + // If the type of the index can't be statically cast to a size_t, this will fail to compile. + // Note: pointer types cannot be static_cast()'d, which is great! + return this->operator[](static_cast(index)); + } + + private: + bool ResetInternalErrorStateImpl(const char* pFile, int line, const char* pCallingFunction) const; + + OpaqueNode m_opaque; + }; + + // Top level container of structured data + class IStructuredReader + { + public: + virtual ~IStructuredReader() {}; + + DD_NODISCARD static Result CreateFromJson( + const void* pBytes, + size_t numBytes, + const AllocCb& allocCb, + IStructuredReader** ppReader + ); + + DD_NODISCARD static Result CreateFromMessagePack( + const uint8* pBytes, + size_t numBytes, + const AllocCb& allocCb, + IStructuredReader** ppReader + ); + + static void Destroy(IStructuredReader **ppReader); + + /// Get the root object being read + virtual StructuredValue GetRoot() const = 0; + + /// Get the allocation callbacks + virtual const AllocCb& GetAllocCb() const = 0; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h new file mode 100644 index 0000000000..fc022a9401 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h @@ -0,0 +1,102 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ + +/// ==================================================================================================================== +/// Hashes the bytes of a Key using MetroHash +template +struct DefaultHashFunc +{ + uint32 operator()(const Key& key) const + { + return MetroHash::MetroHash32(reinterpret_cast(&key), sizeof(Key)); + } +}; + +/// ==================================================================================================================== +/// Hashes a const char* CString Key using Metrohash +template<> +struct DefaultHashFunc +{ + uint32 operator()(const char* pKey) const + { + // We cannot pass NULL strings to strlen() and friends, so guard against it anyway. + uint32 hash = 0; + DD_ASSERT(pKey != nullptr); + if (pKey != nullptr) + { + hash = MetroHash::MetroHash32(reinterpret_cast(pKey), strlen(pKey)); + } + + return hash; + } +}; + +/// Pointer keys are usually a mistake, so this version is explicitly 'delete'd +/// Overload this template if you're sure you need this. (See: const char* above) +template +struct DefaultHashFunc +{ + uint32 operator()(const T* pKey) const = delete; +}; + +/// Generic compare functor for types that have defined the comparison operator +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct DefaultEqualFunc +{ + bool operator()(const Key& key1, const Key& key2) const { return (key1 == key2); } +}; + +/// String compare functor for use with C-style strings +template<> +struct DefaultEqualFunc +{ + bool operator()(const char* pKey1, const char* pKey2) const + { + DD_ASSERT(pKey1 != nullptr); + DD_ASSERT(pKey2 != nullptr); + return (strcmp(pKey1, pKey2) == 0); + } +}; + +/// Generic compare functor for types with arbitrary size +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct BitwiseEqualFunc +{ + bool operator()(const Key& key1, const Key& key2) const { return (memcmp(&key1, &key2, sizeof(Key)) == 0); } +}; + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h new file mode 100644 index 0000000000..5f92ed61de --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h @@ -0,0 +1,291 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +namespace DevDriver +{ + // Helper structure that sets value to true if T is not abstract and is constructable using the provided + // arguments, otherwise it returns false. We use this to prevent the Create() function from being defined + // for classes that are not creatable. This in turn prevents code from being generated that causes + // Container to be defined, which leads to compile errors even if the client application never attempts + // to directly create an object of that type. + template + struct CanConstruct + { + DD_STATIC_CONST bool Value = !Platform::IsAbstract::Value && Platform::IsConstructible::Value; + }; + + // SharedPointerBase is the common parent class used by SharedPointer<> + // It implements common functions (e.g., pointer management) but cannot ever be + // used directly. The purpose of this separation is to enable SharedPointer to perform + // typecasts between derived and base types by casting through SharedPointerBase. + class SharedPointerBase + { + // SharedPointer is a subclass, but all need to have access to the protected + private members + template + friend class SharedPointer; + // public functions available to all subclasses + public: + // check to see if the class has been set + bool IsNull() const { return m_pObject == nullptr; } + + // clear the pointer and, if required, delete the underlying allocation + void Clear() + { + if (m_pContainer != nullptr) + { + if (m_pContainer->Release() == 0) + { + // ContainerBase has been declared with a virtual destructor, which guarantees + // that the specific ContainerBase subclass destructor is called + DD_DELETE(m_pContainer, m_pContainer->GetAllocCb()); + } + m_pContainer = nullptr; + m_pObject = nullptr; + } + } + + protected: + // Inner class that provides a standardized reference counted container interface + // Subclassed by SharedPointer to include an actual object + class ContainerBase + { + public: + // Construct container and initialize ref count to zero. This class should never be + // constructed directly by anything other than a subclass. + constexpr ContainerBase(const AllocCb &allocCb) + : m_allocCb(allocCb) + , m_refCount(0) + { + //DD_PRINT(LogLevel::Never, "Created reference counted container %i", m_refCount); + } + + // Destroy the container. Since this class is never directly created, this ensures + // subclasses (and the contained object) are always destroyed correctly. + virtual ~ContainerBase() + { + DD_ASSERT(m_refCount == 0); + DD_PRINT(LogLevel::Never, "Deleted reference counted container %i", m_refCount); + } + + // Increments the reference count of the container + int32 Retain(void) + { + DD_ASSERT(m_refCount >= 0); + int32 result = Platform::AtomicIncrement(&m_refCount); + DD_ASSERT(result >= 1); + DD_PRINT(LogLevel::Never, "Incremented reference count: %i", result); + return result; + } + + // Decrements the reference count of the container + int32 Release(void) + { + int32 result = Platform::AtomicDecrement(&m_refCount); + DD_ASSERT(result >= 0); + DD_PRINT(LogLevel::Never, "Decremented reference count: %i", result); + return result; + } + + // Returns the reference count of the container + int32 QueryReferenceCount(void) const + { + return m_refCount; + } + + // Retrieve the allocator callbacks so it can be destroyed + const AllocCb& GetAllocCb() const { return m_allocCb; } + private: + // Allocator callbacks + const AllocCb m_allocCb; + // Reference count + Platform::Atomic m_refCount; + + }; + + // Default constructor that is constexpr. Allows the compiler to inline this if it wants to. + constexpr SharedPointerBase() + : m_pContainer(nullptr) + , m_pObject(nullptr) + { + } + + // Initialize the object using the provided pointer + SharedPointerBase(ContainerBase* pContainer, void* pObject) + : m_pContainer(pContainer) + , m_pObject(pObject) + { + // We should always have a valid object if the container is valid. + DD_ASSERT((m_pContainer == nullptr) || (m_pObject != nullptr)); + + // If we have a valid container, increment the reference count. + if (m_pContainer != nullptr) + { + m_pContainer->Retain(); + } + } + + // Copy constructor copies the container pointer and increments the reference count + SharedPointerBase(const SharedPointerBase &right) + : SharedPointerBase(right.m_pContainer, right.m_pObject) + { + } + + // Move constructor takes the container pointer and clears the other container's pointer + SharedPointerBase(SharedPointerBase &&right) + : m_pContainer(Platform::Exchange(right.m_pContainer, nullptr)) + , m_pObject(Platform::Exchange(right.m_pObject, nullptr)) + { + } + + // On deletion of the object clear the pointer + ~SharedPointerBase() + { + Clear(); + } + private: + // Pointer to the shared container + ContainerBase* m_pContainer; + // Pointer to the object inside the shared container. We keep a copy of this to allow + // direct access to the object since we might not know the actual parent type of it. + void* m_pObject; + }; + + template + class SharedPointer : public SharedPointerBase + { + public: + // Create SharedPointer object with the default constructor + constexpr SharedPointer() : SharedPointerBase() {}; + + SharedPointer(const SharedPointer&) = default; + + // Copy conversion constructor. Creates a new object if you can cast from type U to type T. + template ::Value>::Type> + SharedPointer(const SharedPointer &right) + : SharedPointerBase(Platform::Forward(right)) + { + } + + // Move conversion constructor. Takes ownership of the shared container if you can cast from type U to type T. + template ::Value>::Type> + SharedPointer(SharedPointer &&right) + : SharedPointerBase(Platform::Forward(right)) + { + } + + // Assignment operator to allow copy + swap idiom + SharedPointer &operator= (SharedPointer right) + { + m_pContainer = Platform::Exchange(right.m_pContainer, m_pContainer); + m_pObject = Platform::Exchange(right.m_pObject, m_pObject); + return *this; + } + + // Indirection operator. Returns a const reference to the object in the shared container. + // This operator is unsafe to use if the container hasn't been allocated. + T& operator*() const + { + DD_ASSERT(m_pObject != nullptr); + return *Get(); + } + + // Member of pointer operator. Returns a pointer to the object in the shared container. + // This operator is unsafe to use if the container hasn't been allocated. + T* operator->() const + { + DD_ASSERT(m_pObject != nullptr); + return Get(); + } + + // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. + template ::Value>::Type> + bool operator== (const SharedPointer< U >&right) const + { + return m_pObject == right.m_pObject; + } + + // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. + template ::Value>::Type> + bool operator!= (const SharedPointer< U >&right) const + { + return m_pObject != right.m_pObject; + } + + // Get a pointer to the contained object + T* Get() const + { + return static_cast(m_pObject); + } + + // Returns the reference count for the container + int32 QueryReferenceCount() const + { + return m_pContainer->QueryReferenceCount(); + } + + // Create a SharedPointer using the provided allocator callbacks and arguments + // This function is only valid if the class is not a valid class + template::Value>::Type> + static SharedPointer Create(const AllocCb& allocCb, Args&&... args) + { + SharedPointer result; + Container *pContainer = + DD_NEW(Container, allocCb)(allocCb, Platform::Forward(args)...); + + if (pContainer != nullptr) + { + result = SharedPointer(pContainer, &pContainer->m_object); + } + return result; + } + private: + // Templated Container class that inherents the type from the outer (SharedPointer) class + class Container : public ContainerBase + { + public: + // Constructor that initializes ContainerBase class and the object using the provided parameters + template + explicit constexpr Container(const AllocCb& allocCb, Args&&... args) + : ContainerBase(allocCb) + , m_object(Platform::Forward(args)...) + { + } + + // Actual object that the SharedPointer instance encapsulates + T m_object; + }; + + // Private constructor to allow direct initialization using an externally created Container + explicit SharedPointer(Container* pContainer, T* pObject) + : SharedPointerBase(static_cast(pContainer), pObject) + { + } + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h new file mode 100644 index 0000000000..4452fd0eca --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h @@ -0,0 +1,292 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ +// A String class that stores the string inline with a compile-time maximum size. +// This class facilitiates passing bounded sized C Strings around without dynamic allocation. It has POD semantics +// when copied or passed by value into functions, and can be stored in a vector. +template +class FixedString +{ +private: + char m_data[FixedSize]; + +public: + FixedString() + { + // Only the first byte needs to be initialized - we actively do not want to zero the entire array! + m_data[0] = 0; + } + + FixedString(const FixedString&) = default; + FixedString(FixedString&&) = default; + + ~FixedString() {} + + FixedString& operator=(FixedString& pOther) + { + Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); + return *this; + } + + FixedString& operator=(FixedString&& pOther) + { + Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); + return *this; + } + + bool operator==(const FixedString& other) const + { + return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) == 0; + } + + bool operator!=(const FixedString& other) const + { + return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) != 0; + } + + // Create a FixedString from a C String, truncating the copy if pString is too long + FixedString(const char* pString) { Platform::Strncpy(m_data, pString, sizeof(m_data)); } + + // Return a pointer to the inline C String. + const char* AsCStr() const { return m_data; } + + char* AsCStr() { return m_data; } + + // Computes the length of the string. + // Note! This is an O(N) operation! + size_t Size() const { return strlen(m_data); } +}; + +// Sanity check for class size. +static_assert(sizeof(FixedString<16>) == 16, "FixedString<16> should be exactly 16 bytes"); + +/// ==================================================================================================================== +// Hashes a FixedString<> Key using Metrohash +template +struct DefaultHashFunc> +{ + uint32 operator()(const FixedString& key) const { return DefaultHashFunc()(key.AsCStr()); } +}; + +/// ==================================================================================================================== +/// Utility functions for strings +/// ==================================================================================================================== + +enum struct HexStringFmt +{ + Lowercase, + Uppercase, +}; + +/// ==================================================================================================================== +// Encode not more than `numBytes` from `pBytes` into hexadecimal, storing not more than stringBufferSize characters +// into pStringBuffer. +// +// This is the compliment of DecodeFromHexString() and is suitable for saving large binary blocks in text formats such +// as Json. +// +// This function NULL terminates its output if it writes anything. +// Hex pairs are written to `pStrBuff` in pairs - either both digits are written or neither is. A lone nibble +// is never written to the buffer. +// Thus, Hex strings are always an even length (+ a NULL byte) +// +// Returns the number of characters written out through `pStrBuff` (including the NULL terminator). +template +inline size_t EncodeToHexString(const void* pBytesIn, size_t numBytes, char* pStrBuff, size_t strBuffSize) +{ + const uint8* pBytes = static_cast(pBytesIn); + + // Character offset that we've written into pStrBuff + size_t charsProcessed = 0; + + if ((pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strBuffSize != 0)) + { + + // Both lookups are indexed by nibble + constexpr const char kHexStringLookupLower[] = "0123456789abcdef"; + constexpr const char kHexStringLookupUpper[] = "0123456789ABCDEF"; + + // This is the index where our next character pair goes. + // We save this outside of the loop to NULL terminate correctly. + size_t strIdx = 0; + for (size_t byteIdx = 0; byteIdx < numBytes; byteIdx += 1) + { + // We're going to write two bytes this loop, but need to exit early if we're out of bounds. + // We need room for: + // - the high nibble + // - the low nibble + // - the NULL terminator + // Offsets (from stdIdx) of 0, 1, and 2 must be within the buffer bounds. + if ((strIdx + 2) < strBuffSize) + { + const uint8 byte = pBytes[byteIdx]; + + if (fmt == HexStringFmt::Lowercase) + { + pStrBuff[strIdx + 0] = kHexStringLookupLower[byte >> 4]; // High nibble first + pStrBuff[strIdx + 1] = kHexStringLookupLower[byte & 0xf]; // Low nibble + } + else + { + pStrBuff[strIdx + 0] = kHexStringLookupUpper[byte >> 4]; // High nibble first + pStrBuff[strIdx + 1] = kHexStringLookupUpper[byte & 0xf]; // Low nibble + } + + strIdx += 2; + } + else + { + break; + } + } + + pStrBuff[strIdx] = '\0'; + charsProcessed += strIdx + 1; // Hex characters (if any) + NULL + } + + return charsProcessed; +} + +/// ==================================================================================================================== +// Helper function that translates hex digits into numeric values. +// Returns 0xff if the value is not a hex digit +#if DD_CPLUSPLUS_SUPPORTS(CPP17) +constexpr uint8 HexDigitToValue(char c) +#else +inline uint8 HexDigitToValue(char c) +#endif +{ + // We use a switch case here to get the point across + // gcc9, clang8, and MSVC all turn this into a lookup table indexing with c (sometimes subtracting from it first) + + switch (c) + { + // clang-format off + case '0': + case '1': case '2': case '3': + case '4': case '5': case '6': + case '7': case '8': case '9': + return c - '0'; + + case 'a': case 'A': return 0xa; + case 'b': case 'B': return 0xb; + case 'c': case 'C': return 0xc; + case 'd': case 'D': return 0xd; + case 'e': case 'E': return 0xe; + case 'f': case 'F': return 0xf; + + default: + return 0xff; + // clang-format on + } +} + +#if DD_CPLUSPLUS_SUPPORTS(CPP17) + static_assert(HexDigitToValue('Z') == 0xff); + + static_assert(HexDigitToValue('0') == 0); + static_assert(HexDigitToValue('1') == 1); + static_assert(HexDigitToValue('2') == 2); + static_assert(HexDigitToValue('3') == 3); + static_assert(HexDigitToValue('4') == 4); + static_assert(HexDigitToValue('5') == 5); + static_assert(HexDigitToValue('6') == 6); + static_assert(HexDigitToValue('7') == 7); + static_assert(HexDigitToValue('8') == 8); + static_assert(HexDigitToValue('9') == 9); + + static_assert(HexDigitToValue('a') == 10); + static_assert(HexDigitToValue('b') == 11); + static_assert(HexDigitToValue('c') == 12); + static_assert(HexDigitToValue('d') == 13); + static_assert(HexDigitToValue('e') == 14); + static_assert(HexDigitToValue('f') == 15); + + static_assert(HexDigitToValue('A') == 10); + static_assert(HexDigitToValue('B') == 11); + static_assert(HexDigitToValue('C') == 12); + static_assert(HexDigitToValue('D') == 13); + static_assert(HexDigitToValue('E') == 14); + static_assert(HexDigitToValue('F') == 15); +#endif + +/// ==================================================================================================================== +// Decode not more than `strLength` hex characters from `pStrBuff` into their binary representation, storing +// not more than `numBytes` into `pBytesOut`. +// +// This is the compliment of EncodeToHexString() and is suitable for decoding large binary blocks out of text formats +// such as Json. +// +// Returns the number of bytes written out through `pBytesOut`. +inline size_t DecodeFromHexString(const char* pStrBuff, size_t strLength, void* pBytesOut, size_t numBytes) +{ + uint8* pBytes = static_cast(pBytesOut); + + // Byte offset that we've written into pBytes + size_t bytesProcessed = 0; + + // Note: Only even-length hex strings are supported + if ((strLength % 2 == 0) && (pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strLength != 0)) + { + size_t byteIdx = 0; + + // Process two characters (one byte) per iteration. + // This loop is bounded on two sizes: the string buffer and the byte buffer + for (size_t strIdx = 0; + ((strIdx + 1) < strLength) && (byteIdx < numBytes); + strIdx += 2, byteIdx += 1) + { + const uint8 hi = HexDigitToValue(pStrBuff[strIdx + 0]); // High nibble first + const uint8 lo = HexDigitToValue(pStrBuff[strIdx + 1]); // Low nibble + + if ((lo != 0xff) && (hi != 0xff)) + { + pBytes[byteIdx] = (hi << 4) | lo; + bytesProcessed += 1; + } + else + { + // Non-hex digit encountered, this is a parsing error. + // This log statement is compiled out, but may be useful for debugging something funny. + DD_PRINT(LogLevel::Never, + "[DecodeFromHexString] Expected hex digits ([0-9a-fA-F]), but found \"%c%c\"", + pStrBuff[strIdx + 0], + pStrBuff[strIdx + 1]); + break; + } + } + } + + return bytesProcessed; +} + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h new file mode 100644 index 0000000000..82c10bbc43 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h @@ -0,0 +1,605 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#if !DD_PLATFORM_WINDOWS_KM +#include +#endif +#include + +namespace DevDriver +{ + template + class Vector + { + public: + class Iterator; + + // The capacity this Vector can hold without allocating extra space. + static constexpr size_t DefaultCapacity = defaultCapacity; + + // Standard constructor + explicit Vector(const AllocCb& allocCb) + : m_pData(m_data) + , m_size(0) + , m_capacity(defaultCapacity) + , m_allocCb(allocCb) + { + } + + // Move constructor + Vector(Vector &&rhs) + : m_pData(m_data) // default initialize it to the default allocation + , m_size(Platform::Exchange(rhs.m_size, (size_t)0)) // move the rhs size value into ours + , m_capacity(defaultCapacity) // initialize the capacity to default + , m_allocCb(rhs.m_allocCb) // copy the allocator callback + { + // if the vector will fit inside the default allocation, move it into it + if (m_size <= defaultCapacity) + { + for (size_t index = 0; index < m_size; index++) + { + m_data[index] = Platform::Move(rhs.m_pData[index]); + } + } + else // otherwise, we want to move the allocation + replace the capacity + { + m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); + m_capacity = Platform::Exchange(rhs.m_capacity, defaultCapacity); + } + } + + // Destructor + ~Vector() + { + Clear(); + } + + void operator=(Vector&& rhs) + { + Swap(rhs); + } + + // Convenience methods + size_t Size() const { return m_size; } + size_t Capacity() const { return m_capacity; } + bool IsEmpty() const { return (m_size == 0); } + + // Subscript operator + T& operator[](size_t index) { DD_ASSERT(index < m_size); return m_pData[index]; } + const T& operator[](size_t index) const { DD_ASSERT(index < m_size); return m_pData[index]; } + + // Insert elements into the back of the Vector + template + bool PushBack(Args&&... args) + { + bool result = false; + Reserve(m_size + 1); + if (m_size < m_capacity) + { + m_pData[m_size] = T(Platform::Forward(args)...); + ++m_size; + result = true; + } + return result; + } + + // Insert elements from another Vector to the back of the Vector + bool Append(const Vector& other) + { + return Append(other.Data(), other.Size()); + } + + template + bool Append(const T (&buffer)[Len]) + { + return Append(buffer, Len); + } + + // This is un-used by default, but may be overloaded for some Ts + bool Append(const T* pTs); + + // Insert elements from a buffer to the back of the Vector + // An empty slice (countOfTs == 0) is effectively a no-op + bool Append(const T* pTs, size_t countOfTs) + { + // Check that we get a valid pointer. If this fires, we'll crash but this is more visible than just crashing + // in a memcpy below. + if (countOfTs != 0) + { + DD_ASSERT(pTs != nullptr); + } + + // Pre-allocate all the new elements, since we know how many there are. + const size_t oldSize = Grow(countOfTs); + + // Some types can be bulk-transferred with a memcpy. + // Instead of letting the compiler guess, we dictate when dealing with Pods. + if (Platform::IsPod::Value) + { + memcpy(&m_pData[oldSize], pTs, (sizeof(T) * countOfTs)); + } + else + { + for (size_t i = 0; i < countOfTs; ++i) + { + m_pData[oldSize + i] = pTs[i]; + } + } + + // Pretend Grow() cannot fail, since we cannot check allocation failure with it right now. + return true; + } + + // Pop elements out of the Vector + bool PopBack(T* pData) + { + bool result = !IsEmpty(); + if (result) + { + --m_size; + if (pData != nullptr) + { + *pData = Platform::Move(m_pData[m_size]); + } + } + return result; + } + + // Pop elements out of the Vector + bool PopFront(T* pData) + { + bool result = !IsEmpty(); + if (result) + { + if (pData != nullptr) + { + *pData = Platform::Move(m_pData[0]); + } + + --m_size; + + if (m_size > 0) + { + for (size_t i = 0; i < m_size; i++) + { + m_pData[i] = Platform::Move(m_pData[i + 1]); + } + } + } + return result; + } + + // Remove the object at the specified index. Does not maintain order. + void Remove(size_t index) + { + DD_ASSERT(index < m_size); + + const size_t lastIndex = m_size - 1; + + // If the index is the last index, we move the last element into it's place + if (index != lastIndex) + { + m_pData[index] = Platform::Move(m_pData[lastIndex]); + } + // Otherwise, if it is the last element and not a POD we replace it with a default constructed object + else if (!Platform::IsPod::Value) + { + m_pData[index] = T(); + } + + --m_size; + } + + // Remove all instances of the specified object from the vector. Does not maintain order. + size_t Remove(const T& object) + { + size_t numRemoved = 0; + + for (size_t index = m_size; index > 0; index--) + { + if (m_pData[index - 1] == object) + { + Remove(index - 1); + numRemoved++; + } + } + return numRemoved; + } + + // Free all memory + void Clear() + { + if (m_pData != m_data) + { + // If the object is not a POD we explicitly destroy all objects prior to freeing the allocation. + if (!Platform::IsPod::Value) + { + for (size_t i = 0; i < m_capacity; i++) + { + m_pData[i].~T(); + } + } + DD_FREE(m_pData, m_allocCb); + m_pData = m_data; + m_capacity = defaultCapacity; + m_size = 0; + } + else + { + Reset(); + } + } + + // Clears all objects stored, but doesn't free memory. + void Reset() + { + // If the object is not a POD we need to destroy all instances and replace them with default constructed + // instances. + if (!Platform::IsPod::Value) + { + for (size_t index = 0; index < m_size; index++) + { + m_pData[index] = T(); + } + } + m_size = 0; + } + + // Swaps the contents of the current vector with the provided vector + void Swap(Vector& rhs) + { + // If we can, we swap allocations directly + if ((m_pData != m_data) && (rhs.m_pData != rhs.m_data)) + { + m_pData = Platform::Exchange(rhs.m_pData, m_pData); + } + // Else if the other object is using the default allocation we move it's contents here + // and give ownership of our allocation to it + else if (m_pData != m_data) + { + for (size_t index = 0; index < rhs.m_size; index++) + { + m_data[index] = Platform::Move(rhs.m_data[index]); + } + rhs.m_pData = Platform::Exchange(m_pData, m_data); + } + // Else if this object is using the default allocation we move our data into it's allocation + // and take ownership of our allocation to it + else if (rhs.m_pData != rhs.m_data) + { + for (size_t index = 0; index < m_size; index++) + { + rhs.m_data[index] = Platform::Move(m_data[index]); + } + m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); + } + // Otherwise we just exchange all the objects that we need to + else + { + for (size_t index = 0; index < Platform::Max(m_size, rhs.m_size); index++) + { + m_data[index] = Platform::Exchange(rhs.m_data[index], m_data[index]); + } + } + + // Finally, we exchange the rest of the data + m_allocCb = Platform::Exchange(rhs.m_allocCb, m_allocCb); + m_capacity = Platform::Exchange(rhs.m_capacity, m_capacity); + m_size = Platform::Exchange(rhs.m_size, m_size); + } + + // Get a pointer to the beginning of the data + // + // Returns nullptr if there is no data available + const T* Data() const + { + return (Size() != 0) ? m_pData : nullptr; + } + + // Get a pointer to the beginning of the data + // + // Returns nullptr if there is no data available + T* Data() + { + return (Size() != 0) ? m_pData : nullptr; + } + + // Allocates enough memory to hold the specified number of elements + void Reserve(size_t newSize) + { + if (m_capacity < newSize) + { + const size_t newCapacity = Platform::Pow2Pad(Platform::Max(newSize, (size_t)1)); + const size_t allocSize = sizeof(T) * newCapacity; + T* pData = static_cast(DD_MALLOC(allocSize, alignof(T), m_allocCb)); + + DD_ASSERT(pData != nullptr); + + // If the struct is not a POD, then we need to construct objects + if (is_type_trivial() == false) + { + size_t i = 0; + // First, we move all existing objects into the vector. + for (; i < m_size; i++) + { + new(&pData[i]) T(Platform::Move(m_pData[i])); + } + // Then we construct new objects with the remaining memory. + for (; i < newCapacity; i++) + { + new(&pData[i]) T(); + } + } + // Otherwise, we just copy the existing data into the new vector and call it good. + else + { + // Need to use reinterpret_cast here because gcc can't seem to evaluate + // `is_trivial_v` at compile-time, thus generating a no-class-memaccess warning. + // `if constexpr` fixes the issue, but AMDLOG's toolchain doesn't support c++17. + std::memcpy(reinterpret_cast(pData), m_pData, m_size * sizeof(T)); + } + + if (m_pData != m_data) + { + // If the object wasn't a POD we need to destroy all instances before freeing the memory. + if (!Platform::IsPod::Value) + { + for (size_t i = 0; i < m_capacity; i++) + { + m_pData[i].~T(); + } + } + DD_FREE(m_pData, m_allocCb); + } + m_pData = pData; + m_capacity = newCapacity; + } + } + + // Resizes the vector. Implicitly destroys objects if newSize is smaller than the existing size. + void Resize(size_t newSize) + { + // TODO: Reserve should return whether allocation failed + Reserve(newSize); + + // If the object isn't a POD and we are shrinking the size, we need to replace destroyed objects with + // default constructed instances. + if (!Platform::IsPod::Value) + { + for (size_t i = newSize; i < m_size; i++) + { + m_pData[i] = T(); + } + } + m_size = newSize; + } + + // Resizes the vector, zeroing additional elements + // + // Warning: This will break badly if your type cannot be safely memset() to 0! + void ResizeAndZero(size_t newSize) + { + // TODO: Reserve should return whether allocation failed + Reserve(newSize); + + if (newSize > m_size) + { + memset(&m_pData[m_size], 0, (newSize - m_size) * sizeof(T)); + } + + m_size = newSize; + } + + // Grows the vector by the specified number of elements and returns the previous size + size_t Grow(size_t numElements) + { + const size_t oldSize = m_size; + + Resize(m_size + numElements); + + return oldSize; + } + + // Iterator creation function + Iterator Begin() const + { + return CreateIterator(0); + } + + // Iterator creation function + constexpr Iterator End() const + { + return Iterator(nullptr, 0); + } + + // Iterator creation function + Iterator CreateIterator(size_t index) const + { + if (index < m_size) + return Iterator(this, index); + return End(); + } + + // Finds the first index for the provided object + Iterator Find(const T& object) const + { + auto it = Begin(); + for (; it != End(); ++it) + { + if (*it == object) + { + break; + } + } + return it; + } + + // Removes the element represented by the provided iterator. Does not maintain order. + Iterator Remove(const Iterator& it) + { + DD_ASSERT(it.m_pContainer == this); + + Remove(it.m_index); + if (it.m_index < m_size) + return it; + + return End(); + } + + /// Returns the internal allocator + const AllocCb& GetAllocCb() const { return m_allocCb; } + + private: + // Disallow copy construct. + Vector(Vector& rhs) = delete; + + // This indirection fixes the warning comparision of a constant with another constant. This should be + // replace with `if constexpr` once AMDLog upgrades to support C++17. + constexpr bool is_type_trivial() + { +#if !DD_PLATFORM_WINDOWS_KM + return std::is_trivial_v; +#else + // not available in kernel so treat every type as nontrivial + return false; +#endif + } + + T m_data[defaultCapacity]; + T* m_pData; + size_t m_size; + size_t m_capacity; + AllocCb m_allocCb; + }; + + // Iterator class for the Vector type + template + class Vector::Iterator + { + friend Vector; + public: + // Comparison operators + bool operator==(const Iterator& rhs) const + { + return ((m_pContainer == rhs.m_pContainer) && (m_index == rhs.m_index)); + } + + bool operator!=(const Iterator& rhs) const + { + return ((m_pContainer != rhs.m_pContainer) || (m_index != rhs.m_index)); + } + + // Prefix operator to increment the iterator + Iterator& operator++() + { + if (m_pContainer != nullptr) + { + m_index += 1; + if (m_index >= m_pContainer->m_size) + { + m_index = 0; + m_pContainer = nullptr; + } + } + return *this; + } + + // Addition operator to add to the iterator + Iterator& operator+(size_t value) + { + if (m_pContainer != nullptr) + { + m_index += value; + if (m_index >= m_pContainer->m_size) + { + m_index = 0; + m_pContainer = nullptr; + } + } + return *this; + } + + // Indirection operator + T& operator*() const + { + DD_ASSERT(m_pContainer != nullptr); + return m_pContainer->m_pData[m_index]; + } + + // Member of pointer operator. Returns a pointer to the object in the shared container. + T* operator->() const + { + DD_ASSERT(m_pContainer != nullptr); + return &m_pContainer->m_pData[m_index]; + } + private: + // Constructor is private to ensure it cannot be created by anything other than the Vector itself + Iterator(const Vector* pContainer, size_t index) : + m_pContainer(pContainer), + m_index(index) + { + }; + + const Vector* m_pContainer; + size_t m_index; + }; + + // + // functions necessary for C++ ranged based for loop support + // + + // Implement begin() function for range-based for loops + template + inline typename Vector::Iterator begin(Vector& rhs) + { + return rhs.Begin(); + } + + // Implement end() function for range-based for loops + template + inline constexpr typename Vector::Iterator end(const Vector& rhs) + { + return rhs.End(); + } + + // Specialized functions for using Vector<> like a String + template <> + inline bool Vector::Append(const char* pStr) + { + return Append(pStr, strlen(pStr)); + } + + template <> + template + inline bool Vector::Append(const char (&str)[Len]) + { + return Append(str, strlen(str)); + } + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h new file mode 100644 index 0000000000..530c32cf8a --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h @@ -0,0 +1,77 @@ +//--------------------------------------------------------------------- +// CRC32 +// +// Calculate a 32bit crc using a the Sarwate look up table method. The original algorithm was created by +// Dilip V. Sarwate, and is based off of Stephan Brumme's implementation. See also: +// https://dl.acm.org/citation.cfm?doid=63030.63037 +// http://create.stephan-brumme.com/crc32/#sarwate +// +//// Copyright (c) 2011-2016 Stephan Brumme. All rights reserved. +//***************************************************************************************************************** +// * This software is provided 'as-is', without any express or implied warranty. In no event will the author be held +// * liable for any damages arising from the use of this software. Permission is granted to anyone to use this +// * software for any purpose, including commercial applications, and to alter it and redistribute it freely, +// * subject to the following restrictions: +// * 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original +// * software +// * 2. If you use this software in a product, an acknowledgment in the product documentation would be +// * appreciated but is not required. +// * 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the +// * original software. +// ***************************************************************************************************************** +// +// Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved +// +// This software program is licensed subject to the BSD License, +// available at http://www.opensource.org/licenses/bsd-license.html. +// +// +// Tables for software CRC generation +// + +#include +static inline uint32_t CRC32(const void *pData, size_t length, uint32_t lastCRC = 0) +{ + DD_STATIC_CONST uint32_t lookupTable[256] = + { + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + }; + + uint32_t crc = ~lastCRC; // same as lastCRC ^ 0xFFFFFFFF + const unsigned char* DD_RESTRICT pCurrent = (const unsigned char*)pData; + while (length--) + crc = (crc >> 8) ^ lookupTable[(crc & 0xFF) ^ *pCurrent++]; + return ~crc; +} + diff --git a/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h b/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h new file mode 100644 index 0000000000..80f5ca6617 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h @@ -0,0 +1,187 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file trackedCmdLocation.h + * @brief Defines the format used for correlation buffers reported through + * ICmdBufferReporting::CorrelationReportOnSubmit, + * - enum class TrackedCmdLocationMode + * - struct TrackedCmdLocation + * + * Plus the helper functions + * - TrackedCmdLocationGetDeltaInDwords + * - TrackedCmdLocationGetDeltaInBytes + *********************************************************************************************************************** + */ + +#pragma once + +namespace Pal +{ + +namespace CmdDisassembly +{ + +/// @brief enum class TrackedCmdLocationMode +/// Defines how to interpret the unions within struct TrackedCmdLocation +/// +enum class TrackedCmdLocationMode : uint8_t +{ + Invalid = 0, + Before, + After, + Delta, // before and after + ClientId, + ClientEventId +}; + +/// @brief struct TrackedCmdLocation defines the format used for correlation data submitted through +/// ICmdBufferReporting::CorrelationReportOnSubmit, and is two DWORDs in size (uint64_t) +/// +/// @detail struct TrackedCmdLocation has a number of flavors interpreted by its member m_mode +/// +/// For m_mode == TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After or +/// TrackedCmdLocationMode::Delta, m_correlateInternal will be used +/// +/// For m_mode == TrackedCmdLocationMode::ClientId, m_clientId will be used +/// +/// For m_mode == TrackedCmdLocationMode::ClientEventId, m_clientEvent will be used +/// +/// +/// For use as m_correlateInternal +/// m_correlateInternal.m_event refers to an internal function that can be converted to a name via +/// Pal::CmdDisassembly::TrackedCmdSupportBase +/// m_correlateInternal.m_ptr is address within the cmdList being correlated by this +/// TrackedCmdLocation +/// For a cmdList with baseAddress and sizeInBytes, m_ptr is in the range +/// [baseAddress, baseAddress+sizeInBytes) +/// m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta +/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair +/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr +/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) +/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList +/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the +/// event described by m_event +/// +/// For use as m_clientEvent +/// m_clientEvent.m_clientEventId is a number provided by the client, provided by a call to +/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) +/// The m_ptr for this event will be the next TrackedCmdLocation, which will have +/// m_correlateInternal.m_mode == TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_event == PostClientEvent +/// m_correlateInternal.m_deltaInDWords == 0 +/// +/// For use as m_clientId +/// m_clientId.m_clientId is an identifier use by the client for the cmdList that corresponds to +/// this array of correlation data. This will have been set through a call to +/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) +/// When internal correlation is not active, this will be the first tracked location. Otherwise +/// it will not appear until what tracking occurs during Reset is complete. +struct TrackedCmdLocation +{ + static constexpr uint32_t DeltaBitCount = 5; + static constexpr uint32_t MaxDelta = (1LL << DeltaBitCount) - 1; + static constexpr uint32_t DwordDeltaShift = 3; + static constexpr uint8_t PostClientEvent = 0xff; + + static constexpr uint64_t PtrBitCount = 48; + /// NoCorrespondingBaseAddress is set to an impossible pointer value, that still fits in to the 48 fits + /// used for m_correlateInternal.m_ptr; + static constexpr uint64_t NoCorrespondingBaseAddress = (1LL << PtrBitCount) - 1; + + union + { + struct + { + uint64_t m_mode : 3; + }; + + struct + { + uint64_t m_mode : 3; // TrackedCmdLocationMode::Before/After/Delta + uint64_t m_event : 8; // TrackedEvents + uint64_t m_ptr : PtrBitCount; // Note, can probably use two bits fewer, + // since these addresses appear to be at a minimum 4-byte aligned. + uint64_t m_deltaInDWords : DeltaBitCount; + } m_correlateInternal; + + struct + { + uint64_t m_mode : 3; + uint64_t m_clientId : 61; + } m_clientId; + + struct + { + uint64_t m_mode : 3; + uint64_t m_clientEventId : 61; + + } m_clientEvent; + + uint64_t m_all; + }; +}; + +// ===================================================================================================================== +/// @brief Helper funcion to obtain DeltaInDwords from TrackedCmdLocation +/// +/// @detail m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta +/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair +/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr +/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) +/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList +/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the +/// event described by m_event +/// +/// +/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_deltaInDWords otherwise +constexpr uint64_t TrackedCmdLocationGetDeltaInDwords( + const TrackedCmdLocation location) +{ + const TrackedCmdLocationMode mode = static_cast(location.m_mode); + if (mode == TrackedCmdLocationMode::Delta) + { + return location.m_correlateInternal.m_deltaInDWords; + } + else + { + return 0; + } +} + +// ===================================================================================================================== +/// @brief Helper funcion to convert DeltaInDwords from TrackedCmdLocation to "InBytes" +/// +/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_deltaInDWords * sizeof(DWORD) otherwise - where DWORD is uint32_t +constexpr uint64_t TrackedCmdLocationGetDeltaInBytes( + const TrackedCmdLocation location) +{ + return TrackedCmdLocationGetDeltaInDwords(location) << TrackedCmdLocation::DwordDeltaShift; +} + +} // namespace CmdDisassembly +} // namespace Pal diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt b/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt new file mode 100644 index 0000000000..544afca91d --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt @@ -0,0 +1,70 @@ +## + ####################################################################################################################### + # + # Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + # + ####################################################################################################################### + +cmake_minimum_required(VERSION 3.1...3.21) + +project(MetroHash VERSION 1.0.0 LANGUAGES CXX) + +option(METROHASH_ENABLE_WERROR "Build with -Werror enabled" OFF) + +add_library(metrohash STATIC "") + +target_include_directories(metrohash PUBLIC src) + +target_sources(metrohash PRIVATE src/metrohash64.cpp + src/metrohash128.cpp) + + +set_target_properties(metrohash PROPERTIES CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + POSITION_INDEPENDENT_CODE ON) + + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + + if(METROHASH_ENABLE_WERROR) + target_compile_options(metrohash PRIVATE -Werror) + endif() + + # [GCC] Exceptions + # https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_exceptions.html + # + # [GCC] Options Controlling C++ Dialect + # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/C_002b_002b-Dialect-Options.html + target_compile_options(metrohash PRIVATE + -fno-exceptions # Disable exception handling support. + -fno-rtti) # Disable run-time type information support. + + # [GCC] Options to Request or Suppress Warnings + # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/Warning-Options.html + target_compile_options(metrohash PRIVATE + -Wall + -Wextra + -Wpedantic) + +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + + # [MSVC] Exception Handling Model + # + # [MSVC] Enable Run-Time Type Information + # + # [MSVC] Buffer Security Check + target_compile_options(metrohash PRIVATE + /EHsc # Catches only C++ exceptions and assumes + # functions declared as extern "C" never throw a C++ exception. + /GR- # Disables run-time type information. + /GS-) # Disables detection of buffer overruns. + + # [MSVC] Warning Level + target_compile_options(metrohash PRIVATE + /W4 # Enable warning level 4. + /WX) # Treat warnings as errors. + +else() + message(FATAL_ERROR "Compiler ${CMAKE_CXX_COMPILER_ID} is not supported!") +endif() diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE b/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md b/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md new file mode 100644 index 0000000000..2ac16b1437 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md @@ -0,0 +1,56 @@ +## MetroHash: Faster, Better Hash Functions + +MetroHash is a set of state-of-the-art hash functions for *non-cryptographic* use cases. They are notable for being algorithmically generated in addition to their exceptional performance. The set of published hash functions may be expanded in the future, having been selected from a very large set of hash functions that have been constructed this way. + +* Fastest general-purpose functions for bulk hashing. +* Fastest general-purpose functions for small, variable length keys. +* Robust statistical bias profile, similar to the MD5 cryptographic hash. +* Hashes can be constructed incrementally (**new**) +* 64-bit, 128-bit, and 128-bit CRC variants currently available. +* Optimized for modern x86-64 microarchitectures. +* Elegant, compact, readable functions. + +You can read more about the design and history [here](http://www.jandrewrogers.com/2015/05/27/metrohash/). + +## News + +### 23 October 2018 + +The project has been re-licensed under Apache License v2.0. The purpose of this license change is consistency with the imminent release of MetroHash v2.0, which is also licensed under the Apache license. + +### 27 July 2015 + +Two new 64-bit and 128-bit algorithms add the ability to construct hashes incrementally. In addition to supporting incremental construction, the algorithms are slightly superior to the prior versions. + +A big change is that these new algorithms are implemented as C++ classes that support both incremental and stateless hashing. These classes also have a static method for verifying the implementation against the test vectors built into the classes. Implementations are now fully contained by their respective headers e.g. "metrohash128.h". + +*Note: an incremental version of the 128-bit CRC version is on its way but is not included in this push.* + +**Usage Example For Stateless Hashing** + +`MetroHash128::Hash(key, key_length, hash_ptr, seed)` + +**Usage Example For Incremental Hashing** + +`MetroHash128 hasher;` +`hasher.Update(partial_key, partial_key_length);` +`...` +`hasher.Update(partial_key, partial_key_length);` +`hasher.Finalize(hash_ptr);` + +An `Initialize(seed)` method allows the hasher objects to be reused. + + +### 27 May 2015 + +Six hash functions have been included in the initial release: + +* 64-bit hash functions, "metrohash64_1" and "metrohash64_2" +* 128-bit hash functions, "metrohash128_1" and "metrohash128_2" +* 128-bit hash functions using CRC instructions, "metrohash128crc_1" and "metrohash128crc_2" + +Hash functions in the same family are effectively statistically unique. In other words, if you need two hash functions for a bloom filter, you can use "metrohash64_1" and "metrohash64_2" in the same implementation without issue. An unbounded set of statistically unique functions can be generated in each family. The functions in this repo were generated specifically for public release. + +The hash function generation software made no effort toward portability. While these hash functions should be easily portable to big-endian microarchitectures, they have not been tested on them and the performance optimization algorithms were not targeted at them. ARM64 microarchitectures might be a worthwhile hash function generation targets if I had the hardware. + + diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash b/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash new file mode 100644 index 0000000000..3c5ee421fd --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash @@ -0,0 +1,20 @@ +#### +# +# Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. +# +#### + +#----------------------------------------------------------------------- +# Common MetroHash Implementation Files +#----------------------------------------------------------------------- + +vpath %.cpp $(METROHASH_DEPTH)/src + +CPPFILES += metrohash64.cpp \ + metrohash128.cpp + +#----------------------------------------------------------------------- +# Common MetroHash Includes +#----------------------------------------------------------------------- + +LCXXINCS += -I$(METROHASH_DEPTH)/src diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h new file mode 100644 index 0000000000..ffab03216b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h @@ -0,0 +1,24 @@ +// metrohash.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef METROHASH_METROHASH_H +#define METROHASH_METROHASH_H + +#include "metrohash64.h" +#include "metrohash128.h" +#include "metrohash128crc.h" + +#endif // #ifndef METROHASH_METROHASH_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp new file mode 100644 index 0000000000..2763de5f88 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp @@ -0,0 +1,419 @@ +// metrohash128.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "platform.h" +#include "metrohash128.h" + +namespace Util +{ + +const char * MetroHash128::test_string = "012345678901234567890123456789012345678901234567890123456789012"; + +const uint8_t MetroHash128::test_seed_0[16] = { + 0xC7, 0x7C, 0xE2, 0xBF, 0xA4, 0xED, 0x9F, 0x9B, + 0x05, 0x48, 0xB2, 0xAC, 0x50, 0x74, 0xA2, 0x97 + }; + +const uint8_t MetroHash128::test_seed_1[16] = { + 0x45, 0xA3, 0xCD, 0xB8, 0x38, 0x19, 0x9D, 0x7F, + 0xBD, 0xD6, 0x8D, 0x86, 0x7A, 0x14, 0xEC, 0xEF + }; + + + +MetroHash128::MetroHash128(const uint64_t seed) +{ + Initialize(seed); +} + + +void MetroHash128::Initialize(const uint64_t seed) +{ + // initialize internal hash registers + state.v[0] = (static_cast(seed) - k0) * k3; + state.v[1] = (static_cast(seed) + k1) * k2; + state.v[2] = (static_cast(seed) + k0) * k2; + state.v[3] = (static_cast(seed) - k1) * k3; + + // initialize total length of input + bytes = 0; +} + + +void MetroHash128::Update(const uint8_t * const buffer, const uint64_t length) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + // input buffer may be partially filled + if (bytes % 32) + { + uint64_t fill = 32 - (bytes % 32); + if (fill > length) + fill = length; + + memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); + ptr += fill; + bytes += fill; + + // input buffer is still partially filled + if ((bytes % 32) != 0) return; + + // process full input buffer + state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // bulk update + bytes += (end - ptr); + while (ptr <= (end - 32)) + { + // process directly from the source, bypassing the input buffer + state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // store remaining bytes in input buffer + if (ptr < end) + memcpy(input.b, ptr, end - ptr); +} + + +void MetroHash128::Finalize(uint8_t * const hash) +{ + // finalize bulk loop, if used + if (bytes >= 32) + { + state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 21) * k1; + state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 21) * k0; + state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 21) * k1; + state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 21) * k0; + } + + // process any bytes remaining in the input buffer + const uint8_t * ptr = reinterpret_cast(input.b); + const uint8_t * const end = ptr + (bytes % 32); + + if ((end - ptr) >= 16) + { + state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[1] += read_u64(ptr) * k2; ptr += 8; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 45) * k1; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 45) * k0; + } + + if ((end - ptr) >= 8) + { + state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 27) * k1; + } + + if ((end - ptr) >= 4) + { + state.v[1] += read_u32(ptr) * k2; ptr += 4; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 46) * k0; + } + + if ((end - ptr) >= 2) + { + state.v[0] += read_u16(ptr) * k2; ptr += 2; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 22) * k1; + } + + if ((end - ptr) >= 1) + { + state.v[1] += read_u8 (ptr) * k2; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 58) * k0; + } + + state.v[0] += rotate_right((state.v[0] * k0) + state.v[1], 13); + state.v[1] += rotate_right((state.v[1] * k1) + state.v[0], 37); + state.v[0] += rotate_right((state.v[0] * k2) + state.v[1], 13); + state.v[1] += rotate_right((state.v[1] * k3) + state.v[0], 37); + + bytes = 0; + + // do any endian conversion here + + memcpy(hash, state.v, 16); +} + + +void MetroHash128::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + uint64_t v[4]; + + v[0] = (static_cast(seed) - k0) * k3; + v[1] = (static_cast(seed) + k1) * k2; + + if (length >= 32) + { + v[2] = (static_cast(seed) + k0) * k2; + v[3] = (static_cast(seed) - k1) * k3; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 21) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 21) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 21) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 21) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 45) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 45) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 27) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 46) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 58) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 13); + v[1] += rotate_right((v[1] * k1) + v[0], 37); + v[0] += rotate_right((v[0] * k2) + v[1], 13); + v[1] += rotate_right((v[1] * k3) + v[0], 37); + + // do any endian conversion here + + memcpy(hash, v, 16); +} + + +bool MetroHash128::ImplementationVerified() +{ + uint8_t hash[16]; + const uint8_t * key = reinterpret_cast(MetroHash128::test_string); + + // verify one-shot implementation + MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 0); + if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; + + MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 1); + if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; + + // verify incremental implementation + MetroHash128 metro; + + metro.Initialize(0); + metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; + + metro.Initialize(1); + metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; + + return true; +} + + +void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 26) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 26) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 26) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 17) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 20) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 24) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 24) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 13); + v[1] += rotate_right((v[1] * k1) + v[0], 37); + v[0] += rotate_right((v[0] * k2) + v[1], 13); + v[1] += rotate_right((v[1] * k3) + v[0], 37); + + // do any endian conversion here + + memcpy(out, v, 16); +} + + +void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 29) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],29) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 25) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],29) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 33); + v[1] += rotate_right((v[1] * k1) + v[0], 33); + v[0] += rotate_right((v[0] * k2) + v[1], 33); + v[1] += rotate_right((v[1] * k3) + v[0], 33); + + // do any endian conversion here + + memcpy(out, v, 16); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h new file mode 100644 index 0000000000..1517024b18 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h @@ -0,0 +1,85 @@ +// metrohash128.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_128_H +#define METROHASH_METROHASH_128_H + +#include + +namespace Util +{ + +class MetroHash128 +{ +public: + static const uint32_t bits = 128; + + // Constructor initializes the same as Initialize() + MetroHash128(const uint64_t seed=0); + + // Initializes internal state for new hash with optional seed + void Initialize(const uint64_t seed=0); + + // Update the hash state with a string of bytes. If the length + // is sufficiently long, the implementation switches to a bulk + // hashing algorithm directly on the argument buffer for speed. + void Update(const uint8_t * buffer, const uint64_t length); + + // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. + template + void Update(const T& object) + { + Update(reinterpret_cast(&object), sizeof(object)); + } + + // Constructs the final hash and writes it to the argument buffer. + // After a hash is finalized, this instance must be Initialized()-ed + // again or the behavior of Update() and Finalize() is undefined. + void Finalize(uint8_t * const hash); + + // A non-incremental function implementation. This can be significantly + // faster than the incremental implementation for some usage patterns. + static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); + + // Does implementation correctly execute test vectors? + static bool ImplementationVerified(); + + // test vectors -- Hash(test_string, seed=0) => test_seed_0 + static const char * test_string; + static const uint8_t test_seed_0[16]; + static const uint8_t test_seed_1[16]; + +private: + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + struct { uint64_t v[4]; } state; + struct { uint8_t b[32]; } input; + uint64_t bytes; +}; + + +// Legacy 128-bit hash functions -- do not use +void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_128_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp new file mode 100644 index 0000000000..4e988223d1 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp @@ -0,0 +1,178 @@ +// metrohash128crc.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include "metrohash.h" +#include "platform.h" + +namespace Util +{ + +void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; + v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; + v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; + v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; + v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; + v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); + v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 11); + v[1] += rotate_right((v[1] * k1) + v[0], 26); + v[0] += rotate_right((v[0] * k0) + v[1], 11); + v[1] += rotate_right((v[1] * k1) + v[0], 26); + + memcpy(out, v, 16); +} + + +void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xEE783E2F; + static const uint64_t k1 = 0xAD07C493; + static const uint64_t k2 = 0x797A90BB; + static const uint64_t k3 = 0x2E4B2E1B; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; + v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; + v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; + v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; + v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; + v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 15); + v[1] += rotate_right((v[1] * k1) + v[0], 27); + v[0] += rotate_right((v[0] * k0) + v[1], 15); + v[1] += rotate_right((v[1] * k1) + v[0], 27); + + memcpy(out, v, 16); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h new file mode 100644 index 0000000000..7877394d3b --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h @@ -0,0 +1,33 @@ +// metrohash128crc.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_128_CRC_H +#define METROHASH_METROHASH_128_CRC_H + +#include + +namespace Util +{ + +// Legacy 128-bit hash functions +void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_128_CRC_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp new file mode 100644 index 0000000000..aec3563dd2 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp @@ -0,0 +1,415 @@ +// metrohash64.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include "platform.h" +#include "metrohash64.h" + +#include + +namespace Util +{ + +const char * MetroHash64::test_string = "012345678901234567890123456789012345678901234567890123456789012"; + +const uint8_t MetroHash64::test_seed_0[8] = { 0x6B, 0x75, 0x3D, 0xAE, 0x06, 0x70, 0x4B, 0xAD }; +const uint8_t MetroHash64::test_seed_1[8] = { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0xB9, 0xB8, 0xDF }; + + + +MetroHash64::MetroHash64(const uint64_t seed) +{ + Initialize(seed); +} + + +void MetroHash64::Initialize(const uint64_t seed) +{ + vseed = (static_cast(seed) + k2) * k0; + + // initialize internal hash registers + state.v[0] = vseed; + state.v[1] = vseed; + state.v[2] = vseed; + state.v[3] = vseed; + + // initialize total length of input + bytes = 0; +} + + +void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + // input buffer may be partially filled + if (bytes % 32) + { + uint64_t fill = 32 - (bytes % 32); + if (fill > length) + fill = length; + + memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); + ptr += fill; + bytes += fill; + + // input buffer is still partially filled + if ((bytes % 32) != 0) return; + + // process full input buffer + state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // bulk update + bytes += static_cast(end - ptr); + while (ptr <= (end - 32)) + { + // process directly from the source, bypassing the input buffer + state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // store remaining bytes in input buffer + if (ptr < end) + memcpy(input.b, ptr, static_cast(end - ptr)); +} + + +void MetroHash64::Finalize(uint8_t * const hash) +{ + // finalize bulk loop, if used + if (bytes >= 32) + { + state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 37) * k1; + state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 37) * k0; + state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 37) * k1; + state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 37) * k0; + + state.v[0] = vseed + (state.v[0] ^ state.v[1]); + } + + // process any bytes remaining in the input buffer + const uint8_t * ptr = reinterpret_cast(input.b); + const uint8_t * const end = ptr + (bytes % 32); + + if ((end - ptr) >= 16) + { + state.v[1] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3; + state.v[2] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[2] = rotate_right(state.v[2],29) * k3; + state.v[1] ^= rotate_right(state.v[1] * k0, 21) + state.v[2]; + state.v[2] ^= rotate_right(state.v[2] * k3, 21) + state.v[1]; + state.v[0] += state.v[2]; + } + + if ((end - ptr) >= 8) + { + state.v[0] += read_u64(ptr) * k3; ptr += 8; + state.v[0] ^= rotate_right(state.v[0], 55) * k1; + } + + if ((end - ptr) >= 4) + { + state.v[0] += read_u32(ptr) * k3; ptr += 4; + state.v[0] ^= rotate_right(state.v[0], 26) * k1; + } + + if ((end - ptr) >= 2) + { + state.v[0] += read_u16(ptr) * k3; ptr += 2; + state.v[0] ^= rotate_right(state.v[0], 48) * k1; + } + + if ((end - ptr) >= 1) + { + state.v[0] += read_u8 (ptr) * k3; + state.v[0] ^= rotate_right(state.v[0], 37) * k1; + } + + state.v[0] ^= rotate_right(state.v[0], 28); + state.v[0] *= k0; + state.v[0] ^= rotate_right(state.v[0], 29); + + bytes = 0; + + // do any endian conversion here + + memcpy(hash, state.v, 8); +} + + +void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + uint64_t h = (static_cast(seed) + k2) * k0; + + if (length >= 32) + { + uint64_t v[4]; + v[0] = h; + v[1] = h; + v[2] = h; + v[3] = h; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 37) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 37) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; + h += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = h + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; + uint64_t v1 = h + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; + v0 ^= rotate_right(v0 * k0, 21) + v1; + v1 ^= rotate_right(v1 * k3, 21) + v0; + h += v1; + } + + if ((end - ptr) >= 8) + { + h += read_u64(ptr) * k3; ptr += 8; + h ^= rotate_right(h, 55) * k1; + } + + if ((end - ptr) >= 4) + { + h += read_u32(ptr) * k3; ptr += 4; + h ^= rotate_right(h, 26) * k1; + } + + if ((end - ptr) >= 2) + { + h += read_u16(ptr) * k3; ptr += 2; + h ^= rotate_right(h, 48) * k1; + } + + if ((end - ptr) >= 1) + { + h += read_u8 (ptr) * k3; + h ^= rotate_right(h, 37) * k1; + } + + h ^= rotate_right(h, 28); + h *= k0; + h ^= rotate_right(h, 29); + + memcpy(hash, &h, 8); +} + + +bool MetroHash64::ImplementationVerified() +{ + uint8_t hash[8]; + const uint8_t * key = reinterpret_cast(MetroHash64::test_string); + + // verify one-shot implementation + MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 0); + if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; + + MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 1); + if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; + + // verify incremental implementation + MetroHash64 metro; + + metro.Initialize(0); + metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; + + metro.Initialize(1); + metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; + + return true; +} + + +void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t hash = ((static_cast(seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1; + uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2; + v0 ^= rotate_right(v0 * k0, 35) + v1; + v1 ^= rotate_right(v1 * k3, 35) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += read_u64(ptr) * k3; ptr += 8; + hash ^= rotate_right(hash, 33) * k1; + + } + + if ((end - ptr) >= 4) + { + hash += read_u32(ptr) * k3; ptr += 4; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += read_u16(ptr) * k3; ptr += 2; + hash ^= rotate_right(hash, 13) * k1; + } + + if ((end - ptr) >= 1) + { + hash += read_u8 (ptr) * k3; + hash ^= rotate_right(hash, 25) * k1; + } + + hash ^= rotate_right(hash, 33); + hash *= k0; + hash ^= rotate_right(hash, 33); + + memcpy(out, &hash, 8); +} + + +void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t hash = ((static_cast(seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; + uint64_t v1 = hash + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; + v0 ^= rotate_right(v0 * k0, 34) + v1; + v1 ^= rotate_right(v1 * k3, 34) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += read_u64(ptr) * k3; ptr += 8; + hash ^= rotate_right(hash, 36) * k1; + } + + if ((end - ptr) >= 4) + { + hash += read_u32(ptr) * k3; ptr += 4; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += read_u16(ptr) * k3; ptr += 2; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 1) + { + hash += read_u8 (ptr) * k3; + hash ^= rotate_right(hash, 23) * k1; + } + + hash ^= rotate_right(hash, 28); + hash *= k0; + hash ^= rotate_right(hash, 29); + + memcpy(out, &hash, 8); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h new file mode 100644 index 0000000000..43926bbf20 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h @@ -0,0 +1,86 @@ +// metrohash64.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_64_H +#define METROHASH_METROHASH_64_H + +#include + +namespace Util +{ + +class MetroHash64 +{ +public: + static const uint32_t bits = 64; + + // Constructor initializes the same as Initialize() + MetroHash64(const uint64_t seed=0); + + // Initializes internal state for new hash with optional seed + void Initialize(const uint64_t seed=0); + + // Update the hash state with a string of bytes. If the length + // is sufficiently long, the implementation switches to a bulk + // hashing algorithm directly on the argument buffer for speed. + void Update(const uint8_t * buffer, const uint64_t length); + + // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. + template + void Update(const T& object) + { + Update(reinterpret_cast(&object), sizeof(object)); + } + + // Constructs the final hash and writes it to the argument buffer. + // After a hash is finalized, this instance must be Initialized()-ed + // again or the behavior of Update() and Finalize() is undefined. + void Finalize(uint8_t * const hash); + + // A non-incremental function implementation. This can be significantly + // faster than the incremental implementation for some usage patterns. + static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); + + // Does implementation correctly execute test vectors? + static bool ImplementationVerified(); + + // test vectors -- Hash(test_string, seed=0) => test_seed_0 + static const char * test_string; + static const uint8_t test_seed_0[8]; + static const uint8_t test_seed_1[8]; + +private: + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + struct { uint64_t v[4]; } state; + struct { uint8_t b[32]; } input; + uint64_t bytes; + uint64_t vseed; +}; + + +// Legacy 64-bit hash functions -- do not use +void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_64_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h new file mode 100644 index 0000000000..3762acdd14 --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h @@ -0,0 +1,62 @@ +// platform.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_PLATFORM_H +#define METROHASH_PLATFORM_H + +#include +#include + +namespace Util +{ + +// rotate right idiom recognized by most compilers +inline static uint64_t rotate_right(uint64_t v, unsigned k) +{ + return (v >> k) | (v << (64 - k)); +} + +inline static uint64_t read_u64(const void * const ptr) +{ + uint64_t val; + memcpy(&val, ptr, sizeof(val)); + return val; +} + +inline static uint64_t read_u32(const void * const ptr) +{ + uint32_t val; + memcpy(&val, ptr, sizeof(val)); + return static_cast(val); +} + +inline static uint64_t read_u16(const void * const ptr) +{ + uint16_t val; + memcpy(&val, ptr, sizeof(val)); + return static_cast(val); +} + +inline static uint64_t read_u8 (const void * const ptr) +{ + return static_cast(*reinterpret_cast(ptr)); +} + +} // Util + +#endif // #ifndef METROHASH_PLATFORM_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h new file mode 100644 index 0000000000..ba56b316ae --- /dev/null +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h @@ -0,0 +1,63 @@ +// testvector.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef METROHASH_TESTVECTOR_H +#define METROHASH_TESTVECTOR_H + +#include "metrohash.h" + + +typedef void (*HashFunction) (const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * hash); + +struct TestVectorData +{ + HashFunction function; + uint32_t bits; + const char * key; + uint32_t seed; + uint8_t hash[64]; +}; + +// The test vector string is selected such that it will properly exercise every +// internal branch of the hash function. Currently that requires a string with +// a length of (at least) 63 bytes. + +static const char * test_key_63 = "012345678901234567890123456789012345678901234567890123456789012"; + +// The hash assumes a little-endian architecture. Treating the hash results +// as an array of uint64_t should enable conversion for big-endian implementations. +const TestVectorData TestVector [] = +{ + // seed = 0 + { metrohash64_1, 64, test_key_63, 0, "658F044F5C730E40" }, + { metrohash64_2, 64, test_key_63, 0, "073CAAB960623211" }, + { metrohash128_1, 128, test_key_63, 0, "ED9997ED9D0A8B0FF3F266399477788F" }, + { metrohash128_2, 128, test_key_63, 0, "7BBA6FE119CF35D45507EDF3505359AB" }, + { metrohash128crc_1, 128, test_key_63, 0, "B329ED67831604D3DFAC4E4876D8262F" }, + { metrohash128crc_2, 128, test_key_63, 0, "0502A67E257BBD77206BBCA6BBEF2653" }, + + // seed = 1 + { metrohash64_1, 64, test_key_63, 1, "AE49EBB0A856537B" }, + { metrohash64_2, 64, test_key_63, 1, "CF518E9CF58402C0" }, + { metrohash128_1, 128, test_key_63, 1, "DDA6BA67F7DE755EFDF6BEABECCFD1F4" }, + { metrohash128_2, 128, test_key_63, 1, "2DA6AF149A5CDBC12B09DB0846D69EF0" }, + { metrohash128crc_1, 128, test_key_63, 1, "E8FAB51AF19F18A7B10D0A57D4276DF2" }, + { metrohash128crc_2, 128, test_key_63, 1, "2D54F87181A0CF64B02C50D95692BC19" }, +}; + + + +#endif // #ifndef METROHASH_TESTVECTOR_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/CMakeLists.txt b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/CMakeLists.txt new file mode 100644 index 0000000000..9a66965cd6 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/CMakeLists.txt @@ -0,0 +1,82 @@ +#----------------------------------------------------------------------------- +# Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. +#----------------------------------------------------------------------------- + +# amdhsacode library +# +# This file is expected to be included from top-level CMakeLists.txt. +# +# Dependencies: +# - Compiler definitions +# - elf library +# - hsail library +# - sp3 library +# +# Defines: +# - amdhsacode library and target include directories + +set(USE_AMD_LIBELF "no" CACHE STRING "Do not use AMD LIBELF by default") +set(NO_SI_SP3 "no" CACHE STRING "Disable using SP3") + +file(GLOB sources *.cpp *.hpp) + +add_library(amdhsacode STATIC ${sources}) +set_target_properties(amdhsacode PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ON +) + +if(CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|(Apple)?Clang)$") + target_compile_options(amdhsacode PRIVATE + -Werror + -Wno-inconsistent-missing-override + ) +endif() + +set(LIBELF_LIB) +if(${USE_AMD_LIBELF} STREQUAL "yes") + target_compile_definitions(amdhsacode PRIVATE AMD_LIBELF) + target_include_directories(amdhsacode PUBLIC ${HSAIL_ELFTOOLCHAIN_DIR}/common) + if(WIN32) + target_include_directories(amdhsacode PUBLIC ${HSAIL_ELFTOOLCHAIN_DIR}/common/win32) + endif() + target_include_directories(amdhsacode PUBLIC ${HSAIL_ELFTOOLCHAIN_DIR}/libelf) + set(LIBELF_LIB oclelf) + message(STATUS "Using AMD LIBELF") +else() + find_package(LibElf REQUIRED) + set(LIBELF_LIB elf) + message(STATUS "Using SYSTEM LIBELF") +endif() + +target_include_directories(amdhsacode PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(amdhsacode PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include) +target_include_directories(amdhsacode PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../hsail-tools/libHSAIL) +if(${NO_SI_SP3} STREQUAL "no") + target_include_directories(amdhsacode PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../../Chip/sp3) +endif() +#if defined(GFX102_BUILD) +target_compile_definitions(amdhsacode PRIVATE GFX102_BUILD) +#endif +#if defined(GFX11_BUILD) +target_compile_definitions(amdhsacode PRIVATE GFX11_BUILD) +#endif +#if defined(GFX115_BUILD) +target_compile_definitions(amdhsacode PRIVATE GFX115_BUILD) +#endif +#if defined(GFX12_BUILD) +target_compile_definitions(amdhsacode PRIVATE GFX12_BUILD) +#endif + +if(${NO_SI_SP3} STREQUAL "yes") + target_compile_definitions(amdhsacode PRIVATE NO_SI_SP3) +endif() + +target_link_libraries(amdhsacode PRIVATE ${LIBELF_LIB}) + +if(${NO_SI_SP3} STREQUAL "no") + set(SC_BUILD_SP3 ON) + if(SC_BUILD_SP3) + target_link_libraries(amdhsacode PUBLIC sp3) + endif() +endif() diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_elf_image.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_elf_image.cpp new file mode 100644 index 0000000000..f48c1dcb47 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_elf_image.cpp @@ -0,0 +1,1717 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_elf_image.hpp" +#include "amd_hsa_code_util.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#define alignof __alignof +#endif // _WIN32 +#include + +#ifndef _WIN32 +#define _open open +#define _close close +#define _tempnam tempnam +#include +#include +#endif + +#if defined(USE_MEMFILE) + +#include "memfile.h" +#define OpenTemp(f) mem_open(NULL, 0, 0) +#define CloseTemp(f) mem_close(f) +#define _read(f, b, l) mem_read((f), (b), (l)) +#define _write(f, b, l) mem_write((f), (b), (l)) +#define _lseek(f, l, w) mem_lseek((f), (l), (w)) +#define _ftruncate(f, l) mem_ftruncate((f), (size_t)(l)) +#define sendfile(o, i, p, s) mem_sendfile((o), (i), (p), (s)) + +#else // USE_MEMFILE + +#define OpenTemp(f) amd::hsa::OpenTempFile(f); +#define CloseTemp(f) amd::hsa::CloseTempFile(f); + +#ifndef _WIN32 +#define _read read +#define _write write +#define _lseek lseek +#define _ftruncate ftruncate +#include +#else +#define _ftruncate _chsize +#endif // !_WIN32 + +#endif // !USE_MEMFILE + +#if !defined(BSD_LIBELF) + #define elf_setshstrndx elfx_update_shstrndx +#endif + +#define NOTE_RECORD_ALIGNMENT 4 + +using amd::hsa::alignUp; + +namespace amd { + namespace elf { + + class FileImage { + public: + FileImage(); + ~FileImage(); + bool create(); + bool readFrom(const std::string& filename); + bool copyFrom(const void* data, size_t size); + bool writeTo(const std::string& filename); + bool copyTo(void** buffer, size_t* size = 0); + bool copyTo(void* buffer, size_t size); + size_t getSize(); + + std::string output() { return out.str(); } + + int fd() { return d; } + + private: + int d; + std::ostringstream out; + + bool error(const char* msg); + bool perror(const char *msg); + std::string werror(); + }; + + FileImage::FileImage() + : d(-1) + { + } + + FileImage::~FileImage() + { + if (d != -1) { CloseTemp(d); } + } + + bool FileImage::error(const char* msg) + { + out << "Error: " << msg << std::endl; + return false; + } + + bool FileImage::perror(const char* msg) + { + out << "Error: " << msg << ": " << strerror(errno) << std::endl; + return false; + } + +#ifdef _WIN32 + std::string FileImage::werror() + { + LPVOID lpMsgBuf; + DWORD dw = GetLastError(); + + FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + dw, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPTSTR)&lpMsgBuf, + 0, NULL); + std::string result((LPTSTR)lpMsgBuf); + LocalFree(lpMsgBuf); + return result; + } +#endif // _WIN32 + + bool FileImage::create() + { + d = OpenTemp("amdelf"); + if (d == -1) { return error("Failed to open temporary file for elf image"); } + return true; + } + + bool FileImage::readFrom(const std::string& filename) + { +#ifdef _WIN32 + std::unique_ptr buffer(new char[32 * 1024 * 1024]); + HANDLE in = CreateFile(filename.c_str(), GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (in == INVALID_HANDLE_VALUE) { out << "Failed to open " << filename << ": " << werror() << std::endl; return false; } + DWORD read; + unsigned write; + int written; + do { + if (!ReadFile(in, buffer.get(), sizeof(buffer), &read, NULL)) { + out << "Failed to read " << filename << ": " << werror() << std::endl; + CloseHandle(in); + return false; + } + if (read > 0) { + write = read; + do { + written = _write(d, buffer.get(), write); + if (written < 0) { + out << "Failed to write image file: " << werror() << std::endl; + CloseHandle(in); + } + write -= written; + } while (write > 0); + } + } while (read > 0); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(0) failed"); } + CloseHandle(in); + return true; +#else // _WIN32 + int in = _open(filename.c_str(), O_RDONLY); + if (in < 0) { return perror("open failed"); } + if (_lseek(in, 0L, SEEK_END) < 0) { return perror("lseek failed"); } + off_t size; + if ((size = _lseek(in, 0L, SEEK_CUR)) < 0) { return perror("lseek(2) failed"); } + if (_lseek(in, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + ssize_t written; + do { + written = sendfile(d, in, NULL, size); + if (written < 0) { + _close(in); + return perror("sendfile failed"); + } + size -= written; + } while (size > 0); + _close(in); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(0) failed"); } + return true; +#endif // _WIN32 + } + + bool FileImage::copyFrom(const void* data, size_t size) + { + assert(d != -1); + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek failed"); } + if (_ftruncate(d, 0) < 0) { return perror("ftruncate failed"); } + int written, offset = 0; + while (size > 0) { + written = _write(d, (const char*) data + offset, size); + if (written < 0) { + return perror("write failed"); + } + size -= written; + offset += written; + } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek failed"); } + return true; + } + + size_t FileImage::getSize() + { + assert(d != -1); + if (_lseek(d, 0L, SEEK_END) < 0) { return perror("lseek failed"); } + long seek = 0; + if ((seek = _lseek(d, 0L, SEEK_CUR)) < 0) { return perror("lseek(2) failed"); } + if (_lseek(d, 0L, SEEK_SET) < 0) { return perror("lseek(3) failed"); } + return seek; + } + + bool FileImage::copyTo(void** buffer, size_t* size) + { + size_t size1 = getSize(); + void* buffer1 = malloc(size1); + if (_read(d, buffer1, size1) < 0) { free(buffer1); return perror("read failed"); } + *buffer = buffer1; + if (size) { *size = size1; } + return true; + } + + bool FileImage::copyTo(void* buffer, size_t size) + { + size_t size1 = getSize(); + if (size < size1) { return error("Buffer size is not enough"); } + if (_read(d, buffer, size1) < 0) { return perror("read failed"); } + return true; + } + + bool FileImage::writeTo(const std::string& filename) + { + bool res = false; + size_t size = 0; + void *buffer = nullptr; + if (copyTo(&buffer, &size)) { + res = true; + std::ofstream out(filename.c_str(), std::ios::binary); + out.write((char*)buffer, size); + } + free(buffer); + return res; + } + + class Buffer { + public: + typedef unsigned char byte_type; + typedef size_t size_type; + + Buffer(); + Buffer(const byte_type *src, size_type size, size_type align = 0); + virtual ~Buffer(); + + const byte_type* raw() const + { return this->isConst() ? ptr_ : data_.data(); } + size_type align() const + { return align_; } + size_type size() const + { return this->isConst() ? size_ : data_.size(); } + bool isConst() const + { return 0 != size_; } + bool isEmpty() + { return size() == 0; } + bool hasRaw(const byte_type *src) const + { return (src >= this->raw()) && (src < this->raw() + this->size()); } + template + bool has(const T *src) const + { return this->hasRaw((const byte_type*)src); } + bool has(size_type offset) const + { return offset < this->size(); } + + template + size_type getOffset(const T *src) const + { return this->getRawOffset((const byte_type*)src); } + template + T get(size_type offset) const + { return (T)this->getRaw(offset); } + size_type addString(const std::string &str, size_type align = 0); + size_type addStringLength(const std::string &str, size_type align = 0); + size_type nextOffset(size_type align) const { return alignUp(this->size(), align); } + template + size_type add(const T *src, size_type size, size_type align) + { return this->addRaw((const byte_type*)src, size, align); } + template + size_type add(const T &src, size_type align = 0) + { return this->addRaw((const byte_type*)&src, sizeof(T), align == 0 ? alignof(T) : align); } + size_type align(size_type align); + + template + size_type reserve() + { + Buffer::size_type offset = this->align(alignof(T)); + data_.insert(data_.end(), sizeof(T), 0x0); + return offset; + } + + private: + size_type getRawOffset(const byte_type *src) const; + const byte_type* getRaw(size_type offset) const; + size_type addRaw(const byte_type *src, size_type size, size_type align); + + std::vector data_; + const byte_type *ptr_; + size_type size_; + size_type align_; + }; + + Buffer::Buffer() + : ptr_(nullptr) + , size_(0) + , align_(0) + { + } + + Buffer::Buffer(const Buffer::byte_type *src, Buffer::size_type size, Buffer::size_type align) + : ptr_(src) + , size_(size) + , align_(align) + { + } + + Buffer::~Buffer() + { + } + + Buffer::size_type Buffer::getRawOffset(const Buffer::byte_type *src) const + { + assert(this->has(src)); + return src - this->raw(); + } + + const Buffer::byte_type* Buffer::getRaw(Buffer::size_type offset) const + { + assert(this->has(offset)); + return this->raw() + offset; + } + + Buffer::size_type Buffer::addRaw(const Buffer::byte_type *src, Buffer::size_type size, Buffer::size_type align) + { + assert(!this->isConst()); + assert(nullptr != src); + assert(0 != size); + assert(0 != align); + Buffer::size_type offset = this->align(align); + data_.insert(data_.end(), src, src + size); + return offset; + } + + Buffer::size_type Buffer::addString(const std::string &str, size_type align) + { + return this->add(str.c_str(), str.length() + 1, align == 0 ? alignof(char) : align); + } + + Buffer::size_type Buffer::addStringLength(const std::string &str, size_type align) + { + return this->add((uint32_t)(str.length() + 1), align == 0 ? alignof(uint32_t) : align); + } + + Buffer::size_type Buffer::align(Buffer::size_type align) + { + assert(!this->isConst()); + assert(0 != align); + Buffer::size_type offset = alignUp(this->size(), align); + align_ = (std::max)(align_, align); + data_.insert(data_.end(), offset - this->size(), 0x0); + return offset; + } + + class GElfImage; + class GElfSegment; + + class GElfSection : public virtual Section { + public: + GElfSection(GElfImage* elf); + + bool push(const char* name, uint32_t shtype, uint64_t shflags, uint16_t shlink, uint32_t info, uint32_t align, uint64_t entsize = 0); + bool pull0(); + bool pull(uint16_t ndx); + virtual bool pullData() { return true; } + bool push(); + uint16_t getSectionIndex() const override; + uint32_t type() const override { return hdr.sh_type; } + std::string Name() const override; + uint64_t offset() const override { return hdr.sh_offset; } + uint64_t addr() const override { return hdr.sh_addr; } + bool updateAddr(uint64_t addr) override; + uint64_t addralign() const override { return data0.size() == 0 ? data.align() : data0.align(); } + uint64_t flags() const override { return hdr.sh_flags; } + uint64_t size() const override { return data0.size() == 0 ? data.size() : data0.size(); } + uint64_t nextDataOffset(uint64_t align) const override; + uint64_t addData(const void *src, uint64_t size, uint64_t align) override; + bool getData(uint64_t offset, void* dest, uint64_t size) override; + bool hasRelocationSection() const override { return reloc_sec != 0; } + RelocationSection* relocationSection(SymbolTable* symtab = 0) override; + Segment* segment() override { return seg; } + RelocationSection* asRelocationSection() override { return 0; } + bool setMemSize(uint64_t s) override { memsize_ = s; return true; } + uint64_t memSize() const override { return memsize_ ? memsize_ : size(); } + bool setAlign(uint64_t a) override { align_ = a; return true; } + uint64_t memAlign() const override { return align_ ? align_ : addralign(); } + + protected: + GElfImage* elf; + Segment* seg; + GElf_Shdr hdr; + Buffer data0, data; + uint64_t memsize_; + uint64_t align_; + RelocationSection *reloc_sec; + + size_t ndxscn; + + friend class GElfSymbol; + friend class GElfSegment; + friend class GElfImage; + }; + + class GElfSegment : public Segment { + public: + GElfSegment(GElfImage* elf, uint16_t index); + GElfSegment(GElfImage* elf, uint16_t index, uint32_t type, uint32_t flags, uint64_t paddr = 0); + bool push(uint64_t vaddr); + bool pull(); + uint64_t type() const override { return phdr.p_type; } + uint64_t memSize() const override { return phdr.p_memsz; } + uint64_t align() const override { return phdr.p_align; } + uint64_t imageSize() const override { return phdr.p_filesz; } + uint64_t vaddr() const override { return phdr.p_vaddr; } + uint64_t flags() const override { return phdr.p_flags; } + uint64_t offset() const override { return phdr.p_offset; } + const char* data() const override; + uint16_t getSegmentIndex() override; + bool updateAddSection(Section *section) override; + + private: + GElfImage* elf; + uint16_t index; + GElf_Phdr phdr; + std::vector sections; + }; + + class GElfStringTable : public GElfSection, public StringTable { + public: + GElfStringTable(GElfImage* elf); + bool push(const char* name, uint32_t shtype, uint64_t shflags); + bool pullData() override; + const char* addString(const std::string& s) override; + size_t addString1(const std::string& s) override; + const char* getString(size_t ndx) override; + size_t getStringIndex(const char* name) override; + + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + uint64_t offset() const override { return GElfSection::offset(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfSymbolTable; + + class GElfSymbol : public Symbol { + public: + GElfSymbol(GElfSymbolTable* symtab, Buffer &data, size_t index); + + bool push(const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, uint16_t shndx, unsigned char other); + + uint32_t index() override { return eindex / sizeof(GElf_Rela); } + uint32_t type() override { return GELF_ST_TYPE(Sym()->st_info); } + uint32_t binding() override { return GELF_ST_BIND(Sym()->st_info); } + uint64_t size() override { return Sym()->st_size; } + uint64_t value() override { return Sym()->st_value; } + unsigned char other() override { return Sym()->st_other; } + std::string name() override; + Section* section() override; + + void setValue(uint64_t value) override { Sym()->st_value = value; } + void setSize(uint64_t size) override { Sym()->st_size = size; } + + private: + GElf_Sym* Sym() { return edata.get(eindex); } + GElfSymbolTable* symtab; + Buffer &edata; + size_t eindex; + friend class GElfSymbolTable; + }; + + class GElfSymbolTable : public GElfSection, public SymbolTable { + private: + Symbol* addSymbolInternal(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0); + + GElfStringTable* strtab; + std::vector> symbols; + friend class GElfSymbol; + + public: + GElfSymbolTable(GElfImage* elf); + bool push(const char* name, GElfStringTable* strtab); + bool pullData() override; + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t offset() const override { return GElfSection::offset(); } + uint64_t addr() const override { return GElfSection::addr(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + Symbol* addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0) override; + size_t symbolCount() override; + Symbol* symbol(size_t i) override; + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfNoteSection : public GElfSection, public NoteSection { + public: + GElfNoteSection(GElfImage* elf); + bool push(const std::string& name); + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t offset() const override { return GElfSection::offset(); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + bool addNote(const std::string& name, uint32_t type, const void* desc, uint32_t desc_size) override; + bool getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) override; + RelocationSection* asRelocationSection() override { return 0; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + }; + + class GElfRelocationSection; + + class GElfRelocation : public Relocation { + private: + GElf_Rela *Rela() { return edata.get(eindex); } + + GElfRelocationSection* rsection; + Buffer &edata; + size_t eindex; + + public: + GElfRelocation(GElfRelocationSection* rsection_, Buffer &edata_, size_t eindex_) + : rsection(rsection_), + edata(edata_), eindex(eindex_) + { + } + + bool push(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend); + + RelocationSection* section() override; + uint32_t type() override { return GELF_R_TYPE(Rela()->r_info); } + uint32_t symbolIndex() override { return GELF_R_SYM(Rela()->r_info); } + Symbol* symbol() override; + uint64_t offset() override { return Rela()->r_offset; } + int64_t addend() override { return Rela()->r_addend; } + }; + + class GElfRelocationSection : public GElfSection, public RelocationSection { + private: + Section* section; + GElfSymbolTable* symtab; + std::vector> relocations; + + public: + GElfRelocationSection(GElfImage* elf, Section* targetSection = 0, GElfSymbolTable* symtab_ = 0); + bool push(const std::string& name); + bool pullData() override; + uint16_t getSectionIndex() const override { return GElfSection::getSectionIndex(); } + uint32_t type() const override { return GElfSection::type(); } + std::string Name() const override { return GElfSection::Name(); } + uint64_t addr() const override { return GElfSection::addr(); } + uint64_t offset() const override { return GElfSection::offset(); } + bool updateAddr(uint64_t addr) override { return GElfSection::updateAddr(addr); } + uint64_t addralign() const override { return GElfSection::addralign(); } + uint64_t flags() const override { return GElfSection::flags(); } + uint64_t size() const override { return GElfSection::size(); } + Segment* segment() override { return GElfSection::segment(); } + uint64_t nextDataOffset(uint64_t align) const override { return GElfSection::nextDataOffset(align); } + uint64_t addData(const void *src, uint64_t size, uint64_t align) override { return GElfSection::addData(src, size, align); } + bool getData(uint64_t offset, void* dest, uint64_t size) override { return GElfSection::getData(offset, dest, size); } + bool hasRelocationSection() const override { return GElfSection::hasRelocationSection(); } + RelocationSection* relocationSection(SymbolTable* symtab) override { return GElfSection::relocationSection(); } + RelocationSection* asRelocationSection() override { return this; } + + size_t relocationCount() const override { return relocations.size(); } + Relocation* relocation(size_t i) override { return relocations[i].get(); } + Relocation* addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) override; + Section* targetSection() override { return section; } + uint64_t memSize() const override { return GElfSection::memSize(); } + bool setMemSize(uint64_t s) override { return GElfSection::setMemSize(s); } + uint64_t memAlign() const override { return GElfSection::memAlign(); } + bool setAlign(uint64_t a) override { return GElfSection::setAlign(a); } + friend class GElfRelocation; + }; + + class GElfImage : public Image { + public: + GElfImage(int elfclass); + ~GElfImage(); + bool initNew(uint16_t machine, uint16_t type, uint8_t os_abi = 0, uint8_t abi_version = 0, uint32_t e_flags = 0) override; + bool loadFromFile(const std::string& filename) override; + bool saveToFile(const std::string& filename) override; + bool initFromBuffer(const void* buffer, size_t size) override; + bool initAsBuffer(const void* buffer, size_t size) override; + bool close(); + bool writeTo(const std::string& filename) override; + bool copyToBuffer(void** buf, size_t* size = 0) override; + bool copyToBuffer(void* buf, size_t size) override; + + const char* data() override { assert(buffer); return buffer; } + uint64_t size() override; + + bool push(); + + bool Freeze() override; + bool Validate() override; + + uint16_t Machine() override { return ehdr.e_machine; } + uint16_t Type() override { return ehdr.e_type; } + uint32_t EFlags() override { return ehdr.e_flags; } + uint32_t ABIVersion() override { return (uint32_t)(ehdr.e_ident[EI_ABIVERSION]); } + uint32_t EClass() override { return (uint32_t)(ehdr.e_ident[EI_CLASS]); } + uint32_t OsAbi() override { return (uint32_t)(ehdr.e_ident[EI_OSABI]); } + + GElfStringTable* shstrtab() override; + GElfStringTable* strtab() override; + GElfSymbolTable* getSymtab(uint16_t index) override + { + return static_cast(section(index)); + } + + GElfStringTable* addStringTable(const std::string& name) override; + GElfStringTable* getStringTable(uint16_t index) override; + + GElfSymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) override; + GElfSymbolTable* symtab() override; + + GElfSegment* segment(size_t i) override { return segments[i].get(); } + Segment* segmentByVAddr(uint64_t vaddr) override; + size_t sectionCount() override { return sections.size(); } + GElfSection* section(size_t i) override { return sections[i].get(); } + Section* sectionByVAddr(uint64_t vaddr) override; + uint16_t machine() const; + uint16_t etype() const; + int eclass() const { return elfclass; } + bool elfError(const char* msg); + + GElfNoteSection* note() override; + GElfNoteSection* addNoteSection(const std::string& name) override; + + size_t segmentCount() override { return segments.size(); } + Segment* initSegment(uint32_t type, uint32_t flags, uint64_t paddr = 0) override; + bool addSegments() override; + + Section* addSection(const std::string &name, + uint32_t type, + uint64_t flags = 0, + uint64_t entsize = 0, + Segment* segment = 0) override; + + RelocationSection* addRelocationSection(Section* sec, SymbolTable* symtab); + RelocationSection* relocationSection(Section* sec, SymbolTable* symtab = 0) override; + + private: + bool frozen; + int elfclass; + FileImage img; + const char* buffer; + size_t bufferSize; + Elf* e; + GElf_Ehdr ehdr; + GElfStringTable* shstrtabSection; + GElfStringTable* strtabSection; + GElfSymbolTable* symtabSection; + GElfNoteSection* noteSection; + std::vector> segments; + std::vector> sections; + + bool imgError(); + const char *elfError(); + bool elfBegin(Elf_Cmd cmd); + bool elfEnd(); + bool push0(); + bool pullElf(); + + friend class GElfSection; + friend class GElfSymbolTable; + friend class GElfNoteSection; + friend class GElfRelocationSection; + friend class GElfSegment; + friend class GElfSymbol; + }; + + GElfSegment::GElfSegment(GElfImage* elf_, uint16_t index_) + : elf(elf_), + index(index_) + { + memset(&phdr, 0, sizeof(phdr)); + } + + GElfSegment::GElfSegment(GElfImage* elf_, uint16_t index_, + uint32_t type, uint32_t flags, uint64_t paddr) + : elf(elf_), + index(index_) + { + memset(&phdr, 0, sizeof(phdr)); + phdr.p_type = type; + phdr.p_flags = flags; + phdr.p_paddr = paddr; + } + + const char* GElfSegment::data() const + { + return (const char*) elf->data() + phdr.p_offset; + } + + bool GElfImage::Freeze() + { + assert(!frozen); + if (!push()) { return false; } + frozen = true; + return true; + } + + bool GElfImage::Validate() + { + if (ELFMAG0 != ehdr.e_ident[EI_MAG0] || + ELFMAG1 != ehdr.e_ident[EI_MAG1] || + ELFMAG2 != ehdr.e_ident[EI_MAG2] || + ELFMAG3 != ehdr.e_ident[EI_MAG3]) { + out << "Invalid ELF magic" << std::endl; + return false; + } + if (EV_CURRENT != ehdr.e_version) { + out << "Invalid ELF version" << std::endl; + return false; + } + return true; + } + + bool GElfSegment::push(uint64_t vaddr) + { + phdr.p_align = 0; + phdr.p_offset = 0; + if (!sections.empty()) { + phdr.p_offset = sections[0]->offset(); + } + for (Section* section : sections) { + phdr.p_align = (std::max)(phdr.p_align, section->memAlign()); + } + phdr.p_vaddr = alignUp(vaddr, (std::max)(phdr.p_align, (uint64_t) 1)); + phdr.p_filesz = 0; + phdr.p_memsz = 0; + for (Section* section : sections) { + phdr.p_memsz = alignUp(phdr.p_memsz, (std::max)(section->memAlign(), (uint64_t) 1)); + phdr.p_filesz = alignUp(phdr.p_filesz, (std::max)(section->memAlign(), (uint64_t) 1)); + if (!section->updateAddr(phdr.p_vaddr + phdr.p_memsz)) { return false; } + phdr.p_filesz += (section->type() == SHT_NOBITS) ? 0 : section->size(); + phdr.p_memsz += section->memSize(); + } + if (!gelf_update_phdr(elf->e, index, &phdr)) { return elf->elfError("gelf_update_phdr failed"); } + return true; + } + + bool GElfSegment::pull() + { + if (!gelf_getphdr(elf->e, index, &phdr)) { return elf->elfError("gelf_getphdr failed"); } + return true; + } + + uint16_t GElfSegment::getSegmentIndex() + { + return index; + } + + bool GElfSegment::updateAddSection(Section *section) + { + sections.push_back(section); + return true; + } + + GElfSection::GElfSection(GElfImage* elf_) + : elf(elf_), + memsize_(0), + align_(0), + reloc_sec(nullptr), + ndxscn(0) + { + } + + uint16_t GElfSection::getSectionIndex() const + { + return (uint16_t)ndxscn; + } + + std::string GElfSection::Name() const + { + return std::string(elf->shstrtab()->getString(hdr.sh_name)); + } + + bool GElfSection::updateAddr(uint64_t addr) + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + hdr.sh_addr = addr; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + bool GElfSection::push(const char* name, uint32_t shtype, uint64_t shflags, uint16_t shlink, uint32_t info, uint32_t align, uint64_t entsize) + { + Elf_Scn *scn = elf_newscn(elf->e); + if (!scn) { return false; } + ndxscn = elf_ndxscn(scn); + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + align = (std::max)(align, (uint32_t) 8); + hdr.sh_name = elf->shstrtab()->addString1(name); + hdr.sh_type = shtype; + hdr.sh_flags = shflags; + hdr.sh_link = shlink; + hdr.sh_addr = 0; + hdr.sh_info = info; + hdr.sh_addralign = align; + hdr.sh_entsize = entsize; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + bool GElfSection::pull0() + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + if (!scn) { return false; } + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + return true; + } + + bool GElfSection::pull(uint16_t ndx) + { + ndxscn = (size_t) ndx; + if (!pull0()) { return false; } + Elf_Scn *scn = elf_getscn(elf->e, ndx); + if (!scn) { return false; } + Elf_Data *edata0 = elf_getdata(scn, NULL); + if (edata0) { + data0 = Buffer((const Buffer::byte_type*)edata0->d_buf, edata0->d_size, edata0->d_align); + } + seg = elf->segmentByVAddr(hdr.sh_addr); + return true; + } + + bool GElfSection::push() + { + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + Elf_Data *edata = nullptr; + edata = elf_newdata(scn); + if (!edata) { return elf->elfError("elf_newdata failed"); } + if (hdr.sh_type == SHT_NOBITS) { + edata->d_buf = 0; + edata->d_size = memsize_; + if (align_ != 0) { + edata->d_align = align_; + } + } else { + edata->d_buf = (void*)data.raw(); + edata->d_size = data.size(); + if (data.align() != 0) { + edata->d_align = data.align(); + } + } + edata->d_align = (std::max)(edata->d_align, (uint64_t) 8); + switch (hdr.sh_type) { + case SHT_RELA: + edata->d_type = ELF_T_RELA; + break; + case SHT_SYMTAB: + edata->d_type = ELF_T_SYM; + break; + default: + edata->d_type = ELF_T_BYTE; + break; + } + edata->d_version = EV_CURRENT; + if (!gelf_getshdr(scn, &hdr)) { return elf->elfError("gelf_get_shdr failed"); } + hdr.sh_size = edata->d_size; + hdr.sh_addralign = edata->d_align; + if (!gelf_update_shdr(scn, &hdr)) { return elf->elfError("gelf_update_shdr failed"); } + return true; + } + + uint64_t GElfSection::nextDataOffset(uint64_t align) const + { + return data.nextOffset(align); + } + + uint64_t GElfSection::addData(const void *src, uint64_t size, uint64_t align) + { + return data.add(src, size, align); + } + + bool GElfSection::getData(uint64_t offset, void* dest, uint64_t size) + { + Elf_Data* edata = 0; + uint64_t coffset = 0; + uint64_t csize = 0; + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + if ((edata = elf_getdata(scn, edata)) != 0) { + if (coffset <= offset && offset <= coffset + edata->d_size) { + csize = (std::min)(size, edata->d_size - offset); + memcpy(dest, (const char*) edata->d_buf + offset - coffset, csize); + coffset += csize; + dest = (char*) dest + csize; + size -= csize; + if (!size) { return true; } + } + } + return false; + } + + RelocationSection* GElfSection::relocationSection(SymbolTable* symtab) + { + if (!reloc_sec) { + reloc_sec = elf->addRelocationSection(this, symtab); + } + return reloc_sec; + } + + GElfStringTable::GElfStringTable(GElfImage* elf) + : GElfSection(elf) + { + } + + bool GElfStringTable::push(const char* name, uint32_t shtype, uint64_t shflags) + { + if (!GElfSection::push(name, shtype, shflags, SHN_UNDEF, 0, 0)) { return false; } + return true; + } + + bool GElfStringTable::pullData() + { + return true; + } + + const char* GElfStringTable::addString(const std::string& s) + { + if (data0.size() == 0 && data.size() == 0) { + data.add('\0'); + } + return data.get(data.addString(s)); + } + + size_t GElfStringTable::addString1(const std::string& s) + { + if (data0.size() == 0 && data.size() == 0) { + data.add('\0'); + } + return data.addString(s); + } + + const char* GElfStringTable::getString(size_t ndx) + { + if (data0.has(ndx)) { return data0.get(ndx); } + else if (data.has(ndx)) { return data.get(ndx); } + return nullptr; + } + + size_t GElfStringTable::getStringIndex(const char* s) + { + if (data0.has(s)) { + return data0.getOffset(s); + } else if (data.has(s)) { + return data.getOffset(s); + } else { + assert(false); + return 0; + } + } + + GElfSymbol::GElfSymbol(GElfSymbolTable* symtab_, Buffer &data_, size_t index_) + : symtab(symtab_), + edata(data_), + eindex(index_) + { + } + + Section* GElfSymbol::section() + { + if (Sym()->st_shndx != SHN_UNDEF) { + return symtab->elf->section(Sym()->st_shndx); + } + return 0; + } + + bool GElfSymbol::push(const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, uint16_t shndx, unsigned char other) + { + Sym()->st_name = symtab->strtab->addString1(name.c_str()); + Sym()->st_value = value; + Sym()->st_size = size; + Sym()->st_info = GELF_ST_INFO(binding, type); + Sym()->st_shndx = shndx; + Sym()->st_other = other; + return true; + } + + std::string GElfSymbol::name() + { + return symtab->strtab->getString(Sym()->st_name); + } + + GElfSymbolTable::GElfSymbolTable(GElfImage* elf) + : GElfSection(elf), + strtab(0) + { + } + + bool GElfSymbolTable::push(const char* name, GElfStringTable* strtab) + { + if (!strtab) { strtab = elf->strtab(); } + this->strtab = strtab; + if (!GElfSection::push(name, SHT_SYMTAB, 0, strtab->getSectionIndex(), 0, 0, sizeof(Elf64_Sym))) { return false; } + return true; + } + + bool GElfSymbolTable::pullData() + { + strtab = elf->getStringTable(hdr.sh_link); + for (size_t i = 0; i < data0.size() / sizeof(GElf_Sym); ++i) { + symbols.push_back(std::unique_ptr(new GElfSymbol(this, data0, i * sizeof(GElf_Sym)))); + } + return true; + } + + Symbol* GElfSymbolTable::addSymbolInternal(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other) + { + GElfSymbol *sym = new (std::nothrow) GElfSymbol(this, data, data.reserve()); + uint16_t shndx = section ? section->getSectionIndex() : (uint16_t) SHN_UNDEF; + if (!sym->push(name, value, size, type, binding, shndx, other)) { + delete sym; + return nullptr; + } + symbols.push_back(std::unique_ptr(sym)); + return sym; + } + + Symbol* GElfSymbolTable::addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other) + { + if (symbols.size() == 0) { + this->addSymbolInternal(nullptr, "", 0, 0, 0, 0, 0); + } + return this->addSymbolInternal(section, name, value, size, type, binding, other); + } + + size_t GElfSymbolTable::symbolCount() + { + return symbols.size(); + } + + Symbol* GElfSymbolTable::symbol(size_t i) + { + return symbols[i].get(); + } + + GElfNoteSection::GElfNoteSection(GElfImage* elf) + : GElfSection(elf) + { + } + + bool GElfNoteSection::push(const std::string& name) + { + return GElfSection::push(name.c_str(), SHT_NOTE, 0, 0, 0, 8); + } + + bool GElfNoteSection::addNote(const std::string& name, uint32_t type, const void* desc, uint32_t desc_size) + { + data.addStringLength(name, NOTE_RECORD_ALIGNMENT); + data.add(desc_size, NOTE_RECORD_ALIGNMENT); + data.add(type, NOTE_RECORD_ALIGNMENT); + data.addString(name, NOTE_RECORD_ALIGNMENT); + data.align(NOTE_RECORD_ALIGNMENT); + if (desc_size > 0) { + assert(desc); + data.add(desc, desc_size, NOTE_RECORD_ALIGNMENT); + data.align(NOTE_RECORD_ALIGNMENT); + } + return true; + } + + bool GElfNoteSection::getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) + { + Elf_Data* data = 0; + Elf_Scn *scn = elf_getscn(elf->e, ndxscn); + assert(scn); + while ((data = elf_getdata(scn, data)) != 0) { + uint32_t note_offset = 0; + while (note_offset < data->d_size) { + char* notec = (char *) data->d_buf + note_offset; + Elf64_Nhdr* note = (Elf64_Nhdr*) notec; + if (type == note->n_type) { + std::string note_name = GetNoteString(note->n_namesz, notec + sizeof(Elf64_Nhdr)); + if (name == note_name) { + *desc = notec + sizeof(Elf64_Nhdr) + alignUp(note->n_namesz, 4); + *desc_size = note->n_descsz; + return true; + } + } + note_offset += sizeof(Elf64_Nhdr) + alignUp(note->n_namesz, 4) + alignUp(note->n_descsz, 4); + } + } + return false; + } + + bool GElfRelocation::push(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) + { + Rela()->r_info = GELF_R_INFO((uint64_t) symbol->index(), type); + Rela()->r_offset = offset; + Rela()->r_addend = addend; + return true; + } + + RelocationSection* GElfRelocation::section() + { + return rsection; + } + + Symbol* GElfRelocation::symbol() + { + return rsection->symtab->symbol(symbolIndex()); + } + + GElfRelocationSection::GElfRelocationSection(GElfImage* elf, Section* section_, GElfSymbolTable* symtab_) + : GElfSection(elf), + section(section_), + symtab(symtab_) + { + } + + bool GElfRelocationSection::push(const std::string& name) + { + return GElfSection::push(name.c_str(), SHT_RELA, 0, symtab->getSectionIndex(), section->getSectionIndex(), 0, sizeof(Elf64_Rela)); + } + + Relocation* GElfRelocationSection::addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) + { + GElfRelocation *rela = new (std::nothrow) GElfRelocation(this, data, data.reserve()); + if (!rela || !rela->push(type, symbol, offset, addend)) { + delete rela; + return nullptr; + } + relocations.push_back(std::unique_ptr(rela)); + return rela; + } + + bool GElfRelocationSection::pullData() + { + section = elf->section(hdr.sh_info); + symtab = elf->getSymtab(hdr.sh_link); + Elf_Scn *lScn = elf_getscn(elf->e, ndxscn); + assert(lScn); + Elf_Data *lData = elf_getdata(lScn, nullptr); + assert(lData); + data0 = Buffer((const Buffer::byte_type*)lData->d_buf, lData->d_size, lData->d_align); + for (size_t i = 0; i < data0.size() / sizeof(GElf_Rela); ++i) { + relocations.push_back(std::unique_ptr(new GElfRelocation(this, data0, i * sizeof(GElf_Rela)))); + } + return true; + } + + GElfImage::GElfImage(int elfclass_) + : frozen(true), + elfclass(elfclass_), + buffer(0), bufferSize(0), + e(0), + shstrtabSection(0), strtabSection(0), + symtabSection(0), + noteSection(0) + { + if (EV_NONE == elf_version(EV_CURRENT)) { + assert(false); + } + } + + GElfImage::~GElfImage() + { + elf_end(e); + } + + bool GElfImage::imgError() + { + out << img.output(); + return false; + } + + const char *GElfImage::elfError() + { + return elf_errmsg(-1); + } + + bool GElfImage::elfBegin(Elf_Cmd cmd) + { + if ((e = elf_begin(img.fd(), cmd, NULL +#ifdef AMD_LIBELF + , NULL +#endif + )) == NULL) { + out << "elf_begin failed: " << elfError() << std::endl; + return false; + } + return true; + } + + bool GElfImage::initNew(uint16_t machine, uint16_t type, uint8_t os_abi, uint8_t abi_version, uint32_t e_flags) + { + if (!img.create()) { return imgError(); } + if (!elfBegin(ELF_C_WRITE)) { return false; } + if (!gelf_newehdr(e, elfclass)) { return elfError("gelf_newehdr failed"); } + if (!gelf_getehdr(e, &ehdr)) { return elfError("gelf_getehdr failed"); } + ehdr.e_ident[EI_DATA] = ELFDATA2LSB; + ehdr.e_ident[EI_VERSION] = EV_CURRENT; + ehdr.e_ident[EI_OSABI] = os_abi; + ehdr.e_ident[EI_ABIVERSION] = abi_version; + ehdr.e_machine = machine; + ehdr.e_type = type; + ehdr.e_version = EV_CURRENT; + ehdr.e_flags = e_flags; + if (!gelf_update_ehdr(e, &ehdr)) { return elfError("gelf_updateehdr failed"); } + sections.push_back(std::unique_ptr()); + if (!shstrtab()->push(".shstrtab", SHT_STRTAB, SHF_STRINGS)) { return elfError("Failed to create shstrtab"); } + ehdr.e_shstrndx = shstrtab()->getSectionIndex(); + if (!gelf_update_ehdr(e, &ehdr)) { return elfError("gelf_updateehdr failed"); } + if (!strtab()->push(".strtab", SHT_STRTAB, SHF_STRINGS)) { return elfError("Failed to create strtab"); } + frozen = false; + return true; + } + + bool GElfImage::loadFromFile(const std::string& filename) + { + if (!img.create()) { return imgError(); } + if (!img.readFrom(filename)) { return imgError(); } + if (!elfBegin(ELF_C_RDWR)) { return false; } + return pullElf(); + } + + bool GElfImage::saveToFile(const std::string& filename) + { + if (buffer) { + std::ofstream out(filename.c_str(), std::ios::binary); + if (out.fail()) { return false; } + out.write(buffer, bufferSize); + return !out.fail(); + } else { + if (!push()) { return false; } + return img.writeTo(filename); + } + } + + bool GElfImage::initFromBuffer(const void* buffer, size_t size) + { + if (size == 0) { size = ElfSize(buffer); } + if (!img.create()) { return imgError(); } + if (!img.copyFrom(buffer, size)) { return imgError(); } + if (!elfBegin(ELF_C_RDWR)) { return false; } + return pullElf(); + } + + bool GElfImage::initAsBuffer(const void* buffer, size_t size) + { + if (size == 0) { size = ElfSize(buffer); } + if ((e = elf_memory(reinterpret_cast(const_cast(buffer)), size +#ifdef AMD_LIBELF + , NULL +#endif + )) == NULL) { + out << "elf_begin(buffer) failed: " << elfError() << std::endl; + return false; + } + this->buffer = reinterpret_cast(buffer); + this->bufferSize = size; + return pullElf(); + } + + bool GElfImage::pullElf() + { + if (!gelf_getehdr(e, &ehdr)) { return elfError("gelf_getehdr failed"); } + segments.reserve(ehdr.e_phnum); + for (size_t i = 0; i < ehdr.e_phnum; ++i) { + GElfSegment* segment = new GElfSegment(this, i); + segment->pull(); + segments.push_back(std::unique_ptr(segment)); + } + + shstrtabSection = new GElfStringTable(this); + if (!shstrtabSection->pull(ehdr.e_shstrndx)) { return false; } + Elf_Scn* scn = 0; + for (unsigned n = 0; n < ehdr.e_shnum; ++n) { + scn = elf_getscn(e, n); + if (n == ehdr.e_shstrndx) { + sections.push_back(std::unique_ptr(shstrtabSection)); + continue; + } + GElf_Shdr shdr; + if (!gelf_getshdr(scn, &shdr)) { return elfError("Failed to get shdr"); } + GElfSection* section = 0; + if (shdr.sh_type == SHT_NOTE) { + section = new GElfNoteSection(this); + } else if (shdr.sh_type == SHT_RELA) { + section = new GElfRelocationSection(this); + } else if (shdr.sh_type == SHT_STRTAB) { + section = new GElfStringTable(this); + } else if (shdr.sh_type == SHT_SYMTAB || shdr.sh_type == SHT_DYNSYM) { + section = new GElfSymbolTable(this); + } else if (shdr.sh_type == SHT_NULL) { + section = 0; + sections.push_back(std::unique_ptr()); + } else { + section = new GElfSection(this); + } + if (section) { + sections.push_back(std::unique_ptr(section)); + if (!section->pull(n)) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() == SHT_STRTAB) { + if (!section->pullData()) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() == SHT_SYMTAB || section->type() == SHT_DYNSYM) { + if (!section->pullData()) { return false; } + } + } + + for (size_t n = 1; n < sections.size(); ++n) { + GElfSection* section = sections[n].get(); + if (section->type() != SHT_STRTAB && section->type() != SHT_SYMTAB && section->type() != SHT_DYNSYM) { + if (!section->pullData()) { return false; } + } + } + + for (size_t i = 1; i < sections.size(); ++i) { + if (i == ehdr.e_shstrndx || i == ehdr.e_shstrndx) { continue; } + std::unique_ptr& section = sections[i]; + if (section->Name() == ".strtab") { strtabSection = static_cast(section.get()); } + if (section->Name() == ".symtab") { symtabSection = static_cast(section.get()); } + if (section->Name() == ".note") { noteSection = static_cast(section.get()); } + } + + size_t phnum; + if (elf_getphdrnum(e, &phnum) < 0) { return elfError("elf_getphdrnum failed"); } + for (size_t i = 0; i < phnum; ++i) { + segments.push_back(std::unique_ptr(new GElfSegment(this, i))); + if (!segments[i]->pull()) { return false; } + } + + return true; + } + + bool GElfImage::elfError(const char* msg) + { + out << "Error: " << msg << ": " << elfError() << std::endl; + return false; + } + + uint64_t GElfImage::size() + { + if (buffer) { + return ElfSize(buffer); + } else { + return img.getSize(); + } + } + + bool GElfImage::push0() + { + assert(e); + for (std::unique_ptr& section : sections) { + if (section && !section->push()) { return false; } + } + + for (std::unique_ptr& section : sections) { + if (section && !section->pull0()) { return false; } + } + + if (!segments.empty()) { + if (!gelf_newphdr(e, segments.size())) { return elfError("gelf_newphdr failed"); } + } + if (elf_update(e, ELF_C_NULL) < 0) { return elfError("elf_update (1.1) failed"); } + if (!segments.empty()) { + for (std::unique_ptr& section : sections) { + // Update section offsets. + if (section && !section->pull0()) { return false; } + } + uint64_t vaddr = 0; + for (std::unique_ptr& segment : segments) { + if (!segment->push(vaddr)) { return false; } + vaddr = segment->vaddr() + segment->memSize(); + } + } + return true; + } + + bool GElfImage::push() + { + if (!push0()) { return false; } + if (elf_update(e, ELF_C_WRITE) < 0) { return elfError("elf_update (2) failed"); } + return true; + } + + Segment* GElfImage::segmentByVAddr(uint64_t vaddr) + { + for (std::unique_ptr& seg : segments) { + if (seg->vaddr() <= vaddr && vaddr < seg->vaddr() + seg->memSize()) { + return seg.get(); + } + } + return 0; + } + + Section* GElfImage::sectionByVAddr(uint64_t vaddr) + { + for (size_t n = 1; n < sections.size(); ++n) { + if (sections[n]->addr() <= vaddr && vaddr < sections[n]->addr() + sections[n]->size()) { + return sections[n].get(); + } + } + return nullptr; + } + + bool GElfImage::elfEnd() + { + return false; + } + + bool GElfImage::writeTo(const std::string& filename) + { + if (!img.writeTo(filename)) { return imgError(); } + return true; + } + + bool GElfImage::copyToBuffer(void** buf, size_t* size) + { + if (buffer) { + *buf = malloc(bufferSize); + memcpy(*buf, buffer, bufferSize); + if (size) { *size = bufferSize; } + return true; + } else { + return img.copyTo(buf, size); + } + } + + bool GElfImage::copyToBuffer(void* buf, size_t size) + { + if (buffer) { + if (size < bufferSize) { return false; } + memcpy(buf, buffer, bufferSize); + return true; + } else { + return img.copyTo(buf, size); + } + } + + GElfStringTable* GElfImage::addStringTable(const std::string& name) + { + GElfStringTable* stab = new GElfStringTable(this); + sections.push_back(std::unique_ptr(stab)); + return stab; + } + + GElfStringTable* GElfImage::getStringTable(uint16_t index) + { + return static_cast(sections[index].get()); + } + + GElfSymbolTable* GElfImage::addSymbolTable(const std::string& name, StringTable* stab) + { + if (!stab) { stab = strtab(); } + const char* name0 = shstrtab()->addString(name); + GElfSymbolTable* symtab = new GElfSymbolTable(this); + symtab->push(name0, static_cast(stab)); + sections.push_back(std::unique_ptr(symtab)); + return symtab; + } + + GElfStringTable* GElfImage::shstrtab() { + if (!shstrtabSection) { + shstrtabSection = addStringTable(".shstrtab"); + } + return shstrtabSection; + } + + GElfStringTable* GElfImage::strtab() { + if (!strtabSection) { + strtabSection = addStringTable(".shstrtab"); + } + return strtabSection; + } + + GElfSymbolTable* GElfImage::symtab() + { + if (!symtabSection) { + symtabSection = addSymbolTable(".symtab", strtab()); + } + return symtabSection; + } + + + GElfNoteSection* GElfImage::note() + { + if (!noteSection) { noteSection = addNoteSection(".note"); } + return noteSection; + } + + GElfNoteSection* GElfImage::addNoteSection(const std::string& name) + { + GElfNoteSection* note = new GElfNoteSection(this); + note->push(name); + sections.push_back(std::unique_ptr(note)); + return note; + } + + Segment* GElfImage::initSegment(uint32_t type, uint32_t flags, uint64_t paddr) + { + GElfSegment *seg = new (std::nothrow) GElfSegment(this, segments.size(), type, flags, paddr); + segments.push_back(std::unique_ptr(seg)); + return seg; + } + + bool GElfImage::addSegments() + { + return true; + } + + Section* GElfImage::addSection(const std::string &name, + uint32_t type, + uint64_t flags, + uint64_t entsize, Segment* segment) + { + GElfSection *section = new (std::nothrow) GElfSection(this); + if (!section || !section->push(name.c_str(), type, flags, 0, 0, 0, entsize)) { + delete section; + return nullptr; + } + if (segment) { + if (!segment->updateAddSection(section)) { + delete section; + return nullptr; + } + } + sections.push_back(std::unique_ptr(section)); + return section; + } + + RelocationSection* GElfImage::addRelocationSection(Section* sec, SymbolTable* symtab) + { + std::string section_name = ".rela" + sec->Name(); + if (!symtab) { symtab = this->symtab(); } + GElfRelocationSection *rsec = new GElfRelocationSection(this, sec, (GElfSymbolTable*) symtab); + if (!rsec || !rsec->push(section_name)) { + delete rsec; + return nullptr; + } + sections.push_back(std::unique_ptr(rsec)); + return rsec; + } + + RelocationSection* GElfImage::relocationSection(Section* sec, SymbolTable* symtab) + { + return sec->relocationSection(symtab); + } + + uint16_t GElfImage::machine() const + { + return ehdr.e_machine; + } + + uint16_t GElfImage::etype() const + { + return ehdr.e_type; + } + + Image* NewElf32Image() { return new GElfImage(ELFCLASS32); } + Image* NewElf64Image() { return new GElfImage(ELFCLASS64); } + + uint64_t ElfSize(const void* emi) + { + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*) emi; + if (NULL == ehdr || EV_CURRENT != ehdr->e_version) { + return false; + } + + const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + if (NULL == shdr) { + return false; + } + + uint64_t max_offset = ehdr->e_shoff; + uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; + + for (uint16_t i = 0; i < ehdr->e_shnum; ++i) { + uint64_t cur_offset = static_cast(shdr[i].sh_offset); + if (max_offset < cur_offset) { + max_offset = cur_offset; + total_size = max_offset; + if (SHT_NOBITS != shdr[i].sh_type) { + total_size += static_cast(shdr[i].sh_size); + } + } + } + + return total_size; + } + + std::string GetNoteString(uint32_t s_size, const char* s) + { + if (!s_size) { return ""; } + if (s[s_size-1] == '\0') { + return std::string(s, s_size-1); + } else { + return std::string(s, s_size); + } + } + + } +} diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code.cpp new file mode 100644 index 0000000000..546bd81d71 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code.cpp @@ -0,0 +1,1910 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include "amd_hsa_code.hpp" +#include "amd_hsa_code_util.hpp" +#include +#include "amd_hsa_elf.h" +#include +#include +#include +#include + +#if !defined(NO_SI_SP3) +#include "public/lib/sp3.h" +#endif // !defined(NO_SI_SP3) + +#ifndef _WIN32 +#define _alloca alloca +#endif + +namespace amd { +namespace hsa { +namespace code { + + using amd::elf::GetNoteString; + + bool Symbol::IsDeclaration() const + { + return elfsym->type() == STT_COMMON; + } + + bool Symbol::IsDefinition() const + { + return !IsDeclaration(); + } + + bool Symbol::IsAgent() const + { + return elfsym->section()->flags() & SHF_AMDGPU_HSA_AGENT ? true : false; + } + + hsa_symbol_linkage_t Symbol::Linkage() const + { + return elfsym->binding() == STB_GLOBAL ? HSA_SYMBOL_LINKAGE_PROGRAM : HSA_SYMBOL_LINKAGE_MODULE; + } + + hsa_variable_allocation_t Symbol::Allocation() const + { + return IsAgent() ? HSA_VARIABLE_ALLOCATION_AGENT : HSA_VARIABLE_ALLOCATION_PROGRAM; + } + + hsa_variable_segment_t Symbol::Segment() const + { + return elfsym->section()->flags() & SHF_AMDGPU_HSA_READONLY ? HSA_VARIABLE_SEGMENT_READONLY : HSA_VARIABLE_SEGMENT_GLOBAL; + } + + uint64_t Symbol::Size() const + { + return elfsym->size(); + } + + uint32_t Symbol::Size32() const + { + assert(elfsym->size() < UINT32_MAX); + return (uint32_t) Size(); + } + + uint32_t Symbol::Alignment() const + { + assert(elfsym->section()->addralign() < UINT32_MAX); + return uint32_t(elfsym->section()->addralign()); + } + + bool Symbol::IsConst() const + { + return elfsym->section()->flags() & SHF_WRITE ? true : false; + } + + hsa_status_t Symbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_TYPE: { + *((hsa_symbol_kind_t*)value) = Kind(); + break; + } + case HSA_CODE_SYMBOL_INFO_NAME_LENGTH: { + *((uint32_t*)value) = GetSymbolName().size(); + break; + } + case HSA_CODE_SYMBOL_INFO_NAME: { + std::string SymbolName = GetSymbolName(); + memset(value, 0x0, SymbolName.size()); + memcpy(value, SymbolName.c_str(), SymbolName.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH: { + *((uint32_t*)value) = GetModuleName().size(); + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME: { + std::string ModuleName = GetModuleName(); + memset(value, 0x0, ModuleName.size()); + memcpy(value, ModuleName.c_str(), ModuleName.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_LINKAGE: { + *((hsa_symbol_linkage_t*)value) = Linkage(); + break; + } + case HSA_CODE_SYMBOL_INFO_IS_DEFINITION: { + *((bool*)value) = IsDefinition(); + break; + } + default: { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + } + return HSA_STATUS_SUCCESS; + } + + std::string Symbol::GetModuleName() const { + std::string FullName = Name(); + return FullName.rfind(":") != std::string::npos ? + FullName.substr(0, FullName.find(":")) : ""; + } + + std::string Symbol::GetSymbolName() const { + std::string FullName = Name(); + return FullName.rfind(":") != std::string::npos ? + FullName.substr(FullName.rfind(":") + 1) : FullName; + } + + hsa_code_symbol_t Symbol::ToHandle(Symbol* sym) + { + hsa_code_symbol_t s; + s.handle = reinterpret_cast(sym); + return s; + } + + Symbol* Symbol::FromHandle(hsa_code_symbol_t s) + { + return reinterpret_cast(s.handle); + } + + KernelSymbol::KernelSymbol(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc) + : Symbol(elfsym_) + , kernarg_segment_size(0) + , kernarg_segment_alignment(0) + , group_segment_size(0) + , private_segment_size(0) + , is_dynamic_callstack(0) + { + if (akc) { + kernarg_segment_size = (uint32_t) akc->kernarg_segment_byte_size; + kernarg_segment_alignment = (uint32_t) (1 << akc->kernarg_segment_alignment); + group_segment_size = uint32_t(akc->workgroup_group_segment_byte_size); + private_segment_size = uint32_t(akc->workitem_private_segment_byte_size); + is_dynamic_callstack = + AMD_HSA_BITS_GET(akc->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false; + } + } + + hsa_status_t KernelSymbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: { + *((uint32_t*)value) = kernarg_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT: { + *((uint32_t*)value) = kernarg_segment_alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: { + *((uint32_t*)value) = group_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE: { + *((uint32_t*)value) = private_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK: { + *((bool*)value) = is_dynamic_callstack; + break; + } + default: { + return Symbol::GetInfo(attribute, value); + } + } + return HSA_STATUS_SUCCESS; + } + + hsa_status_t VariableSymbol::GetInfo(hsa_code_symbol_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION: { + *((hsa_variable_allocation_t*)value) = Allocation(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT: { + *((hsa_variable_segment_t*)value) = Segment(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT: { + *((uint32_t*)value) = Alignment(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE: { + *((uint32_t*)value) = Size(); + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST: { + *((bool*)value) = IsConst(); + break; + } + default: { + return Symbol::GetInfo(attribute, value); + } + } + return HSA_STATUS_SUCCESS; + } + + AmdHsaCode::AmdHsaCode(bool combineDataSegments_) + : img(nullptr), + combineDataSegments(combineDataSegments_), + hsatext(0), imageInit(0), samplerInit(0), + debugInfo(0), debugLine(0), debugAbbrev(0) + { + for (unsigned i = 0; i < AMDGPU_HSA_SEGMENT_LAST; ++i) { + for (unsigned j = 0; j < 2; ++j) { + hsaSegments[i][j] = 0; + } + } + for (unsigned i = 0; i < AMDGPU_HSA_SECTION_LAST; ++i) { + hsaSections[i] = 0; + } + } + + AmdHsaCode::~AmdHsaCode() + { + for (Symbol* sym : symbols) { delete sym; } + } + + bool AmdHsaCode::PullElf() + { + uint32_t majorVersion, minorVersion; + if (!GetCodeObjectVersion(&majorVersion, &minorVersion)) { + return false; + } + if (majorVersion >= 2) { + return PullElfV2(); + } else { + return PullElfV1(); + } + } + + bool AmdHsaCode::PullElfV1() + { + for (size_t i = 0; i < img->segmentCount(); ++i) { + Segment* s = img->segment(i); + if (s->type() == PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM || + s->type() == PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT || + s->type() == PT_AMDGPU_HSA_LOAD_READONLY_AGENT || + s->type() == PT_AMDGPU_HSA_LOAD_CODE_AGENT) { + dataSegments.push_back(s); + } + } + for (size_t i = 0; i < img->sectionCount(); ++i) { + Section* sec = img->section(i); + if (!sec) { continue; } + if ((sec->type() == SHT_PROGBITS || sec->type() == SHT_NOBITS) && + (sec->flags() & (SHF_AMDGPU_HSA_AGENT | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_CODE))) { + dataSections.push_back(sec); + } else if (sec->type() == SHT_RELA) { + relocationSections.push_back(sec->asRelocationSection()); + } + if (sec->Name() == ".hsatext") { + hsatext = sec; + } + } + for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { + amd::elf::Symbol* elfsym = img->symtab()->symbol(i); + Symbol* sym = 0; + switch (elfsym->type()) { + case STT_AMDGPU_HSA_KERNEL: { + amd::elf::Section* sec = elfsym->section(); + amd_kernel_code_t akc; + if (!sec) { + out << "Failed to find section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!(sec->flags() & (SHF_AMDGPU_HSA_AGENT | SHF_AMDGPU_HSA_CODE | SHF_EXECINSTR))) { + out << "Invalid code section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!sec->getData(elfsym->value(), &akc, sizeof(amd_kernel_code_t))) { + out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; + return false; + } + sym = new KernelSymbol(elfsym, &akc); + break; + } + case STT_OBJECT: + case STT_FUNC: + case STT_COMMON: + sym = new VariableSymbol(elfsym); + break; + default: + break; // Skip unknown symbols. + } + if (sym) { symbols.push_back(sym); } + } + + return true; + } + + bool AmdHsaCode::LoadFromFile(const std::string& filename) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->loadFromFile(filename)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::SaveToFile(const std::string& filename) + { + return img->saveToFile(filename) || ElfImageError(); + } + + bool AmdHsaCode::WriteToBuffer(void* buffer) + { + return img->copyToBuffer(buffer, ElfSize()) || ElfImageError(); + } + + + bool AmdHsaCode::InitFromBuffer(const void* buffer, size_t size) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->initFromBuffer(buffer, size)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::InitAsBuffer(const void* buffer, size_t size) + { + if (!img) { img.reset(amd::elf::NewElf64Image()); } + if (!img->initAsBuffer(buffer, size)) { return ElfImageError(); } + if (!PullElf()) { return ElfImageError(); } + return true; + } + + bool AmdHsaCode::InitAsHandle(hsa_code_object_t code_object) + { + void *elfmemrd = reinterpret_cast(code_object.handle); + if (!elfmemrd) { return false; } + return InitAsBuffer(elfmemrd, 0); + } + + bool AmdHsaCode::InitNew(bool xnack) + { + if (!img) { + img.reset(amd::elf::NewElf64Image()); + uint32_t flags = 0; + if (xnack) { flags |= ELF::EF_AMDGPU_FEATURE_XNACK_V2; } + return img->initNew(ELF::EM_AMDGPU, ET_EXEC, ELF::ELFOSABI_AMDGPU_HSA, ELF::ELFABIVERSION_AMDGPU_HSA_V2, flags) || + ElfImageError(); // FIXME: elfutils libelf does not allow program headers in ET_REL file type, so change it later in finalizer. + } + return false; + } + + bool AmdHsaCode::Freeze() + { + return img->Freeze() || ElfImageError(); + } + + hsa_code_object_t AmdHsaCode::GetHandle() + { + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast(img->data()); + return code_object; + } + + const char* AmdHsaCode::ElfData() + { + return img->data(); + } + + uint64_t AmdHsaCode::ElfSize() + { + return img->size(); + } + + bool AmdHsaCode::Validate() + { + if (!img->Validate()) { return ElfImageError(); } + if (img->Machine() != ELF::EM_AMDGPU) { + out << "ELF error: Invalid machine" << std::endl; + return false; + } + return true; + } + + void AmdHsaCode::AddAmdNote(uint32_t type, const void* desc, uint32_t desc_size) + { + img->note()->addNote("AMD", type, desc, desc_size); + } + + void AmdHsaCode::AddNoteCodeObjectVersion(uint32_t major, uint32_t minor) + { + amdgpu_hsa_note_code_object_version_t desc; + desc.major_version = major; + desc.minor_version = minor; + AddAmdNote(NT_AMD_HSA_CODE_OBJECT_VERSION, &desc, sizeof(desc)); + } + + bool AmdHsaCode::GetCodeObjectVersion(uint32_t* major, uint32_t* minor) + { + switch (img->ABIVersion()) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + amdgpu_hsa_note_code_object_version_t* desc; + if (GetAmdNote(NT_AMD_HSA_CODE_OBJECT_VERSION, &desc)) { + *major = desc->major_version; + *minor = desc->minor_version; + return *major <= 2; + } + return false; + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + *major = 3; + *minor = 0; + return true; + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + *major = 4; + *minor = 0; + return true; + case ELF::ELFABIVERSION_AMDGPU_HSA_V5: + *major = 5; + *minor = 0; + return true; + case ELF::ELFABIVERSION_AMDGPU_HSA_V6: + *major = 6; + *minor = 0; + return true; + } + + return false; + } + + bool AmdHsaCode::GetNoteCodeObjectVersion(std::string& version) + { + amdgpu_hsa_note_code_object_version_t* desc; + if (!GetAmdNote(NT_AMD_HSA_CODE_OBJECT_VERSION, &desc)) { return false; } + version.clear(); + version += std::to_string(desc->major_version); + version += "."; + version += std::to_string(desc->minor_version); + return true; + } + + void AmdHsaCode::AddNoteHsail(uint32_t hsail_major, uint32_t hsail_minor, hsa_profile_t profile, hsa_machine_model_t machine_model, hsa_default_float_rounding_mode_t rounding_mode) + { + amdgpu_hsa_note_hsail_t desc; + memset(&desc, 0, sizeof(desc)); + desc.hsail_major_version = hsail_major; + desc.hsail_minor_version = hsail_minor; + desc.profile = uint8_t(profile); + desc.machine_model = uint8_t(machine_model); + desc.default_float_round = uint8_t(rounding_mode); + AddAmdNote(NT_AMD_HSA_HSAIL, &desc, sizeof(desc)); + } + + bool AmdHsaCode::GetNoteHsail(uint32_t* hsail_major, uint32_t* hsail_minor, hsa_profile_t* profile, hsa_machine_model_t* machine_model, hsa_default_float_rounding_mode_t* default_float_round) + { + amdgpu_hsa_note_hsail_t *desc; + if (!GetAmdNote(NT_AMD_HSA_HSAIL, &desc)) { return false; } + *hsail_major = desc->hsail_major_version; + *hsail_minor = desc->hsail_minor_version; + *profile = (hsa_profile_t) desc->profile; + *machine_model = (hsa_machine_model_t) desc->machine_model; + *default_float_round = (hsa_default_float_rounding_mode_t) desc->default_float_round; + return true; + } + + void AmdHsaCode::AddNoteIsa(const std::string& vendor_name, const std::string& architecture_name, uint32_t major, uint32_t minor, uint32_t stepping) + { + size_t size = sizeof(amdgpu_hsa_note_producer_t) + vendor_name.length() + architecture_name.length() + 2; + amdgpu_hsa_note_isa_t* desc = (amdgpu_hsa_note_isa_t*) _alloca(size); + memset(desc, 0, size); + desc->vendor_name_size = vendor_name.length()+1; + desc->architecture_name_size = architecture_name.length()+1; + desc->major = major; + desc->minor = minor; + desc->stepping = stepping; + memcpy(desc->vendor_and_architecture_name, vendor_name.c_str(), vendor_name.length() + 1); + memcpy(desc->vendor_and_architecture_name + desc->vendor_name_size, architecture_name.c_str(), architecture_name.length() + 1); + AddAmdNote(NT_AMD_HSA_ISA_VERSION, desc, size); + } + + bool AmdHsaCode::GetNoteIsa(std::string& vendor_name, std::string& architecture_name, uint32_t* major_version, uint32_t* minor_version, uint32_t* stepping) + { + amdgpu_hsa_note_isa_t *desc; + if (!GetAmdNote(NT_AMD_HSA_ISA_VERSION, &desc)) { return false; } + vendor_name = GetNoteString(desc->vendor_name_size, desc->vendor_and_architecture_name); + architecture_name = GetNoteString(desc->architecture_name_size, desc->vendor_and_architecture_name + vendor_name.length() + 1); + *major_version = desc->major; + *minor_version = desc->minor; + *stepping = desc->stepping; + return true; + } + + // TODO: Move isa registry into the loader. + static bool GetMachInfo(unsigned mach, std::string &name, bool &sramecc_supported, bool &xnack_supported) { + switch (mach) { + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: name = "gfx700"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: name = "gfx701"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: name = "gfx702"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: name = "gfx703"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: name = "gfx704"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: name = "gfx801"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: name = "gfx802"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: name = "gfx803"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: name = "gfx810"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: name = "gfx900"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: name = "gfx902"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: name = "gfx904"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: name = "gfx906"; xnack_supported = true; sramecc_supported = true; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: name = "gfx908"; xnack_supported = true; sramecc_supported = true; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: name = "gfx90c"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1000: name = "gfx1000"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: name = "gfx1010"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: name = "gfx1011"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: name = "gfx1012"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: name = "gfx1030"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031: name = "gfx1031"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032: name = "gfx1032"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033: name = "gfx1033"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034: name = "gfx1034"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035: name = "gfx1035"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036: name = "gfx1036"; xnack_supported = false; sramecc_supported = false; break; +#if defined(GFX40_BUILD) + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX4000: name = "gfx4000"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX4010: name = "gfx4010"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX4020: name = "gfx4020"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX4030: name = "gfx4030"; xnack_supported = false; sramecc_supported = false; break; +#endif // GFX40_BUILD +#if defined(GFX11_BUILD) + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100: name = "gfx1100"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: name = "gfx1101"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: name = "gfx1102"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: name = "gfx1103"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: name = "gfx1150"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: name = "gfx1151"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: name = "gfx1152"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC: name = "gfx11-generic"; xnack_supported = false; sramecc_supported = false; break; +#endif // GFX11_BUILD +#if defined(GFX12_BUILD) + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: name = "gfx1200"; xnack_supported = false; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: name = "gfx1201"; xnack_supported = false; sramecc_supported = false; break; +#endif // GFX12_BUILD + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: name = "gfx9-generic"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC: name = "gfx10-1-generic"; xnack_supported = true; sramecc_supported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC: name = "gfx10-3-generic"; xnack_supported = false; sramecc_supported = false; break; + default: return false; + } + return true; + } + + // This function is also copied to the Code Object Manager library. + static std::string ConvertOldTargetNameToNew(const std::string &old_name, bool is_finalizer, uint32_t e_flags) { + assert(!old_name.empty() && "Expecting non-empty old name"); + + unsigned mach = 0; + if (old_name == "AMD:AMDGPU:7:0:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX700; + else if (old_name == "AMD:AMDGPU:7:0:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX701; + else if (old_name == "AMD:AMDGPU:7:0:2") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX702; + else if (old_name == "AMD:AMDGPU:7:0:3") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX703; + else if (old_name == "AMD:AMDGPU:7:0:4") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX704; + else if (old_name == "AMD:AMDGPU:8:0:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX801; + else if (old_name == "AMD:AMDGPU:8:0:0" || old_name == "AMD:AMDGPU:8:0:2") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX802; + else if (old_name == "AMD:AMDGPU:8:0:3" || old_name == "AMD:AMDGPU:8:0:4") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX803; + else if (old_name == "AMD:AMDGPU:8:1:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX810; + else if (old_name == "AMD:AMDGPU:9:0:0" || old_name == "AMD:AMDGPU:9:0:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX900; + else if (old_name == "AMD:AMDGPU:9:0:2" || old_name == "AMD:AMDGPU:9:0:3") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX902; + else if (old_name == "AMD:AMDGPU:9:0:4" || old_name == "AMD:AMDGPU:9:0:5") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; + else if (old_name == "AMD:AMDGPU:9:0:6" || old_name == "AMD:AMDGPU:9:0:7") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + else if (old_name == "AMD:AMDGPU:9:0:12" || old_name == "AMD:AMDGPU:9:0:13") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:0:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1000; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:1:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:1:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:1:2") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1031; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:2") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1032; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:3") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1033; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:4") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1034; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:5") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1035; + else if (is_finalizer && old_name == "AMD:AMDGPU:10:3:6") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1036; +#if defined(GFX40_BUILD) + else if (is_finalizer && old_name == "AMD:AMDGPU:40:0:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX4000; + else if (is_finalizer && old_name == "AMD:AMDGPU:40:1:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX4010; + else if (is_finalizer && old_name == "AMD:AMDGPU:40:2:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX4020; + else if (is_finalizer && old_name == "AMD:AMDGPU:40:3:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX4030; +#endif // GFX40_BUILD +#if defined(GFX11_BUILD) + else if (is_finalizer && old_name == "AMD:AMDGPU:11:0:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100; + else if (is_finalizer && old_name == "AMD:AMDGPU:11:0:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101; + else if (is_finalizer && old_name == "AMD:AMDGPU:11:0:2") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102; + else if (is_finalizer && old_name == "AMD:AMDGPU:11:0:3") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; + else if (is_finalizer && old_name == "AMD:AMDGPU:11:5:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150; + else if (is_finalizer && old_name == "AMD:AMDGPU:11:5:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; +#endif // GFX11_BUILD +#if defined(GFX12_BUILD) + else if (is_finalizer && old_name == "AMD:AMDGPU:12:0:0") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; + else if (is_finalizer && old_name == "AMD:AMDGPU:12:0:1") + mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; +#endif // GFX12_BUILD + else { + // Code object v2 is still supported by the finalizer for GFX10+, but + // NOT the lightning compiler. + return ""; + } + std::string name; + bool sramecc_supported = false; + bool xnack_supported = false; + if (!GetMachInfo(mach, name, sramecc_supported, xnack_supported)) + return ""; + + // Only "AMD:AMDGPU:9:0:6" and "AMD:AMDGPU:9:0:7" supports SRAMECC for + // code object V2, and it must be OFF. + if (sramecc_supported) + name += ":sramecc-"; + + if (is_finalizer) { + if (e_flags & ELF::EF_AMDGPU_FEATURE_XNACK_V2) + name += ":xnack+"; + else if (xnack_supported) + name += ":xnack-"; + } else { + if (old_name == "AMD:AMDGPU:8:0:1") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:8:1:0") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:9:0:1") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:9:0:3") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:9:0:5") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:9:0:7") + name += ":xnack+"; + else if (old_name == "AMD:AMDGPU:9:0:13") + name += ":xnack+"; + else if (xnack_supported) + name += ":xnack-"; + } + + return name; + } + + bool AmdHsaCode::GetIsa(std::string& isa_name, unsigned *genericVersion) + { + isa_name.clear(); + + uint32_t code_object_major_version = 0; + uint32_t code_object_minor_version = 0; + + // Generic versioning starts at 1, so zero means no generic version. + if (genericVersion) + *genericVersion = 0; + + if (!GetCodeObjectVersion(&code_object_major_version, &code_object_minor_version)) { + return false; + } + if (code_object_major_version >= 3) { + + switch (img->EClass()) { + case ELFCLASS64: + // There is no e_machine and/or OS ABI for R600 so rely on checking + // the ELFCLASS to determin if AMDGCN verses R600. AMDHSA always uses + // ELFCLASS64 and R600 always uses ELFCLASS32. + isa_name += "amdgcn"; + break; + default: + return false; + } + + if (img->Machine() != ELF::EM_AMDGPU) + return false; + isa_name += "-amd-"; + + switch (img->OsAbi()) { + case ELF::ELFOSABI_AMDGPU_HSA: + isa_name += "amdhsa"; + break; + default: + // Only support AMDHSA in the ROCm runtime. + return false; + } + + isa_name += "--"; + + unsigned mach = img->EFlags() & ELF::EF_AMDGPU_MACH; + std::string name = ""; + bool xnack_supported = false; + bool sramecc_supported = false; + + if (!GetMachInfo(mach, name, sramecc_supported, xnack_supported)) + return false; + + isa_name += name; + + if (code_object_major_version == 3) { + if (img->EFlags() & ELF::EF_AMDGPU_FEATURE_SRAMECC_V3) + isa_name += ":sramecc+"; + else if (sramecc_supported) + isa_name += ":sramecc-"; + + if (img->EFlags() & ELF::EF_AMDGPU_FEATURE_XNACK_V3) + isa_name += ":xnack+"; + else if (xnack_supported) + isa_name += ":xnack-"; + } else if (code_object_major_version >= 4) { + switch (img->EFlags() & ELF::EF_AMDGPU_FEATURE_SRAMECC_V4) { + case ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4: + isa_name += ":sramecc-"; + break; + case ELF::EF_AMDGPU_FEATURE_SRAMECC_ON_V4: + isa_name += ":sramecc+"; + break; + } + + switch (img->EFlags() & ELF::EF_AMDGPU_FEATURE_XNACK_V4) { + case ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4: + isa_name += ":xnack-"; + break; + case ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4: + isa_name += ":xnack+"; + break; + } + + // Generic version is not part of the ISA name. + // Only parse it when the caller wants it. + if (genericVersion && code_object_major_version >= 6) { + *genericVersion = (img->EFlags() & ELF::EF_AMDGPU_GENERIC_VERSION) >> ELF::EF_AMDGPU_GENERIC_VERSION_OFFSET; + } + } else { + return false; + } + + return true; + } else { + std::string vendor_name, architecture_name; + uint32_t major_version, minor_version, stepping; + if (!GetNoteIsa(vendor_name, architecture_name, &major_version, &minor_version, &stepping)) { return false; } + isa_name += vendor_name; + isa_name += ":"; + isa_name += architecture_name; + isa_name += ":"; + isa_name += std::to_string(major_version); + isa_name += ":"; + isa_name += std::to_string(minor_version); + isa_name += ":"; + isa_name += std::to_string(stepping); + + amdgpu_hsa_note_hsail_t *hsailNote; + bool IsFinalizer = GetAmdNote(NT_AMD_HSA_HSAIL, &hsailNote); + isa_name = ConvertOldTargetNameToNew(isa_name, IsFinalizer, img->EFlags()); + isa_name = "amdgcn-amd-amdhsa--" + isa_name; + return !isa_name.empty(); + } + } + + void AmdHsaCode::AddNoteProducer(uint32_t major, uint32_t minor, const std::string& producer) + { + size_t size = sizeof(amdgpu_hsa_note_producer_t) + producer.length(); + amdgpu_hsa_note_producer_t* desc = (amdgpu_hsa_note_producer_t*) _alloca(size); + memset(desc, 0, size); + desc->producer_name_size = producer.length(); + desc->producer_major_version = major; + desc->producer_minor_version = minor; + memcpy(desc->producer_name, producer.c_str(), producer.length() + 1); + AddAmdNote(NT_AMD_HSA_PRODUCER, desc, size); + } + + bool AmdHsaCode::GetNoteProducer(uint32_t* major, uint32_t* minor, std::string& producer_name) + { + amdgpu_hsa_note_producer_t* desc; + if (!GetAmdNote(NT_AMD_HSA_PRODUCER, &desc)) { return false; } + *major = desc->producer_major_version; + *minor = desc->producer_minor_version; + producer_name = GetNoteString(desc->producer_name_size, desc->producer_name); + return true; + } + + void AmdHsaCode::AddNoteProducerOptions(const std::string& options) + { + size_t size = sizeof(amdgpu_hsa_note_producer_options_t) + options.length(); + amdgpu_hsa_note_producer_options_t *desc = (amdgpu_hsa_note_producer_options_t*) _alloca(size); + desc->producer_options_size = options.length(); + memcpy(desc->producer_options, options.c_str(), options.length() + 1); + AddAmdNote(NT_AMD_HSA_PRODUCER_OPTIONS, desc, size); + } + + void AmdHsaCode::AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options) + { + using namespace code_options; + std::ostringstream ss; + ss << + space << "-hsa_call_convention=" << call_convention << + control_directives(user_directives); + if (!user_options.empty()) { + ss << space << user_options; + } + + AddNoteProducerOptions(ss.str()); + } + + bool AmdHsaCode::GetNoteProducerOptions(std::string& options) + { + amdgpu_hsa_note_producer_options_t* desc; + if (!GetAmdNote(NT_AMD_HSA_PRODUCER_OPTIONS, &desc)) { return false; } + options = GetNoteString(desc->producer_options_size, desc->producer_options); + return true; + } + + hsa_status_t AmdHsaCode::GetInfo(hsa_code_object_info_t attribute, void *value) + { + assert(value); + switch (attribute) { + case HSA_CODE_OBJECT_INFO_VERSION: { + std::string version; + if (!GetNoteCodeObjectVersion(version)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + char *svalue = (char*)value; + memset(svalue, 0x0, 64); + memcpy(svalue, version.c_str(), (std::min)(size_t(63), version.length())); + break; + } + case HSA_CODE_OBJECT_INFO_ISA: { + // TODO: Currently returns string representation instead of hsa_isa_t + // which is unavailable here. + std::string isa; + if (!GetIsa(isa)) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + char *svalue = (char*)value; + memset(svalue, 0x0, 64); + memcpy(svalue, isa.c_str(), (std::min)(size_t(63), isa.length())); + break; + } + case HSA_CODE_OBJECT_INFO_MACHINE_MODEL: + case HSA_CODE_OBJECT_INFO_PROFILE: + case HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: { + uint32_t hsail_major, hsail_minor; + hsa_profile_t profile; + hsa_machine_model_t machine_model; + hsa_default_float_rounding_mode_t default_float_round; + if (!GetNoteHsail(&hsail_major, &hsail_minor, &profile, &machine_model, &default_float_round)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + switch (attribute) { + case HSA_CODE_OBJECT_INFO_MACHINE_MODEL: + *((hsa_machine_model_t*)value) = machine_model; break; + case HSA_CODE_OBJECT_INFO_PROFILE: + *((hsa_profile_t*)value) = profile; break; + case HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE: + *((hsa_default_float_rounding_mode_t*)value) = default_float_round; break; + default: break; + } + break; + } + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; + } + + hsa_status_t AmdHsaCode::GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *s) + { + std::string mname = MangleSymbolName(module_name ? module_name : "", symbol_name); + for (Symbol* sym : symbols) { + if (sym->Name() == mname) { + *s = Symbol::ToHandle(sym); + return HSA_STATUS_SUCCESS; + } + } + return HSA_STATUS_ERROR_INVALID_SYMBOL_NAME; + } + + hsa_status_t AmdHsaCode::IterateSymbols(hsa_code_object_t code_object, + hsa_status_t (*callback)( + hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void* data), + void* data) + { + for (Symbol* sym : symbols) { + hsa_code_symbol_t s = Symbol::ToHandle(sym); + hsa_status_t status = callback(code_object, s, data); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; + } + + Section* AmdHsaCode::ImageInitSection() + { + if (!imageInit) { + imageInit = img->addSection( + ".hsaimage_imageinit", + SHT_PROGBITS, + SHF_MERGE, + sizeof(amdgpu_hsa_image_descriptor_t)); + } + return imageInit; + } + + void AmdHsaCode::AddImageInitializer(Symbol* image, uint64_t destOffset, const amdgpu_hsa_image_descriptor_t& desc) + { + uint64_t offset = ImageInitSection()->addData(&desc, sizeof(desc), 8); + amd::elf::Symbol* imageInit = + img->symtab()->addSymbol(ImageInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); + image->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_IMAGE, imageInit, image->elfSym()->value() + destOffset, 0); + } + + void AmdHsaCode::AddImageInitializer( + Symbol* image, uint64_t destOffset, + amdgpu_hsa_metadata_kind16_t kind, + amdgpu_hsa_image_geometry8_t geometry, + amdgpu_hsa_image_channel_order8_t channel_order, amdgpu_hsa_image_channel_type8_t channel_type, + uint64_t width, uint64_t height, uint64_t depth, uint64_t array) + { + amdgpu_hsa_image_descriptor_t desc; + desc.size = (uint16_t) sizeof(amdgpu_hsa_image_descriptor_t); + desc.kind = kind; + desc.geometry = geometry; + desc.channel_order = channel_order; + desc.channel_type = channel_type; + desc.width = width; + desc.height = height; + desc.depth = depth; + desc.array = array; + AddImageInitializer(image, destOffset, desc); + } + + + Section* AmdHsaCode::SamplerInitSection() + { + if (!samplerInit) { + samplerInit = img->addSection( + ".hsaimage_samplerinit", + SHT_PROGBITS, + SHF_MERGE, + sizeof(amdgpu_hsa_sampler_descriptor_t)); + } + return samplerInit; + } + + void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, const amdgpu_hsa_sampler_descriptor_t& desc) + { + uint64_t offset = SamplerInitSection()->addData(&desc, sizeof(desc), 8); + amd::elf::Symbol* samplerInit = + img->symtab()->addSymbol(SamplerInitSection(), "", offset, 0, STT_AMDGPU_HSA_METADATA, STB_LOCAL); + sampler->elfSym()->section()->relocationSection()->addRelocation(R_AMDGPU_INIT_SAMPLER, samplerInit, sampler->elfSym()->value() + destOffset, 0); + } + + void AmdHsaCode::AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, + amdgpu_hsa_sampler_coord8_t coord, + amdgpu_hsa_sampler_filter8_t filter, + amdgpu_hsa_sampler_addressing8_t addressing) + { + amdgpu_hsa_sampler_descriptor_t desc; + desc.size = (uint16_t) sizeof(amdgpu_hsa_sampler_descriptor_t); + desc.kind = AMDGPU_HSA_METADATA_KIND_INIT_SAMP; + desc.coord = coord; + desc.filter = filter; + desc.addressing = addressing; + AddSamplerInitializer(sampler, destOffset, desc); + } + + void AmdHsaCode::AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend) + { + uint32_t rtype = large ? R_AMDGPU_64 : R_AMDGPU_32_LOW; + dest->elfSym()->section()->relocationSection()->addRelocation(rtype, addrOf->elfSym(), dest->elfSym()->value() + destOffset, addrAddend); + } + + uint64_t AmdHsaCode::NextKernelCodeOffset() const + { + return HsaText()->nextDataOffset(256); + } + + bool AmdHsaCode::AddKernelCode(KernelSymbol* sym, const void* code, size_t size) + { + assert(nullptr != sym); + + uint64_t offset = HsaText()->addData(code, size, 256); + sym->setValue(offset); + sym->setSize(size); + return true; + } + + Section* AmdHsaCode::AddEmptySection() + { + dataSections.push_back(nullptr); return nullptr; + } + + Section* AmdHsaCode::AddCodeSection(Segment* segment) + { + if (nullptr == img) { return nullptr; } + Section *sec = img->addSection( + ".hsatext", + SHT_PROGBITS, + SHF_ALLOC | SHF_EXECINSTR | SHF_WRITE | SHF_AMDGPU_HSA_CODE | SHF_AMDGPU_HSA_AGENT, + 0, + segment); + dataSections.push_back(sec); + hsatext = sec; + return sec; + } + + Section* AmdHsaCode::AddDataSection(const std::string &name, + uint32_t type, + uint64_t flags, + Segment* segment) + { + if (nullptr == img) { return nullptr; } + Section *sec = img->addSection(name, type, flags, 0, segment); + dataSections.push_back(sec); + return sec; + } + + void AmdHsaCode::InitHsaSectionSegment(amdgpu_hsa_elf_section_t section, bool combineSegments) + { + InitHsaSegment(AmdHsaElfSectionSegment(section), combineSegments || !IsAmdHsaElfSectionROData(section)); + } + + Section* AmdHsaCode::HsaDataSection(amdgpu_hsa_elf_section_t sec, bool combineSegments) + { + if (!hsaSections[sec]) { + bool writable = combineSegments || !IsAmdHsaElfSectionROData(sec); + Segment* segment = HsaSegment(AmdHsaElfSectionSegment(sec), writable); + assert(segment); // Expected to be init the segment via InitHsaSegment. + Section* section; + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + section = AddDataSection(".hsarodata_global_program", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + section = AddDataSection(".hsarodata_global_agent", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_RODATA_READONLY_AGENT: + section = AddDataSection(".hsarodata_readonly_agent", SHT_PROGBITS, SHF_ALLOC | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_DATA_GLOBAL_PROGRAM: + section = AddDataSection(".hsadata_global_program", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_DATA_GLOBAL_AGENT: + section = AddDataSection(".hsadata_global_agent", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_DATA_READONLY_AGENT: + section = AddDataSection(".hsadata_readonly_agent", SHT_PROGBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_BSS_GLOBAL_PROGRAM: + section = AddDataSection(".hsabss_global_program", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL, segment); break; + case AMDGPU_HSA_BSS_GLOBAL_AGENT: + section = AddDataSection(".hsabss_global_agent", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_GLOBAL | SHF_AMDGPU_HSA_AGENT, segment); break; + case AMDGPU_HSA_BSS_READONLY_AGENT: + section = AddDataSection(".hsabss_readonly_agent", SHT_NOBITS, SHF_ALLOC | SHF_WRITE | SHF_AMDGPU_HSA_READONLY | SHF_AMDGPU_HSA_AGENT, segment); break; + default: + assert(false); return 0; + } + hsaSections[sec] = section; + } + return hsaSections[sec]; + } + + void AmdHsaCode::InitHsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable) + { + if (!hsaSegments[segment][writable]) { + uint32_t flags = PF_R; + if (writable) { flags |= PF_W; } + if (segment == AMDGPU_HSA_SEGMENT_CODE_AGENT) { flags |= PF_X; } + uint32_t type = PT_LOOS + segment; + assert(segment < AMDGPU_HSA_SEGMENT_LAST); + hsaSegments[segment][writable] = img->initSegment(type, flags); + } + } + + bool AmdHsaCode::AddHsaSegments() + { + if (!img->addSegments()) { return ElfImageError(); } + return true; + } + + Segment* AmdHsaCode::HsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable) + { + return hsaSegments[segment][writable]; + } + + Symbol* AmdHsaCode::AddExecutableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section) + { + if (nullptr == img) { return nullptr; } + if (!section) { section = HsaText(); } + symbols.push_back(new KernelSymbol(img->symtab()->addSymbol(section, name, 0, 0, type, binding, other), nullptr)); + return symbols.back(); + } + + Symbol* AmdHsaCode::AddVariableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section, + uint64_t value, + uint64_t size) + { + if (nullptr == img) { return nullptr; } + symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(section, name, value, size, type, binding, other))); + return symbols.back(); + } + + void AmdHsaCode::AddSectionSymbols() + { + if (nullptr == img) { return; } + for (size_t i = 0; i < dataSections.size(); ++i) { + if (dataSections[i] && dataSections[i]->flags() & SHF_ALLOC) { + symbols.push_back(new VariableSymbol(img->symtab()->addSymbol(dataSections[i], "__hsa_section" + dataSections[i]->Name(), 0, 0, STT_SECTION, STB_LOCAL))); + } + } + } + + Symbol* AmdHsaCode::GetSymbolByElfIndex(size_t index) + { + for (auto &s : symbols) { + if (s && index == s->Index()) { + return s; + } + } + return nullptr; + } + + Symbol* AmdHsaCode::FindSymbol(const std::string &n) + { + for (auto &s : symbols) { + if (s && n == s->Name()) { + return s; + } + } + return nullptr; + } + + void AmdHsaCode::AddData(amdgpu_hsa_elf_section_t s, const void* data, size_t size) + { +// getDataSection(s)->addData(data, size); + } + + Section* AmdHsaCode::DebugInfo() + { + if (!debugInfo) { + debugInfo = img->addSection(".debug_info", SHT_PROGBITS); + } + return debugInfo; + } + + Section* AmdHsaCode::DebugLine() + { + if (!debugLine) { + debugLine = img->addSection(".debug_line", SHT_PROGBITS); + } + return debugLine; + } + + Section* AmdHsaCode::DebugAbbrev() + { + if (!debugAbbrev) { + debugAbbrev = img->addSection(".debug_abbrev", SHT_PROGBITS); + } + return debugAbbrev; + } + + Section* AmdHsaCode::AddHsaHlDebug(const std::string& name, const void* data, size_t size) + { + Section* section = img->addSection(name, SHT_PROGBITS, SHF_OS_NONCONFORMING); + section->addData(data, size, 1); + return section; + } + + bool AmdHsaCode::PrintToFile(const std::string& filename) + { + std::ofstream out(filename); + if (out.fail()) { return false; } + Print(out); + return out.fail(); + } + + void AmdHsaCode::Print(std::ostream& out) + { + PrintNotes(out); + out << std::endl; + PrintSegments(out); + out << std::endl; + PrintSections(out); + out << std::endl; + PrintSymbols(out); + out << std::endl; + PrintMachineCode(out); + out << std::endl; + out << "AMD HSA Code Object End" << std::endl; + } + + void AmdHsaCode::PrintNotes(std::ostream& out) + { + { + uint32_t major_version, minor_version; + if (GetCodeObjectVersion(&major_version, &minor_version)) { + out << "AMD HSA Code Object" << std::endl + << " Version " << major_version << "." << minor_version << std::endl; + } + } + { + uint32_t hsail_major, hsail_minor; + hsa_profile_t profile; + hsa_machine_model_t machine_model; + hsa_default_float_rounding_mode_t rounding_mode; + if (GetNoteHsail(&hsail_major, &hsail_minor, &profile, &machine_model, &rounding_mode)) { + out << "HSAIL " << std::endl + << " Version: " << hsail_major << "." << hsail_minor << std::endl + << " Profile: " << HsaProfileToString(profile) + << " Machine model: " << HsaMachineModelToString(machine_model) + << " Default float rounding: " << HsaFloatRoundingModeToString(rounding_mode) << std::endl; + } + } + { + std::string vendor_name, architecture_name; + uint32_t major_version, minor_version, stepping; + if (GetNoteIsa(vendor_name, architecture_name, &major_version, &minor_version, &stepping)) { + out << "ISA" << std::endl + << " Vendor " << vendor_name + << " Arch " << architecture_name + << " Version " << major_version << ":" << minor_version << ":" << stepping << std::endl; + } + } + { + std::string producer_name, producer_options; + uint32_t major, minor; + if (GetNoteProducer(&major, &minor, producer_name)) { + out << "Producer '" << producer_name << "' " << "Version " << major << ":" << minor << std::endl; + } + } + { + std::string producer_options; + if (GetNoteProducerOptions(producer_options)) { + out << "Producer options" << std::endl + << " '" << producer_options << "'" << std::endl; + } + } + } + + void AmdHsaCode::PrintSegments(std::ostream& out) + { + out << "Segments (total " << DataSegmentCount() << "):" << std::endl; + for (size_t i = 0; i < DataSegmentCount(); ++i) { + PrintSegment(out, DataSegment(i)); + } + } + + void AmdHsaCode::PrintSections(std::ostream& out) + { + out << "Data Sections (total " << DataSectionCount() << "):" << std::endl; + for (size_t i = 0; i < DataSectionCount(); ++i) { + PrintSection(out, DataSection(i)); + } + out << std::endl; + out << "Relocation Sections (total " << RelocationSectionCount() << "):" << std::endl; + for (size_t i = 0; i < RelocationSectionCount(); ++i) { + PrintSection(out, GetRelocationSection(i)); + } + } + + void AmdHsaCode::PrintSymbols(std::ostream& out) + { + out << "Symbols (total " << SymbolCount() << "):" << std::endl; + for (size_t i = 0; i < SymbolCount(); ++i) { + PrintSymbol(out, GetSymbol(i)); + } + } + + void AmdHsaCode::PrintMachineCode(std::ostream& out) + { + if (HasHsaText()) { + out << std::dec; + for (size_t i = 0; i < SymbolCount(); ++i) { + Symbol* sym = GetSymbol(i); + if (sym->IsKernelSymbol() && sym->IsDefinition()) { + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + out << "AMD Kernel Code for " << sym->Name() << ": " << std::endl << std::dec; + PrintAmdKernelCode(out, &kernel_code); + out << std::endl; + } + } + + std::vector isa(HsaText()->size(), 0); + HsaText()->getData(0, isa.data(), HsaText()->size()); + + out << "Disassembly:" << std::endl; + PrintDisassembly(out, isa.data(), HsaText()->size(), 0); + out << std::endl << std::dec; + } else { + out << "Machine code section is not present" << std::endl << std::endl; + } + } + + void AmdHsaCode::PrintSegment(std::ostream& out, Segment* segment) + { + out << " Segment (" << segment->getSegmentIndex() << ")" << std::endl; + out << " Type: " << AmdPTLoadToString(segment->type()) + << " " + << " Flags: " << "0x" << std::hex << std::setw(8) << std::setfill('0') << segment->flags() << std::dec + << std::endl + << " Image Size: " << segment->imageSize() + << " " + << " Memory Size: " << segment->memSize() + << " " + << " Align: " << segment->align() + << " " + << " VAddr: " << segment->vaddr() + << std::endl; + out << std::dec; + } + + void AmdHsaCode::PrintSection(std::ostream& out, Section* section) + { + out << " Section " << section->Name() << " (Index " << section->getSectionIndex() << ")" << std::endl; + out << " Type: " << section->type() + << " " + << " Flags: " << "0x" << std::hex << std::setw(8) << std::setfill('0') << section->flags() << std::dec + << std::endl + << " Size: " << section->size() + << " " + << " Address: " << section->addr() + << " " + << " Align: " << section->addralign() + << std::endl; + out << std::dec; + + if (section->flags() & SHF_AMDGPU_HSA_CODE) { + // Printed separately. + return; + } + + switch (section->type()) { + case SHT_NOBITS: + return; + case SHT_RELA: + PrintRelocationData(out, section->asRelocationSection()); + return; + default: + PrintRawData(out, section); + } + } + + void AmdHsaCode::PrintRawData(std::ostream& out, Section* section) + { + out << " Data:" << std::endl; + unsigned char *sdata = (unsigned char*)alloca(section->size()); + section->getData(0, sdata, section->size()); + PrintRawData(out, sdata, section->size()); + } + + void AmdHsaCode::PrintRawData(std::ostream& out, const unsigned char *data, size_t size) + { + out << std::hex << std::right << std::setfill('0'); + for (size_t i = 0; i < size; i += 16) { + out << " " << std::setw(7) << i << ":"; + + for (size_t j = 0; j < 16; j += 1) { + uint32_t value = i + j < size ? (uint32_t)data[i + j] : 0; + if (j % 2 == 0) { out << ' '; } + out << std::setw(2) << value; + } + out << " "; + + for (size_t j = 0; i + j < size && j < 16; j += 1) { + char value = (char)data[i + j] >= 32 && (char)data[i + j] <= 126 ? (char)data[i + j] : '.'; + out << value; + } + out << std::endl; + } + out << std::dec << std::left; + } + + void AmdHsaCode::PrintRelocationData(std::ostream& out, RelocationSection* section) + { + if (section->targetSection()) { + out << " Relocation Entries for " << section->targetSection()->Name() << " Section (total " << section->relocationCount() << "):" << std::endl; + } else { + // Dynamic relocations do not have a target section, they work with + // virtual addresses. + out << " Dynamic Relocation Entries (total " << section->relocationCount() << "):" << std::endl; + } + for (size_t i = 0; i < section->relocationCount(); ++i) { + out << " Relocation (Index " << i << "):" << std::endl; + out << " Type: " << section->relocation(i)->type() << std::endl; + out << " Symbol: " << section->relocation(i)->symbol()->name() << std::endl; + out << " Offset: " << section->relocation(i)->offset() << " Addend: " << section->relocation(i)->addend() << std::endl; + } + out << std::dec; + } + + void AmdHsaCode::PrintSymbol(std::ostream& out, Symbol* sym) + { + out << " Symbol " << sym->Name() << " (Index " << sym->Index() << "):" << std::endl; + if (sym->IsKernelSymbol() || sym->IsVariableSymbol()) { + out << " Section: " << sym->GetSection()->Name() << " "; + out << " Section Offset: " << sym->SectionOffset() << std::endl; + out << " VAddr: " << sym->VAddr() << " "; + out << " Size: " << sym->Size() << " "; + out << " Alignment: " << sym->Alignment() << std::endl; + out << " Kind: " << HsaSymbolKindToString(sym->Kind()) << " "; + out << " Linkage: " << HsaSymbolLinkageToString(sym->Linkage()) << " "; + out << " Definition: " << (sym->IsDefinition() ? "TRUE" : "FALSE") << std::endl; + } + if (sym->IsVariableSymbol()) { + out << " Allocation: " << HsaVariableAllocationToString(sym->Allocation()) << " "; + out << " Segment: " << HsaVariableSegmentToString(sym->Segment()) << " "; + out << " Constant: " << (sym->IsConst() ? "TRUE" : "FALSE") << std::endl; + } + out << std::dec; + } + + void AmdHsaCode::PrintMachineCode(std::ostream& out, KernelSymbol* sym) + { + assert(HsaText()); + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + + out << "AMD Kernel Code for " << sym->Name() << ": " << std::endl << std::dec; + PrintAmdKernelCode(out, &kernel_code); + out << std::endl; + + uint64_t kernel_offset = sym->SectionOffset() + kernel_code.kernel_code_entry_byte_offset; + uint64_t kernel_size = sym->Size(); + uint64_t isa_chunk_size = kernel_offset + kernel_size; + std::vector isa_chunk(isa_chunk_size, 0); + HsaText()->getData(0, isa_chunk.data(), isa_chunk_size); + + out << "Disassembly for " << sym->Name() << ": " << std::endl; + PrintDisassembly(out, isa_chunk.data(), isa_chunk_size, kernel_offset); + out << std::endl << std::dec; + } + + void AmdHsaCode::PrintDisassembly(std::ostream& out, const unsigned char *isa, size_t size, uint32_t isa_offset) + { +#if !defined(NO_SI_SP3) + // Default asic is ci. + std::string asic = "CI"; + std::string vendor_name, architecture_name; + uint32_t major_version, minor_version, stepping; + if (GetNoteIsa(vendor_name, architecture_name, &major_version, &minor_version, &stepping)) { + if (major_version == 7) { + asic = "CI"; + } else if (major_version == 8) { + asic = "VI"; + } else if (major_version == 9) { + asic = "GFX9"; + } + else if (major_version == 10) { + if (minor_version == 1) { + asic = "GFX10"; +#if defined(GFX102_BUILD) + } else if (minor_version == 2) { + asic = "GFX10_B"; +#endif + } else if (minor_version == 3) { + asic = "GFX10_3"; + } else if (minor_version == 4) { + asic = "GFX10_4"; + } else { + asic = "GFX10_A"; + } + } + else if (major_version == 40) { + if (minor_version == 1) { + asic = "MGFX1"; + } else if ( minor_version == 2 ) { + asic = "MGFX2"; + } else if ( minor_version == 3 ) { + asic = "MGFX3"; + } else { + asic = "GFX10_4"; + } + } +#if defined(GFX11_BUILD) + else if (major_version == 11) { + asic = "GFX11"; + } +#endif // GFX11_BUILD +#if defined(GFX12_BUILD) + else if (major_version == 12) { + asic = "GFX12"; + } +#endif // GFX12_BUILD + else { + assert(!"unknown compute capability"); + } + } + + struct sp3_context *dis_state = sp3_new(); + sp3_setasic(dis_state, asic.c_str()); + + sp3_vma *dis_vma = sp3_vm_new_ptr(0, size / 4, (const uint32_t*)isa); + + std::vector comments(HsaText()->size() / 4, 0); + for (size_t i = 0; i < SymbolCount(); ++i) { + Symbol* sym = GetSymbol(i); + if (sym->IsKernelSymbol() && sym->IsDefinition()) { + comments[sym->SectionOffset() / 4] = COMMENT_AMD_KERNEL_CODE_T_BEGIN; + comments[(sym->SectionOffset() + 252) / 4] = COMMENT_AMD_KERNEL_CODE_T_END; + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + comments[(kernel_code.kernel_code_entry_byte_offset + sym->SectionOffset()) / 4] = COMMENT_KERNEL_ISA_BEGIN; + } + } + sp3_vma *comment_vma = sp3_vm_new_ptr(0, comments.size(), (const uint32_t*)comments.data()); + sp3_setcomments(dis_state, comment_vma, CommentTopCallBack, CommentRightCallBack, this); + + // When isa_offset == 0 disassembly full hsatext section. + // Otherwise disassembly only from this offset till endpgm instruction. + char *text = sp3_disasm( + dis_state, + dis_vma, + isa_offset / 4, + nullptr, + SP3_SHTYPE_CS, + nullptr, + (unsigned)(size / 4), + SP3DIS_FORCEVALID | SP3DIS_COMMENTS); + + enum class IsaState { + UNKNOWN, + AMD_KERNEL_CODE_T_BEGIN, + AMD_KERNEL_CODE_T, + AMD_KERNEL_CODE_T_END, + ISA_BEGIN, + ISA, + PADDING, + }; + + std::string line; + char *text_ptr = text; + IsaState state = IsaState::UNKNOWN; + + uint32_t offset = 0; + uint32_t padding_end = 0; + std::string padding; + + while (text_ptr && text_ptr[0] != '\0') { + line.clear(); + while (text_ptr[0] != '\0' && text_ptr[0] != '\n') { + line.push_back(text_ptr[0]); + ++text_ptr; + } + ltrim(line); + if (text_ptr[0] == '\n') { + ++text_ptr; + } + switch (state) { + case IsaState::UNKNOWN: + assert(line != "// amd_kernel_code_t end"); + padding.clear(); + if (line == "// amd_kernel_code_t begin") { + state = IsaState::AMD_KERNEL_CODE_T_BEGIN; + } else if (line == "// isa begin") { + state = IsaState::ISA_BEGIN; + } else if (line == "end") { + out << line << std::endl; + } else if (line.find("v_cndmask_b32 v0, s0, v0, vcc") != std::string::npos) { + padding += " " + line + "\n"; + offset = ParseInstructionOffset(line); + padding_end = ParseInstructionOffset(line); + state = IsaState::PADDING; + } else if (line != "shader (null)") { + out << " " << line << std::endl; + } + break; + + case IsaState::AMD_KERNEL_CODE_T_BEGIN: + assert(line != "// amd_kernel_code_t begin"); + assert(line != "// amd_kernel_code_t end"); + assert(line != "// isa begin"); + assert(line != "end"); + padding.clear(); + offset = ParseInstructionOffset(line); + state = IsaState::AMD_KERNEL_CODE_T; + break; + + case IsaState::AMD_KERNEL_CODE_T: + assert(line != "// amd_kernel_code_t begin"); + assert(line != "// isa begin"); + assert(line != "end"); + assert(padding.empty()); + if (line == "// amd_kernel_code_t end") { + state = IsaState::AMD_KERNEL_CODE_T_END; + } + break; + + case IsaState::AMD_KERNEL_CODE_T_END: + assert(line != "// amd_kernel_code_t begin"); + assert(line != "// amd_kernel_code_t end"); + assert(line != "// isa begin"); + assert(line != "end"); + assert(padding.empty()); + for (size_t i = 0; i < SymbolCount(); ++i) { + Symbol* sym = GetSymbol(i); + if (sym->IsKernelSymbol() && sym->IsDefinition() && sym->SectionOffset() == offset) { + std::ostream::fmtflags flags = out.flags(); + char fill = out.fill(); + out << " //" << std::endl; + out << " // amd_kernel_code_t for " << sym->Name() + << " (" << std::hex << std::setw(12) << std::setfill('0') << std::right << offset + << " - " << std::setw(12) << (offset + 256) << ')' << std::endl; + out << " //" << std::endl; + out << std::setfill(fill); + out.flags(flags); + break; + } + } + state = IsaState::UNKNOWN; + break; + + case IsaState::ISA_BEGIN: + assert(line != "// amd_kernel_code_t begin"); + assert(line != "// amd_kernel_code_t end"); + assert(line != "// isa begin"); + padding.clear(); + offset = ParseInstructionOffset(line); + for (size_t i = 0; i < SymbolCount(); ++i) { + Symbol* sym = GetSymbol(i); + if (sym->IsKernelSymbol() && sym->IsDefinition()) { + amd_kernel_code_t kernel_code; + HsaText()->getData(sym->SectionOffset(), &kernel_code, sizeof(amd_kernel_code_t)); + if ((sym->SectionOffset() + kernel_code.kernel_code_entry_byte_offset) == offset) { + out << " //" << std::endl; + out << " // " << sym->Name() << ':' << std::endl; + out << " //" << std::endl; + break; + } + } + } + if (line == "end") { + out << line << std::endl; + state = IsaState::UNKNOWN; + } else { + out << " " << line << std::endl; + state = IsaState::ISA; + } + break; + + case IsaState::ISA: + assert(line != "// amd_kernel_code_t end"); + if (!padding.empty()) { + out << padding; + out.flush(); + padding.clear(); + } + if (line == "// amd_kernel_code_t begin") { + state = IsaState::AMD_KERNEL_CODE_T_BEGIN; + } else if (line == "// isa begin") { + state = IsaState::ISA_BEGIN; + } else if (line == "end") { + out << line << std::endl; + state = IsaState::UNKNOWN; + } else if (line.find("v_cndmask_b32 v0, s0, v0, vcc") != std::string::npos) { + padding += " " + line + "\n"; + offset = ParseInstructionOffset(line); + padding_end = offset; + state = IsaState::PADDING; + } else { + out << " " << line << std::endl; + } + break; + + case IsaState::PADDING: + assert(line != "// amd_kernel_code_t end"); + if (line.find("v_cndmask_b32 v0, s0, v0, vcc") != std::string::npos) { + padding += " " + line + "\n"; + padding_end = ParseInstructionOffset(line); + } else if (line == "// amd_kernel_code_t begin" || line == "// isa begin" || line == "end") { + padding.clear(); + std::ostream::fmtflags flags = out.flags(); + char fill = out.fill(); + out << " //" << std::endl; + out << " // padding (" + << std::hex << std::setw(12) << std::setfill('0') << std::right << offset + << " - " << std::setw(12) << (padding_end + 4) << ')' << std::endl; + out << " //" << std::endl; + out << std::setfill(fill); + out.flags(flags); + if (line == "// amd_kernel_code_t begin") { + state = IsaState::AMD_KERNEL_CODE_T_BEGIN; + } else if (line == "// isa begin") { + state = IsaState::ISA_BEGIN; + } else if (line == "end") { + out << line << std::endl; + state = IsaState::UNKNOWN; + } + } else { + padding += " " + line + "\n"; + state = IsaState::ISA; + } + break; + + default: + assert(false); + break; + } + } + + sp3_free(text); + sp3_close(dis_state); + sp3_vm_free(dis_vma); + sp3_vm_free(comment_vma); +#else + PrintRawData(out, isa, size); +#endif // #if !defined(NO_SI_SP3) + out << std::dec; + } + + std::string AmdHsaCode::MangleSymbolName(const std::string& module_name, const std::string symbol_name) + { + if (module_name.empty()) { + return symbol_name; + } else { + return module_name + "::" + symbol_name; + } + } + + bool AmdHsaCode::ElfImageError() + { + out << img->output(); + return false; + } + + AmdHsaCode* AmdHsaCodeManager::FromHandle(hsa_code_object_t c) + { + CodeMap::iterator i = codeMap.find(c.handle); + if (i == codeMap.end()) { + AmdHsaCode* code = new AmdHsaCode(); + const void* buffer = reinterpret_cast(c.handle); + if (!code->InitAsBuffer(buffer, 0)) { + delete code; + return 0; + } + codeMap[c.handle] = code; + return code; + } + return i->second; + } + + bool AmdHsaCodeManager::Destroy(hsa_code_object_t c) + { + CodeMap::iterator i = codeMap.find(c.handle); + if (i == codeMap.end()) { + // Currently, we do not always create map entry for every code object buffer. + return true; + } + delete i->second; + codeMap.erase(i); + return true; + } + + bool AmdHsaCode::PullElfV2() + { + for (size_t i = 0; i < img->segmentCount(); ++i) { + Segment* s = img->segment(i); + if (s->type() == PT_LOAD) { + dataSegments.push_back(s); + } + } + for (size_t i = 0; i < img->sectionCount(); ++i) { + Section* sec = img->section(i); + if (!sec) { continue; } + if ((sec->type() == SHT_PROGBITS || sec->type() == SHT_NOBITS) && + !(sec->flags() & SHF_EXECINSTR)) { + dataSections.push_back(sec); + } else if (sec->type() == SHT_RELA) { + relocationSections.push_back(sec->asRelocationSection()); + } + if (sec->Name() == ".text") { + hsatext = sec; + } + } + for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { + amd::elf::Symbol* elfsym = img->symtab()->symbol(i); + Symbol* sym = 0; + switch (elfsym->type()) { + case STT_AMDGPU_HSA_KERNEL: { + amd::elf::Section* sec = elfsym->section(); + amd_kernel_code_t akc; + if (!sec) { + out << "Failed to find section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!(sec->flags() & (SHF_ALLOC | SHF_EXECINSTR))) { + out << "Invalid code section for symbol " << elfsym->name() << std::endl; + return false; + } + if (!sec->getData(elfsym->value() - sec->addr(), &akc, sizeof(amd_kernel_code_t))) { + out << "Failed to get AMD Kernel Code for symbol " << elfsym->name() << std::endl; + return false; + } + sym = new KernelSymbolV2(elfsym, &akc); + break; + } + case STT_OBJECT: + case STT_FUNC: + case STT_COMMON: + sym = new VariableSymbolV2(elfsym); + break; + default: + break; // Skip unknown symbols. + } + if (sym) { symbols.push_back(sym); } + } + + return true; + } + + KernelSymbolV2::KernelSymbolV2(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc) : + KernelSymbol(elfsym_, akc) { } +} +} +} diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.cpp new file mode 100644 index 0000000000..23245ba990 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.cpp @@ -0,0 +1,1048 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_hsa_code_util.hpp" +#include "libelf.h" +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#include +#include +#else // _WIN32 +#include +#include +#include +#include +#endif // _WIN32 +#include "Brig.h" + +namespace { +auto eq = " = "; + +std::ostream& attr1(std::ostream& out) +{ + out << " " << std::left << std::setw(60) << std::setfill(' '); + return out; +} + +std::ostream& attr2(std::ostream& out) +{ + out << " " << std::left << std::setw(58) << std::setfill(' '); + return out; +} +} // namespace anonymous + +namespace amd { +namespace hsa { +namespace common { + +bool IsAccessibleMemoryAddress(uint64_t address) +{ + if (0 == address) { + return false; + } +#if defined(_WIN32) || defined(_WIN64) + MEMORY_BASIC_INFORMATION memory_info; + if (!VirtualQuery(reinterpret_cast(address), &memory_info, sizeof(memory_info))) { + return false; + } + int32_t is_accessible = ((memory_info.Protect & PAGE_READONLY) || + (memory_info.Protect & PAGE_READWRITE) || + (memory_info.Protect & PAGE_WRITECOPY) || + (memory_info.Protect & PAGE_EXECUTE_READ) || + (memory_info.Protect & PAGE_EXECUTE_READWRITE) || + (memory_info.Protect & PAGE_EXECUTE_WRITECOPY)); + if (memory_info.Protect & PAGE_GUARD) { + is_accessible = 0; + } + if (memory_info.Protect & PAGE_NOACCESS) { + is_accessible = 0; + } + return is_accessible > 0; +#else + int32_t random_fd = 0; + ssize_t bytes_written = 0; + if (-1 == (random_fd = open("/dev/random", O_WRONLY))) { + return false; + } + bytes_written = write(random_fd, (void*)address, 1); + if (-1 == close(random_fd)) { + return false; + } + return bytes_written == 1; +#endif // _WIN32 || _WIN64 +} + +} + +std::string HsaSymbolKindToString(hsa_symbol_kind_t kind) +{ + switch (kind) { + case HSA_SYMBOL_KIND_VARIABLE: return "VARIABLE"; + case HSA_SYMBOL_KIND_INDIRECT_FUNCTION: return "INDIRECT_FUNCTION"; + case HSA_SYMBOL_KIND_KERNEL: return "KERNEL"; + default: return "UNKNOWN"; + } +} + +std::string HsaSymbolLinkageToString(hsa_symbol_linkage_t linkage) +{ + switch (linkage) { + case HSA_SYMBOL_LINKAGE_MODULE: return "MODULE"; + case HSA_SYMBOL_LINKAGE_PROGRAM: return "PROGRAM"; + default: return "UNKNOWN"; + } +} + +std::string HsaVariableAllocationToString(hsa_variable_allocation_t allocation) +{ + switch (allocation) { + case HSA_VARIABLE_ALLOCATION_AGENT: return "AGENT"; + case HSA_VARIABLE_ALLOCATION_PROGRAM: return "PROGRAM"; + default: return "UNKNOWN"; + } +} + +std::string HsaVariableSegmentToString(hsa_variable_segment_t segment) +{ + switch (segment) { + case HSA_VARIABLE_SEGMENT_GLOBAL: return "GLOBAL"; + case HSA_VARIABLE_SEGMENT_READONLY: return "READONLY"; + default: return "UNKNOWN"; + } +} + +std::string HsaProfileToString(hsa_profile_t profile) +{ + switch (profile) { + case HSA_PROFILE_BASE: return "BASE"; + case HSA_PROFILE_FULL: return "FULL"; + default: return "UNKNOWN"; + } +} + +std::string HsaMachineModelToString(hsa_machine_model_t model) +{ + switch (model) { + case HSA_MACHINE_MODEL_SMALL: return "SMALL"; + case HSA_MACHINE_MODEL_LARGE: return "LARGE"; + default: return "UNKNOWN"; + } +} + +std::string HsaFloatRoundingModeToString(hsa_default_float_rounding_mode_t mode) +{ + switch (mode) { + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT: return "DEFAULT"; + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO: return "ZERO"; + case HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR: return "NEAR"; + default: return "UNKNOWN"; + } +} + +std::string AmdMachineKindToString(amd_machine_kind16_t machine) +{ + switch (machine) { + case AMD_MACHINE_KIND_UNDEFINED: return "UNDEFINED"; + case AMD_MACHINE_KIND_AMDGPU: return "AMDGPU"; + default: return "UNKNOWN"; + } +} + +std::string AmdFloatRoundModeToString(amd_float_round_mode_t round_mode) +{ + switch (round_mode) { + case AMD_FLOAT_ROUND_MODE_NEAREST_EVEN: return "NEAREST_EVEN"; + case AMD_FLOAT_ROUND_MODE_PLUS_INFINITY: return "PLUS_INFINITY"; + case AMD_FLOAT_ROUND_MODE_MINUS_INFINITY: return "MINUS_INFINITY"; + case AMD_FLOAT_ROUND_MODE_ZERO: return "ZERO"; + default: return "UNKNOWN"; + } +} + +std::string AmdFloatDenormModeToString(amd_float_denorm_mode_t denorm_mode) +{ + switch (denorm_mode) { + case AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT: return "FLUSH_SOURCE_OUTPUT"; + case AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT: return "FLUSH_OUTPUT"; + case AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE: return "FLUSH_SOURCE"; + case AMD_FLOAT_DENORM_MODE_NO_FLUSH: return "FLUSH_NONE"; + default: return "UNKNOWN"; + } +} + +std::string AmdSystemVgprWorkitemIdToString(amd_system_vgpr_workitem_id_t system_vgpr_workitem_id) +{ + switch (system_vgpr_workitem_id) { + case AMD_SYSTEM_VGPR_WORKITEM_ID_X: return "X"; + case AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y: return "X, Y"; + case AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z: return "X, Y, Z"; + default: return "UNKNOWN"; + } +} + +std::string AmdElementByteSizeToString(amd_element_byte_size_t element_byte_size) +{ + switch (element_byte_size) { + case AMD_ELEMENT_BYTE_SIZE_2: return "WORD (2 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_4: return "DWORD (4 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_8: return "QWORD (8 bytes)"; + case AMD_ELEMENT_BYTE_SIZE_16: return "16 bytes"; + default: return "UNKNOWN"; + } +} + +std::string AmdExceptionKindToString(amd_exception_kind16_t exceptions) +{ + std::string e; + if (exceptions & AMD_EXCEPTION_KIND_INVALID_OPERATION) { + e += ", INVALID_OPERATON"; + exceptions &= ~AMD_EXCEPTION_KIND_INVALID_OPERATION; + } + if (exceptions & AMD_EXCEPTION_KIND_DIVISION_BY_ZERO) { + e += ", DIVISION_BY_ZERO"; + exceptions &= ~AMD_EXCEPTION_KIND_DIVISION_BY_ZERO; + } + if (exceptions & AMD_EXCEPTION_KIND_OVERFLOW) { + e += ", OVERFLOW"; + exceptions &= ~AMD_EXCEPTION_KIND_OVERFLOW; + } + if (exceptions & AMD_EXCEPTION_KIND_UNDERFLOW) { + e += ", UNDERFLOW"; + exceptions &= ~AMD_EXCEPTION_KIND_UNDERFLOW; + } + if (exceptions & AMD_EXCEPTION_KIND_INEXACT) { + e += ", INEXACT"; + exceptions &= ~AMD_EXCEPTION_KIND_INEXACT; + } + if (exceptions) { + e += ", UNKNOWN"; + } + if (!e.empty()) { + e = "[" + e.erase(0, 2) + "]"; + } + return e; +} + +std::string AmdPowerTwoToString(amd_powertwo8_t p) +{ + return std::to_string(1 << (unsigned) p); +} + +amdgpu_hsa_elf_segment_t AmdHsaElfSectionSegment(amdgpu_hsa_elf_section_t sec) +{ + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_DATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_BSS_GLOBAL_PROGRAM: + return AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM; + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + case AMDGPU_HSA_DATA_GLOBAL_AGENT: + case AMDGPU_HSA_BSS_GLOBAL_AGENT: + return AMDGPU_HSA_SEGMENT_GLOBAL_AGENT; + case AMDGPU_HSA_RODATA_READONLY_AGENT: + case AMDGPU_HSA_DATA_READONLY_AGENT: + case AMDGPU_HSA_BSS_READONLY_AGENT: + return AMDGPU_HSA_SEGMENT_READONLY_AGENT; + default: + assert(false); return AMDGPU_HSA_SEGMENT_LAST; + } +} + +bool IsAmdHsaElfSectionROData(amdgpu_hsa_elf_section_t sec) +{ + switch (sec) { + case AMDGPU_HSA_RODATA_GLOBAL_PROGRAM: + case AMDGPU_HSA_RODATA_GLOBAL_AGENT: + case AMDGPU_HSA_RODATA_READONLY_AGENT: + default: + return false; + } +} + +std::string AmdHsaElfSegmentToString(amdgpu_hsa_elf_segment_t seg) +{ + switch (seg) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: return "GLOBAL_PROGRAM"; + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: return "GLOBAL_AGENT"; + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: return "READONLY_AGENT"; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: return "CODE_AGENT"; + default: return "UNKNOWN"; + } +} + +std::string AmdPTLoadToString(uint64_t type) +{ + if (PT_LOOS <= type && type < PT_LOOS + AMDGPU_HSA_SEGMENT_LAST) { + return AmdHsaElfSegmentToString((amdgpu_hsa_elf_segment_t) (type - PT_LOOS)); + } else { + return "UNKNOWN (" + std::to_string(type) + ")"; + } +} + +void PrintAmdKernelCode(std::ostream& out, const amd_kernel_code_t *akc) +{ + uint32_t is_debug_enabled = AMD_HSA_BITS_GET(akc->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED); + + out << attr1 << "amd_kernel_code_version_major" << eq + << akc->amd_kernel_code_version_major + << std::endl; + out << attr1 << "amd_kernel_code_version_minor" << eq + << akc->amd_kernel_code_version_minor + << std::endl; + out << attr1 << "amd_machine_kind" << eq + << AmdMachineKindToString(akc->amd_machine_kind) + << std::endl; + out << attr1 << "amd_machine_version_major" << eq + << (uint32_t)akc->amd_machine_version_major + << std::endl; + out << attr1 << "amd_machine_version_minor" << eq + << (uint32_t)akc->amd_machine_version_minor + << std::endl; + out << attr1 << "amd_machine_version_stepping" << eq + << (uint32_t)akc->amd_machine_version_stepping + << std::endl; + out << attr1 << "kernel_code_entry_byte_offset" << eq + << akc->kernel_code_entry_byte_offset + << std::endl; + if (akc->kernel_code_prefetch_byte_offset) { + out << attr1 << "kernel_code_prefetch_byte_offset" << eq + << akc->kernel_code_prefetch_byte_offset + << std::endl; + } + if (akc->kernel_code_prefetch_byte_size) { + out << attr1 << "kernel_code_prefetch_byte_size" << eq + << akc->kernel_code_prefetch_byte_size + << std::endl; + } + out << attr1 << "max_scratch_backing_memory_byte_size" << eq + << akc->max_scratch_backing_memory_byte_size + << std::endl; + PrintAmdComputePgmRsrcOne(out, akc->compute_pgm_rsrc1); + PrintAmdComputePgmRsrcTwo(out, akc->compute_pgm_rsrc2); + PrintAmdKernelCodeProperties(out, akc->kernel_code_properties); + if (akc->workitem_private_segment_byte_size) { + out << attr1 << "workitem_private_segment_byte_size" << eq + << akc->workitem_private_segment_byte_size + << std::endl; + } + if (akc->workgroup_group_segment_byte_size) { + out << attr1 << "workgroup_group_segment_byte_size" << eq + << akc->workgroup_group_segment_byte_size + << std::endl; + } + if (akc->gds_segment_byte_size) { + out << attr1 << "gds_segment_byte_size" << eq + << akc->gds_segment_byte_size + << std::endl; + } + if (akc->kernarg_segment_byte_size) { + out << attr1 << "kernarg_segment_byte_size" << eq + << akc->kernarg_segment_byte_size + << std::endl; + } + if (akc->workgroup_fbarrier_count) { + out << attr1 << "workgroup_fbarrier_count" << eq + << akc->workgroup_fbarrier_count + << std::endl; + } + out << attr1 << "wavefront_sgpr_count" << eq + << (uint32_t)akc->wavefront_sgpr_count + << std::endl; + out << attr1 << "workitem_vgpr_count" << eq + << (uint32_t)akc->workitem_vgpr_count + << std::endl; + if (akc->reserved_vgpr_count > 0) { + out << attr1 << "reserved_vgpr_first" << eq + << (uint32_t)akc->reserved_vgpr_first + << std::endl; + out << attr1 << "reserved_vgpr_count" << eq + << (uint32_t)akc->reserved_vgpr_count + << std::endl; + } + if (akc->reserved_sgpr_count > 0) { + out << attr1 << "reserved_sgpr_first" << eq + << (uint32_t)akc->reserved_sgpr_first + << std::endl; + out << attr1 << "reserved_sgpr_count" << eq + << (uint32_t)akc->reserved_sgpr_count + << std::endl; + } + if (is_debug_enabled && (akc->debug_wavefront_private_segment_offset_sgpr != uint16_t(-1))) { + out << attr1 << "debug_wavefront_private_segment_offset_sgpr" << eq + << (uint32_t)akc->debug_wavefront_private_segment_offset_sgpr + << std::endl; + } + if (is_debug_enabled && (akc->debug_private_segment_buffer_sgpr != uint16_t(-1))) { + out << attr1 << "debug_private_segment_buffer_sgpr" << eq + << (uint32_t)akc->debug_private_segment_buffer_sgpr + << ":" + << (uint32_t)(akc->debug_private_segment_buffer_sgpr + 3) + << std::endl; + } + if (akc->kernarg_segment_alignment) { + out << attr1 << "kernarg_segment_alignment" << eq + << AmdPowerTwoToString(akc->kernarg_segment_alignment) + << " (" << (uint32_t) akc->kernarg_segment_alignment << ")" + << std::endl; + } + if (akc->group_segment_alignment) { + out << attr1 << "group_segment_alignment" << eq + << AmdPowerTwoToString(akc->group_segment_alignment) + << " (" << (uint32_t) akc->group_segment_alignment << ")" + << std::endl; + } + if (akc->private_segment_alignment) { + out << attr1 << "private_segment_alignment" << eq + << AmdPowerTwoToString(akc->private_segment_alignment) + << " (" << (uint32_t) akc->private_segment_alignment << ")" + << std::endl; + } + out << attr1 << "wavefront_size" << eq + << AmdPowerTwoToString(akc->wavefront_size) + << " (" << (uint32_t) akc->wavefront_size << ")" + << std::endl; + PrintAmdControlDirectives(out, akc->control_directives); +} + +void PrintAmdComputePgmRsrcOne(std::ostream& out, amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1) +{ + out << " COMPUTE_PGM_RSRC1 (0x" << std::hex << std::setw(8) << std::setfill('0') << compute_pgm_rsrc1 << "):" << std::endl; + out << std::dec; + + uint32_t granulated_workitem_vgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + out << attr2 << "granulated_workitem_vgpr_count" << eq + << granulated_workitem_vgpr_count + << std::endl; + uint32_t granulated_wavefront_sgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); + out << attr2 << "granulated_wavefront_sgpr_count" << eq + << granulated_wavefront_sgpr_count + << std::endl; + uint32_t priority = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY); + out << attr2 << "priority" << eq + << priority + << std::endl; + uint32_t float_round_mode_32 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32); + out << attr2 << "float_round_mode_32" << eq + << AmdFloatRoundModeToString((amd_float_round_mode_t)float_round_mode_32) + << std::endl; + uint32_t float_round_mode_16_64 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64); + out << attr2 << "float_round_mode_16_64" << eq + << AmdFloatRoundModeToString((amd_float_round_mode_t)float_round_mode_16_64) + << std::endl; + uint32_t float_denorm_mode_32 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32); + out << attr2 << "float_denorm_mode_32" << eq + << AmdFloatDenormModeToString((amd_float_denorm_mode_t)float_denorm_mode_32) + << std::endl; + uint32_t float_denorm_mode_16_64 = AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64); + out << attr2 << "float_denorm_mode_16_64" << eq + << AmdFloatDenormModeToString((amd_float_denorm_mode_t)float_denorm_mode_16_64) + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_PRIV)) { + out << attr2 << "priv" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP)) { + out << attr2 << "enable_dx10_clamp" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE)) { + out << attr2 << "debug_mode" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE)) { + out << attr2 << "enable_ieee_mode" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_BULKY)) { + out << attr2 << "bulky" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER)) { + out << attr2 << "cdbg_user" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdComputePgmRsrcTwo(std::ostream& out, amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2) +{ + out << " COMPUTE_PGM_RSRC2 (0x" << std::hex << std::setw(8) << std::setfill('0') << compute_pgm_rsrc2 << "):" << std::endl; + out << std::dec; + + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET)) { + out << attr2 << "enable_sgpr_private_segment_wave_byte_offset" << eq << "TRUE" + << std::endl; + } + uint32_t user_sgpr_count = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT); + out << attr2 << "user_sgpr_count" << eq + << user_sgpr_count + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER)) { + out << attr2 << "enable_trap_handler" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X)) { + out << attr2 << "enable_sgpr_workgroup_id_x" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y)) { + out << attr2 << "enable_sgpr_workgroup_id_y" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z)) { + out << attr2 << "enable_sgpr_workgroup_id_z" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO)) { + out << attr2 << "enable_sgpr_workgroup_info" << eq << "TRUE" + << std::endl; + } + uint32_t enable_vgpr_workitem_id = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID); + out << attr2 << "enable_vgpr_workitem_id" << eq + << AmdSystemVgprWorkitemIdToString((amd_system_vgpr_workitem_id_t)enable_vgpr_workitem_id) + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH)) { + out << attr2 << "enable_exception_address_watch" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION)) { + out << attr2 << "enable_exception_memory_violation" << eq << "TRUE" + << std::endl; + } + uint32_t granulated_lds_size = AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); + out << attr2 << "granulated_lds_size" << eq + << granulated_lds_size + << std::endl; + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION)) { + out << attr2 << "enable_exception_ieee_754_fp_invalid_operation" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE)) { + out << attr2 << "enable_exception_fp_denormal_source" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO)) { + out << attr2 << "enable_exception_ieee_754_fp_division_by_zero" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW)) { + out << attr2 << "enable_exception_ieee_754_fp_overflow" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW)) { + out << attr2 << "enable_exception_ieee_754_fp_underflow" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT)) { + out << attr2 << "enable_exception_ieee_754_fp_inexact" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO)) { + out << attr2 << "enable_exception_int_division_by_zero" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdKernelCodeProperties(std::ostream& out, amd_kernel_code_properties32_t kernel_code_properties) +{ + out << " KERNEL_CODE_PROPERTIES (0x" << std::hex << std::setw(8) << std::setfill('0') << kernel_code_properties << "):" << std::endl; + out << std::dec; + + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER)) { + out << attr2 << "enable_sgpr_private_segment_buffer" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR)) { + out << attr2 << "enable_sgpr_dispatch_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + out << attr2 << "enable_sgpr_queue_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR)) { + out << attr2 << "enable_sgpr_kernarg_segment_ptr" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID)) { + out << attr2 << "enable_sgpr_dispatch_id" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT)) { + out << attr2 << "enable_sgpr_flat_scratch_init" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)) { + out << attr2 << "enable_sgpr_private_segment_size" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_x" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_y" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z)) { + out << attr2 << "enable_sgpr_grid_workgroup_count_z" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS)) { + out << attr2 << "enable_ordered_append_gds" << eq << "TRUE" + << std::endl; + } + uint32_t private_element_size = AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE); + out << attr2 << "private_element_size" << eq + << AmdElementByteSizeToString((amd_element_byte_size_t)private_element_size) + << std::endl; + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_PTR64)) { + out << attr2 << "is_ptr64" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK)) { + out << attr2 << "is_dynamic_callstack" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED)) { + out << attr2 << "is_debug_enabled" << eq << "TRUE" + << std::endl; + } + if (AMD_HSA_BITS_GET(kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED)) { + out << attr2 << "is_xnack_enabled" << eq << "TRUE" + << std::endl; + } +} + +void PrintAmdControlDirectives(std::ostream& out, const amd_control_directives_t &control_directives) +{ + if (!control_directives.enabled_control_directives) { + return; + } + + out << " CONTROL_DIRECTIVES:" << std::endl; + + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS) { + out << attr2 << "enable_break_exceptions" << eq + << AmdExceptionKindToString(control_directives.enable_break_exceptions).c_str() + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS) { + out << attr2 << "enable_detect_exceptions" << eq + << AmdExceptionKindToString(control_directives.enable_detect_exceptions).c_str() + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE) { + out << attr2 << "max_dynamic_group_size" << eq + << control_directives.max_dynamic_group_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE) { + out << attr2 << "max_flat_grid_size" << eq + << control_directives.max_flat_grid_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE) { + out << attr2 << "max_flat_workgroup_size" << eq + << control_directives.max_flat_workgroup_size + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM) { + out << attr2 << "required_dim" << eq + << (uint32_t)control_directives.required_dim + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE) { + out << attr2 << "required_grid_size" << eq + << "(" + << control_directives.required_grid_size[0] + << ", " + << control_directives.required_grid_size[1] + << ", " + << control_directives.required_grid_size[2] + << ")" + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE) { + out << attr2 << "required_workgroup_size" << eq + << "(" + << control_directives.required_workgroup_size[0] + << ", " + << control_directives.required_workgroup_size[1] + << ", " + << control_directives.required_workgroup_size[2] + << ")" + << std::endl; + } + if (control_directives.enabled_control_directives & AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS) { + out << attr2 << "require_no_partial_workgroups" << eq << "TRUE" + << std::endl; + } +} + +namespace code_options { + + std::ostream& space(std::ostream& out) + { + if (out.tellp()) { out << " "; } + return out; + } + + std::ostream& operator<<(std::ostream& out, const control_directive& d) + { + out << space << + "-hsa_control_directive:" << d.name << "="; + return out; + } + + const char *BrigExceptionString(BrigExceptions32_t e) + { + switch (e) { + case BRIG_EXCEPTIONS_INVALID_OPERATION: return "INVALID_OPERATION"; + case BRIG_EXCEPTIONS_DIVIDE_BY_ZERO: return "DIVIDE_BY_ZERO"; + case BRIG_EXCEPTIONS_OVERFLOW: return "OVERFLOW"; + case BRIG_EXCEPTIONS_INEXACT: return "INEXACT"; + default: + assert(false); return ""; + } + } + + std::ostream& operator<<(std::ostream& out, const exceptions_mask& e) + { + bool first = true; + for (BrigExceptions32_t be = BRIG_EXCEPTIONS_INVALID_OPERATION; be < BRIG_EXCEPTIONS_FIRST_USER_DEFINED; ++be) { + if (e.mask & be) { + if (first) { first = false; } else { out << ","; } + out << BrigExceptionString(be); + } + } + return out; + } + + std::ostream& operator<<(std::ostream& out, const control_directives& cd) + { + const hsa_ext_control_directives_t& d = cd.d; + uint64_t mask = d.control_directives_mask; + if (!mask) { return out; } + + if (mask & BRIG_CONTROL_ENABLEBREAKEXCEPTIONS) { + out << + control_directive("ENABLEBREAKEXCEPTIONS") << + exceptions_mask(d.break_exceptions_mask); + } + if (mask & BRIG_CONTROL_ENABLEDETECTEXCEPTIONS) { + out << + control_directive("ENABLEDETECTEXCEPTIONS") << + exceptions_mask(d.detect_exceptions_mask); + } + if (mask & BRIG_CONTROL_MAXDYNAMICGROUPSIZE) { + out << + control_directive("MAXDYNAMICGROUPSIZE") << + d.max_dynamic_group_size; + } + if (mask & BRIG_CONTROL_MAXFLATGRIDSIZE) { + out << + control_directive("MAXFLATGRIDSIZE") << + d.max_flat_grid_size; + } + if (mask & BRIG_CONTROL_MAXFLATWORKGROUPSIZE) { + out << + control_directive("MAXFLATWORKGROUPSIZE") << + d.max_flat_workgroup_size; + } + if (mask & BRIG_CONTROL_REQUIREDDIM) { + out << + control_directive("REQUIREDDIM") << + d.required_dim; + } + if (mask & BRIG_CONTROL_REQUIREDGRIDSIZE) { + out << + control_directive("REQUIREDGRIDSIZE") << + d.required_grid_size[0] << "," << + d.required_grid_size[1] << "," << + d.required_grid_size[2]; + } + if (mask & BRIG_CONTROL_REQUIREDWORKGROUPSIZE) { + out << + control_directive("REQUIREDWORKGROUPSIZE") << + d.required_workgroup_size.x << "," << + d.required_workgroup_size.y << "," << + d.required_workgroup_size.z; + } + return out; + } +} + +const char* hsaerr2str(hsa_status_t status) { + switch ((unsigned) status) { + case HSA_STATUS_SUCCESS: + return + "HSA_STATUS_SUCCESS: The function has been executed successfully."; + case HSA_STATUS_INFO_BREAK: + return + "HSA_STATUS_INFO_BREAK: A traversal over a list of " + "elements has been interrupted by the application before " + "completing."; + case HSA_STATUS_ERROR: + return "HSA_STATUS_ERROR: A generic error has occurred."; + case HSA_STATUS_ERROR_INVALID_ARGUMENT: + return + "HSA_STATUS_ERROR_INVALID_ARGUMENT: One of the actual " + "arguments does not meet a precondition stated in the " + "documentation of the corresponding formal argument."; + case HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: + return + "HSA_STATUS_ERROR_INVALID_QUEUE_CREATION: The requested " + "queue creation is not valid."; + case HSA_STATUS_ERROR_INVALID_ALLOCATION: + return + "HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested " + "allocation is not valid."; + case HSA_STATUS_ERROR_INVALID_AGENT: + return + "HSA_STATUS_ERROR_INVALID_AGENT: The agent is invalid."; + case HSA_STATUS_ERROR_INVALID_REGION: + return + "HSA_STATUS_ERROR_INVALID_REGION: The memory region is invalid."; + case HSA_STATUS_ERROR_INVALID_SIGNAL: + return + "HSA_STATUS_ERROR_INVALID_SIGNAL: The signal is invalid."; + case HSA_STATUS_ERROR_INVALID_QUEUE: + return + "HSA_STATUS_ERROR_INVALID_QUEUE: The queue is invalid."; + case HSA_STATUS_ERROR_OUT_OF_RESOURCES: + return + "HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to " + "allocate the necessary resources. This error may also " + "occur when the core runtime library needs to spawn " + "threads or create internal OS-specific events."; + case HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: + return + "HSA_STATUS_ERROR_INVALID_PACKET_FORMAT: The AQL packet " + "is malformed."; + case HSA_STATUS_ERROR_RESOURCE_FREE: + return + "HSA_STATUS_ERROR_RESOURCE_FREE: An error has been " + "detected while releasing a resource."; + case HSA_STATUS_ERROR_NOT_INITIALIZED: + return + "HSA_STATUS_ERROR_NOT_INITIALIZED: An API other than " + "hsa_init has been invoked while the reference count of " + "the HSA runtime is zero."; + case HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: + return + "HSA_STATUS_ERROR_REFCOUNT_OVERFLOW: The maximum " + "reference count for the object has been reached."; + case HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: + return + "HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS: The arguments passed to " + "a functions are not compatible."; + case HSA_STATUS_ERROR_INVALID_INDEX: + return "The index is invalid."; + case HSA_STATUS_ERROR_INVALID_ISA: + return "The instruction set architecture is invalid."; + case HSA_STATUS_ERROR_INVALID_CODE_OBJECT: + return "The code object is invalid."; + case HSA_STATUS_ERROR_INVALID_EXECUTABLE: + return "The executable is invalid."; + case HSA_STATUS_ERROR_FROZEN_EXECUTABLE: + return "The executable is frozen."; + case HSA_STATUS_ERROR_INVALID_SYMBOL_NAME: + return "There is no symbol with the given name."; + case HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED: + return "The variable is already defined."; + case HSA_STATUS_ERROR_VARIABLE_UNDEFINED: + return "The variable is undefined."; + case HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: + return + "HSA_EXT_STATUS_ERROR_INVALID_PROGRAM: Invalid program"; + case HSA_EXT_STATUS_ERROR_INVALID_MODULE: + return "HSA_EXT_STATUS_ERROR_INVALID_MODULE: Invalid module"; + case HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: + return + "HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE: Incompatible module"; + case HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: + return + "HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED: Module already " + "included"; + case HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: + return + "HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH: Symbol mismatch"; + case HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: + return + "HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED: Finalization failed"; + case HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: + return + "HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH: Directive mismatch"; + default: + return + "Unknown HSA status"; + } +} + +bool ReadFileIntoBuffer(const std::string& filename, std::vector& buffer) +{ + std::ifstream file(filename, std::ios::binary); + if (!file) { return false; } + file.seekg(0, std::ios::end); + std::streamsize size = file.tellg(); + file.seekg(0, std::ios::beg); + + buffer.resize((size_t) size); + if (!file.read(buffer.data(), size)) { return false; } + return true; +} + +#ifndef _WIN32 +#define _close close +#define _open open +#endif // _WIN32 + +int OpenTempFile(const char* prefix) +{ + std::string tname; +#ifdef _WIN32 + tname += prefix; + tname += "_"; + tname += std::to_string(_getpid()); + tname += "_"; + unsigned c = 0; + while (c++ < 20) { // Loop because several threads can generate same filename. + char dir[MAX_PATH+1]; + if (!GetTempPath(sizeof(dir), dir)) { return -1; } + char *name = _tempnam(dir, tname.c_str()); + if (!name) { return -1; } + HANDLE h = CreateFile( + name, + GENERIC_READ | GENERIC_WRITE, + 0, // No sharing + NULL, + CREATE_NEW, + FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, + NULL); + free(name); + if (h == INVALID_HANDLE_VALUE) { continue; } + return _open_osfhandle((intptr_t)h, 0); + } + return -1; +#else // _WIN32 +#if defined(P_tmpdir) // available in GLIBC and SVID + tname += P_tmpdir; + tname += "/"; +#endif + tname += prefix; + tname += "_XXXXXX"; + int d = mkstemp(&tname[0]); + if (d < 0) { return -1; } + if (unlink(tname.c_str()) < 0) { _close(d); return -1; } + return d; +#endif // _WIN32 +} + +void CloseTempFile(int fd) +{ + _close(fd); +} + +const char * CommentTopCallBack(void *ctx, int type) { + static const char* amd_kernel_code_t_begin = "amd_kernel_code_t begin"; + static const char* amd_kernel_code_t_end = "amd_kernel_code_t end"; + static const char* isa_begin = "isa begin"; + switch(type) { + case COMMENT_AMD_KERNEL_CODE_T_BEGIN: + return amd_kernel_code_t_begin; + case COMMENT_AMD_KERNEL_CODE_T_END: + return amd_kernel_code_t_end; + case COMMENT_KERNEL_ISA_BEGIN: + return isa_begin; + default: + assert(false); + return ""; + } +} +const char * CommentRightCallBack(void *ctx, int type) { + return nullptr; +} + +uint32_t ParseInstructionOffset(const std::string& instruction) { + // instruction format: opcode op1, op2 ... // offset: binopcode + std::string::size_type n = instruction.find("//"); + assert(n != std::string::npos); + std::string comment = instruction.substr(n); + n = comment.find(':'); + assert(n != std::string::npos); + comment.erase(n); + assert(comment.size() > 3); + comment.erase(0, 3); + return strtoul(comment.c_str(), nullptr, 16); +} + +bool IsNotSpace(char c) { + return !isspace(static_cast(c)); +} + +void ltrim(std::string &str) { + str.erase(str.begin(), std::find_if(str.begin(), str.end(), IsNotSpace)); +} + +std::string DumpFileName(const std::string& dir, const char* prefix, const char* ext, unsigned n, unsigned i) +{ + std::ostringstream ss; + if (!dir.empty()) { + ss << dir << "/"; + } + ss << + prefix << + std::setfill('0') << std::setw(3) << n; + if (i) { ss << "_" << i; } + if (ext) { ss << "." << ext; } + return ss.str(); +} + + +} +} diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.hpp new file mode 100644 index 0000000000..113a888c66 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_code_util.hpp @@ -0,0 +1,195 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_CODE_UTIL_HPP_ +#define AMD_HSA_CODE_UTIL_HPP_ + +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else // _WIN32 +#include +#endif // _WIN32 +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_elf.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" + +#define hsa_error(e) static_cast(e) + +#define release_assert(e) \ + if (!(e)) { \ + std::cerr << __FILE__ << ":"; \ + std::cerr << __LINE__ << ":"; \ + std::cerr << " Assertion `" << #e << "' failed." << std::endl; \ + std::abort(); \ + } \ + +namespace amd { +namespace hsa { + +std::string HsaSymbolKindToString(hsa_symbol_kind_t kind); +std::string HsaSymbolLinkageToString(hsa_symbol_linkage_t linkage); +std::string HsaVariableAllocationToString(hsa_variable_allocation_t allocation); +std::string HsaVariableSegmentToString(hsa_variable_segment_t segment); +std::string HsaProfileToString(hsa_profile_t profile); +std::string HsaMachineModelToString(hsa_machine_model_t model); +std::string HsaFloatRoundingModeToString(hsa_default_float_rounding_mode_t mode); +std::string AmdMachineKindToString(amd_machine_kind16_t machine); +std::string AmdFloatRoundModeToString(amd_float_round_mode_t round_mode); +std::string AmdFloatDenormModeToString(amd_float_denorm_mode_t denorm_mode); +std::string AmdSystemVgprWorkitemIdToString(amd_system_vgpr_workitem_id_t system_vgpr_workitem_id); +std::string AmdElementByteSizeToString(amd_element_byte_size_t element_byte_size); +std::string AmdExceptionKindToString(amd_exception_kind16_t exceptions); +std::string AmdPowerTwoToString(amd_powertwo8_t p); +amdgpu_hsa_elf_segment_t AmdHsaElfSectionSegment(amdgpu_hsa_elf_section_t sec); +bool IsAmdHsaElfSectionROData(amdgpu_hsa_elf_section_t sec); +std::string AmdHsaElfSegmentToString(amdgpu_hsa_elf_segment_t seg); +std::string AmdPTLoadToString(uint64_t type); + +void PrintAmdKernelCode(std::ostream& out, const amd_kernel_code_t *akc); +void PrintAmdComputePgmRsrcOne(std::ostream& out, amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1); +void PrintAmdComputePgmRsrcTwo(std::ostream& out, amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2); +void PrintAmdKernelCodeProperties(std::ostream& out, amd_kernel_code_properties32_t kernel_code_properties); +void PrintAmdControlDirectives(std::ostream& out, const amd_control_directives_t &control_directives); + +namespace code_options { + // Space between options (not at the beginning). + std::ostream& space(std::ostream& out); + + // Control directive option without value. + struct control_directive { + const char *name; + control_directive(const char* name_) : name(name_) { } + }; + std::ostream& operator<<(std::ostream& out, const control_directive& d); + + // Exceptions mask string. + struct exceptions_mask { + uint16_t mask; + exceptions_mask(uint16_t mask_) : mask(mask_) { } + }; + std::ostream& operator<<(std::ostream& out, const exceptions_mask& e); + + // Control directives options. + struct control_directives { + const hsa_ext_control_directives_t& d; + control_directives(const hsa_ext_control_directives_t& d_) : d(d_) { } + }; + std::ostream& operator<<(std::ostream& out, const control_directives& cd); +} + +const char* hsaerr2str(hsa_status_t status); +bool ReadFileIntoBuffer(const std::string& filename, std::vector& buffer); + +// Create new empty temporary file that will be deleted when closed. +int OpenTempFile(const char* prefix); +void CloseTempFile(int fd); + +// Helper comment types for isa disassembler +enum DumpIsaCommentType { + COMMENT_AMD_KERNEL_CODE_T_BEGIN = 1, + COMMENT_AMD_KERNEL_CODE_T_END, + COMMENT_KERNEL_ISA_BEGIN, +}; + +// Callbacks to create helper comments for isa disassembler +const char * CommentTopCallBack(void *ctx, int type); +const char * CommentRightCallBack(void *ctx, int type); + +// Parse disassembler instruction line to find offset +uint32_t ParseInstructionOffset(const std::string& instruction); + +// Trim whitespaces from start of string +void ltrim(std::string &str); + + +// Helper function that allocates an aligned memory. +inline void* +alignedMalloc(size_t size, size_t alignment) +{ +#if defined(_WIN32) + return ::_aligned_malloc(size, alignment); +#else + void * ptr = NULL; + alignment = (std::max)(alignment, sizeof(void*)); + if (0 == ::posix_memalign(&ptr, alignment, size)) { + return ptr; + } + return NULL; +#endif +} + +// Helper function that frees an aligned memory. +inline void +alignedFree(void *ptr) +{ +#if defined(_WIN32) + ::_aligned_free(ptr); +#else + free(ptr); +#endif +} + +inline uint64_t alignUp(uint64_t num, uint64_t align) +{ + assert(align); + assert((align & (align - 1)) == 0); + return (num + align - 1) & ~(align - 1); +} + +inline uint32_t alignUp(uint32_t num, uint32_t align) +{ + assert(align); + assert((align & (align - 1)) == 0); + return (num + align - 1) & ~(align - 1); +} + +std::string DumpFileName(const std::string& dir, const char* prefix, const char* ext, unsigned n, unsigned i = 0); + +} +} + +#endif // AMD_HSA_CODE_UTIL_HPP_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.cpp new file mode 100644 index 0000000000..004c9cc626 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.cpp @@ -0,0 +1,94 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_hsa_locks.hpp" + +namespace amd { +namespace hsa { +namespace common { + +void ReaderWriterLock::ReaderLock() +{ + internal_lock_.lock(); + while (0 < writers_count_) { + readers_condition_.wait(internal_lock_); + } + readers_count_ += 1; + internal_lock_.unlock(); +} + +void ReaderWriterLock::ReaderUnlock() +{ + internal_lock_.lock(); + readers_count_ -= 1; + if (0 == readers_count_ && 0 < writers_waiting_) { + writers_condition_.notify_one(); + } + internal_lock_.unlock(); +} + +void ReaderWriterLock::WriterLock() +{ + internal_lock_.lock(); + writers_waiting_ += 1; + while (0 < readers_count_ || 0 < writers_count_) { + writers_condition_.wait(internal_lock_); + } + writers_count_ += 1; + writers_waiting_ -= 1; + internal_lock_.unlock(); +} + +void ReaderWriterLock::WriterUnlock() +{ + internal_lock_.lock(); + writers_count_ -= 1; + if (0 < writers_waiting_) { + writers_condition_.notify_one(); + } + readers_condition_.notify_all(); + internal_lock_.unlock(); +} + +} // namespace common +} // namespace hsa +} // namespace amd diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.hpp new file mode 100644 index 0000000000..6bc7632d1f --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_hsa_locks.hpp @@ -0,0 +1,127 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_LOCKS_HPP +#define AMD_HSA_LOCKS_HPP + +#include +#include +#include + +namespace amd { +namespace hsa { +namespace common { + +template +class ReaderLockGuard final { +public: + explicit ReaderLockGuard(LockType &lock): + lock_(lock) + { + lock_.ReaderLock(); + } + + ~ReaderLockGuard() + { + lock_.ReaderUnlock(); + } + +private: + ReaderLockGuard(const ReaderLockGuard&); + ReaderLockGuard& operator=(const ReaderLockGuard&); + + LockType &lock_; +}; + +template +class WriterLockGuard final { +public: + explicit WriterLockGuard(LockType &lock): + lock_(lock) + { + lock_.WriterLock(); + } + + ~WriterLockGuard() + { + lock_.WriterUnlock(); + } + +private: + WriterLockGuard(const WriterLockGuard&); + WriterLockGuard& operator=(const WriterLockGuard&); + + LockType &lock_; +}; + +class ReaderWriterLock final { +public: + ReaderWriterLock(): + readers_count_(0), writers_count_(0), writers_waiting_(0) {} + + ~ReaderWriterLock() {} + + void ReaderLock(); + + void ReaderUnlock(); + + void WriterLock(); + + void WriterUnlock(); + +private: + ReaderWriterLock(const ReaderWriterLock&); + ReaderWriterLock& operator=(const ReaderWriterLock&); + + size_t readers_count_; + size_t writers_count_; + size_t writers_waiting_; + std::mutex internal_lock_; + std::condition_variable_any readers_condition_; + std::condition_variable_any writers_condition_; +}; + +} // namespace common +} // namespace hsa +} // namespace amd + +#endif // AMD_HSA_LOCKS_HPP diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.cpp new file mode 100644 index 0000000000..d40fe6d012 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.cpp @@ -0,0 +1,381 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "amd_options.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace amd { +namespace options { + +//===----------------------------------------------------------------------===// +// StringFactory. // +//===----------------------------------------------------------------------===// + +std::string StringFactory::Flatten(const char **cstrs, + const uint32_t &cstrs_count, + const char &spacer) { + if (NULL == cstrs || 0 == cstrs_count) { + return std::string(); + } + + std::string flattened; + for (uint32_t i = 0; i < cstrs_count; ++i) { + if (NULL == cstrs[i]) { + return std::string(); + } + flattened += cstrs[i]; + if (i != (cstrs_count - 1)) { + flattened += spacer; + } + } + return flattened; +} + +std::list StringFactory::Tokenize(const char *cstr, + const char &delim) { + if (NULL == cstr) { + return std::list(); + } + + const std::string str = cstr; + size_t start = 0; + size_t end = 0; + + std::list tokens; + while ((end = str.find(delim, start)) != std::string::npos) { + if (start != end) { + tokens.push_back(str.substr(start, end - start)); + } + start = end + 1; + } + if (str.size() > start) { + tokens.push_back(str.substr(start)); + } + return tokens; +} + +std::string StringFactory::ToLower(const std::string& str) { + std::string lower(str.length(), ' '); + std::transform(str.begin(), str.end(), lower.begin(), ::tolower); + return lower; +} + +std::string StringFactory::ToUpper(const std::string& str) { + std::string upper(str.length(), ' '); + std::transform(str.begin(), str.end(), upper.begin(), ::toupper); + return upper; +} + +//===----------------------------------------------------------------------===// +// HelpPrinter, HelpStreambuf. // +//===----------------------------------------------------------------------===// + +HelpStreambuf::HelpStreambuf(std::ostream& stream) + : basicStream_(&stream), + basicBuf_(stream.rdbuf()), + wrapWidth_(0), + indentSize_(0), + atLineStart_(true), + lineWidth_(0) +{ + basicStream_->rdbuf(this); +} + +HelpStreambuf::int_type HelpStreambuf::overflow(HelpStreambuf::int_type ch) { + if (atLineStart_ && ch != '\n') { + std::string indent(indentSize_, ' '); + basicBuf_->sputn(indent.data(), indent.size()); + lineWidth_ = indentSize_; + atLineStart_ = false; + } else if (ch == '\n') { + atLineStart_ = true; + lineWidth_ = 0; + } + + if (wrapWidth_ > 0 && lineWidth_ == wrapWidth_) { + basicBuf_->sputc('\n'); + std::string indent(indentSize_, ' '); + basicBuf_->sputn(indent.data(), indent.size()); + lineWidth_ = indentSize_; + atLineStart_ = false; + } + + lineWidth_++; + return basicBuf_->sputc(ch); + } + +HelpPrinter& HelpPrinter::PrintUsage(const std::string& usage) { + sbuf_.IndentSize(0); + sbuf_.WrapWidth(0); + Stream() << usage; + if (usage.length() < USAGE_WIDTH) { + Stream() << std::string(USAGE_WIDTH - usage.length(), ' '); + } + Stream() << std::string(PADDING_WIDTH, ' '); + return *this; +} + +HelpPrinter& HelpPrinter::PrintDescription(const std::string& description) { + sbuf_.WrapWidth(USAGE_WIDTH + PADDING_WIDTH + DESCRIPTION_WIDTH); + sbuf_.IndentSize(USAGE_WIDTH + PADDING_WIDTH); + Stream() << description << std::endl; + sbuf_.IndentSize(0); + sbuf_.WrapWidth(0); + return *this; +} + +//===----------------------------------------------------------------------===// +// ChoiceOptioin. // +//===----------------------------------------------------------------------===// +ChoiceOption::ChoiceOption(const std::string& name, + const std::vector& choices, + const std::string& help, + std::ostream& error) + : OptionBase(name, help, error) { + for (const auto& choice: choices) { + choices_.insert(choice); + } + } + +bool ChoiceOption::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 != tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + tokens.pop_front(); + + if (0 == choices_.count(tokens.front())) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + value_ = tokens.front(); + tokens.pop_front(); + return true; +} + +void ChoiceOption::PrintHelp(HelpPrinter& printer) const { + std::string usage = "-" + name_ + "=["; + bool first = true; + for (const auto& choice: choices_) { + if (!first) { + usage += '|'; + } else { + first = false; + } + usage += choice; + } + usage += "]"; + printer.PrintUsage(usage).PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// PrefixOption. // +//===----------------------------------------------------------------------===// +bool PrefixOption::IsValid() const { + return (0 < name_.size()) && (name_.find(':') == std::string::npos); +} + +std::string::size_type PrefixOption::FindPrefix(const std::string& token) const { + auto prefix = name_ + ':'; + return token.find(prefix); +} + +bool PrefixOption::Accept(const std::string& token) const { + return + (token.compare(0, name_.length(), name_) == 0) && + token.length() > name_.length() && + token[name_.length()] == ':'; +} + +bool PrefixOption::ProcessTokens(std::list &tokens) { + assert(1 <= tokens.size()); + assert(Accept(tokens.front()) && "option name is mismatched"); + + std::string value = tokens.front(); tokens.pop_front(); + value = value.substr(name_.length() + 1); + + for (const auto& token: tokens) { + value += '='; + value += token; + } + tokens.clear(); + + values_.push_back(value); + is_set_ = true; + return true; +} + +void PrefixOption::PrintHelp(HelpPrinter& printer) const { + printer.PrintUsage("-" + name_ + ":[value]").PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// OptionParser. // +//===----------------------------------------------------------------------===// +std::vector::iterator +OptionParser::FindOption(const std::string& name) { + std::vector::iterator it = options_.begin(); + std::vector::iterator end = options_.end(); + for (; it != end; ++it) { + if ((*it)->Accept(name)) { + return it; + } + } + return end; +} + +bool OptionParser::AddOption(OptionBase *option) { + if (NULL == option || !option->IsValid()) { + return false; + } + if (FindOption(option->name()) != options_.end()) { + return false; + } + options_.push_back(option); + return true; +} + +const std::string& OptionParser::Unknown() const { + assert(collectUnknown_); + return unknownOptions_; +} + +bool OptionParser::ParseOptions(const char *options) { + std::list tokens_l1 = StringFactory::Tokenize(options, ' '); + if (0 == tokens_l1.size()) { + return true; + } + + std::list::iterator tokens_l1i = tokens_l1.begin(); + while (tokens_l1i != tokens_l1.end()) { + if ('-' == tokens_l1i->at(0)) { + std::list::iterator option_begin = tokens_l1i; + std::list tokens_l2; + do { + tokens_l2.push_back(*tokens_l1i); + tokens_l1i++; + } while (tokens_l1i != tokens_l1.end() && '-' != tokens_l1i->at(0)); + std::list::iterator option_end = tokens_l1i; + tokens_l2.front().erase(0, 1); + + if (1 == tokens_l2.size()) { + tokens_l2 = StringFactory::Tokenize(tokens_l2.front().c_str(), '='); + if (2 < tokens_l2.size()) { + if (collectUnknown_) { + unknownOptions_ += *tokens_l1i + " "; + continue; + } else { + error() << "error: invalid option format: \'" + << tokens_l2.front() << '\'' << std::endl; + Reset(); + return false; + } + } + } + + auto find_status = FindOption(tokens_l2.front()); + if (find_status == options_.end()) { + if (collectUnknown_) { + for (; option_begin != option_end; ++option_begin) { + unknownOptions_ += *option_begin + " "; + } + continue; + } else { + error() << "error: unknown option: \'" + << tokens_l2.front() << '\'' << std::endl; + Reset(); + return false; + } + } + + if (!(*find_status)->ProcessTokens(tokens_l2)) { + Reset(); + return false; + } + assert(0 == tokens_l2.size()); + } else { + if (collectUnknown_) { + unknownOptions_ += *tokens_l1i + " "; + } else { + error() << "error: unknown option: \'" + << *tokens_l1i << '\'' << std::endl; + Reset(); + return false; + } + } + } + + return true; +} + +void OptionParser::PrintHelp(std::ostream& out, const std::string& addition) const { + HelpPrinter printer(out); + for (const auto& option: options_) { + option->PrintHelp(printer); + } + out << addition << std::endl; +} + +void OptionParser::Reset() { + unknownOptions_.clear(); + for (auto &option : options_) { + option->Reset(); + } +} + +} // namespace options +} // namespace amd diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.hpp new file mode 100644 index 0000000000..c80d489a2e --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/libamdhsacode/amd_options.hpp @@ -0,0 +1,476 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_OPTIONS_HPP +#define AMD_OPTIONS_HPP + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace amd { +namespace options { + +//===----------------------------------------------------------------------===// +// StringFactory. // +//===----------------------------------------------------------------------===// + +class StringFactory final { +public: + static std::string Flatten(const char **cstrs, + const uint32_t &cstrs_count, + const char &spacer = '\0'); + + static std::list Tokenize(const char *cstr, const char &delim); + + static std::string ToLower(const std::string& str); + static std::string ToUpper(const std::string& str); +}; + +//===----------------------------------------------------------------------===// +// HelpPrinter, HelpStreambuf. // +//===----------------------------------------------------------------------===// + +class HelpStreambuf : public std::streambuf { +public: + explicit HelpStreambuf(std::ostream& stream); + + virtual ~HelpStreambuf() { + basicStream_->rdbuf(basicBuf_); + } + + void IndentSize(unsigned indent) { + assert(wrapWidth_ == 0 || indentSize_ < wrapWidth_); + indentSize_ = indent; + } + + void WrapWidth(unsigned wrap) { + assert(wrapWidth_ == 0 || indentSize_ < wrapWidth_); + wrapWidth_ = wrap; + } + +protected: + virtual int_type overflow(int_type ch) override; + +private: + std::ostream* basicStream_; + std::streambuf* basicBuf_; + + unsigned wrapWidth_; + unsigned indentSize_; + + bool atLineStart_; + unsigned lineWidth_; +}; + + +class HelpPrinter { +private: + static const unsigned USAGE_WIDTH = 30; + static const unsigned PADDING_WIDTH = 2; + static const unsigned DESCRIPTION_WIDTH = 50; + +public: + HelpPrinter& PrintUsage(const std::string& usage); + HelpPrinter& PrintDescription(const std::string& description); + + std::ostream& Stream() { return *out_; } + +private: + explicit HelpPrinter(std::ostream& out = std::cout) : out_(&out), sbuf_(*out_) {} + + /// @brief Not copy-constructible. + HelpPrinter(const HelpPrinter&); + /// @brief Not copy-assignable. + HelpPrinter& operator =(const HelpPrinter&); + + friend class OptionParser; + + std::ostream *out_; + HelpStreambuf sbuf_; +}; + +//===----------------------------------------------------------------------===// +// OptionBase. // +//===----------------------------------------------------------------------===// + +class OptionBase { +public: + virtual ~OptionBase() {} + + const std::string& name() const { + return name_; + } + const bool& is_set() const { + return is_set_; + } + + virtual bool IsValid() const { + return 0 < name_.size(); + } + +protected: + explicit OptionBase(const std::string& name, + const std::string& help = "", + std::ostream &error = std::cerr) + : name_(name), + help_(help), + is_set_(false), + error_(&error) {} + + virtual void PrintHelp(HelpPrinter& printer) const = 0; + virtual bool Accept(const std::string& name) const { return name_ == name; } + + const std::string name_; + const std::string help_; + bool is_set_; + + std::ostream &error() const { return *error_; } + +private: + /// @brief Not copy-constructible. + OptionBase(const OptionBase &ob); + /// @brief Not copy-assignable. + OptionBase& operator=(const OptionBase &ob); + + void Reset() { + is_set_ = false; + } + + virtual bool ProcessTokens(std::list &tokens) = 0; + + friend class OptionParser; + + mutable std::ostream *error_; +}; + + +//===----------------------------------------------------------------------===// +// Option. // +//===----------------------------------------------------------------------===// + +template +class Option final: public OptionBase { +public: + explicit Option(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~Option() {} + + const std::list& values() const { + return values_; + } + +protected: + virtual void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + Option(const Option &o); + /// @brief Not copy-assignable. + Option& operator=(const Option &o); + + bool ProcessTokens(std::list &tokens); + + std::list values_; +}; + +template +bool Option::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 > tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + tokens.pop_front(); + + while (!tokens.empty()) { + std::istringstream token_stream(tokens.front()); + if (!token_stream.good()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + T value; + token_stream >> value; + + values_.push_back(value); + tokens.pop_front(); + } + return true; +} + +template +void Option::PrintHelp(HelpPrinter& printer) const { + printer.PrintUsage("-" + name_ + " [" + StringFactory::ToUpper(name_) + "s]") + .PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// ValueOption. // +//===----------------------------------------------------------------------===// + +template +class ValueOption final: public OptionBase { +public: + explicit ValueOption(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~ValueOption() {} + + const T& value() const { + return value_; + } + +protected: + void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + ValueOption(const ValueOption &o); + /// @brief Not copy-assignable. + ValueOption& operator=(const ValueOption &o); + + bool ProcessTokens(std::list &tokens) override; + + T value_; +}; + +template +bool ValueOption::ProcessTokens(std::list &tokens) { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (2 != tokens.size()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + + is_set_ = true; + tokens.pop_front(); + + std::istringstream token_stream(tokens.front()); + if (!token_stream.good()) { + error() << "error: invalid option: \'" << name_ << '\'' << std::endl; + return false; + } + token_stream >> value_; + tokens.pop_front(); + return true; +} + +template +void ValueOption::PrintHelp(HelpPrinter& printer) const { + printer.PrintUsage("-" + name_ + "=[VAL]") + .PrintDescription(help_); +} + +//===----------------------------------------------------------------------===// +// ChoiceOptioin. // +//===----------------------------------------------------------------------===// +class ChoiceOption final: public OptionBase { +public: + ChoiceOption(const std::string& name, + const std::vector& choices, + const std::string& help = "", + std::ostream& error = std::cerr); + + ~ChoiceOption() {} + + const std::string& value() const { + return value_; + } + +protected: + void PrintHelp(HelpPrinter& printer) const override; + +private: + /// @brief Not copy-constructible. + ChoiceOption(const ChoiceOption&); + /// @brief Not copy-assignable. + ChoiceOption& operator =(const ChoiceOption&); + + bool ProcessTokens(std::list &tokens) override; + + std::unordered_set choices_; + std::string value_; +}; + +//===----------------------------------------------------------------------===// +// Option. // +//===----------------------------------------------------------------------===// + +class NoArgOption final: public OptionBase { +public: + explicit NoArgOption(const std::string& name, + const std::string& help = "", + std::ostream& error = std::cerr): + OptionBase(name, help, error) {} + + ~NoArgOption() {} + +protected: + void PrintHelp(HelpPrinter& printer) const override { + printer.PrintUsage("-" + name_).PrintDescription(help_); + } + +private: + /// @brief Not copy-constructible. + NoArgOption(const NoArgOption &o); + /// @brief Not copy-assignable. + NoArgOption& operator=(const NoArgOption &o); + + bool ProcessTokens(std::list &tokens) override { + assert(0 == name_.compare(tokens.front()) && "option name is mismatched"); + if (1 == tokens.size()) { + tokens.pop_front(); + is_set_ = true; + return true; + } else if (2 == tokens.size()) { + tokens.pop_front(); + if (tokens.front() == "1") { + is_set_ = true; + tokens.pop_front(); + return true; + } else if (tokens.front() == "0") { + is_set_ = false; + tokens.pop_front(); + return true; + } + } + error() << "error: invalid option: '" << name_ << "'" << std::endl; + return false; + } +}; + +//===----------------------------------------------------------------------===// +// PrefixOption. // +//===----------------------------------------------------------------------===// +class PrefixOption final: public OptionBase { +public: + PrefixOption(const std::string& prefix, + const std::string& help = "", + std::ostream& error = std::cerr) + : OptionBase(prefix, help, error) {} + + ~PrefixOption() {} + + const std::vector& values() const { + return values_; + } + + bool IsValid() const override; + +protected: + void PrintHelp(HelpPrinter& printer) const override; + bool Accept(const std::string& token) const override; + +private: + /// @brief Not copy-constructible. + PrefixOption(const PrefixOption&); + /// @brief Not copy-assignable. + PrefixOption& operator =(const PrefixOption&); + + bool ProcessTokens(std::list &tokens); + + std::string::size_type FindPrefix(const std::string& token) const; + + std::vector values_; +}; + +//===----------------------------------------------------------------------===// +// OptionParser. // +//===----------------------------------------------------------------------===// + +class OptionParser final { +public: + explicit OptionParser(bool collectUnknown = false, std::ostream& error = std::cerr) + : collectUnknown_(collectUnknown), + error_(&error) {} + + ~OptionParser() {} + + bool AddOption(OptionBase *option); + + bool ParseOptions(const char *options); + + const std::string& Unknown() const; + void CollectUnknown(bool b) { collectUnknown_ = b; } + + void PrintHelp(std::ostream& out, const std::string& addition = "") const; + + void Reset(); + +private: + /// @brief Not copy-constructible. + OptionParser(const OptionParser &op); + /// @brief Not copy-assignable. + OptionParser& operator=(const OptionParser &op); + + std::ostream& error() { return *error_; } + + std::vector::iterator FindOption(const std::string& name); + + std::vector options_; + + std::string unknownOptions_; + bool collectUnknown_; + + std::ostream *error_; +}; + +} // namespace options +} // namespace amd + +#endif // AMD_OPTIONS_HPP diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/AMDHSAKernelDescriptor.h b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/AMDHSAKernelDescriptor.h new file mode 100644 index 0000000000..13ad395be1 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/AMDHSAKernelDescriptor.h @@ -0,0 +1,263 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H +#define LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H + +#include +#include + +// Gets offset of specified member in specified type. +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE*)0)->MEMBER) +#endif // offsetof + +// Creates enumeration entries used for packing bits into integers. Enumeration +// entries include bit shift amount, bit width, and bit mask. +#ifndef AMDHSA_BITS_ENUM_ENTRY +#define AMDHSA_BITS_ENUM_ENTRY(NAME, SHIFT, WIDTH) \ + NAME ## _SHIFT = (SHIFT), \ + NAME ## _WIDTH = (WIDTH), \ + NAME = (((1 << (WIDTH)) - 1) << (SHIFT)) +#endif // AMDHSA_BITS_ENUM_ENTRY + +// Gets bits for specified bit mask from specified source. +#ifndef AMDHSA_BITS_GET +#define AMDHSA_BITS_GET(SRC, MSK) ((SRC & MSK) >> MSK ## _SHIFT) +#endif // AMDHSA_BITS_GET + +// Sets bits for specified bit mask in specified destination. +#ifndef AMDHSA_BITS_SET +#define AMDHSA_BITS_SET(DST, MSK, VAL) \ + DST &= ~MSK; \ + DST |= ((VAL << MSK ## _SHIFT) & MSK) +#endif // AMDHSA_BITS_SET + +namespace llvm { +namespace amdhsa { + +// Floating point rounding modes. Must match hardware definition. +enum : uint8_t { + FLOAT_ROUND_MODE_NEAR_EVEN = 0, + FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + FLOAT_ROUND_MODE_ZERO = 3, +}; + +// Floating point denorm modes. Must match hardware definition. +enum : uint8_t { + FLOAT_DENORM_MODE_FLUSH_SRC_DST = 0, + FLOAT_DENORM_MODE_FLUSH_DST = 1, + FLOAT_DENORM_MODE_FLUSH_SRC = 2, + FLOAT_DENORM_MODE_FLUSH_NONE = 3, +}; + +// System VGPR workitem IDs. Must match hardware definition. +enum : uint8_t { + SYSTEM_VGPR_WORKITEM_ID_X = 0, + SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3, +}; + +// Compute program resource register 1. Must match hardware definition. +#define COMPUTE_PGM_RSRC1(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC1_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC1(GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + COMPUTE_PGM_RSRC1(GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + COMPUTE_PGM_RSRC1(PRIORITY, 10, 2), + COMPUTE_PGM_RSRC1(FLOAT_ROUND_MODE_32, 12, 2), + COMPUTE_PGM_RSRC1(FLOAT_ROUND_MODE_16_64, 14, 2), + COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_32, 16, 2), + COMPUTE_PGM_RSRC1(FLOAT_DENORM_MODE_16_64, 18, 2), + COMPUTE_PGM_RSRC1(PRIV, 20, 1), + COMPUTE_PGM_RSRC1(ENABLE_DX10_CLAMP, 21, 1), + COMPUTE_PGM_RSRC1(DEBUG_MODE, 22, 1), + COMPUTE_PGM_RSRC1(ENABLE_IEEE_MODE, 23, 1), + COMPUTE_PGM_RSRC1(BULKY, 24, 1), + COMPUTE_PGM_RSRC1(CDBG_USER, 25, 1), + COMPUTE_PGM_RSRC1(FP16_OVFL, 26, 1), // GFX9+ + COMPUTE_PGM_RSRC1(RESERVED0, 27, 2), + COMPUTE_PGM_RSRC1(WGP_MODE, 29, 1), // GFX10+ + COMPUTE_PGM_RSRC1(MEM_ORDERED, 30, 1), // GFX10+ + COMPUTE_PGM_RSRC1(FWD_PROGRESS, 31, 1), // GFX10+ +}; +#undef COMPUTE_PGM_RSRC1 + +// Compute program resource register 2. Must match hardware definition. +#define COMPUTE_PGM_RSRC2(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC2_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC2(ENABLE_PRIVATE_SEGMENT, 0, 1), + COMPUTE_PGM_RSRC2(USER_SGPR_COUNT, 1, 5), + COMPUTE_PGM_RSRC2(ENABLE_TRAP_HANDLER, 6, 1), + COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + COMPUTE_PGM_RSRC2(ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + COMPUTE_PGM_RSRC2(ENABLE_VGPR_WORKITEM_ID, 11, 2), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_MEMORY, 14, 1), + COMPUTE_PGM_RSRC2(GRANULATED_LDS_SIZE, 15, 9), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + COMPUTE_PGM_RSRC2(ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO, 30, 1), + COMPUTE_PGM_RSRC2(RESERVED0, 31, 1), +}; +#undef COMPUTE_PGM_RSRC2 + +// Compute program resource register 3 for GFX90A+. Must match hardware +// definition. +#define COMPUTE_PGM_RSRC3_GFX90A(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX90A_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC3_GFX90A(ACCUM_OFFSET, 0, 6), + COMPUTE_PGM_RSRC3_GFX90A(RESERVED0, 6, 10), + COMPUTE_PGM_RSRC3_GFX90A(TG_SPLIT, 16, 1), + COMPUTE_PGM_RSRC3_GFX90A(RESERVED1, 17, 15), +}; +#undef COMPUTE_PGM_RSRC3_GFX90A + +// Compute program resource register 3 for GFX10+. Must match hardware +// definition. +#define COMPUTE_PGM_RSRC3_GFX10_PLUS(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_PLUS_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC3_GFX10_PLUS(SHARED_VGPR_COUNT, 0, 4), // GFX10+ + COMPUTE_PGM_RSRC3_GFX10_PLUS(INST_PREF_SIZE, 4, 6), + COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_START, 10, 1), + COMPUTE_PGM_RSRC3_GFX10_PLUS(TRAP_ON_END, 11, 1), + COMPUTE_PGM_RSRC3_GFX10_PLUS(RESERVED0, 12, 19), + COMPUTE_PGM_RSRC3_GFX10_PLUS(IMAGE_OP, 31, 1), +}; +#undef COMPUTE_PGM_RSRC3_GFX10_PLUS + +// Kernel code properties. Must be kept backwards compatible. +#define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(KERNEL_CODE_PROPERTY_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_PTR, 1, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_QUEUE_PTR, 2, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + KERNEL_CODE_PROPERTY(RESERVED0, 7, 3), + KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+ + KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1), + KERNEL_CODE_PROPERTY(RESERVED1, 12, 4), +}; +#undef KERNEL_CODE_PROPERTY + +// Kernel descriptor. Must be kept backwards compatible. +struct kernel_descriptor_t { + uint32_t group_segment_fixed_size; + uint32_t private_segment_fixed_size; + uint32_t kernarg_size; + uint8_t reserved0[4]; + int64_t kernel_code_entry_byte_offset; + uint8_t reserved1[20]; + uint32_t compute_pgm_rsrc3; // GFX10+ and GFX90A+ + uint32_t compute_pgm_rsrc1; + uint32_t compute_pgm_rsrc2; + uint16_t kernel_code_properties; + uint8_t reserved2[6]; +}; + +enum : uint32_t { + GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0, + PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4, + KERNARG_SIZE_OFFSET = 8, + RESERVED0_OFFSET = 12, + KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16, + RESERVED1_OFFSET = 24, + COMPUTE_PGM_RSRC3_OFFSET = 44, + COMPUTE_PGM_RSRC1_OFFSET = 48, + COMPUTE_PGM_RSRC2_OFFSET = 52, + KERNEL_CODE_PROPERTIES_OFFSET = 56, + RESERVED2_OFFSET = 58, +}; + +static_assert( + sizeof(kernel_descriptor_t) == 64, + "invalid size for kernel_descriptor_t"); +static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) == + GROUP_SEGMENT_FIXED_SIZE_OFFSET, + "invalid offset for group_segment_fixed_size"); +static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) == + PRIVATE_SEGMENT_FIXED_SIZE_OFFSET, + "invalid offset for private_segment_fixed_size"); +static_assert(offsetof(kernel_descriptor_t, kernarg_size) == + KERNARG_SIZE_OFFSET, + "invalid offset for kernarg_size"); +static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET, + "invalid offset for reserved0"); +static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == + KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET, + "invalid offset for kernel_code_entry_byte_offset"); +static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET, + "invalid offset for reserved1"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == + COMPUTE_PGM_RSRC3_OFFSET, + "invalid offset for compute_pgm_rsrc3"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == + COMPUTE_PGM_RSRC1_OFFSET, + "invalid offset for compute_pgm_rsrc1"); +static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == + COMPUTE_PGM_RSRC2_OFFSET, + "invalid offset for compute_pgm_rsrc2"); +static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) == + KERNEL_CODE_PROPERTIES_OFFSET, + "invalid offset for kernel_code_properties"); +static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET, + "invalid offset for reserved2"); + +} // end namespace amdhsa +} // end namespace llvm + +#endif // LLVM_SUPPORT_AMDHSAKERNELDESCRIPTOR_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/CMakeLists.txt b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/CMakeLists.txt new file mode 100644 index 0000000000..5bea74d5eb --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/CMakeLists.txt @@ -0,0 +1,31 @@ +#----------------------------------------------------------------------------- +# Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved. +#----------------------------------------------------------------------------- + +# loader library +# +# This file is expected to be included from top-level CMakeLists.txt. +# +# Dependencies: +# - Compiler definitions +# - amdhsacode library +# +# Defines: +# - amdhsaloader library and target include directories + +file(GLOB sources *.cpp *.hpp) +add_library(amdhsaloader STATIC ${sources}) +set_target_properties(amdhsaloader PROPERTIES + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" + POSITION_INDEPENDENT_CODE ON +) + +if(CMAKE_CXX_COMPILER_ID MATCHES "^(GNU|(Apple)?Clang)$") + target_compile_options(amdhsaloader PRIVATE + -Werror + -Wno-inconsistent-missing-override + ) +endif() + +target_include_directories(amdhsaloader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(amdhsaloader amdhsacode) diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.cpp new file mode 100644 index 0000000000..74f62bd98b --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.cpp @@ -0,0 +1,1983 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "executable.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "amd_hsa_elf.h" +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_code.hpp" +#include "amd_hsa_code_util.hpp" +#include "amd_options.hpp" +#include "AMDHSAKernelDescriptor.h" + +#include + +#if defined(__clang__) || defined(__GNUC__) + #define __loader_attribute__(attr) __attribute__((attr)) + + #define loader_strdup(str1) strdup(str1) +#elif defined(_MSC_VER) + #define __loader_attribute__(attr) __declspec(attr) + + #define loader_strdup(str1) _strdup(str1) +#else + #error "Unknown compiler." +#endif + +#if !defined(__loader_noinline__) + #define __loader_noinline__ __loader_attribute__(noinline) +#endif + +namespace atomic { + +template +inline void store_relaxed(T *ptr, T val) { +#if defined(__clang__) || defined(__GNUC__) + __atomic_store_n(ptr, val, __ATOMIC_RELAXED); +#elif defined(_MSC_VER) + *ptr = val; +#else + #error "Unknown compiler." +#endif +} + +template +inline void store_release(T *ptr, T val) { +#if defined(__clang__) || defined(__GNUC__) + __atomic_store_n(ptr, val, __ATOMIC_RELEASE); +#elif defined(_MSC_VER) + std::atomic_thread_fence(std::memory_order_acq_rel); + *ptr = val; +#else + #error "Unknown compiler." +#endif +} + +inline void thread_fence_acquire_release() { + std::atomic_thread_fence(std::memory_order_acq_rel); +} + +} // namespace atomic + +using namespace amd::hsa; +using namespace amd::hsa::common; + +// Having a side effect prevents call site optimization that allows removal of a noinline function call +// with no side effect. +__loader_noinline__ static void _loader_debug_state() { + static volatile int function_needs_a_side_effect = 0; + function_needs_a_side_effect ^= 1; +} +// r_version history: +// 1: Initial debug protocol +// 2: New trap handler ABI. The reason for halting a wave is recorded in ttmp11[8:7]. +// 3: New trap handler ABI. A wave halted at S_ENDPGM rewinds its PC by 8 bytes, and sets ttmp11[9]=1. +// 4: New trap handler ABI. Save the trap id in ttmp11[16:9] +// 5: New trap handler ABI. Save the PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped +// 6: New trap handler ABI. ttmp6[25:0] contains dispatch index modulo queue size +// 7: New trap handler ABI. Send interrupts as a bitmask, coalescing concurrent exceptions. +// 8: New trap handler ABI for gfx940: Initialize ttmp[4:5] if ttmp11[31] == 0. +HSA_API r_debug _amdgpu_r_debug = {8, + nullptr, + reinterpret_cast(&_loader_debug_state), + r_debug::RT_CONSISTENT, + 0}; +HSA_API r_debug *_amdgpu_r_debug_ptr = &_amdgpu_r_debug; +static link_map* r_debug_tail = nullptr; + +namespace amd { +namespace hsa { +namespace loader { + +class LoaderOptions { +public: + explicit LoaderOptions(std::ostream &error = std::cerr); + + const amd::options::NoArgOption* Help() const { return &help; } + const amd::options::NoArgOption* DumpCode() const { return &dump_code; } + const amd::options::NoArgOption* DumpIsa() const { return &dump_isa; } + const amd::options::NoArgOption* DumpExec() const { return &dump_exec; } + const amd::options::NoArgOption* DumpAll() const { return &dump_all; } + const amd::options::ValueOption* DumpDir() const { return &dump_dir; } + const amd::options::PrefixOption* Substitute() const { return &substitute; } + + bool ParseOptions(const std::string& options); + void Reset(); + void PrintHelp(std::ostream& out) const; + +private: + /// @brief Copy constructor - not available. + LoaderOptions(const LoaderOptions&); + + /// @brief Assignment operator - not available. + LoaderOptions& operator=(const LoaderOptions&); + + amd::options::NoArgOption help; + amd::options::NoArgOption dump_code; + amd::options::NoArgOption dump_isa; + amd::options::NoArgOption dump_exec; + amd::options::NoArgOption dump_all; + amd::options::ValueOption dump_dir; + amd::options::PrefixOption substitute; + amd::options::OptionParser option_parser; +}; + +LoaderOptions::LoaderOptions(std::ostream& error) : + help("help", "print help"), + dump_code("dump-code", "Dump finalizer output code object"), + dump_isa("dump-isa", "Dump finalizer output to ISA text file"), + dump_exec("dump-exec", "Dump executable to text file"), + dump_all("dump-all", "Dump all finalizer input and output (as above)"), + dump_dir("dump-dir", "Dump directory"), + substitute("substitute", "Substitute code object with given index or index range on loading from file"), + option_parser(false, error) +{ + option_parser.AddOption(&help); + option_parser.AddOption(&dump_code); + option_parser.AddOption(&dump_isa); + option_parser.AddOption(&dump_exec); + option_parser.AddOption(&dump_all); + option_parser.AddOption(&dump_dir); + option_parser.AddOption(&substitute); +} + +bool LoaderOptions::ParseOptions(const std::string& options) +{ + return option_parser.ParseOptions(options.c_str()); +} + +void LoaderOptions::Reset() +{ + option_parser.Reset(); +} + +void LoaderOptions::PrintHelp(std::ostream& out) const +{ + option_parser.PrintHelp(out); +} + +static const char *LOADER_DUMP_PREFIX = "amdcode"; + +Loader* Loader::Create(Context* context) +{ + return new AmdHsaCodeLoader(context); +} + +void Loader::Destroy(Loader *loader) +{ + // Loader resets the link_map, but the executables and loaded code objects are not deleted. + _amdgpu_r_debug.r_map = nullptr; + _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT; + r_debug_tail = nullptr; + delete loader; +} + +Executable* AmdHsaCodeLoader::CreateExecutable( + hsa_profile_t profile, const char *options, hsa_default_float_rounding_mode_t default_float_rounding_mode) +{ + WriterLockGuard writer_lock(rw_lock_); + + executables.push_back(new ExecutableImpl(profile, context, executables.size(), default_float_rounding_mode)); + return executables.back(); +} + +static void AddCodeObjectInfoIntoDebugMap(link_map* map) { + if (r_debug_tail) { + r_debug_tail->l_next = map; + map->l_prev = r_debug_tail; + map->l_next = nullptr; + } else { + _amdgpu_r_debug.r_map = map; + map->l_prev = nullptr; + map->l_next = nullptr; + } + r_debug_tail = map; +} + +static void RemoveCodeObjectInfoFromDebugMap(link_map* map) { + if (r_debug_tail == map) { + r_debug_tail = map->l_prev; + } + if (_amdgpu_r_debug.r_map == map) { + _amdgpu_r_debug.r_map = map->l_next; + } + + if (map->l_prev) { + map->l_prev->l_next = map->l_next; + } + if (map->l_next) { + map->l_next->l_prev = map->l_prev; + } + + free(map->l_name); + memset(map, 0, sizeof(link_map)); +} + +hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const char *options) { + hsa_status_t status = executable->Freeze(options); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + + // Assuming runtime atomic implements C++ std::memory_order + WriterLockGuard writer_lock(rw_lock_); + atomic::store_relaxed(&_amdgpu_r_debug.r_state, r_debug::RT_ADD); + atomic::thread_fence_acquire_release(); + _loader_debug_state(); + atomic::thread_fence_acquire_release(); + for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info)); + } + atomic::store_release(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT); + _loader_debug_state(); + + return HSA_STATUS_SUCCESS; +} + +void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) { + // Assuming runtime atomic implements C++ std::memory_order + WriterLockGuard writer_lock(rw_lock_); + atomic::store_relaxed(&_amdgpu_r_debug.r_state, r_debug::RT_DELETE); + atomic::thread_fence_acquire_release(); + _loader_debug_state(); + atomic::thread_fence_acquire_release(); + for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info)); + } + atomic::store_release(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT); + _loader_debug_state(); + + executables[((ExecutableImpl*)executable)->id()] = nullptr; + delete executable; +} + +hsa_status_t AmdHsaCodeLoader::IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) +{ + WriterLockGuard writer_lock(rw_lock_); + assert(callback); + + for (auto &exec : executables) { + hsa_status_t status = callback(Executable::Handle(exec), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t AmdHsaCodeLoader::QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) +{ + if (!num_segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (*num_segment_descriptors == 0 && segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + if (*num_segment_descriptors != 0 && !segment_descriptors) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + this->EnableReadOnlyMode(); + + size_t actual_num_segment_descriptors = 0; + for (auto &executable : executables) { + if (executable) { + actual_num_segment_descriptors += executable->GetNumSegmentDescriptors(); + } + } + + if (*num_segment_descriptors == 0) { + *num_segment_descriptors = actual_num_segment_descriptors; + this->DisableReadOnlyMode(); + return HSA_STATUS_SUCCESS; + } + if (*num_segment_descriptors != actual_num_segment_descriptors) { + this->DisableReadOnlyMode(); + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + } + + size_t i = 0; + for (auto &executable : executables) { + if (executable) { + i += executable->QuerySegmentDescriptors(segment_descriptors, actual_num_segment_descriptors, i); + } + } + + this->DisableReadOnlyMode(); + return HSA_STATUS_SUCCESS; +} + +uint64_t AmdHsaCodeLoader::FindHostAddress(uint64_t device_address) +{ + ReaderLockGuard reader_lock(rw_lock_); + if (device_address == 0) { + return 0; + } + + for (auto &exec : executables) { + if (exec != nullptr) { + uint64_t host_address = exec->FindHostAddress(device_address); + if (host_address != 0) { + return host_address; + } + } + } + return 0; +} + +void AmdHsaCodeLoader::PrintHelp(std::ostream& out) +{ + LoaderOptions().PrintHelp(out); +} + +void AmdHsaCodeLoader::EnableReadOnlyMode() +{ + rw_lock_.ReaderLock(); + for (auto &executable : executables) { + if (executable) { + ((ExecutableImpl*)executable)->EnableReadOnlyMode(); + } + } +} + +void AmdHsaCodeLoader::DisableReadOnlyMode() +{ + rw_lock_.ReaderUnlock(); + for (auto &executable : executables) { + if (executable) { + ((ExecutableImpl*)executable)->DisableReadOnlyMode(); + } + } +} + +//===----------------------------------------------------------------------===// +// SymbolImpl. // +//===----------------------------------------------------------------------===// + +bool SymbolImpl::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_TYPE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_TYPE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_TYPE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_TYPE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_NAME_LENGTH) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_NAME) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_NAME)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_MODULE_NAME) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_LINKAGE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_IS_DEFINITION) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION)), + "attributes are not compatible" + ); + + assert(value); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_TYPE: { + *((hsa_symbol_kind_t*)value) = kind; + break; + } + case HSA_CODE_SYMBOL_INFO_NAME_LENGTH: { + *((uint32_t*)value) = symbol_name.size(); + break; + } + case HSA_CODE_SYMBOL_INFO_NAME: { + memset(value, 0x0, symbol_name.size()); + memcpy(value, symbol_name.c_str(), symbol_name.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH: { + *((uint32_t*)value) = module_name.size(); + break; + } + case HSA_CODE_SYMBOL_INFO_MODULE_NAME: { + memset(value, 0x0, module_name.size()); + memcpy(value, module_name.c_str(), module_name.size()); + break; + } + case HSA_CODE_SYMBOL_INFO_LINKAGE: { + *((hsa_symbol_linkage_t*)value) = linkage; + break; + } + case HSA_CODE_SYMBOL_INFO_IS_DEFINITION: { + *((bool*)value) = is_definition; + break; + } + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION: { + *((uint32_t*)value) = 0; + break; + } + case HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT: + case HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS: { + if (!is_loaded) { + return false; + } + *((uint64_t*)value) = address; + break; + } + case HSA_EXECUTABLE_SYMBOL_INFO_AGENT: { + if (!is_loaded) { + return false; + } + *((hsa_agent_t*)value) = agent; + break; + } + default: { + return false; + } + } + + return true; +} + +//===----------------------------------------------------------------------===// +// KernelSymbol. // +//===----------------------------------------------------------------------===// + +bool KernelSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK)), + "attributes are not compatible" + ); + + assert(value); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE: { + *((uint32_t*)value) = kernarg_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT: { + *((uint32_t*)value) = kernarg_segment_alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE: { + *((uint32_t*)value) = group_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE: { + *((uint32_t*)value) = private_segment_size; + break; + } + case HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK: { + *((bool*)value) = is_dynamic_callstack; + break; + } + case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE: { + *((uint32_t*)value) = size; + break; + } + case HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN: { + *((uint32_t*)value) = alignment; + break; + } + default: { + return SymbolImpl::GetInfo(symbol_info, value); + } + } + + return true; +} + +//===----------------------------------------------------------------------===// +// VariableSymbol. // +//===----------------------------------------------------------------------===// + +bool VariableSymbol::GetInfo(hsa_symbol_info32_t symbol_info, void *value) { + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE)), + "attributes are not compatible" + ); + static_assert( + (symbol_attribute32_t(HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST) == + symbol_attribute32_t(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST)), + "attributes are not compatible" + ); + + switch (symbol_info) { + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION: { + *((hsa_variable_allocation_t*)value) = allocation; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT: { + *((hsa_variable_segment_t*)value) = segment; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT: { + *((uint32_t*)value) = alignment; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE: { + *((uint32_t*)value) = size; + break; + } + case HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST: { + *((bool*)value) = is_constant; + break; + } + default: { + return SymbolImpl::GetInfo(symbol_info, value); + } + } + + return true; +} + +bool LoadedCodeObjectImpl::GetInfo(amd_loaded_code_object_info_t attribute, void *value) +{ + assert(value); + + switch (attribute) { + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE: + ((hsa_code_object_t*)value)->handle = reinterpret_cast(elf_data); + break; + case AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE: + *((size_t*)value) = elf_size; + break; + default: { + return false; + } + } + + return true; +} + +hsa_status_t LoadedCodeObjectImpl::IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) +{ + assert(callback); + + for (auto &loaded_segment : loaded_segments) { + hsa_status_t status = callback(LoadedSegment::Handle(loaded_segment), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +void LoadedCodeObjectImpl::Print(std::ostream& out) +{ + out << "Code Object" << std::endl; +} + +bool Segment::GetInfo(amd_loaded_segment_info_t attribute, void *value) +{ + assert(value); + + switch (attribute) { + case AMD_LOADED_SEGMENT_INFO_TYPE: { + *((amdgpu_hsa_elf_segment_t*)value) = segment; + break; + } + case AMD_LOADED_SEGMENT_INFO_ELF_BASE_ADDRESS: { + *((uint64_t*)value) = vaddr; + break; + } + case AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS: { + *((uint64_t*)value) = reinterpret_cast(this->Address(this->VAddr())); + break; + } + case AMD_LOADED_SEGMENT_INFO_SIZE: { + *((size_t*)value) = size; + break; + } + default: { + return false; + } + } + + return true; +} + +uint64_t Segment::Offset(uint64_t addr) +{ + assert(IsAddressInSegment(addr)); + return addr - vaddr; +} + +void* Segment::Address(uint64_t addr) +{ + return owner->context()->SegmentAddress(segment, agent, ptr, Offset(addr)); +} + +bool Segment::Freeze() +{ + return !frozen ? (frozen = owner->context()->SegmentFreeze(segment, agent, ptr, size)) : true; +} + +bool Segment::IsAddressInSegment(uint64_t addr) +{ + return vaddr <= addr && addr < vaddr + size; +} + +void Segment::Copy(uint64_t addr, const void* src, size_t size) +{ + // loader must do copies before freezing. + assert(!frozen); + + if (size > 0) { + owner->context()->SegmentCopy(segment, agent, ptr, Offset(addr), src, size); + } +} + +void Segment::Print(std::ostream& out) +{ + out << "Segment" << std::endl + << " Type: " << AmdHsaElfSegmentToString(segment) + << " Size: " << size + << " VAddr: " << vaddr << std::endl + << " Ptr: " << std::hex << ptr << std::dec + << std::endl; +} + +void Segment::Destroy() +{ + owner->context()->SegmentFree(segment, agent, ptr, size); +} + +//===----------------------------------------------------------------------===// +// ExecutableImpl. // +//===----------------------------------------------------------------------===// + +ExecutableImpl::ExecutableImpl( + const hsa_profile_t &_profile, + Context *context, + size_t id, + hsa_default_float_rounding_mode_t default_float_rounding_mode) + : Executable() + , profile_(_profile) + , context_(context) + , id_(id) + , default_float_rounding_mode_(default_float_rounding_mode) + , state_(HSA_EXECUTABLE_STATE_UNFROZEN) + , program_allocation_segment(nullptr) +{ +} + +ExecutableImpl::~ExecutableImpl() { + for (ExecutableObject* o : objects) { + o->Destroy(); + delete o; + } + objects.clear(); + + for (auto &symbol_entry : program_symbols_) { + delete symbol_entry.second; + } + for (auto &symbol_entry : agent_symbols_) { + delete symbol_entry.second; + } +} + +hsa_status_t ExecutableImpl::DefineProgramExternalVariable( + const char *name, void *address) +{ + WriterLockGuard writer_lock(rw_lock_); + assert(name); + + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + auto symbol_entry = program_symbols_.find(std::string(name)); + if (symbol_entry != program_symbols_.end()) { + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + + program_symbols_.insert( + std::make_pair(std::string(name), + new VariableSymbol(true, + "", // Only program linkage symbols can be + // defined. + std::string(name), + HSA_SYMBOL_LINKAGE_PROGRAM, + true, + HSA_VARIABLE_ALLOCATION_PROGRAM, + HSA_VARIABLE_SEGMENT_GLOBAL, + 0, // TODO: size. + 0, // TODO: align. + false, // TODO: const. + true, + reinterpret_cast(address)))); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address) +{ + WriterLockGuard writer_lock(rw_lock_); + assert(name); + + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + auto symbol_entry = agent_symbols_.find(std::make_pair(std::string(name), agent)); + if (symbol_entry != agent_symbols_.end()) { + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + + auto insert_status = agent_symbols_.insert( + std::make_pair(std::make_pair(std::string(name), agent), + new VariableSymbol(true, + "", // Only program linkage symbols can be + // defined. + std::string(name), + HSA_SYMBOL_LINKAGE_PROGRAM, + true, + HSA_VARIABLE_ALLOCATION_AGENT, + segment, + 0, // TODO: size. + 0, // TODO: align. + false, // TODO: const. + true, + reinterpret_cast(address)))); + assert(insert_status.second); + insert_status.first->second->agent = agent; + + return HSA_STATUS_SUCCESS; +} + +bool ExecutableImpl::IsProgramSymbol(const char *symbol_name) { + assert(symbol_name); + + ReaderLockGuard reader_lock(rw_lock_); + return program_symbols_.find(std::string(symbol_name)) != program_symbols_.end(); +} + +Symbol* ExecutableImpl::GetSymbol( + const char *symbol_name, + const hsa_agent_t *agent) +{ + ReaderLockGuard reader_lock(rw_lock_); + return this->GetSymbolInternal(symbol_name, agent); +} + +Symbol* ExecutableImpl::GetSymbolInternal( + const char *symbol_name, + const hsa_agent_t *agent) +{ + assert(symbol_name); + + std::string mangled_name = std::string(symbol_name); + if (mangled_name.empty()) { + return nullptr; + } + + if (!agent) { + auto program_symbol = program_symbols_.find(mangled_name); + if (program_symbol != program_symbols_.end()) { + return program_symbol->second; + } + return nullptr; + } + + auto agent_symbol = agent_symbols_.find(std::make_pair(mangled_name, *agent)); + if (agent_symbol != agent_symbols_.end()) { + return agent_symbol->second; + } + return nullptr; +} + +hsa_status_t ExecutableImpl::IterateSymbols( + iterate_symbols_f callback, void *data) +{ + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &symbol_entry : program_symbols_) { + hsa_status_t hsc = + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + if (HSA_STATUS_SUCCESS != hsc) { + return hsc; + } + } + for (auto &symbol_entry : agent_symbols_) { + hsa_status_t hsc = + callback(Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + if (HSA_STATUS_SUCCESS != hsc) { + return hsc; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::IterateAgentSymbols( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data) { + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &symbol_entry : agent_symbols_) { + if (symbol_entry.second->GetAgent().handle != agent.handle) { + continue; + } + + hsa_status_t status = callback( + Executable::Handle(this), agent, Symbol::Handle(symbol_entry.second), + data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::IterateProgramSymbols( + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data) { + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &symbol_entry : program_symbols_) { + hsa_status_t status = callback( + Executable::Handle(this), Symbol::Handle(symbol_entry.second), data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::IterateLoadedCodeObjects( + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data) +{ + ReaderLockGuard reader_lock(rw_lock_); + assert(callback); + + for (auto &loaded_code_object : loaded_code_objects) { + hsa_status_t status = callback( + Executable::Handle(this), + LoadedCodeObject::Handle(loaded_code_object), + data); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + } + + return HSA_STATUS_SUCCESS; +} + +size_t ExecutableImpl::GetNumSegmentDescriptors() +{ + // assuming we are in readonly mode. + size_t actual_num_segment_descriptors = 0; + for (auto &obj : loaded_code_objects) { + actual_num_segment_descriptors += obj->LoadedSegments().size(); + } + return actual_num_segment_descriptors; +} + +size_t ExecutableImpl::QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) +{ + // assuming we are in readonly mode. + assert(segment_descriptors); + assert(first_empty_segment_descriptor < total_num_segment_descriptors); + + size_t i = first_empty_segment_descriptor; + for (auto &obj : loaded_code_objects) { + assert(i < total_num_segment_descriptors); + for (auto &seg : obj->LoadedSegments()) { + segment_descriptors[i].agent = seg->Agent(); + segment_descriptors[i].executable = Executable::Handle(seg->Owner()); + segment_descriptors[i].code_object_storage_type = HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY; + segment_descriptors[i].code_object_storage_base = obj->ElfData(); + segment_descriptors[i].code_object_storage_size = obj->ElfSize(); + segment_descriptors[i].code_object_storage_offset = seg->StorageOffset(); + segment_descriptors[i].segment_base = seg->Address(seg->VAddr()); + segment_descriptors[i].segment_size = seg->Size(); + ++i; + } + } + + return i - first_empty_segment_descriptor; +} + +hsa_agent_t LoadedCodeObjectImpl::getAgent() const { + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return loaded_segments.front()->Agent(); +} +hsa_executable_t LoadedCodeObjectImpl::getExecutable() const { + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return Executable::Handle(loaded_segments.front()->Owner()); +} +uint64_t LoadedCodeObjectImpl::getElfData() const { + return reinterpret_cast(elf_data); +} +uint64_t LoadedCodeObjectImpl::getElfSize() const { + return (uint64_t)elf_size; +} +uint64_t LoadedCodeObjectImpl::getStorageOffset() const { + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return (uint64_t)loaded_segments.front()->StorageOffset(); +} +uint64_t LoadedCodeObjectImpl::getLoadBase() const { + // TODO Add support for code objects with 0 segments. + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return reinterpret_cast(loaded_segments.front()->Address(0)); +} +uint64_t LoadedCodeObjectImpl::getLoadSize() const { + // TODO Add support for code objects with 0 or >1 segments. + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return (uint64_t)loaded_segments.front()->Size(); +} +int64_t LoadedCodeObjectImpl::getDelta() const { + // TODO Add support for code objects with 0 segments. + assert(loaded_segments.size() == 1 && "Only supports code objects v2+"); + return getLoadBase() - loaded_segments.front()->VAddr(); +} + +std::string LoadedCodeObjectImpl::getUri() const { + return std::string(r_debug_info.l_name); +} + +hsa_executable_t AmdHsaCodeLoader::FindExecutable(uint64_t device_address) +{ + hsa_executable_t execHandle = {0}; + ReaderLockGuard reader_lock(rw_lock_); + if (device_address == 0) { + return execHandle; + } + + for (auto &exec : executables) { + if (exec != nullptr) { + uint64_t host_address = exec->FindHostAddress(device_address); + if (host_address != 0) { + return Executable::Handle(exec); + } + } + } + return execHandle; +} + +uint64_t ExecutableImpl::FindHostAddress(uint64_t device_address) +{ + for (auto &obj : loaded_code_objects) { + assert(obj); + for (auto &seg : obj->LoadedSegments()) { + assert(seg); + uint64_t paddr = (uint64_t)(uintptr_t)seg->Address(seg->VAddr()); + if (paddr <= device_address && device_address < paddr + seg->Size()) { + void *haddr = context_->SegmentHostAddress( + seg->ElfSegment(), seg->Agent(), seg->Ptr(), device_address - paddr); + return nullptr == haddr ? 0 : (uint64_t)(uintptr_t)haddr; + } + } + } + return 0; +} + +void ExecutableImpl::EnableReadOnlyMode() +{ + rw_lock_.ReaderLock(); +} + +void ExecutableImpl::DisableReadOnlyMode() +{ + rw_lock_.ReaderUnlock(); +} + +#define HSAERRCHECK(hsc) \ + if (hsc != HSA_STATUS_SUCCESS) { \ + assert(false); \ + return hsc; \ + } \ + + +hsa_status_t ExecutableImpl::GetInfo( + hsa_executable_info_t executable_info, void *value) +{ + ReaderLockGuard reader_lock(rw_lock_); + + assert(value); + + switch (executable_info) { + case HSA_EXECUTABLE_INFO_PROFILE: { + *((hsa_profile_t*)value) = profile_;; + break; + } + case HSA_EXECUTABLE_INFO_STATE: { + *((hsa_executable_state_t*)value) = state_; + break; + } + case HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE: { + *((hsa_default_float_rounding_mode_t*)value) = + default_float_rounding_mode_; + break; + } + default: { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + } + + return HSA_STATUS_SUCCESS; +} + +static uint32_t NextCodeObjectNum() +{ + static std::atomic_uint_fast32_t dumpN(1); + return dumpN++; +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + hsa_loaded_code_object_t *loaded_code_object) +{ + return LoadCodeObject(agent, code_object, 0, options, "", loaded_code_object); +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + hsa_loaded_code_object_t *loaded_code_object) +{ + return LoadCodeObject(agent, code_object, code_object_size, options, "", loaded_code_object); +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object) +{ + return LoadCodeObject(agent, code_object, 0, options, uri, loaded_code_object); +} + +hsa_status_t ExecutableImpl::LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object) +{ + WriterLockGuard writer_lock(rw_lock_); + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + logger_ << "LoaderError: executable is already frozen\n"; + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + LoaderOptions loaderOptions; + if (options && !loaderOptions.ParseOptions(options)) { + return HSA_STATUS_ERROR; + } + + const char *options_append = getenv("LOADER_OPTIONS_APPEND"); + if (options_append && !loaderOptions.ParseOptions(options_append)) { + return HSA_STATUS_ERROR; + } + + typedef std::tuple Substitute; + std::vector substitutes; + + for (const std::string& s : loaderOptions.Substitute()->values()) { + std::string::size_type vi = s.find('='); + if (vi == std::string::npos) { return HSA_STATUS_ERROR; } + std::string value = s.substr(vi + 1); + std::string range = s.substr(0, vi); + std::string::size_type mi = range.find('-'); + uint32_t n1 = UINT32_MAX, n2 = UINT32_MAX; + if (mi != std::string::npos) { + std::string s1, s2; + s1 = range.substr(0, mi - 1); + s2 = range.substr(mi + 1); + std::istringstream is1(s1); is1 >> n1; + std::istringstream is2(s2); is2 >> n2; + } else { + std::istringstream is(range); is >> n1; + n2 = n1; + } + substitutes.push_back(std::make_tuple(n1, n2, value)); + } + + uint32_t codeNum = NextCodeObjectNum(); + + code.reset(new code::AmdHsaCode()); + + std::string substituteFileName; + for (const Substitute& ss : substitutes) { + if (codeNum >= std::get<0>(ss) && codeNum <= std::get<1>(ss)) { + substituteFileName = std::get<2>(ss); + break; + } + } + std::vector buffer; + if (substituteFileName.empty()) { + if (!code->InitAsHandle(code_object)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + } else { + if (!ReadFileIntoBuffer(substituteFileName, buffer)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (!code->InitAsBuffer(&buffer[0], buffer.size())) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + } + + if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpCode()->is_set()) { + if (!code->SaveToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "hsaco", codeNum))) { + // Ignore error. + } + } + if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpIsa()->is_set()) { + if (!code->PrintToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "isa", codeNum))) { + // Ignore error. + } + } + + std::string codeIsa; + unsigned genericVersion; + if (!code->GetIsa(codeIsa, &genericVersion)) { + logger_ << "LoaderError: failed to determine code object's ISA\n"; + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + uint32_t majorVersion, minorVersion; + if (!code->GetCodeObjectVersion(&majorVersion, &minorVersion)) { + logger_ << "LoaderError: failed to determine code object's version\n"; + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + if (majorVersion < 1 || majorVersion > 6) { + logger_ << "LoaderError: unsupported code object version: " << majorVersion << "\n"; + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (agent.handle == 0 && majorVersion == 1) { + logger_ << "LoaderError: code object v1 requires non-null agent\n"; + return HSA_STATUS_ERROR_INVALID_AGENT; + } + + uint32_t codeHsailMajor; + uint32_t codeHsailMinor; + hsa_profile_t codeProfile; + hsa_machine_model_t codeMachineModel; + hsa_default_float_rounding_mode_t codeRoundingMode; + if (!code->GetNoteHsail(&codeHsailMajor, &codeHsailMinor, &codeProfile, &codeMachineModel, &codeRoundingMode)) { + codeProfile = profile_; + } + if (profile_ != codeProfile) { + logger_ << "LoaderError: mismatched profiles\n"; + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + } + + hsa_isa_t objectsIsa = context_->IsaFromName(codeIsa.c_str()); + if (!objectsIsa.handle) { + logger_ << "LoaderError: code object's ISA (" << codeIsa.c_str() << ") is invalid\n"; + return HSA_STATUS_ERROR_INVALID_ISA_NAME; + } + + if (agent.handle != 0 && !context_->IsaSupportedByAgent(agent, objectsIsa, genericVersion)) { + logger_ << "LoaderError: code object's ISA (" << codeIsa.c_str() << ") is not supported by the agent\n"; + return HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS; + } + + hsa_status_t status; + + objects.push_back(new LoadedCodeObjectImpl(this, agent, code->ElfData(), code->ElfSize())); + loaded_code_objects.push_back((LoadedCodeObjectImpl*)objects.back()); + + status = LoadSegments(agent, code.get(), majorVersion); + if (status != HSA_STATUS_SUCCESS) return status; + + for (size_t i = 0; i < code->SymbolCount(); ++i) { + if (majorVersion >= 2 && + code->GetSymbol(i)->elfSym()->type() != STT_AMDGPU_HSA_KERNEL && + code->GetSymbol(i)->elfSym()->binding() == STB_LOCAL) + continue; + + status = LoadSymbol(agent, code->GetSymbol(i), majorVersion); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + + status = ApplyRelocations(agent, code.get()); + if (status != HSA_STATUS_SUCCESS) { return status; } + + code.reset(); + + if (loaderOptions.DumpAll()->is_set() || loaderOptions.DumpExec()->is_set()) { + if (!PrintToFile(amd::hsa::DumpFileName(loaderOptions.DumpDir()->value(), LOADER_DUMP_PREFIX, "exec", codeNum))) { + // Ignore error. + } + } + + if (majorVersion >= 2) { + loaded_code_objects.back()->r_debug_info.l_addr = loaded_code_objects.back()->getDelta(); + loaded_code_objects.back()->r_debug_info.l_name = loader_strdup(uri.c_str()); + loaded_code_objects.back()->r_debug_info.l_prev = nullptr; + loaded_code_objects.back()->r_debug_info.l_next = nullptr; + } + + if (nullptr != loaded_code_object) { *loaded_code_object = LoadedCodeObject::Handle(loaded_code_objects.back()); } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegments(hsa_agent_t agent, + const code::AmdHsaCode *c, + uint32_t majorVersion) { + if (majorVersion < 2) + return LoadSegmentsV1(agent, c); + else + return LoadSegmentsV2(agent, c); +} + +hsa_status_t ExecutableImpl::LoadSegmentsV1(hsa_agent_t agent, + const code::AmdHsaCode *c) { + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < c->DataSegmentCount(); ++i) { + status = LoadSegmentV1(agent, c->DataSegment(i)); + if (status != HSA_STATUS_SUCCESS) return status; + } + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegmentsV2(hsa_agent_t agent, + const code::AmdHsaCode *c) { + assert(c->Machine() == ELF::EM_AMDGPU && "Program code objects are not supported"); + + if (!c->DataSegmentCount()) return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + + uint64_t vaddr = c->DataSegment(0)->vaddr(); + uint64_t size = c->DataSegment(c->DataSegmentCount() - 1)->vaddr() + + c->DataSegment(c->DataSegmentCount() - 1)->memSize(); + + void *ptr = context_->SegmentAlloc(AMDGPU_HSA_SEGMENT_CODE_AGENT, agent, size, + AMD_ISA_ALIGN_BYTES, true); + if (!ptr) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + Segment *load_segment = new Segment(this, agent, AMDGPU_HSA_SEGMENT_CODE_AGENT, + ptr, size, vaddr, c->DataSegment(0)->offset()); + if (!load_segment) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < c->DataSegmentCount(); ++i) { + status = LoadSegmentV2(c->DataSegment(i), load_segment); + if (status != HSA_STATUS_SUCCESS) return status; + } + + objects.push_back(load_segment); + loaded_code_objects.back()->LoadedSegments().push_back(load_segment); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegmentV1(hsa_agent_t agent, + const code::Segment *s) { + assert(s->type() < PT_LOOS + AMDGPU_HSA_SEGMENT_LAST); + if (s->memSize() == 0) + return HSA_STATUS_SUCCESS; + amdgpu_hsa_elf_segment_t segment = (amdgpu_hsa_elf_segment_t)(s->type() - PT_LOOS); + Segment *new_seg = nullptr; + bool need_alloc = true; + if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM && nullptr != program_allocation_segment) { + new_seg = program_allocation_segment; + need_alloc = false; + } + if (need_alloc) { + void* ptr = context_->SegmentAlloc(segment, agent, s->memSize(), s->align(), true); + if (!ptr) { return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + new_seg = new Segment(this, agent, segment, ptr, s->memSize(), s->vaddr(), s->offset()); + new_seg->Copy(s->vaddr(), s->data(), s->imageSize()); + objects.push_back(new_seg); + + if (segment == AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) { + program_allocation_segment = new_seg; + } + } + assert(new_seg); + loaded_code_objects.back()->LoadedSegments().push_back(new_seg); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSegmentV2(const code::Segment *data_segment, + loader::Segment *load_segment) { + assert(data_segment && load_segment); + load_segment->Copy(data_segment->vaddr(), data_segment->data(), + data_segment->imageSize()); + + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) +{ + if (sym->IsDeclaration()) { + return LoadDeclarationSymbol(agent, sym, majorVersion); + } else { + return LoadDefinitionSymbol(agent, sym, majorVersion); + } +} + +namespace { + +bool string_ends_with(const std::string &str, const std::string &suf) { + return str.size() >= suf.size() ? str.compare(str.size() - suf.size(), suf.size(), suf) == 0 : false; +} + +} + +hsa_status_t ExecutableImpl::LoadDefinitionSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) +{ + bool isAgent = sym->IsAgent(); + if (majorVersion >= 2) { + isAgent = agent.handle != 0; + } + if (isAgent) { + auto agent_symbol = agent_symbols_.find(std::make_pair(sym->Name(), agent)); + if (agent_symbol != agent_symbols_.end()) { + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + } else { + auto program_symbol = program_symbols_.find(sym->Name()); + if (program_symbol != program_symbols_.end()) { + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED; + } + } + + uint64_t address = SymbolAddress(agent, sym); + SymbolImpl *symbol = nullptr; + if (string_ends_with(sym->GetSymbolName(), ".kd")) { + // V3+. + llvm::amdhsa::kernel_descriptor_t kd; + sym->GetSection()->getData(sym->SectionOffset(), &kd, sizeof(kd)); + + uint32_t kernarg_segment_size = kd.kernarg_size; // FIXME: If 0 then the compiler is not specifying the size. + uint32_t kernarg_segment_alignment = 16; // FIXME: Use the minumum HSA required alignment. + uint32_t group_segment_size = kd.group_segment_fixed_size; + uint32_t private_segment_size = kd.private_segment_fixed_size; + bool is_dynamic_callstack = AMDHSA_BITS_GET( + kd.kernel_code_properties, + llvm::amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK) ? true : false; + + uint64_t size = sym->Size(); + + KernelSymbol *kernel_symbol = new KernelSymbol(true, + sym->GetModuleName(), + sym->GetSymbolName(), + sym->Linkage(), + true, // sym->IsDefinition() + kernarg_segment_size, + kernarg_segment_alignment, + group_segment_size, + private_segment_size, + is_dynamic_callstack, + size, + 64, + address); + symbol = kernel_symbol; + } else if (sym->IsVariableSymbol()) { + symbol = new VariableSymbol(true, + sym->GetModuleName(), + sym->GetSymbolName(), + sym->Linkage(), + true, // sym->IsDefinition() + sym->Allocation(), + sym->Segment(), + sym->Size(), + sym->Alignment(), + sym->IsConst(), + false, + address); + } else if (sym->IsKernelSymbol()) { + amd_kernel_code_t akc; + sym->GetSection()->getData(sym->SectionOffset(), &akc, sizeof(akc)); + + uint32_t kernarg_segment_size = + uint32_t(akc.kernarg_segment_byte_size); + uint32_t kernarg_segment_alignment = + uint32_t(1 << akc.kernarg_segment_alignment); + uint32_t group_segment_size = + uint32_t(akc.workgroup_group_segment_byte_size); + uint32_t private_segment_size = + uint32_t(akc.workitem_private_segment_byte_size); + bool is_dynamic_callstack = + AMD_HSA_BITS_GET(akc.kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK) ? true : false; + + uint64_t size = sym->Size(); + + if (!size && sym->SectionOffset() < sym->GetSection()->size()) { + // ORCA Runtime relies on symbol size equal to size of kernel ISA. If symbol size is 0 in ELF, + // calculate end of segment - symbol value. + size = sym->GetSection()->size() - sym->SectionOffset(); + } + KernelSymbol *kernel_symbol = new KernelSymbol(true, + sym->GetModuleName(), + sym->GetSymbolName(), + sym->Linkage(), + true, // sym->IsDefinition() + kernarg_segment_size, + kernarg_segment_alignment, + group_segment_size, + private_segment_size, + is_dynamic_callstack, + size, + 256, + address); + kernel_symbol->debug_info.elf_raw = code->ElfData(); + kernel_symbol->debug_info.elf_size = code->ElfSize(); + kernel_symbol->debug_info.kernel_name = kernel_symbol->full_name.c_str(); + kernel_symbol->debug_info.owning_segment = (void*)SymbolSegment(agent, sym)->Address(sym->GetSection()->addr()); + symbol = kernel_symbol; + + // \todo kzhuravl 10/15/15 This is a debugger backdoor: needs to be + // removed. + uint64_t target_address = sym->GetSection()->addr() + sym->SectionOffset() + ((size_t)(&((amd_kernel_code_t*)0)->runtime_loader_kernel_symbol)); + uint64_t source_value = (uint64_t) (uintptr_t) &kernel_symbol->debug_info; + SymbolSegment(agent, sym)->Copy(target_address, &source_value, sizeof(source_value)); + } else { + assert(!"Unexpected symbol type in LoadDefinitionSymbol"); + return HSA_STATUS_ERROR; + } + + assert(symbol); + if (isAgent) { + symbol->agent = agent; + agent_symbols_.insert(std::make_pair(std::make_pair(sym->Name(), agent), symbol)); + } else { + program_symbols_.insert(std::make_pair(sym->Name(), symbol)); + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::LoadDeclarationSymbol(hsa_agent_t agent, + code::Symbol* sym, + uint32_t majorVersion) +{ + auto program_symbol = program_symbols_.find(sym->Name()); + if (program_symbol == program_symbols_.end()) { + auto agent_symbol = agent_symbols_.find(std::make_pair(sym->Name(), agent)); + if (agent_symbol == agent_symbols_.end()) { + logger_ << "LoaderError: symbol \"" << sym->Name() << "\" is undefined\n"; + + // TODO(spec): this is not spec compliant. + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + } + return HSA_STATUS_SUCCESS; +} + +Segment* ExecutableImpl::VirtualAddressSegment(uint64_t vaddr) +{ + for (auto &seg : loaded_code_objects.back()->LoadedSegments()) { + if (seg->IsAddressInSegment(vaddr)) { + return seg; + } + } + return 0; +} + +uint64_t ExecutableImpl::SymbolAddress(hsa_agent_t agent, code::Symbol* sym) +{ + code::Section* sec = sym->GetSection(); + Segment* seg = SectionSegment(agent, sec); + return nullptr == seg ? 0 : (uint64_t) (uintptr_t) seg->Address(sym->VAddr()); +} + +uint64_t ExecutableImpl::SymbolAddress(hsa_agent_t agent, elf::Symbol* sym) +{ + elf::Section* sec = sym->section(); + Segment* seg = SectionSegment(agent, sec); + uint64_t vaddr = sec->addr() + sym->value(); + return nullptr == seg ? 0 : (uint64_t) (uintptr_t) seg->Address(vaddr); +} + +Segment* ExecutableImpl::SymbolSegment(hsa_agent_t agent, code::Symbol* sym) +{ + return SectionSegment(agent, sym->GetSection()); +} + +Segment* ExecutableImpl::SectionSegment(hsa_agent_t agent, code::Section* sec) +{ + for (Segment* seg : loaded_code_objects.back()->LoadedSegments()) { + if (seg->IsAddressInSegment(sec->addr())) { + return seg; + } + } + return 0; +} + +hsa_status_t ExecutableImpl::ApplyRelocations(hsa_agent_t agent, amd::hsa::code::AmdHsaCode *c) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < c->RelocationSectionCount(); ++i) { + if (c->GetRelocationSection(i)->targetSection()) { + status = ApplyStaticRelocationSection(agent, c->GetRelocationSection(i)); + } else { + // Dynamic relocations are supported starting code object v2.1. + uint32_t majorVersion, minorVersion; + if (!c->GetCodeObjectVersion(&majorVersion, &minorVersion)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (majorVersion < 2) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (majorVersion == 2 && minorVersion < 1) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + status = ApplyDynamicRelocationSection(agent, c->GetRelocationSection(i)); + } + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::ApplyStaticRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec) +{ + // Skip link-time relocations (if any). + if (!(sec->targetSection()->flags() & SHF_ALLOC)) { return HSA_STATUS_SUCCESS; } + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < sec->relocationCount(); ++i) { + status = ApplyStaticRelocation(agent, sec->relocation(i)); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::code::Relocation *rel) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + amd::elf::Symbol* sym = rel->symbol(); + code::RelocationSection* rsec = rel->section(); + code::Section* sec = rsec->targetSection(); + Segment* rseg = SectionSegment(agent, sec); + size_t reladdr = sec->addr() + rel->offset(); + switch (rel->type()) { + case R_AMDGPU_32_LOW: + case R_AMDGPU_32_HIGH: + case R_AMDGPU_64: + { + uint64_t addr; + switch (sym->type()) { + case STT_OBJECT: + case STT_SECTION: + case STT_AMDGPU_HSA_KERNEL: + case STT_AMDGPU_HSA_INDIRECT_FUNCTION: + addr = SymbolAddress(agent, sym); + if (!addr) { return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; } + break; + case STT_COMMON: { + hsa_agent_t *sagent = &agent; + if (STA_AMDGPU_HSA_GLOBAL_PROGRAM == ELF64_ST_AMDGPU_ALLOCATION(sym->other())) { + sagent = nullptr; + } + SymbolImpl* esym = (SymbolImpl*) GetSymbolInternal(sym->name().c_str(), sagent); + if (!esym) { + logger_ << "LoaderError: symbol \"" << sym->name() << "\" is undefined\n"; + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + addr = esym->address; + break; + } + default: + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + addr += rel->addend(); + + uint32_t addr32 = 0; + switch (rel->type()) { + case R_AMDGPU_32_HIGH: + addr32 = uint32_t((addr >> 32) & 0xFFFFFFFF); + rseg->Copy(reladdr, &addr32, sizeof(addr32)); + break; + case R_AMDGPU_32_LOW: + addr32 = uint32_t(addr & 0xFFFFFFFF); + rseg->Copy(reladdr, &addr32, sizeof(addr32)); + break; + case R_AMDGPU_64: + rseg->Copy(reladdr, &addr, sizeof(addr)); + break; + default: + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + break; + } + + case R_AMDGPU_INIT_SAMPLER: + { + if (STT_AMDGPU_HSA_METADATA != sym->type() || + SHT_PROGBITS != sym->section()->type() || + !(sym->section()->flags() & SHF_MERGE)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + amdgpu_hsa_sampler_descriptor_t desc; + if (!sym->section()->getData(sym->value(), &desc, sizeof(desc))) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (AMDGPU_HSA_METADATA_KIND_INIT_SAMP != desc.kind) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + hsa_ext_sampler_descriptor_t hsa_sampler_descriptor; + hsa_sampler_descriptor.coordinate_mode = + hsa_ext_sampler_coordinate_mode_t(desc.coord); + hsa_sampler_descriptor.filter_mode = + hsa_ext_sampler_filter_mode_t(desc.filter); + hsa_sampler_descriptor.address_mode = + hsa_ext_sampler_addressing_mode_t(desc.addressing); + + hsa_ext_sampler_t hsa_sampler = {0}; + status = context_->SamplerCreate(agent, &hsa_sampler_descriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { return status; } + assert(hsa_sampler.handle); + rseg->Copy(reladdr, &hsa_sampler, sizeof(hsa_sampler)); + break; + } + + case R_AMDGPU_INIT_IMAGE: + { + if (STT_AMDGPU_HSA_METADATA != sym->type() || + SHT_PROGBITS != sym->section()->type() || + !(sym->section()->flags() & SHF_MERGE)) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + amdgpu_hsa_image_descriptor_t desc; + if (!sym->section()->getData(sym->value(), &desc, sizeof(desc))) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + if (AMDGPU_HSA_METADATA_KIND_INIT_ROIMG != desc.kind && + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG != desc.kind && + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG != desc.kind) { + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + + hsa_ext_image_format_t hsa_image_format; + hsa_image_format.channel_order = + hsa_ext_image_channel_order_t(desc.channel_order); + hsa_image_format.channel_type = + hsa_ext_image_channel_type_t(desc.channel_type); + + hsa_ext_image_descriptor_t hsa_image_descriptor; + hsa_image_descriptor.geometry = + hsa_ext_image_geometry_t(desc.geometry); + hsa_image_descriptor.width = size_t(desc.width); + hsa_image_descriptor.height = size_t(desc.height); + hsa_image_descriptor.depth = size_t(desc.depth); + hsa_image_descriptor.array_size = size_t(desc.array); + hsa_image_descriptor.format = hsa_image_format; + + hsa_access_permission_t hsa_image_permission = HSA_ACCESS_PERMISSION_RO; + switch (desc.kind) { + case AMDGPU_HSA_METADATA_KIND_INIT_ROIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_RO; + break; + } + case AMDGPU_HSA_METADATA_KIND_INIT_WOIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_WO; + break; + } + case AMDGPU_HSA_METADATA_KIND_INIT_RWIMG: { + hsa_image_permission = HSA_ACCESS_PERMISSION_RW; + break; + } + default: { + assert(false); + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + } + + hsa_ext_image_t hsa_image = {0}; + status = context_->ImageCreate(agent, hsa_image_permission, + &hsa_image_descriptor, + NULL, // TODO: image_data? + &hsa_image); + if (status != HSA_STATUS_SUCCESS) { return status; } + rseg->Copy(reladdr, &hsa_image, sizeof(hsa_image)); + break; + } + + default: + // Ignore. + break; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::ApplyDynamicRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + for (size_t i = 0; i < sec->relocationCount(); ++i) { + status = ApplyDynamicRelocation(agent, sec->relocation(i)); + if (status != HSA_STATUS_SUCCESS) { return status; } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa::code::Relocation *rel) +{ + Segment* relSeg = VirtualAddressSegment(rel->offset()); + uint64_t symAddr = 0; + switch (rel->symbol()->type()) { + case STT_OBJECT: + case STT_AMDGPU_HSA_KERNEL: + case STT_FUNC: + { + Segment* symSeg = VirtualAddressSegment(rel->symbol()->value()); + symAddr = reinterpret_cast(symSeg->Address(rel->symbol()->value())); + break; + } + + // External symbols, they must be defined prior loading. + case STT_NOTYPE: + { + // TODO: Only agent allocation variables are supported in v2.1. How will + // we distinguish between program allocation and agent allocation + // variables? + auto agent_symbol = agent_symbols_.find(std::make_pair(rel->symbol()->name(), agent)); + if (agent_symbol != agent_symbols_.end()) + symAddr = agent_symbol->second->address; + break; + } + + default: + // Only objects and kernels are supported in v2.1. + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + symAddr += rel->addend(); + + switch (rel->type()) { + case R_AMDGPU_32_HIGH: + { + if (!symAddr) { + logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + + uint32_t symAddr32 = uint32_t((symAddr >> 32) & 0xFFFFFFFF); + relSeg->Copy(rel->offset(), &symAddr32, sizeof(symAddr32)); + break; + } + + case R_AMDGPU_32_LOW: + { + if (!symAddr) { + logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + + uint32_t symAddr32 = uint32_t(symAddr & 0xFFFFFFFF); + relSeg->Copy(rel->offset(), &symAddr32, sizeof(symAddr32)); + break; + } + + case R_AMDGPU_64: + { + if (!symAddr) { + logger_ << "LoaderError: symbol \"" << rel->symbol()->name() << "\" is undefined\n"; + return HSA_STATUS_ERROR_VARIABLE_UNDEFINED; + } + + relSeg->Copy(rel->offset(), &symAddr, sizeof(symAddr)); + break; + } + + case R_AMDGPU_RELATIVE64: + { + int64_t baseDelta = reinterpret_cast(relSeg->Address(0)) - relSeg->VAddr(); + uint64_t relocatedAddr = baseDelta + rel->addend(); + relSeg->Copy(rel->offset(), &relocatedAddr, sizeof(relocatedAddr)); + break; + } + + default: + return HSA_STATUS_ERROR_INVALID_CODE_OBJECT; + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ExecutableImpl::Freeze(const char *options) { + amd::hsa::common::WriterLockGuard writer_lock(rw_lock_); + if (HSA_EXECUTABLE_STATE_FROZEN == state_) { + return HSA_STATUS_ERROR_FROZEN_EXECUTABLE; + } + + for (auto &lco : loaded_code_objects) { + for (auto &ls : lco->LoadedSegments()) { + ls->Freeze(); + } + } + + state_ = HSA_EXECUTABLE_STATE_FROZEN; + return HSA_STATUS_SUCCESS; +} + +void ExecutableImpl::Print(std::ostream& out) +{ + out << "AMD Executable" << std::endl; + out << " Id: " << id() + << " Profile: " << HsaProfileToString(profile()) + << std::endl << std::endl; + out << "Loaded Objects (total " << objects.size() << ")" << std::endl; + size_t i = 0; + for (ExecutableObject* o : objects) { + out << "Loaded Object " << i++ << ": "; + o->Print(out); + out << std::endl; + } + out << "End AMD Executable" << std::endl; +} + +bool ExecutableImpl::PrintToFile(const std::string& filename) +{ + std::ofstream out(filename); + if (out.fail()) { return false; } + Print(out); + return out.fail(); +} + +} // namespace loader +} // namespace hsa +} // namespace amd diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.hpp new file mode 100644 index 0000000000..d0df967b95 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/executable.hpp @@ -0,0 +1,686 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ +#define HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hsa.h" +#include "hsa_ext_image.h" +#include "amd_hsa_loader.hpp" +#include "amd_hsa_code.hpp" +#include "amd_hsa_kernel_code.h" +#include "amd_hsa_locks.hpp" + +#if defined(_WIN32) || defined(_WIN64) +#if _WIN64 +#define __WORDSIZE 64 +#else +#define __WORDSIZE 32 +#endif +#endif + +#define __ELF_NATIVE_CLASS __WORDSIZE + +/* We use this macro to refer to ELF types independent of the native wordsize. + `ElfW(TYPE)' is used in place of `Elf32_TYPE' or `Elf64_TYPE'. */ +#define ElfW(type) _ElfW (Elf, __ELF_NATIVE_CLASS, type) +#define _ElfW(e,w,t) _ElfW_1 (e, w, _##t) +#define _ElfW_1(e,w,t) e##w##t + +/* Structure describing a loaded shared object. The `l_next' and `l_prev' + members form a chain of all the shared objects loaded at startup. + + These data structures exist in space used by the run-time dynamic linker; + modifying them may have disastrous results. */ + +struct link_map + { + /* These first few members are part of the protocol with the debugger. + This is the same format used in SVR4. */ + + ElfW(Addr) l_addr; /* Difference between the address in the ELF + file and the addresses in memory. */ + char *l_name; /* Absolute file name object was found in. */ + ElfW(Dyn) *l_ld; /* Dynamic section of the shared object. */ + struct link_map *l_next, *l_prev; /* Chain of loaded objects. */ + }; + +/* The legacy rendezvous structure used by the run-time dynamic linker to + communicate details of shared object loading to the debugger. */ + +struct r_debug + { + /* Version number for this protocol. It should be greater than 0. */ + int r_version; + + struct link_map *r_map; /* Head of the chain of loaded objects. */ + + /* This is the address of a function internal to the run-time linker, + that will always be called when the linker begins to map in a + library or unmap it, and again when the mapping change is complete. + The debugger can set a breakpoint at this address if it wants to + notice shared object mapping changes. */ + ElfW(Addr) r_brk; + enum RT + { + /* This state value describes the mapping change taking place when + the `r_brk' address is called. */ + RT_CONSISTENT, /* Mapping change is complete. */ + RT_ADD, /* Beginning to add a new object. */ + RT_DELETE /* Beginning to remove an object mapping. */ + } r_state; + + ElfW(Addr) r_ldbase; /* Base address the linker is loaded at. */ + }; + +/* This is the symbol of that structure provided by the dynamic linker. */ +extern struct r_debug _r_debug; + +namespace amd { +namespace hsa { +namespace loader { + +class MemoryAddress; +class SymbolImpl; +class KernelSymbol; +class VariableSymbol; +class ExecutableImpl; + +//===----------------------------------------------------------------------===// +// SymbolImpl. // +//===----------------------------------------------------------------------===// + +typedef uint32_t symbol_attribute32_t; + +class SymbolImpl: public Symbol { +public: + virtual ~SymbolImpl() {} + + bool IsKernel() const { + return HSA_SYMBOL_KIND_KERNEL == kind; + } + bool IsVariable() const { + return HSA_SYMBOL_KIND_VARIABLE == kind; + } + + bool is_loaded; + hsa_symbol_kind_t kind; + std::string module_name; + std::string symbol_name; + hsa_symbol_linkage_t linkage; + bool is_definition; + uint64_t address; + hsa_agent_t agent; + + hsa_agent_t GetAgent() override { + return agent; + } + +protected: + SymbolImpl(const bool &_is_loaded, + const hsa_symbol_kind_t &_kind, + const std::string &_module_name, + const std::string &_symbol_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const uint64_t &_address = 0) + : is_loaded(_is_loaded) + , kind(_kind) + , module_name(_module_name) + , symbol_name(_symbol_name) + , linkage(_linkage) + , is_definition(_is_definition) + , address(_address) {} + + virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void* value) override; + +private: + SymbolImpl(const SymbolImpl &s); + SymbolImpl& operator=(const SymbolImpl &s); +}; + +//===----------------------------------------------------------------------===// +// KernelSymbol. // +//===----------------------------------------------------------------------===// + +class KernelSymbol final: public SymbolImpl { +public: + KernelSymbol(const bool &_is_loaded, + const std::string &_module_name, + const std::string &_symbol_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const uint32_t &_kernarg_segment_size, + const uint32_t &_kernarg_segment_alignment, + const uint32_t &_group_segment_size, + const uint32_t &_private_segment_size, + const bool &_is_dynamic_callstack, + const uint32_t &_size, + const uint32_t &_alignment, + const uint64_t &_address = 0) + : SymbolImpl(_is_loaded, + HSA_SYMBOL_KIND_KERNEL, + _module_name, + _symbol_name, + _linkage, + _is_definition, + _address) + , full_name(_module_name.empty() ? _symbol_name : _module_name + "::" + _symbol_name) + , kernarg_segment_size(_kernarg_segment_size) + , kernarg_segment_alignment(_kernarg_segment_alignment) + , group_segment_size(_group_segment_size) + , private_segment_size(_private_segment_size) + , is_dynamic_callstack(_is_dynamic_callstack) + , size(_size) + , alignment(_alignment) {} + + ~KernelSymbol() {} + + bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + + std::string full_name; + uint32_t kernarg_segment_size; + uint32_t kernarg_segment_alignment; + uint32_t group_segment_size; + uint32_t private_segment_size; + bool is_dynamic_callstack; + uint32_t size; + uint32_t alignment; + amd_runtime_loader_debug_info_t debug_info; + +private: + KernelSymbol(const KernelSymbol &ks); + KernelSymbol& operator=(const KernelSymbol &ks); +}; + +//===----------------------------------------------------------------------===// +// VariableSymbol. // +//===----------------------------------------------------------------------===// + +class VariableSymbol final: public SymbolImpl { +public: + VariableSymbol(const bool &_is_loaded, + const std::string &_module_name, + const std::string &_symbol_name, + const hsa_symbol_linkage_t &_linkage, + const bool &_is_definition, + const hsa_variable_allocation_t &_allocation, + const hsa_variable_segment_t &_segment, + const uint32_t &_size, + const uint32_t &_alignment, + const bool &_is_constant, + const bool &_is_external = false, + const uint64_t &_address = 0) + : SymbolImpl(_is_loaded, + HSA_SYMBOL_KIND_VARIABLE, + _module_name, + _symbol_name, + _linkage, + _is_definition, + _address) + , allocation(_allocation) + , segment(_segment) + , size(_size) + , alignment(_alignment) + , is_constant(_is_constant) + , is_external(_is_external) {} + + ~VariableSymbol() {} + + bool GetInfo(hsa_symbol_info32_t symbol_info, void *value); + + hsa_variable_allocation_t allocation; + hsa_variable_segment_t segment; + uint32_t size; + uint32_t alignment; + bool is_constant; + bool is_external; + +private: + VariableSymbol(const VariableSymbol &vs); + VariableSymbol& operator=(const VariableSymbol &vs); +}; + +//===----------------------------------------------------------------------===// +// Logger. // +//===----------------------------------------------------------------------===// + +class Logger final { +public: + Logger(std::ostream &Stream = std::cerr) : OutStream(Stream) {} + + template + Logger &operator<<(const T &Data) { + if (!IsLoggingEnabled()) + return *this; + OutStream << Data; + std::stringstream ss; + ss << Data; + printf("=================== %s\n", ss.str().c_str()); + + return *this; + } + +private: + Logger(const Logger &L); + Logger& operator=(const Logger &L); + + bool IsLoggingEnabled() const { + const char *enable_logging = getenv("LOADER_ENABLE_LOGGING"); + if (!enable_logging) + return false; + if (std::string(enable_logging) == "0") + return false; + return true; + } + + std::ostream &OutStream; +}; + +//===----------------------------------------------------------------------===// +// Executable. // +//===----------------------------------------------------------------------===// + +class ExecutableImpl; +class LoadedCodeObjectImpl; +class Segment; + +class ExecutableObject { +protected: + ExecutableImpl *owner; + hsa_agent_t agent; + +public: + ExecutableObject(ExecutableImpl *owner_, hsa_agent_t agent_) + : owner(owner_), agent(agent_) { } + + ExecutableImpl* Owner() const { return owner; } + hsa_agent_t Agent() const { return agent; } + virtual void Print(std::ostream& out) = 0; + virtual void Destroy() = 0; + + virtual ~ExecutableObject() { } +}; + +class LoadedCodeObjectImpl : public LoadedCodeObject, public ExecutableObject { +friend class AmdHsaCodeLoader; +private: + LoadedCodeObjectImpl(const LoadedCodeObjectImpl&); + LoadedCodeObjectImpl& operator=(const LoadedCodeObjectImpl&); + + const void *elf_data; + const size_t elf_size; + std::vector loaded_segments; + +public: + LoadedCodeObjectImpl(ExecutableImpl *owner_, hsa_agent_t agent_, const void *elf_data_, size_t elf_size_) + : ExecutableObject(owner_, agent_), elf_data(elf_data_), elf_size(elf_size_) { + memset(&r_debug_info, 0, sizeof(r_debug_info)); + } + + const void* ElfData() const { return elf_data; } + size_t ElfSize() const { return elf_size; } + std::vector& LoadedSegments() { return loaded_segments; } + + bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) override; + + hsa_status_t IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) override; + + void Print(std::ostream& out) override; + + void Destroy() override {} + + hsa_agent_t getAgent() const override; + hsa_executable_t getExecutable() const override; + uint64_t getElfData() const override; + uint64_t getElfSize() const override; + uint64_t getStorageOffset() const override; + uint64_t getLoadBase() const override; + uint64_t getLoadSize() const override; + int64_t getDelta() const override; + std::string getUri() const override; + + link_map r_debug_info; +}; + +class Segment : public LoadedSegment, public ExecutableObject { +private: + amdgpu_hsa_elf_segment_t segment; + void *ptr; + size_t size; + uint64_t vaddr; + bool frozen; + size_t storage_offset; + +public: + Segment(ExecutableImpl *owner_, hsa_agent_t agent_, amdgpu_hsa_elf_segment_t segment_, void* ptr_, size_t size_, uint64_t vaddr_, size_t storage_offset_) + : ExecutableObject(owner_, agent_), segment(segment_), + ptr(ptr_), size(size_), vaddr(vaddr_), frozen(false), storage_offset(storage_offset_) { } + + amdgpu_hsa_elf_segment_t ElfSegment() const { return segment; } + void* Ptr() const { return ptr; } + size_t Size() const { return size; } + uint64_t VAddr() const { return vaddr; } + size_t StorageOffset() const { return storage_offset; } + + bool GetInfo(amd_loaded_segment_info_t attribute, void *value) override; + + uint64_t Offset(uint64_t addr); // Offset within segment. Used together with ptr with loader context functions. + + void* Address(uint64_t addr); // Address in segment. Used for relocations and valid on agent. + + bool Freeze(); + + bool IsAddressInSegment(uint64_t addr); + void Copy(uint64_t addr, const void* src, size_t size); + void Print(std::ostream& out) override; + void Destroy() override; +}; + +class Sampler : public ExecutableObject { +private: + hsa_ext_sampler_t samp; + +public: + Sampler(ExecutableImpl *owner, hsa_agent_t agent, hsa_ext_sampler_t samp_) + : ExecutableObject(owner, agent), samp(samp_) { } + void Print(std::ostream& out) override; + void Destroy() override; +}; + +class Image : public ExecutableObject { +private: + hsa_ext_image_t img; + +public: + Image(ExecutableImpl *owner, hsa_agent_t agent, hsa_ext_image_t img_) + : ExecutableObject(owner, agent), img(img_) { } + void Print(std::ostream& out) override; + void Destroy() override; +}; + +typedef std::string ProgramSymbol; +typedef std::unordered_map ProgramSymbolMap; + +typedef std::pair AgentSymbol; +struct ASC { + bool operator()(const AgentSymbol &las, const AgentSymbol &ras) const { + return las.first == ras.first && las.second.handle == ras.second.handle; + } +}; +struct ASH { + size_t operator()(const AgentSymbol &as) const { + size_t h = std::hash()(as.first); + size_t i = std::hash()(as.second.handle); + return h ^ (i << 1); + } +}; +typedef std::unordered_map AgentSymbolMap; + +class ExecutableImpl final: public Executable { +friend class AmdHsaCodeLoader; +public: + const hsa_profile_t& profile() const { + return profile_; + } + const hsa_executable_state_t& state() const { + return state_; + } + + ExecutableImpl( + const hsa_profile_t &_profile, + Context *context, + size_t id, + hsa_default_float_rounding_mode_t default_float_rounding_mode); + + ~ExecutableImpl(); + + hsa_status_t GetInfo(hsa_executable_info_t executable_info, void *value) override; + + hsa_status_t DefineProgramExternalVariable( + const char *name, void *address) override; + + hsa_status_t DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address) override; + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + hsa_loaded_code_object_t *loaded_code_object) override; + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + hsa_loaded_code_object_t *loaded_code_object) override; + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object) override; + + hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object) override; + + hsa_status_t Freeze(const char *options) override; + + hsa_status_t Validate(uint32_t *result) override { + amd::hsa::common::ReaderLockGuard reader_lock(rw_lock_); + assert(result); + *result = 0; + return HSA_STATUS_SUCCESS; + } + + /// @note needed for hsa v1.0. + /// @todo remove during loader refactoring. + bool IsProgramSymbol(const char *symbol_name) override; + + Symbol* GetSymbol( + const char *symbol_name, + const hsa_agent_t *agent) override; + + hsa_status_t IterateSymbols( + iterate_symbols_f callback, void *data) override; + + /// @since hsa v1.1. + hsa_status_t IterateAgentSymbols( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data) override; + + /// @since hsa v1.1. + hsa_status_t IterateProgramSymbols( + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data) override; + + hsa_status_t IterateLoadedCodeObjects( + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data) override; + + size_t GetNumSegmentDescriptors() override; + + size_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) override; + + uint64_t FindHostAddress(uint64_t device_address) override; + + void EnableReadOnlyMode(); + void DisableReadOnlyMode(); + + void Print(std::ostream& out) override; + bool PrintToFile(const std::string& filename) override; + + Context* context() { return context_; } + size_t id() { return id_; } + +private: + ExecutableImpl(const ExecutableImpl &e); + ExecutableImpl& operator=(const ExecutableImpl &e); + + std::unique_ptr code; + + Symbol* GetSymbolInternal( + const char *symbol_name, + const hsa_agent_t *agent); + + hsa_status_t LoadSegments(hsa_agent_t agent, const code::AmdHsaCode *c, + uint32_t majorVersion); + hsa_status_t LoadSegmentsV1(hsa_agent_t agent, const code::AmdHsaCode *c); + hsa_status_t LoadSegmentsV2(hsa_agent_t agent, const code::AmdHsaCode *c); + hsa_status_t LoadSegmentV1(hsa_agent_t agent, const code::Segment *s); + hsa_status_t LoadSegmentV2(const code::Segment *data_segment, + loader::Segment *load_segment); + + hsa_status_t LoadSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); + hsa_status_t LoadDefinitionSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); + hsa_status_t LoadDeclarationSymbol(hsa_agent_t agent, amd::hsa::code::Symbol* sym, uint32_t majorVersion); + + hsa_status_t ApplyRelocations(hsa_agent_t agent, amd::hsa::code::AmdHsaCode *c); + hsa_status_t ApplyStaticRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec); + hsa_status_t ApplyStaticRelocation(hsa_agent_t agent, amd::hsa::code::Relocation *rel); + hsa_status_t ApplyDynamicRelocationSection(hsa_agent_t agent, amd::hsa::code::RelocationSection* sec); + hsa_status_t ApplyDynamicRelocation(hsa_agent_t agent, amd::hsa::code::Relocation *rel); + + Segment* VirtualAddressSegment(uint64_t vaddr); + uint64_t SymbolAddress(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + uint64_t SymbolAddress(hsa_agent_t agent, amd::elf::Symbol* sym); + Segment* SymbolSegment(hsa_agent_t agent, amd::hsa::code::Symbol* sym); + Segment* SectionSegment(hsa_agent_t agent, amd::hsa::code::Section* sec); + + amd::hsa::common::ReaderWriterLock rw_lock_; + hsa_profile_t profile_; + Context *context_; + Logger logger_; + const size_t id_; + hsa_default_float_rounding_mode_t default_float_rounding_mode_; + hsa_executable_state_t state_; + + ProgramSymbolMap program_symbols_; + AgentSymbolMap agent_symbols_; + std::vector objects; + Segment *program_allocation_segment; + std::vector loaded_code_objects; +}; + +class AmdHsaCodeLoader : public Loader { +private: + Context* context; + std::vector executables; + amd::hsa::common::ReaderWriterLock rw_lock_; + +public: + AmdHsaCodeLoader(Context* context_) + : context(context_) { assert(context); } + + Context* GetContext() const override { return context; } + + Executable* CreateExecutable( + hsa_profile_t profile, + const char *options, + hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override; + + hsa_status_t FreezeExecutable(Executable *executable, const char *options) override; + void DestroyExecutable(Executable *executable) override; + + hsa_status_t IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) override; + + hsa_status_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) override; + + hsa_executable_t FindExecutable(uint64_t device_address) override; + + uint64_t FindHostAddress(uint64_t device_address) override; + + void PrintHelp(std::ostream& out) override; + + void EnableReadOnlyMode(); + void DisableReadOnlyMode(); +}; + +} // namespace loader +} // namespace hsa +} // namespace amd + +#endif // HSA_RUNTIME_CORE_LOADER_EXECUTABLE_HPP_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.cpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.cpp new file mode 100644 index 0000000000..5ef39a2734 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.cpp @@ -0,0 +1,307 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include "loaders.hpp" + +namespace amd { +namespace hsa { +namespace loader { + + // Helper function that allocates an aligned memory. + static inline void* + alignedMalloc(size_t size, size_t alignment) + { + #if defined(_WIN32) + return ::_aligned_malloc(size, alignment); + #else + void * ptr = NULL; + alignment = (std::max)(alignment, sizeof(void*)); + if (0 == ::posix_memalign(&ptr, alignment, size)) { + return ptr; + } + return NULL; + #endif + } + + // Helper function that frees an aligned memory. + static inline void + alignedFree(void *ptr) + { + #if defined(_WIN32) + ::_aligned_free(ptr); + #else + free(ptr); + #endif + } + + OfflineLoaderContext::OfflineLoaderContext() + : out(std::cout) + { + invalid.handle = 0; + gfx700.handle = 700; + gfx701.handle = 701; + gfx800.handle = 800; + gfx801.handle = 801; + gfx802.handle = 802; + gfx803.handle = 803; + gfx804.handle = 804; + gfx810.handle = 810; + gfx900.handle = 900; + gfx901.handle = 901; + gfx902.handle = 902; + gfx903.handle = 903; + gfx904.handle = 904; + gfx905.handle = 905; + gfx906.handle = 906; + gfx907.handle = 907; + gfx1000.handle = 1000; + gfx1001.handle = 1001; + gfx1010.handle = 1010; + gfx1011.handle = 1011; + gfx1012.handle = 1012; + gfx1030.handle = 1030; + gfx4000.handle = 4000; +#if defined(GFX11_BUILD) + gfx1100.handle = 1100; + gfx1101.handle = 1101; + gfx1102.handle = 1102; + gfx1103.handle = 1103; + gfx1150.handle = 1150; + gfx1151.handle = 1151; +#endif // GFX11_BUILD + } + + hsa_isa_t OfflineLoaderContext::IsaFromName(const char *name) + { + std::string sname(name); + if (sname == "AMD:AMDGPU:7:0:0") { + return gfx700; + } else if (sname == "AMD:AMDGPU:7:0:1") { + return gfx701; + } else if (sname == "AMD:AMDGPU:8:0:0") { + return gfx800; + } else if (sname == "AMD:AMDGPU:8:0:1") { + return gfx801; + } else if (sname == "AMD:AMDGPU:8:0:2") { + return gfx802; + } else if (sname == "AMD:AMDGPU:8:0:3") { + return gfx803; + } else if (sname == "AMD:AMDGPU:8:0:4") { + return gfx804; + } else if (sname == "AMD:AMDGPU:8:1:0") { + return gfx810; + } else if (sname == "AMD:AMDGPU:9:0:0") { + return gfx900; + } else if (sname == "AMD:AMDGPU:9:0:1") { + return gfx901; + } else if (sname == "AMD:AMDGPU:9:0:2") { + return gfx902; + } else if (sname == "AMD:AMDGPU:9:0:3") { + return gfx903; + } + else if (sname == "AMD:AMDGPU:9:0:4") { + return gfx904; + } else if (sname == "AMD:AMDGPU:9:0:5") { + return gfx905; + } + else if (sname == "AMD:AMDGPU:9:0:6") { + return gfx906; + } else if (sname == "AMD:AMDGPU:9:0:7") { + return gfx907; + } + else if (sname == "AMD:AMDGPU:10:0:0") { + return gfx1000; + } else if (sname == "AMD:AMDGPU:10:0:1") { + return gfx1001; + } else if (sname == "AMD:AMDGPU:10:1:0") { + return gfx1010; + } else if (sname == "AMD:AMDGPU:10:1:1") { + return gfx1011; + } else if (sname == "AMD:AMDGPU:10:1:2") { + return gfx1012; + } + else if (sname == "AMD:AMDGPU:10:3:0") { + return gfx1030; + } + else if (sname == "AMD:AMDGPU:40:0:0") { + return gfx4000; + } +#if defined(GFX11_BUILD) + else if (sname == "AMD:AMDGPU:11:0:0") { + return gfx1100; + } else if (sname == "AMD:AMDGPU:11:0:1") { + return gfx1101; + } else if (sname == "AMD:AMDGPU:11:0:2") { + return gfx1102; + } else if (sname == "AMD:AMDGPU:11:0:3") { + return gfx1103; + } else if (sname == "AMD:AMDGPU:11:5:0") { + return gfx1150; + } else if (sname == "AMD:AMDGPU:11:5:1") { + return gfx1151; + } +#endif // GFX11_BUILD + + assert(0); + return invalid; + } + + bool OfflineLoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) + { + return true; + } + + void* OfflineLoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) + { + void* ptr = alignedMalloc(size, align); + if (zero) { memset(ptr, 0, size); } + out << "SegmentAlloc: " << segment << ": " << "size=" << size << " align=" << align << " zero=" << zero << " result=" << ptr << std::endl; + pointers.insert(ptr); + return ptr; + } + + bool OfflineLoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) + { + out << "SegmentCopy: " << segment << ": " << "dst=" << dst << " offset=" << offset << " src=" << src << " size=" << size << std::endl; + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + memcpy((char *) dst + offset, src, size); + return true; + } + + void OfflineLoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) + { + out << "SegmentFree: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; + pointers.erase(seg); + alignedFree(seg); + } + + void* OfflineLoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) + { + out << "SegmentAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; + return (char*) seg + offset; + } + + void* OfflineLoaderContext::SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) + { + out << "SegmentHostAddress: " << segment << ": " << " ptr=" << seg << " offset=" << offset << std::endl; + return (char*) seg + offset; + } + + bool OfflineLoaderContext::SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) + { + out << "SegmentFreeze: " << segment << ": " << " ptr=" << seg << " size=" << size << std::endl; + return true; + } + + bool OfflineLoaderContext::ImageExtensionSupported() + { + return true; + } + + hsa_status_t OfflineLoaderContext::ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) + { + void* ptr = alignedMalloc(256, 8); + out << "ImageCreate" << ":" << + " permission=" << image_permission << + " geometry=" << image_descriptor->geometry << + " width=" << image_descriptor->width << + " height=" << image_descriptor->height << + " depth=" << image_descriptor->depth << + " array_size=" << image_descriptor->array_size << + " channel_type=" << image_descriptor->format.channel_type << + " channel_order=" << image_descriptor->format.channel_order<< + " data=" << image_data << + std::endl; + pointers.insert(ptr); + image_handle->handle = reinterpret_cast(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) + { + void* ptr = reinterpret_cast(image_handle.handle); + pointers.erase(ptr); + alignedFree(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) + { + void* ptr = alignedMalloc(256, 8); + out << "SamplerCreate" << ":" << + " coordinate_mode=" << sampler_descriptor->coordinate_mode << + " filter_mode=" << sampler_descriptor->filter_mode << + " address_mode=" << sampler_descriptor->address_mode << + std::endl; + pointers.insert(ptr); + sampler_handle->handle = reinterpret_cast(ptr); + return HSA_STATUS_SUCCESS; + } + + hsa_status_t OfflineLoaderContext::SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) + { + void* ptr = reinterpret_cast(sampler_handle.handle); + pointers.erase(ptr); + alignedFree(ptr); + return HSA_STATUS_SUCCESS; + } + +} +} +} diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.hpp new file mode 100644 index 0000000000..1956440d73 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/ext/loader/loaders.hpp @@ -0,0 +1,114 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LOADERS_HPP_ +#define LOADERS_HPP_ + +#include "amd_hsa_loader.hpp" +#include +#include + +namespace amd { +namespace hsa { +namespace loader { + + class OfflineLoaderContext : public amd::hsa::loader::Context { + private: + hsa_isa_t invalid; + hsa_isa_t gfx700, gfx701, gfx800, gfx801, gfx802, gfx803, gfx804, gfx810; + hsa_isa_t gfx900, gfx901, gfx902, gfx903; + hsa_isa_t gfx904, gfx905; + hsa_isa_t gfx906, gfx907; + hsa_isa_t gfx1000, gfx1001, gfx1010, gfx1011, gfx1012; + hsa_isa_t gfx1030; + hsa_isa_t gfx4000; +#if defined(GFX11_BUILD) + hsa_isa_t gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151; +#endif // GFX11_BUILD + std::ostream& out; + typedef std::set PointerSet; + PointerSet pointers; + + public: + OfflineLoaderContext(); + + hsa_isa_t IsaFromName(const char *name) override; + + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override; + + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) override; + + void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size = 0) override; + + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) override; + + bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) override; + + bool ImageExtensionSupported() override; + + hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) override; + + hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) override; + + hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) override; + + hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; + }; +} +} +} + +#endif // LOADERS_HPP_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/hsail-tools/libHSAIL/Brig.h b/shared/amdgpu-windows-interop/sc/HSAIL/hsail-tools/libHSAIL/Brig.h new file mode 100644 index 0000000000..4f34bd1d50 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/hsail-tools/libHSAIL/Brig.h @@ -0,0 +1,1131 @@ +// University of Illinois/NCSA +// Open Source License +// +// Copyright (c) 2013-2015, Advanced Micro Devices, Inc. +// All rights reserved. +// +// Developed by: +// +// HSA Team +// +// Advanced Micro Devices, Inc +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal with +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +// of the Software, and to permit persons to whom the Software is furnished to do +// so, subject to the following conditions: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimers in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the names of the LLVM Team, University of Illinois at +// Urbana-Champaign, nor the names of its contributors may be used to +// endorse or promote products derived from this Software without specific +// prior written permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +// SOFTWARE. + +#ifndef INCLUDED_BRIG_H +#define INCLUDED_BRIG_H + +#include /* size_t */ +#include /* uintXX_t */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/*========================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ +/* =======================================================================================*/ + +typedef uint32_t BrigCodeOffset32_t; +typedef uint32_t BrigOperandOffset32_t; +typedef uint32_t BrigDataOffset32_t; + +typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; +typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; +typedef BrigDataOffset32_t BrigDataOffsetString32_t; + +typedef uint32_t BrigVersion32_t; +enum BrigVersion { + BRIG_VERSION_HSAIL_MAJOR = 1, + BRIG_VERSION_HSAIL_MINOR = 0, + BRIG_VERSION_BRIG_MAJOR = 1, + BRIG_VERSION_BRIG_MINOR = 0 +}; + +typedef uint16_t BrigKind16_t; +enum BrigKind { + BRIG_KIND_NONE = 0x0000, + + BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, + BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, + BRIG_KIND_DIRECTIVE_COMMENT = 0x1002, + BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, + BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, + BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005, + BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, + BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, + BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, + BRIG_KIND_DIRECTIVE_LABEL = 0x1009, + BRIG_KIND_DIRECTIVE_LOC = 0x100a, + BRIG_KIND_DIRECTIVE_MODULE = 0x100b, + BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c, + BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, + BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e, + BRIG_KIND_DIRECTIVE_END = 0x100f, + + BRIG_KIND_INST_BEGIN = 0x2000, + BRIG_KIND_INST_ADDR = 0x2000, + BRIG_KIND_INST_ATOMIC = 0x2001, + BRIG_KIND_INST_BASIC = 0x2002, + BRIG_KIND_INST_BR = 0x2003, + BRIG_KIND_INST_CMP = 0x2004, + BRIG_KIND_INST_CVT = 0x2005, + BRIG_KIND_INST_IMAGE = 0x2006, + BRIG_KIND_INST_LANE = 0x2007, + BRIG_KIND_INST_MEM = 0x2008, + BRIG_KIND_INST_MEM_FENCE = 0x2009, + BRIG_KIND_INST_MOD = 0x200a, + BRIG_KIND_INST_QUERY_IMAGE = 0x200b, + BRIG_KIND_INST_QUERY_SAMPLER = 0x200c, + BRIG_KIND_INST_QUEUE = 0x200d, + BRIG_KIND_INST_SEG = 0x200e, + BRIG_KIND_INST_SEG_CVT = 0x200f, + BRIG_KIND_INST_SIGNAL = 0x2010, + BRIG_KIND_INST_SOURCE_TYPE = 0x2011, + BRIG_KIND_INST_END = 0x2012, + + BRIG_KIND_OPERAND_BEGIN = 0x3000, + BRIG_KIND_OPERAND_ADDRESS = 0x3000, + BRIG_KIND_OPERAND_ALIGN = 0x3001, + BRIG_KIND_OPERAND_CODE_LIST = 0x3002, + BRIG_KIND_OPERAND_CODE_REF = 0x3003, + BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004, + BRIG_KIND_OPERAND_RESERVED = 0x3005, + BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006, + BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007, + BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008, + BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009, + BRIG_KIND_OPERAND_REGISTER = 0x300a, + BRIG_KIND_OPERAND_STRING = 0x300b, + BRIG_KIND_OPERAND_WAVESIZE = 0x300c, + BRIG_KIND_OPERAND_END = 0x300d +}; + +typedef uint8_t BrigAlignment8_t; +enum BrigAlignment { + BRIG_ALIGNMENT_NONE = 0, + BRIG_ALIGNMENT_1 = 1, + BRIG_ALIGNMENT_2 = 2, + BRIG_ALIGNMENT_4 = 3, + BRIG_ALIGNMENT_8 = 4, + BRIG_ALIGNMENT_16 = 5, + BRIG_ALIGNMENT_32 = 6, + BRIG_ALIGNMENT_64 = 7, + BRIG_ALIGNMENT_128 = 8, + BRIG_ALIGNMENT_256 = 9, + BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_256 +}; + +typedef uint8_t BrigAllocation8_t; +enum BrigAllocation { + BRIG_ALLOCATION_NONE = 0, + BRIG_ALLOCATION_PROGRAM = 1, + BRIG_ALLOCATION_AGENT = 2, + BRIG_ALLOCATION_AUTOMATIC = 3 +}; + +typedef uint8_t BrigAluModifier8_t; +enum BrigAluModifierMask { + BRIG_ALU_FTZ = 1 +}; + +typedef uint8_t BrigAtomicOperation8_t; +enum BrigAtomicOperation { + BRIG_ATOMIC_ADD = 0, + BRIG_ATOMIC_AND = 1, + BRIG_ATOMIC_CAS = 2, + BRIG_ATOMIC_EXCH = 3, + BRIG_ATOMIC_LD = 4, + BRIG_ATOMIC_MAX = 5, + BRIG_ATOMIC_MIN = 6, + BRIG_ATOMIC_OR = 7, + BRIG_ATOMIC_ST = 8, + BRIG_ATOMIC_SUB = 9, + BRIG_ATOMIC_WRAPDEC = 10, + BRIG_ATOMIC_WRAPINC = 11, + BRIG_ATOMIC_XOR = 12, + BRIG_ATOMIC_WAIT_EQ = 13, + BRIG_ATOMIC_WAIT_NE = 14, + BRIG_ATOMIC_WAIT_LT = 15, + BRIG_ATOMIC_WAIT_GTE = 16, + BRIG_ATOMIC_WAITTIMEOUT_EQ = 17, + BRIG_ATOMIC_WAITTIMEOUT_NE = 18, + BRIG_ATOMIC_WAITTIMEOUT_LT = 19, + BRIG_ATOMIC_WAITTIMEOUT_GTE = 20 +}; + +typedef uint8_t BrigCompareOperation8_t; +enum BrigCompareOperation { + BRIG_COMPARE_EQ = 0, + BRIG_COMPARE_NE = 1, + BRIG_COMPARE_LT = 2, + BRIG_COMPARE_LE = 3, + BRIG_COMPARE_GT = 4, + BRIG_COMPARE_GE = 5, + BRIG_COMPARE_EQU = 6, + BRIG_COMPARE_NEU = 7, + BRIG_COMPARE_LTU = 8, + BRIG_COMPARE_LEU = 9, + BRIG_COMPARE_GTU = 10, + BRIG_COMPARE_GEU = 11, + BRIG_COMPARE_NUM = 12, + BRIG_COMPARE_NAN = 13, + BRIG_COMPARE_SEQ = 14, + BRIG_COMPARE_SNE = 15, + BRIG_COMPARE_SLT = 16, + BRIG_COMPARE_SLE = 17, + BRIG_COMPARE_SGT = 18, + BRIG_COMPARE_SGE = 19, + BRIG_COMPARE_SGEU = 20, + BRIG_COMPARE_SEQU = 21, + BRIG_COMPARE_SNEU = 22, + BRIG_COMPARE_SLTU = 23, + BRIG_COMPARE_SLEU = 24, + BRIG_COMPARE_SNUM = 25, + BRIG_COMPARE_SNAN = 26, + BRIG_COMPARE_SGTU = 27 +}; + +typedef uint16_t BrigControlDirective16_t; +enum BrigControlDirective { + BRIG_CONTROL_NONE = 0, + BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1, + BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2, + BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3, + BRIG_CONTROL_MAXFLATGRIDSIZE = 4, + BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5, + BRIG_CONTROL_REQUIREDDIM = 6, + BRIG_CONTROL_REQUIREDGRIDSIZE = 7, + BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8, + BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9 +}; + +typedef uint8_t BrigExecutableModifier8_t; +enum BrigExecutableModifierMask { + BRIG_EXECUTABLE_DEFINITION = 1 +}; + +typedef uint8_t BrigImageChannelOrder8_t; +enum BrigImageChannelOrder { + BRIG_CHANNEL_ORDER_A = 0, + BRIG_CHANNEL_ORDER_R = 1, + BRIG_CHANNEL_ORDER_RX = 2, + BRIG_CHANNEL_ORDER_RG = 3, + BRIG_CHANNEL_ORDER_RGX = 4, + BRIG_CHANNEL_ORDER_RA = 5, + BRIG_CHANNEL_ORDER_RGB = 6, + BRIG_CHANNEL_ORDER_RGBX = 7, + BRIG_CHANNEL_ORDER_RGBA = 8, + BRIG_CHANNEL_ORDER_BGRA = 9, + BRIG_CHANNEL_ORDER_ARGB = 10, + BRIG_CHANNEL_ORDER_ABGR = 11, + BRIG_CHANNEL_ORDER_SRGB = 12, + BRIG_CHANNEL_ORDER_SRGBX = 13, + BRIG_CHANNEL_ORDER_SRGBA = 14, + BRIG_CHANNEL_ORDER_SBGRA = 15, + BRIG_CHANNEL_ORDER_INTENSITY = 16, + BRIG_CHANNEL_ORDER_LUMINANCE = 17, + BRIG_CHANNEL_ORDER_DEPTH = 18, + BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19, + + BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageChannelType8_t; +enum BrigImageChannelType { + BRIG_CHANNEL_TYPE_SNORM_INT8 = 0, + BRIG_CHANNEL_TYPE_SNORM_INT16 = 1, + BRIG_CHANNEL_TYPE_UNORM_INT8 = 2, + BRIG_CHANNEL_TYPE_UNORM_INT16 = 3, + BRIG_CHANNEL_TYPE_UNORM_INT24 = 4, + BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7, + BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8, + BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9, + BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10, + BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + BRIG_CHANNEL_TYPE_HALF_FLOAT = 14, + BRIG_CHANNEL_TYPE_FLOAT = 15, + + BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageGeometry8_t; +enum BrigImageGeometry { + BRIG_GEOMETRY_1D = 0, + BRIG_GEOMETRY_2D = 1, + BRIG_GEOMETRY_3D = 2, + BRIG_GEOMETRY_1DA = 3, + BRIG_GEOMETRY_2DA = 4, + BRIG_GEOMETRY_1DB = 5, + BRIG_GEOMETRY_2DDEPTH = 6, + BRIG_GEOMETRY_2DADEPTH = 7, + + BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigImageQuery8_t; +enum BrigImageQuery { + BRIG_IMAGE_QUERY_WIDTH = 0, + BRIG_IMAGE_QUERY_HEIGHT = 1, + BRIG_IMAGE_QUERY_DEPTH = 2, + BRIG_IMAGE_QUERY_ARRAY = 3, + BRIG_IMAGE_QUERY_CHANNELORDER = 4, + BRIG_IMAGE_QUERY_CHANNELTYPE = 5, + + BRIG_IMAGE_QUERY_FIRST_USER_DEFINED = 6 +}; + +typedef uint8_t BrigLinkage8_t; +enum BrigLinkage { + BRIG_LINKAGE_NONE = 0, + BRIG_LINKAGE_PROGRAM = 1, + BRIG_LINKAGE_MODULE = 2, + BRIG_LINKAGE_FUNCTION = 3, + BRIG_LINKAGE_ARG = 4 +}; + +typedef uint8_t BrigMachineModel8_t; +enum BrigMachineModel { + BRIG_MACHINE_SMALL = 0, + BRIG_MACHINE_LARGE = 1, +}; + +typedef uint8_t BrigMemoryModifier8_t; +enum BrigMemoryModifierMask { + BRIG_MEMORY_CONST = 1 +}; + +typedef uint8_t BrigMemoryOrder8_t; +enum BrigMemoryOrder { + BRIG_MEMORY_ORDER_NONE = 0, + BRIG_MEMORY_ORDER_RELAXED = 1, + BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, + BRIG_MEMORY_ORDER_SC_RELEASE = 3, + BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, +}; + +typedef uint8_t BrigMemoryScope8_t; +enum BrigMemoryScope { + BRIG_MEMORY_SCOPE_NONE = 0, + BRIG_MEMORY_SCOPE_WORKITEM = 1, + BRIG_MEMORY_SCOPE_WAVEFRONT = 2, + BRIG_MEMORY_SCOPE_WORKGROUP = 3, + BRIG_MEMORY_SCOPE_AGENT = 4, + BRIG_MEMORY_SCOPE_SYSTEM = 5, +}; + +typedef uint16_t BrigOpcode16_t; +enum BrigOpcode { + BRIG_OPCODE_NOP = 0, + BRIG_OPCODE_ABS = 1, + BRIG_OPCODE_ADD = 2, + BRIG_OPCODE_BORROW = 3, + BRIG_OPCODE_CARRY = 4, + BRIG_OPCODE_CEIL = 5, + BRIG_OPCODE_COPYSIGN = 6, + BRIG_OPCODE_DIV = 7, + BRIG_OPCODE_FLOOR = 8, + BRIG_OPCODE_FMA = 9, + BRIG_OPCODE_FRACT = 10, + BRIG_OPCODE_MAD = 11, + BRIG_OPCODE_MAX = 12, + BRIG_OPCODE_MIN = 13, + BRIG_OPCODE_MUL = 14, + BRIG_OPCODE_MULHI = 15, + BRIG_OPCODE_NEG = 16, + BRIG_OPCODE_REM = 17, + BRIG_OPCODE_RINT = 18, + BRIG_OPCODE_SQRT = 19, + BRIG_OPCODE_SUB = 20, + BRIG_OPCODE_TRUNC = 21, + BRIG_OPCODE_MAD24 = 22, + BRIG_OPCODE_MAD24HI = 23, + BRIG_OPCODE_MUL24 = 24, + BRIG_OPCODE_MUL24HI = 25, + BRIG_OPCODE_SHL = 26, + BRIG_OPCODE_SHR = 27, + BRIG_OPCODE_AND = 28, + BRIG_OPCODE_NOT = 29, + BRIG_OPCODE_OR = 30, + BRIG_OPCODE_POPCOUNT = 31, + BRIG_OPCODE_XOR = 32, + BRIG_OPCODE_BITEXTRACT = 33, + BRIG_OPCODE_BITINSERT = 34, + BRIG_OPCODE_BITMASK = 35, + BRIG_OPCODE_BITREV = 36, + BRIG_OPCODE_BITSELECT = 37, + BRIG_OPCODE_FIRSTBIT = 38, + BRIG_OPCODE_LASTBIT = 39, + BRIG_OPCODE_COMBINE = 40, + BRIG_OPCODE_EXPAND = 41, + BRIG_OPCODE_LDA = 42, + BRIG_OPCODE_MOV = 43, + BRIG_OPCODE_SHUFFLE = 44, + BRIG_OPCODE_UNPACKHI = 45, + BRIG_OPCODE_UNPACKLO = 46, + BRIG_OPCODE_PACK = 47, + BRIG_OPCODE_UNPACK = 48, + BRIG_OPCODE_CMOV = 49, + BRIG_OPCODE_CLASS = 50, + BRIG_OPCODE_NCOS = 51, + BRIG_OPCODE_NEXP2 = 52, + BRIG_OPCODE_NFMA = 53, + BRIG_OPCODE_NLOG2 = 54, + BRIG_OPCODE_NRCP = 55, + BRIG_OPCODE_NRSQRT = 56, + BRIG_OPCODE_NSIN = 57, + BRIG_OPCODE_NSQRT = 58, + BRIG_OPCODE_BITALIGN = 59, + BRIG_OPCODE_BYTEALIGN = 60, + BRIG_OPCODE_PACKCVT = 61, + BRIG_OPCODE_UNPACKCVT = 62, + BRIG_OPCODE_LERP = 63, + BRIG_OPCODE_SAD = 64, + BRIG_OPCODE_SADHI = 65, + BRIG_OPCODE_SEGMENTP = 66, + BRIG_OPCODE_FTOS = 67, + BRIG_OPCODE_STOF = 68, + BRIG_OPCODE_CMP = 69, + BRIG_OPCODE_CVT = 70, + BRIG_OPCODE_LD = 71, + BRIG_OPCODE_ST = 72, + BRIG_OPCODE_ATOMIC = 73, + BRIG_OPCODE_ATOMICNORET = 74, + BRIG_OPCODE_SIGNAL = 75, + BRIG_OPCODE_SIGNALNORET = 76, + BRIG_OPCODE_MEMFENCE = 77, + BRIG_OPCODE_RDIMAGE = 78, + BRIG_OPCODE_LDIMAGE = 79, + BRIG_OPCODE_STIMAGE = 80, + BRIG_OPCODE_IMAGEFENCE = 81, + BRIG_OPCODE_QUERYIMAGE = 82, + BRIG_OPCODE_QUERYSAMPLER = 83, + BRIG_OPCODE_CBR = 84, + BRIG_OPCODE_BR = 85, + BRIG_OPCODE_SBR = 86, + BRIG_OPCODE_BARRIER = 87, + BRIG_OPCODE_WAVEBARRIER = 88, + BRIG_OPCODE_ARRIVEFBAR = 89, + BRIG_OPCODE_INITFBAR = 90, + BRIG_OPCODE_JOINFBAR = 91, + BRIG_OPCODE_LEAVEFBAR = 92, + BRIG_OPCODE_RELEASEFBAR = 93, + BRIG_OPCODE_WAITFBAR = 94, + BRIG_OPCODE_LDF = 95, + BRIG_OPCODE_ACTIVELANECOUNT = 96, + BRIG_OPCODE_ACTIVELANEID = 97, + BRIG_OPCODE_ACTIVELANEMASK = 98, + BRIG_OPCODE_ACTIVELANEPERMUTE = 99, + BRIG_OPCODE_CALL = 100, + BRIG_OPCODE_SCALL = 101, + BRIG_OPCODE_ICALL = 102, + BRIG_OPCODE_RET = 103, + BRIG_OPCODE_ALLOCA = 104, + BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105, + BRIG_OPCODE_CURRENTWORKITEMFLATID = 106, + BRIG_OPCODE_DIM = 107, + BRIG_OPCODE_GRIDGROUPS = 108, + BRIG_OPCODE_GRIDSIZE = 109, + BRIG_OPCODE_PACKETCOMPLETIONSIG = 110, + BRIG_OPCODE_PACKETID = 111, + BRIG_OPCODE_WORKGROUPID = 112, + BRIG_OPCODE_WORKGROUPSIZE = 113, + BRIG_OPCODE_WORKITEMABSID = 114, + BRIG_OPCODE_WORKITEMFLATABSID = 115, + BRIG_OPCODE_WORKITEMFLATID = 116, + BRIG_OPCODE_WORKITEMID = 117, + BRIG_OPCODE_CLEARDETECTEXCEPT = 118, + BRIG_OPCODE_GETDETECTEXCEPT = 119, + BRIG_OPCODE_SETDETECTEXCEPT = 120, + BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, + BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, + BRIG_OPCODE_LDQUEUEREADINDEX = 123, + BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, + BRIG_OPCODE_STQUEUEREADINDEX = 125, + BRIG_OPCODE_STQUEUEWRITEINDEX = 126, + BRIG_OPCODE_CLOCK = 127, + BRIG_OPCODE_CUID = 128, + BRIG_OPCODE_DEBUGTRAP = 129, + BRIG_OPCODE_GROUPBASEPTR = 130, + BRIG_OPCODE_KERNARGBASEPTR = 131, + BRIG_OPCODE_LANEID = 132, + BRIG_OPCODE_MAXCUID = 133, + BRIG_OPCODE_MAXWAVEID = 134, + BRIG_OPCODE_NULLPTR = 135, + BRIG_OPCODE_WAVEID = 136, + + BRIG_OPCODE_FIRST_USER_DEFINED = 32768, +}; + +typedef uint8_t BrigPack8_t; +enum BrigPack { + BRIG_PACK_NONE = 0, + BRIG_PACK_PP = 1, + BRIG_PACK_PS = 2, + BRIG_PACK_SP = 3, + BRIG_PACK_SS = 4, + BRIG_PACK_S = 5, + BRIG_PACK_P = 6, + BRIG_PACK_PPSAT = 7, + BRIG_PACK_PSSAT = 8, + BRIG_PACK_SPSAT = 9, + BRIG_PACK_SSSAT = 10, + BRIG_PACK_SSAT = 11, + BRIG_PACK_PSAT = 12 +}; + +typedef uint8_t BrigProfile8_t; +enum BrigProfile { + BRIG_PROFILE_BASE = 0, + BRIG_PROFILE_FULL = 1, +}; + +typedef uint16_t BrigRegisterKind16_t; +enum BrigRegisterKind { + BRIG_REGISTER_KIND_CONTROL = 0, + BRIG_REGISTER_KIND_SINGLE = 1, + BRIG_REGISTER_KIND_DOUBLE = 2, + BRIG_REGISTER_KIND_QUAD = 3 +}; + +typedef uint8_t BrigRound8_t; +enum BrigRound { + BRIG_ROUND_NONE = 0, + BRIG_ROUND_FLOAT_DEFAULT = 1, + BRIG_ROUND_FLOAT_NEAR_EVEN = 2, + BRIG_ROUND_FLOAT_ZERO = 3, + BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, + BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, + BRIG_ROUND_INTEGER_NEAR_EVEN = 6, + BRIG_ROUND_INTEGER_ZERO = 7, + BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, + BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, + BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, + BRIG_ROUND_INTEGER_ZERO_SAT = 11, + BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, + BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, + BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, + BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, + BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, + BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, + BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 +}; + +typedef uint8_t BrigSamplerAddressing8_t; +enum BrigSamplerAddressing { + BRIG_ADDRESSING_UNDEFINED = 0, + BRIG_ADDRESSING_CLAMP_TO_EDGE = 1, + BRIG_ADDRESSING_CLAMP_TO_BORDER = 2, + BRIG_ADDRESSING_REPEAT = 3, + BRIG_ADDRESSING_MIRRORED_REPEAT = 4, + + BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerCoordNormalization8_t; +enum BrigSamplerCoordNormalization { + BRIG_COORD_UNNORMALIZED = 0, + BRIG_COORD_NORMALIZED = 1 +}; + +typedef uint8_t BrigSamplerFilter8_t; +enum BrigSamplerFilter { + BRIG_FILTER_NEAREST = 0, + BRIG_FILTER_LINEAR = 1, + + BRIG_FILTER_FIRST_USER_DEFINED = 128 +}; + +typedef uint8_t BrigSamplerQuery8_t; +enum BrigSamplerQuery { + BRIG_SAMPLER_QUERY_ADDRESSING = 0, + BRIG_SAMPLER_QUERY_COORD = 1, + BRIG_SAMPLER_QUERY_FILTER = 2 +}; + +typedef uint32_t BrigSectionIndex32_t; +enum BrigSectionIndex { + BRIG_SECTION_INDEX_DATA = 0, + BRIG_SECTION_INDEX_CODE = 1, + BRIG_SECTION_INDEX_OPERAND = 2, + + BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3, +}; + +typedef uint8_t BrigSegCvtModifier8_t; +enum BrigSegCvtModifierMask { + BRIG_SEG_CVT_NONULL = 1 +}; + +typedef uint8_t BrigSegment8_t; +enum BrigSegment { + BRIG_SEGMENT_NONE = 0, + BRIG_SEGMENT_FLAT = 1, + BRIG_SEGMENT_GLOBAL = 2, + BRIG_SEGMENT_READONLY = 3, + BRIG_SEGMENT_KERNARG = 4, + BRIG_SEGMENT_GROUP = 5, + BRIG_SEGMENT_PRIVATE = 6, + BRIG_SEGMENT_SPILL = 7, + BRIG_SEGMENT_ARG = 8, + + BRIG_SEGMENT_FIRST_USER_DEFINED = 128 +}; + +enum { + BRIG_TYPE_BASE_SIZE = 5, + BRIG_TYPE_PACK_SIZE = 2, + BRIG_TYPE_ARRAY_SIZE = 1, + + BRIG_TYPE_BASE_SHIFT = 0, + BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE, + BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE, + + BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT, + BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT, + + BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT, + BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT, + + BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT +}; + +typedef uint16_t BrigType16_t; +enum BrigType { + BRIG_TYPE_NONE = 0, + BRIG_TYPE_U8 = 1, + BRIG_TYPE_U16 = 2, + BRIG_TYPE_U32 = 3, + BRIG_TYPE_U64 = 4, + BRIG_TYPE_S8 = 5, + BRIG_TYPE_S16 = 6, + BRIG_TYPE_S32 = 7, + BRIG_TYPE_S64 = 8, + BRIG_TYPE_F16 = 9, + BRIG_TYPE_F32 = 10, + BRIG_TYPE_F64 = 11, + BRIG_TYPE_B1 = 12, + BRIG_TYPE_B8 = 13, + BRIG_TYPE_B16 = 14, + BRIG_TYPE_B32 = 15, + BRIG_TYPE_B64 = 16, + BRIG_TYPE_B128 = 17, + BRIG_TYPE_SAMP = 18, + BRIG_TYPE_ROIMG = 19, + BRIG_TYPE_WOIMG = 20, + BRIG_TYPE_RWIMG = 21, + BRIG_TYPE_SIG32 = 22, + BRIG_TYPE_SIG64 = 23, + + BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, + BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, + BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, + BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, + + BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, + BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, + BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, + BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, +}; + +typedef uint8_t BrigVariableModifier8_t; +enum BrigVariableModifierMask { + BRIG_VARIABLE_DEFINITION = 1, + BRIG_VARIABLE_CONST = 2 +}; + +typedef uint8_t BrigWidth8_t; +enum BrigWidth { + BRIG_WIDTH_NONE = 0, + BRIG_WIDTH_1 = 1, + BRIG_WIDTH_2 = 2, + BRIG_WIDTH_4 = 3, + BRIG_WIDTH_8 = 4, + BRIG_WIDTH_16 = 5, + BRIG_WIDTH_32 = 6, + BRIG_WIDTH_64 = 7, + BRIG_WIDTH_128 = 8, + BRIG_WIDTH_256 = 9, + BRIG_WIDTH_512 = 10, + BRIG_WIDTH_1024 = 11, + BRIG_WIDTH_2048 = 12, + BRIG_WIDTH_4096 = 13, + BRIG_WIDTH_8192 = 14, + BRIG_WIDTH_16384 = 15, + BRIG_WIDTH_32768 = 16, + BRIG_WIDTH_65536 = 17, + BRIG_WIDTH_131072 = 18, + BRIG_WIDTH_262144 = 19, + BRIG_WIDTH_524288 = 20, + BRIG_WIDTH_1048576 = 21, + BRIG_WIDTH_2097152 = 22, + BRIG_WIDTH_4194304 = 23, + BRIG_WIDTH_8388608 = 24, + BRIG_WIDTH_16777216 = 25, + BRIG_WIDTH_33554432 = 26, + BRIG_WIDTH_67108864 = 27, + BRIG_WIDTH_134217728 = 28, + BRIG_WIDTH_268435456 = 29, + BRIG_WIDTH_536870912 = 30, + BRIG_WIDTH_1073741824 = 31, + BRIG_WIDTH_2147483648 = 32, + BRIG_WIDTH_WAVESIZE = 33, + BRIG_WIDTH_ALL = 34, +}; + +struct BrigUInt64 { + uint32_t lo; + uint32_t hi; +}; + +struct BrigBase { + uint16_t byteCount; + BrigKind16_t kind; +}; + +struct BrigData { + uint32_t byteCount; + uint8_t bytes[1]; +}; + +struct BrigDirectiveArgBlock { + BrigBase base; +}; + +struct BrigDirectiveComment { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveControl { + BrigBase base; + BrigControlDirective16_t control; + uint16_t reserved; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveExecutable { + BrigBase base; + BrigDataOffsetString32_t name; + uint16_t outArgCount; + uint16_t inArgCount; + BrigCodeOffset32_t firstInArg; + BrigCodeOffset32_t firstCodeBlockEntry; + BrigCodeOffset32_t nextModuleEntry; + BrigExecutableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveExtension { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveFbarrier { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + uint16_t reserved; +}; + +struct BrigDirectiveLabel { + BrigBase base; + BrigDataOffsetString32_t name; +}; + +struct BrigDirectiveLoc { + BrigBase base; + BrigDataOffsetString32_t filename; + uint32_t line; + uint32_t column; +}; + +struct BrigDirectiveNone { + BrigBase base; +}; + +struct BrigDirectivePragma { + BrigBase base; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigDirectiveVariable { + BrigBase base; + BrigDataOffsetString32_t name; + BrigOperandOffset32_t init; + BrigType16_t type; + BrigSegment8_t segment; + BrigAlignment8_t align; + BrigUInt64 dim; + BrigVariableModifier8_t modifier; + BrigLinkage8_t linkage; + BrigAllocation8_t allocation; + uint8_t reserved; +}; + +struct BrigDirectiveModule { + BrigBase base; + BrigDataOffsetString32_t name; + BrigVersion32_t hsailMajor; + BrigVersion32_t hsailMinor; + BrigProfile8_t profile; + BrigMachineModel8_t machineModel; + BrigRound8_t defaultFloatRound; + uint8_t reserved; +}; + +struct BrigInstBase { + BrigBase base; + BrigOpcode16_t opcode; + BrigType16_t type; + BrigDataOffsetOperandList32_t operands; +}; + +struct BrigInstAddr { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstAtomic { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t memoryScope; + BrigAtomicOperation8_t atomicOperation; + uint8_t equivClass; + uint8_t reserved[3]; +}; + +struct BrigInstBasic { + BrigInstBase base; +}; + +struct BrigInstBr { + BrigInstBase base; + BrigWidth8_t width; + uint8_t reserved[3]; +}; + +struct BrigInstCmp { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigCompareOperation8_t compare; + BrigPack8_t pack; + uint8_t reserved[3]; +}; + +struct BrigInstCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigAluModifier8_t modifier; + BrigRound8_t round; +}; + +struct BrigInstImage { + BrigInstBase base; + BrigType16_t imageType; + BrigType16_t coordType; + BrigImageGeometry8_t geometry; + uint8_t equivClass; + uint16_t reserved; +}; + +struct BrigInstLane { + BrigInstBase base; + BrigType16_t sourceType; + BrigWidth8_t width; + uint8_t reserved; +}; + +struct BrigInstMem { + BrigInstBase base; + BrigSegment8_t segment; + BrigAlignment8_t align; + uint8_t equivClass; + BrigWidth8_t width; + BrigMemoryModifier8_t modifier; + uint8_t reserved[3]; +}; + +struct BrigInstMemFence { + BrigInstBase base; + BrigMemoryOrder8_t memoryOrder; + BrigMemoryScope8_t globalSegmentMemoryScope; + BrigMemoryScope8_t groupSegmentMemoryScope; + BrigMemoryScope8_t imageSegmentMemoryScope; +}; + +struct BrigInstMod { + BrigInstBase base; + BrigAluModifier8_t modifier; + BrigRound8_t round; + BrigPack8_t pack; + uint8_t reserved; +}; + +struct BrigInstQueryImage { + BrigInstBase base; + BrigType16_t imageType; + BrigImageGeometry8_t geometry; + BrigImageQuery8_t query; +}; + +struct BrigInstQuerySampler { + BrigInstBase base; + BrigSamplerQuery8_t query; + uint8_t reserved[3]; +}; + +struct BrigInstQueue { + BrigInstBase base; + BrigSegment8_t segment; + BrigMemoryOrder8_t memoryOrder; + uint16_t reserved; +}; + +struct BrigInstSeg { + BrigInstBase base; + BrigSegment8_t segment; + uint8_t reserved[3]; +}; + +struct BrigInstSegCvt { + BrigInstBase base; + BrigType16_t sourceType; + BrigSegment8_t segment; + BrigSegCvtModifier8_t modifier; +}; + +struct BrigInstSignal { + BrigInstBase base; + BrigType16_t signalType; + BrigMemoryOrder8_t memoryOrder; + BrigAtomicOperation8_t signalOperation; +}; + +struct BrigInstSourceType { + BrigInstBase base; + BrigType16_t sourceType; + uint16_t reserved; +}; + +struct BrigOperandAddress { + BrigBase base; + BrigCodeOffset32_t symbol; + BrigOperandOffset32_t reg; + BrigUInt64 offset; +}; + +struct BrigOperandAlign { + BrigBase base; + BrigAlignment8_t align; + uint8_t reserved[3]; +}; + +struct BrigOperandCodeList { + BrigBase base; + BrigDataOffsetCodeList32_t elements; +}; + +struct BrigOperandCodeRef { + BrigBase base; + BrigCodeOffset32_t ref; +}; + +struct BrigOperandConstantBytes { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetString32_t bytes; +}; + +struct BrigOperandConstantOperandList { + BrigBase base; + BrigType16_t type; + uint16_t reserved; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandConstantImage { + BrigBase base; + BrigType16_t type; + BrigImageGeometry8_t geometry; + BrigImageChannelOrder8_t channelOrder; + BrigImageChannelType8_t channelType; + uint8_t reserved[3]; + BrigUInt64 width; + BrigUInt64 height; + BrigUInt64 depth; + BrigUInt64 array; +}; + +struct BrigOperandOperandList { + BrigBase base; + BrigDataOffsetOperandList32_t elements; +}; + +struct BrigOperandRegister { + BrigBase base; + BrigRegisterKind16_t regKind; + uint16_t regNum; +}; + +struct BrigOperandConstantSampler { + BrigBase base; + BrigType16_t type; + BrigSamplerCoordNormalization8_t coord; + BrigSamplerFilter8_t filter; + BrigSamplerAddressing8_t addressing; + uint8_t reserved[3]; +}; + +struct BrigOperandString { + BrigBase base; + BrigDataOffsetString32_t string; +}; + +struct BrigOperandWavesize { + BrigBase base; +}; + +typedef uint32_t BrigExceptions32_t; +enum BrigExceptionsMask { + BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0, + BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1, + BRIG_EXCEPTIONS_OVERFLOW = 1 << 2, + BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3, + BRIG_EXCEPTIONS_INEXACT = 1 << 4, + + BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16 +}; + +struct BrigSectionHeader { + uint64_t byteCount; + uint32_t headerByteCount; + uint32_t nameLength; + uint8_t name[1]; +}; + +struct BrigModuleHeader { + char identification[8]; + BrigVersion32_t brigMajor; + BrigVersion32_t brigMinor; + uint64_t byteCount; + uint8_t hash[64]; + uint32_t reserved; + uint32_t sectionCount; + uint64_t sectionIndex; +}; + +typedef BrigModuleHeader* BrigModule_t; + +#ifdef __cplusplus +} +#endif /*__cplusplus*/ + +#endif // defined(INCLUDED_BRIG_H) diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_elf_image.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_elf_image.hpp new file mode 100644 index 0000000000..87553408b3 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_elf_image.hpp @@ -0,0 +1,266 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/****************************************************************************** +* University of Illinois / NCSA +* Open Source License +* +* Copyright(c) 2011 - 2015 Advanced Micro Devices, Inc. +* All rights reserved. +* +* Developed by: +* Advanced Micro Devices, Inc. +* www.amd.com +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files(the "Software"), to deal +* with the Software without restriction, including without limitation the +* rights to use, copy, modify, merge, publish, distribute, sublicense, and / +* or sell copies of the Software, and to permit persons to whom the Software +* is furnished to do so, subject to the following conditions: +* +* Redistributions of source code must retain the above copyright notice, +* this list of conditions and the following disclaimers. +* +* Redistributions in binary form must reproduce the above copyright notice, +* this list of conditions and the following disclaimers in the documentation +* and / or other materials provided with the distribution. +* +* Neither the names of Advanced Micro Devices, Inc, nor the +mes of its +* contributors may be used to endorse or promote products derived from this +* Software without specific prior written permission. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +* THE SOFTWARE. +******************************************************************************/ + +#ifndef AMD_ELF_IMAGE_HPP_ +#define AMD_ELF_IMAGE_HPP_ + +#include +#include +#include +#include + +namespace amd { + namespace elf { + class Symbol; + class SymbolTable; + class Section; + class RelocationSection; + + class Segment { + public: + virtual ~Segment() { } + virtual uint64_t type() const = 0; + virtual uint64_t memSize() const = 0; + virtual uint64_t align() const = 0; + virtual uint64_t imageSize() const = 0; + virtual uint64_t vaddr() const = 0; + virtual uint64_t flags() const = 0; + virtual uint64_t offset() const = 0; + virtual const char* data() const = 0; + virtual uint16_t getSegmentIndex() = 0; + virtual bool updateAddSection(Section *section) = 0; + }; + + class Section { + public: + virtual ~Section() { } + virtual uint16_t getSectionIndex() const = 0; + virtual uint32_t type() const = 0; + virtual std::string Name() const = 0; + virtual uint64_t offset() const = 0; + virtual uint64_t addr() const = 0; + virtual bool updateAddr(uint64_t addr) = 0; + virtual uint64_t addralign() const = 0; + virtual uint64_t flags() const = 0; + virtual uint64_t size() const = 0; + virtual uint64_t nextDataOffset(uint64_t align) const = 0; + virtual uint64_t addData(const void *src, uint64_t size, uint64_t align) = 0; + virtual bool getData(uint64_t offset, void* dest, uint64_t size) = 0; + virtual Segment* segment() = 0; + virtual RelocationSection* asRelocationSection() = 0; + virtual bool hasRelocationSection() const = 0; + virtual RelocationSection* relocationSection(SymbolTable* symtab = 0) = 0; + virtual bool setMemSize(uint64_t s) = 0; + virtual uint64_t memSize() const = 0; + virtual bool setAlign(uint64_t a) = 0; + virtual uint64_t memAlign() const = 0; + }; + + class Relocation { + public: + virtual ~Relocation() { } + virtual RelocationSection* section() = 0; + virtual uint32_t type() = 0; + virtual uint32_t symbolIndex() = 0; + virtual Symbol* symbol() = 0; + virtual uint64_t offset() = 0; + virtual int64_t addend() = 0; + }; + + class RelocationSection : public virtual Section { + public: + virtual Relocation* addRelocation(uint32_t type, Symbol* symbol, uint64_t offset, int64_t addend) = 0; + virtual size_t relocationCount() const = 0; + virtual Relocation* relocation(size_t i) = 0; + virtual Section* targetSection() = 0; + }; + + class StringTable : public virtual Section { + public: + virtual const char* addString(const std::string& s) = 0; + virtual size_t addString1(const std::string& s) = 0; + virtual const char* getString(size_t ndx) = 0; + virtual size_t getStringIndex(const char* name) = 0; + }; + + class Symbol { + public: + virtual ~Symbol() { } + virtual uint32_t index() = 0; + virtual uint32_t type() = 0; + virtual uint32_t binding() = 0; + virtual uint64_t size() = 0; + virtual uint64_t value() = 0; + virtual unsigned char other() = 0; + virtual std::string name() = 0; + virtual Section* section() = 0; + virtual void setValue(uint64_t value) = 0; + virtual void setSize(uint64_t size) = 0; + }; + + class SymbolTable : public virtual Section { + public: + virtual Symbol* addSymbol(Section* section, const std::string& name, uint64_t value, uint64_t size, unsigned char type, unsigned char binding, unsigned char other = 0) = 0; + virtual size_t symbolCount() = 0; + virtual Symbol* symbol(size_t i) = 0; + }; + + class NoteSection : public virtual Section { + public: + virtual bool addNote(const std::string& name, uint32_t type, const void* desc = 0, uint32_t desc_size = 0) = 0; + virtual bool getNote(const std::string& name, uint32_t type, void** desc, uint32_t* desc_size) = 0; + }; + + class Image { + public: + virtual ~Image() { } + + virtual bool initNew(uint16_t machine, uint16_t type, uint8_t os_abi = 0, uint8_t abi_version = 0, uint32_t e_flags = 0) = 0; + virtual bool loadFromFile(const std::string& filename) = 0; + virtual bool saveToFile(const std::string& filename) = 0; + virtual bool initFromBuffer(const void* buffer, size_t size) = 0; + virtual bool initAsBuffer(const void* buffer, size_t size) = 0; + virtual bool writeTo(const std::string& filename) = 0; + virtual bool copyToBuffer(void** buf, size_t* size = 0) = 0; // Copy to new buffer allocated with malloc + virtual bool copyToBuffer(void* buf, size_t size) = 0; // Copy to existing buffer of given size. + + virtual const char* data() = 0; + virtual uint64_t size() = 0; + + virtual uint16_t Machine() = 0; + virtual uint16_t Type() = 0; + virtual uint32_t EFlags() = 0; + virtual uint32_t ABIVersion() = 0; + virtual uint32_t EClass() = 0; + virtual uint32_t OsAbi() = 0; + + std::string output() { return out.str(); } + + virtual bool Freeze() = 0; + virtual bool Validate() = 0; + + virtual StringTable* shstrtab() = 0; + virtual StringTable* strtab() = 0; + virtual SymbolTable* symtab() = 0; + virtual SymbolTable* getSymtab(uint16_t index) = 0; + + virtual StringTable* addStringTable(const std::string& name) = 0; + virtual StringTable* getStringTable(uint16_t index) = 0; + + virtual SymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) = 0; + + virtual size_t segmentCount() = 0; + virtual Segment* segment(size_t i) = 0; + virtual Segment* segmentByVAddr(uint64_t vaddr) = 0; + + virtual size_t sectionCount() = 0; + virtual Section* section(size_t i) = 0; + virtual Section* sectionByVAddr(uint64_t vaddr) = 0; + + virtual NoteSection* note() = 0; + virtual NoteSection* addNoteSection(const std::string& name) = 0; + + virtual Segment* initSegment(uint32_t type, uint32_t flags, uint64_t paddr = 0) = 0; + virtual bool addSegments() = 0; + + virtual Section* addSection(const std::string &name, + uint32_t type, + uint64_t flags = 0, + uint64_t entsize = 0, + Segment* segment = 0) = 0; + + virtual RelocationSection* relocationSection(Section* sec, SymbolTable* symtab = 0) = 0; + + protected: + std::ostringstream out; + }; + + Image* NewElf32Image(); + Image* NewElf64Image(); + + uint64_t ElfSize(const void* buffer); + + std::string GetNoteString(uint32_t s_size, const char* s); + + } +} + +#endif // AMD_ELF_IMAGE_HPP_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_code.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_code.hpp new file mode 100644 index 0000000000..5b43ae0a7e --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_code.hpp @@ -0,0 +1,432 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +/****************************************************************************** +* University of Illinois / NCSA +* Open Source License +* +* Copyright(c) 2011 - 2015 Advanced Micro Devices, Inc. +* All rights reserved. +* +* Developed by: +* Advanced Micro Devices, Inc. +* www.amd.com +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files(the "Software"), to deal +* with the Software without restriction, including without limitation the +* rights to use, copy, modify, merge, publish, distribute, sublicense, and / +* or sell copies of the Software, and to permit persons to whom the Software +* is furnished to do so, subject to the following conditions: +* +* Redistributions of source code must retain the above copyright notice, +* this list of conditions and the following disclaimers. +* +* Redistributions in binary form must reproduce the above copyright notice, +* this list of conditions and the following disclaimers in the documentation +* and / or other materials provided with the distribution. +* +* Neither the names of Advanced Micro Devices, Inc, nor the +mes of its +* contributors may be used to endorse or promote products derived from this +* Software without specific prior written permission. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH +* THE SOFTWARE. +******************************************************************************/ + +#ifndef AMD_HSA_CODE_HPP_ +#define AMD_HSA_CODE_HPP_ + +#include "amd_elf_image.hpp" +#include "amd_hsa_elf.h" +#include "amd_hsa_kernel_code.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" +#include +#include +#include +#include + +namespace amd { +namespace hsa { +namespace common { + +template +class Signed { +public: + static const uint64_t CT_SIGNATURE; + const uint64_t RT_SIGNATURE; + +protected: + Signed(): RT_SIGNATURE(signature) {} + virtual ~Signed() {} +}; + +template +const uint64_t Signed::CT_SIGNATURE = signature; + +bool IsAccessibleMemoryAddress(uint64_t address); + +template +size_t OffsetOf(member_type class_type::*member) +{ + return (char*)&((class_type*)nullptr->*member) - (char*)nullptr; +} + +template +class_type* ObjectAt(uint64_t address) +{ + if (!IsAccessibleMemoryAddress(address)) { + return nullptr; + } + + const uint64_t *rt_signature = + (const uint64_t*)(address + OffsetOf(&class_type::RT_SIGNATURE)); + if (nullptr == rt_signature) { + return nullptr; + } + if (class_type::CT_SIGNATURE != *rt_signature) { + return nullptr; + } + + return (class_type*)address; +} + +} + +namespace code { + + typedef amd::elf::Segment Segment; + typedef amd::elf::Section Section; + typedef amd::elf::RelocationSection RelocationSection; + typedef amd::elf::Relocation Relocation; + + class KernelSymbol; + class VariableSymbol; + + class Symbol { + protected: + amd::elf::Symbol* elfsym; + + public: + explicit Symbol(amd::elf::Symbol* elfsym_) + : elfsym(elfsym_) { } + virtual ~Symbol() { } + virtual bool IsKernelSymbol() const { return false; } + virtual KernelSymbol* AsKernelSymbol() { assert(false); return 0; } + virtual bool IsVariableSymbol() const { return false; } + virtual VariableSymbol* AsVariableSymbol() { assert(false); return 0; } + amd::elf::Symbol* elfSym() { return elfsym; } + std::string Name() const { return elfsym ? elfsym->name() : ""; } + Section* GetSection() { return elfsym->section(); } + virtual uint64_t SectionOffset() const { return elfsym->value(); } + virtual uint64_t VAddr() const { return elfsym->section()->addr() + elfsym->value(); } + uint32_t Index() const { return elfsym ? elfsym->index() : 0; } + bool IsDeclaration() const; + bool IsDefinition() const; + virtual bool IsAgent() const; + virtual hsa_symbol_kind_t Kind() const = 0; + hsa_symbol_linkage_t Linkage() const; + hsa_variable_allocation_t Allocation() const; + hsa_variable_segment_t Segment() const; + uint64_t Size() const; + uint32_t Size32() const; + uint32_t Alignment() const; + bool IsConst() const; + virtual hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value); + static hsa_code_symbol_t ToHandle(Symbol* sym); + static Symbol* FromHandle(hsa_code_symbol_t handle); + void setValue(uint64_t value) { elfsym->setValue(value); } + void setSize(uint32_t size) { elfsym->setSize(size); } + + std::string GetModuleName() const; + std::string GetSymbolName() const; + }; + + class KernelSymbol : public Symbol { + private: + uint32_t kernarg_segment_size, kernarg_segment_alignment; + uint32_t group_segment_size, private_segment_size; + bool is_dynamic_callstack; + + public: + explicit KernelSymbol(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc); + bool IsKernelSymbol() const override { return true; } + KernelSymbol* AsKernelSymbol() override { return this; } + hsa_symbol_kind_t Kind() const override { return HSA_SYMBOL_KIND_KERNEL; } + hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value) override; + }; + + class VariableSymbol : public Symbol { + public: + explicit VariableSymbol(amd::elf::Symbol* elfsym_) + : Symbol(elfsym_) { } + bool IsVariableSymbol() const override { return true; } + VariableSymbol* AsVariableSymbol() override { return this; } + hsa_symbol_kind_t Kind() const override { return HSA_SYMBOL_KIND_VARIABLE; } + hsa_status_t GetInfo(hsa_code_symbol_info_t attribute, void *value) override; + }; + + class AmdHsaCode { + private: + std::ostringstream out; + std::unique_ptr img; + std::vector dataSegments; + std::vector dataSections; + std::vector relocationSections; + std::vector symbols; + bool combineDataSegments; + Segment* hsaSegments[AMDGPU_HSA_SEGMENT_LAST][2]; + Section* hsaSections[AMDGPU_HSA_SECTION_LAST]; + + amd::elf::Section* hsatext; + amd::elf::Section* imageInit; + amd::elf::Section* samplerInit; + amd::elf::Section* debugInfo; + amd::elf::Section* debugLine; + amd::elf::Section* debugAbbrev; + + bool PullElf(); + bool PullElfV1(); + bool PullElfV2(); + + void AddAmdNote(uint32_t type, const void* desc, uint32_t desc_size); + template + bool GetAmdNote(uint32_t type, S** desc) + { + uint32_t desc_size; + if (!img->note()->getNote("AMD", type, (void**) desc, &desc_size)) { + out << "Failed to find note, type: " << type << std::endl; + return false; + } + if (desc_size < sizeof(S)) { + out << "Note size mismatch, type: " << type << " size: " << desc_size << " expected at least " << sizeof(S) << std::endl; + return false; + } + return true; + } + + void PrintSegment(std::ostream& out, Segment* segment); + void PrintSection(std::ostream& out, Section* section); + void PrintRawData(std::ostream& out, Section* section); + void PrintRawData(std::ostream& out, const unsigned char *data, size_t size); + void PrintRelocationData(std::ostream& out, RelocationSection* section); + void PrintSymbol(std::ostream& out, Symbol* sym); + void PrintDisassembly(std::ostream& out, const unsigned char *isa, size_t size, uint32_t isa_offset = 0); + std::string MangleSymbolName(const std::string& module_name, const std::string symbol_name); + bool ElfImageError(); + + public: + bool HasHsaText() const { return hsatext != 0; } + amd::elf::Section* HsaText() { assert(hsatext); return hsatext; } + const amd::elf::Section* HsaText() const { assert(hsatext); return hsatext; } + amd::elf::SymbolTable* Symtab() { assert(img); return img->symtab(); } + uint16_t Machine() const { return img->Machine(); } + uint32_t EFlags() const { return img->EFlags(); } + uint32_t EClass() const { return img->EClass(); } + uint32_t OsAbi() const { return img->OsAbi(); } + + AmdHsaCode(bool combineDataSegments = true); + virtual ~AmdHsaCode(); + + std::string output() { return out.str(); } + bool LoadFromFile(const std::string& filename); + bool SaveToFile(const std::string& filename); + bool WriteToBuffer(void* buffer); + bool InitFromBuffer(const void* buffer, size_t size); + bool InitAsBuffer(const void* buffer, size_t size); + bool InitAsHandle(hsa_code_object_t code_handle); + bool InitNew(bool xnack = false); + bool Freeze(); + hsa_code_object_t GetHandle(); + const char* ElfData(); + uint64_t ElfSize(); + bool Validate(); + void Print(std::ostream& out); + void PrintNotes(std::ostream& out); + void PrintSegments(std::ostream& out); + void PrintSections(std::ostream& out); + void PrintSymbols(std::ostream& out); + void PrintMachineCode(std::ostream& out); + void PrintMachineCode(std::ostream& out, KernelSymbol* sym); + bool PrintToFile(const std::string& filename); + + void AddNoteCodeObjectVersion(uint32_t major, uint32_t minor); + bool GetNoteCodeObjectVersion(std::string& version); + void AddNoteHsail(uint32_t hsail_major, uint32_t hsail_minor, hsa_profile_t profile, hsa_machine_model_t machine_model, hsa_default_float_rounding_mode_t rounding_mode); + bool GetNoteHsail(uint32_t* hsail_major, uint32_t* hsail_minor, hsa_profile_t* profile, hsa_machine_model_t* machine_model, hsa_default_float_rounding_mode_t* default_float_round); + void AddNoteIsa(const std::string& vendor_name, const std::string& architecture_name, uint32_t major, uint32_t minor, uint32_t stepping); + bool GetNoteIsa(std::string& vendor_name, std::string& architecture_name, uint32_t* major_version, uint32_t* minor_version, uint32_t* stepping); + void AddNoteProducer(uint32_t major, uint32_t minor, const std::string& producer); + bool GetNoteProducer(uint32_t* major, uint32_t* minor, std::string& producer_name); + void AddNoteProducerOptions(const std::string& options); + void AddNoteProducerOptions(int32_t call_convention, const hsa_ext_control_directives_t& user_directives, const std::string& user_options); + bool GetNoteProducerOptions(std::string& options); + + bool GetIsa(std::string& isaName, unsigned *genericVersion = nullptr); + bool GetCodeObjectVersion(uint32_t* major, uint32_t* minor); + hsa_status_t GetInfo(hsa_code_object_info_t attribute, void *value); + hsa_status_t GetSymbol(const char *module_name, const char *symbol_name, hsa_code_symbol_t *sym); + hsa_status_t IterateSymbols(hsa_code_object_t code_object, + hsa_status_t (*callback)( + hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void* data), + void* data); + + void AddHsaTextData(const void* buffer, size_t size); + uint64_t NextKernelCodeOffset() const; + bool AddKernelCode(KernelSymbol* sym, const void* code, size_t size); + + Symbol* AddKernelDefinition(const std::string& name, const void* isa, size_t isa_size); + + size_t DataSegmentCount() const { return dataSegments.size(); } + Segment* DataSegment(size_t i) const { return dataSegments[i]; } + + size_t DataSectionCount() { return dataSections.size(); } + Section* DataSection(size_t i) { return dataSections[i]; } + + Section* AddEmptySection(); + Section* AddCodeSection(Segment* segment); + Section* AddDataSection(const std::string &name, + uint32_t type, + uint64_t flags, + Segment* segment); + + bool HasImageInitSection() const { return imageInit != 0; } + Section* ImageInitSection(); + void AddImageInitializer(Symbol* image, uint64_t destOffset, const amdgpu_hsa_image_descriptor_t& init); + void AddImageInitializer(Symbol* image, uint64_t destOffset, + amdgpu_hsa_metadata_kind16_t kind, + amdgpu_hsa_image_geometry8_t geometry, + amdgpu_hsa_image_channel_order8_t channel_order, amdgpu_hsa_image_channel_type8_t channel_type, + uint64_t width, uint64_t height, uint64_t depth, uint64_t array); + + + bool HasSamplerInitSection() const { return samplerInit != 0; } + amd::elf::Section* SamplerInitSection(); + amd::elf::Section* AddSamplerInit(); + void AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, const amdgpu_hsa_sampler_descriptor_t& init); + void AddSamplerInitializer(Symbol* sampler, uint64_t destOffset, + amdgpu_hsa_sampler_coord8_t coord, + amdgpu_hsa_sampler_filter8_t filter, + amdgpu_hsa_sampler_addressing8_t addressing); + + void AddInitVarWithAddress(bool large, Symbol* dest, uint64_t destOffset, Symbol* addrOf, uint64_t addrAddend); + + void InitHsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable); + bool AddHsaSegments(); + Segment* HsaSegment(amdgpu_hsa_elf_segment_t segment, bool writable); + + void InitHsaSectionSegment(amdgpu_hsa_elf_section_t section, bool combineSegments = true); + Section* HsaDataSection(amdgpu_hsa_elf_section_t section, bool combineSegments = true); + + Symbol* AddExecutableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section = 0); + + Symbol* AddVariableSymbol(const std::string &name, + unsigned char type, + unsigned char binding, + unsigned char other, + Section *section, + uint64_t value, + uint64_t size); + void AddSectionSymbols(); + + size_t RelocationSectionCount() { return relocationSections.size(); } + RelocationSection* GetRelocationSection(size_t i) { return relocationSections[i]; } + + size_t SymbolCount() { return symbols.size(); } + Symbol* GetSymbol(size_t i) { return symbols[i]; } + Symbol* GetSymbolByElfIndex(size_t index); + Symbol* FindSymbol(const std::string &n); + + void AddData(amdgpu_hsa_elf_section_t section, const void* data = 0, size_t size = 0); + + Section* DebugInfo(); + Section* DebugLine(); + Section* DebugAbbrev(); + + Section* AddHsaHlDebug(const std::string& name, const void* data, size_t size); + }; + + class AmdHsaCodeManager { + private: + typedef std::unordered_map CodeMap; + CodeMap codeMap; + + public: + AmdHsaCode* FromHandle(hsa_code_object_t handle); + bool Destroy(hsa_code_object_t handle); + }; + + class KernelSymbolV2 : public KernelSymbol { + private: + public: + explicit KernelSymbolV2(amd::elf::Symbol* elfsym_, const amd_kernel_code_t* akc); + bool IsAgent() const override { return true; } + uint64_t SectionOffset() const override { return elfsym->value() - elfsym->section()->addr(); } + uint64_t VAddr() const override { return elfsym->value(); } + }; + + class VariableSymbolV2 : public VariableSymbol { + private: + public: + explicit VariableSymbolV2(amd::elf::Symbol* elfsym_) : VariableSymbol(elfsym_) { } + bool IsAgent() const override { return false; } + uint64_t SectionOffset() const override { return elfsym->value() - elfsym->section()->addr(); } + uint64_t VAddr() const override { return elfsym->value(); } + }; +} +} +} + +#endif // AMD_HSA_CODE_HPP_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_common.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_common.h new file mode 100644 index 0000000000..bfb613ec47 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_common.h @@ -0,0 +1,91 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// The following set of header files provides definitions for AMD GPU +// Architecture: +// - amd_hsa_common.h +// - amd_hsa_elf.h +// - amd_hsa_kernel_code.h +// - amd_hsa_queue.h +// - amd_hsa_signal.h +// +// Refer to "HSA Application Binary Interface: AMD GPU Architecture" for more +// information. + +#ifndef AMD_HSA_COMMON_H +#define AMD_HSA_COMMON_H + +#include +#include + +// Descriptive version of the HSA Application Binary Interface. +#define AMD_HSA_ABI_VERSION "AMD GPU Architecture v0.35 (June 25, 2015)" + +// Alignment attribute that specifies a minimum alignment (in bytes) for +// variables of the specified type. +#if defined(__GNUC__) +# define __ALIGNED__(x) __attribute__((aligned(x))) +#elif defined(_MSC_VER) +# define __ALIGNED__(x) __declspec(align(x)) +#elif defined(RC_INVOKED) +# define __ALIGNED__(x) +#else +# error +#endif + +// Creates enumeration entries for packed types. Enumeration entries include +// bit shift amount, bit width, and bit mask. +#define AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ + name##_SHIFT = (shift), \ + name##_WIDTH = (width), \ + name = (((1 << (width)) - 1) << (shift)) \ + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask##_SHIFT) & ~mask); \ + dst |= (((val) << mask##_SHIFT) & mask) \ + +#endif // AMD_HSA_COMMON_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_elf.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_elf.h new file mode 100644 index 0000000000..10063ef7b0 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_elf.h @@ -0,0 +1,476 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// Undefine the macro in case it is defined in the system elf.h. +#undef EM_AMDGPU + +#ifndef AMD_HSA_ELF_H +#define AMD_HSA_ELF_H + +#include + +// AMD GPU Specific ELF Header Enumeration Values. +// +// Values are copied from LLVM BinaryFormat/ELF.h . This file also contains +// code object V1 defintions which are not part of the LLVM header. Code object +// V1 was only supported by the Finalizer which is now deprecated and removed. +// +// TODO: Deprecate and remove V1 support and replace this header with using the +// LLVM header. +namespace ELF { + +// Machine architectures +// See current registered ELF machine architectures at: +// http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html +enum { + EM_AMDGPU = 224, // AMD GPU architecture +}; + +// OS ABI identification. +enum { + ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime +}; + +// AMDGPU OS ABI Version identification. +enum { + // ELFABIVERSION_AMDGPU_HSA_V1 does not exist because OS ABI identification + // was never defined for V1. + ELFABIVERSION_AMDGPU_HSA_V2 = 0, + ELFABIVERSION_AMDGPU_HSA_V3 = 1, + ELFABIVERSION_AMDGPU_HSA_V4 = 2, + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, +}; + +// AMDGPU specific e_flags. +enum : unsigned { + // Processor selection mask for EF_AMDGPU_MACH_* values. + EF_AMDGPU_MACH = 0x0ff, + // Not specified processor. + EF_AMDGPU_MACH_NONE = 0x000, + // R600-based processors. + // Radeon HD 2000/3000 Series (R600). + EF_AMDGPU_MACH_R600_R600 = 0x001, + EF_AMDGPU_MACH_R600_R630 = 0x002, + EF_AMDGPU_MACH_R600_RS880 = 0x003, + EF_AMDGPU_MACH_R600_RV670 = 0x004, + // Radeon HD 4000 Series (R700). + EF_AMDGPU_MACH_R600_RV710 = 0x005, + EF_AMDGPU_MACH_R600_RV730 = 0x006, + EF_AMDGPU_MACH_R600_RV770 = 0x007, + // Radeon HD 5000 Series (Evergreen). + EF_AMDGPU_MACH_R600_CEDAR = 0x008, + EF_AMDGPU_MACH_R600_CYPRESS = 0x009, + EF_AMDGPU_MACH_R600_JUNIPER = 0x00a, + EF_AMDGPU_MACH_R600_REDWOOD = 0x00b, + EF_AMDGPU_MACH_R600_SUMO = 0x00c, + // Radeon HD 6000 Series (Northern Islands). + EF_AMDGPU_MACH_R600_BARTS = 0x00d, + EF_AMDGPU_MACH_R600_CAICOS = 0x00e, + EF_AMDGPU_MACH_R600_CAYMAN = 0x00f, + EF_AMDGPU_MACH_R600_TURKS = 0x010, + // Reserved for R600-based processors. + EF_AMDGPU_MACH_R600_RESERVED_FIRST = 0x011, + EF_AMDGPU_MACH_R600_RESERVED_LAST = 0x01f, + // First/last R600-based processors. + EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600, + EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS, + // AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, + EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, + EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, + EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023, + EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024, + EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025, + EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026, + EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028, + EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029, + EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a, + EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b, + EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c, + EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d, + EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, + EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, + EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, + EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, + EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, + EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, + EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037, + EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038, + EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039, + EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a, + EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b, + EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c, + EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, + EF_AMDGPU_MACH_AMDGCN_GFX1000 = 0x0f1, +#if defined(GFX40_BUILD) + EF_AMDGPU_MACH_AMDGCN_GFX4000 = 0x0f8, + EF_AMDGPU_MACH_AMDGCN_GFX4010 = 0x0f9, + EF_AMDGPU_MACH_AMDGCN_GFX4020 = 0x0fe, + EF_AMDGPU_MACH_AMDGCN_GFX4030 = 0x0f6, +#endif // GFX40_BUILD + EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, +#if defined(GFX11_BUILD) + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, + EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, + EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, +#endif // GFX11_BUILD +#if defined(GFX12_BUILD) + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, +#endif // GFX12_BUILD + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + // Reserved for AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_RESERVED_LAST = 0x0ff, + // First/last AMDGCN-based processors. + EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_RESERVED_LAST, + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_XNACK_V2 = 0x01, + // Indicates if the trap handler is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V2. + EF_AMDGPU_FEATURE_TRAP_HANDLER_V2 = 0x02, + // Indicates if the "xnack" target feature is enabled for all code contained + // in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_XNACK_V3 = 0x100, + // Indicates if the "sramecc" target feature is enabled for all code + // contained in the object. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V3. + EF_AMDGPU_FEATURE_SRAMECC_V3 = 0x200, + // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, + // ELFABIVERSION_AMDGPU_HSA_V5. + EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, + // XNACK is not supported. + EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, + // XNACK is any/default/unspecified. + EF_AMDGPU_FEATURE_XNACK_ANY_V4 = 0x100, + // XNACK is off. + EF_AMDGPU_FEATURE_XNACK_OFF_V4 = 0x200, + // XNACK is on. + EF_AMDGPU_FEATURE_XNACK_ON_V4 = 0x300, + // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. + // + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, + // ELFABIVERSION_AMDGPU_HSA_V5. + EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, + // SRAMECC is not supported. + EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, + // SRAMECC is any/default/unspecified. + EF_AMDGPU_FEATURE_SRAMECC_ANY_V4 = 0x400, + // SRAMECC is off. + EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, + // SRAMECC is on. + EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +} // end namespace ELF + +// ELF Section Header Flag Enumeration Values. +#define SHF_AMDGPU_HSA_GLOBAL (0x00100000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_READONLY (0x00200000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_CODE (0x00400000 & SHF_MASKOS) +#define SHF_AMDGPU_HSA_AGENT (0x00800000 & SHF_MASKOS) + +// +typedef enum { + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT = 1, + AMDGPU_HSA_SEGMENT_READONLY_AGENT = 2, + AMDGPU_HSA_SEGMENT_CODE_AGENT = 3, + AMDGPU_HSA_SEGMENT_LAST, +} amdgpu_hsa_elf_segment_t; + +// ELF Program Header Type Enumeration Values. +#define PT_AMDGPU_HSA_LOAD_GLOBAL_PROGRAM (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM) +#define PT_AMDGPU_HSA_LOAD_GLOBAL_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_GLOBAL_AGENT) +#define PT_AMDGPU_HSA_LOAD_READONLY_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_READONLY_AGENT) +#define PT_AMDGPU_HSA_LOAD_CODE_AGENT (PT_LOOS + AMDGPU_HSA_SEGMENT_CODE_AGENT) + +// ELF Symbol Type Enumeration Values. +#define STT_AMDGPU_HSA_KERNEL (STT_LOOS + 0) +#define STT_AMDGPU_HSA_INDIRECT_FUNCTION (STT_LOOS + 1) +#define STT_AMDGPU_HSA_METADATA (STT_LOOS + 2) + +// ELF Symbol Binding Enumeration Values. +#define STB_AMDGPU_HSA_EXTERNAL (STB_LOOS + 0) + +// ELF Symbol Other Information Creation/Retrieval. +#define ELF64_ST_AMDGPU_ALLOCATION(o) (((o) >> 2) & 0x3) +#define ELF64_ST_AMDGPU_FLAGS(o) ((o) >> 4) +#define ELF64_ST_AMDGPU_OTHER(f, a, v) (((f) << 4) + (((a) & 0x3) << 2) + ((v) & 0x3)) + +typedef enum { + AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM = 1, + AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT = 2, + AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT = 3, + AMDGPU_HSA_SYMBOL_ALLOCATION_LAST, +} amdgpu_hsa_symbol_allocation_t; + +// ELF Symbol Allocation Enumeration Values. +#define STA_AMDGPU_HSA_DEFAULT AMDGPU_HSA_SYMBOL_ALLOCATION_DEFAULT +#define STA_AMDGPU_HSA_GLOBAL_PROGRAM AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_PROGRAM +#define STA_AMDGPU_HSA_GLOBAL_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_GLOBAL_AGENT +#define STA_AMDGPU_HSA_READONLY_AGENT AMDGPU_HSA_SYMBOL_ALLOCATION_READONLY_AGENT + +typedef enum { + AMDGPU_HSA_SYMBOL_FLAG_DEFAULT = 0, + AMDGPU_HSA_SYMBOL_FLAG_CONST = 1, + AMDGPU_HSA_SYMBOL_FLAG_LAST, +} amdgpu_hsa_symbol_flag_t; + +// ELF Symbol Flag Enumeration Values. +#define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST + +// AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_NONE 0 +#define R_AMDGPU_32_LOW 1 +#define R_AMDGPU_32_HIGH 2 +#define R_AMDGPU_64 3 +#define R_AMDGPU_INIT_SAMPLER 4 +#define R_AMDGPU_INIT_IMAGE 5 +#define R_AMDGPU_RELATIVE64 13 + +// AMD GPU Note Type Enumeration Values. +#define NT_AMD_HSA_CODE_OBJECT_VERSION 1 +#define NT_AMD_HSA_HSAIL 2 +#define NT_AMD_HSA_ISA_VERSION 3 +#define NT_AMD_HSA_PRODUCER 4 +#define NT_AMD_HSA_PRODUCER_OPTIONS 5 +#define NT_AMD_HSA_EXTENSION 6 +#define NT_AMD_HSA_ISA_NAME 11 +#define NT_AMD_HSA_HLDEBUG_DEBUG 101 +#define NT_AMD_HSA_HLDEBUG_TARGET 102 + +// AMD GPU Metadata Kind Enumeration Values. +typedef uint16_t amdgpu_hsa_metadata_kind16_t; +typedef enum { + AMDGPU_HSA_METADATA_KIND_NONE = 0, + AMDGPU_HSA_METADATA_KIND_INIT_SAMP = 1, + AMDGPU_HSA_METADATA_KIND_INIT_ROIMG = 2, + AMDGPU_HSA_METADATA_KIND_INIT_WOIMG = 3, + AMDGPU_HSA_METADATA_KIND_INIT_RWIMG = 4 +} amdgpu_hsa_metadata_kind_t; + +// AMD GPU Sampler Coordinate Normalization Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_coord8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_COORD_UNNORMALIZED = 0, + AMDGPU_HSA_SAMPLER_COORD_NORMALIZED = 1 +} amdgpu_hsa_sampler_coord_t; + +// AMD GPU Sampler Filter Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_filter8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_FILTER_NEAREST = 0, + AMDGPU_HSA_SAMPLER_FILTER_LINEAR = 1 +} amdgpu_hsa_sampler_filter_t; + +// AMD GPU Sampler Addressing Enumeration Values. +typedef uint8_t amdgpu_hsa_sampler_addressing8_t; +typedef enum { + AMDGPU_HSA_SAMPLER_ADDRESSING_UNDEFINED = 0, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_EDGE = 1, + AMDGPU_HSA_SAMPLER_ADDRESSING_CLAMP_TO_BORDER = 2, + AMDGPU_HSA_SAMPLER_ADDRESSING_REPEAT = 3, + AMDGPU_HSA_SAMPLER_ADDRESSING_MIRRORED_REPEAT = 4 +} amdgpu_hsa_sampler_addressing_t; + +// AMD GPU Sampler Descriptor. +typedef struct amdgpu_hsa_sampler_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_sampler_coord8_t coord; + amdgpu_hsa_sampler_filter8_t filter; + amdgpu_hsa_sampler_addressing8_t addressing; + uint8_t reserved1; +} amdgpu_hsa_sampler_descriptor_t; + +// AMD GPU Image Geometry Enumeration Values. +typedef uint8_t amdgpu_hsa_image_geometry8_t; +typedef enum { + AMDGPU_HSA_IMAGE_GEOMETRY_1D = 0, + AMDGPU_HSA_IMAGE_GEOMETRY_2D = 1, + AMDGPU_HSA_IMAGE_GEOMETRY_3D = 2, + AMDGPU_HSA_IMAGE_GEOMETRY_1DA = 3, + AMDGPU_HSA_IMAGE_GEOMETRY_2DA = 4, + AMDGPU_HSA_IMAGE_GEOMETRY_1DB = 5, + AMDGPU_HSA_IMAGE_GEOMETRY_2DDEPTH = 6, + AMDGPU_HSA_IMAGE_GEOMETRY_2DADEPTH = 7 +} amdgpu_hsa_image_geometry_t; + +// AMD GPU Image Channel Order Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_order8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_A = 0, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_R = 1, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RX = 2, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RG = 3, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGX = 4, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RA = 5, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGB = 6, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBX = 7, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_RGBA = 8, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_BGRA = 9, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ARGB = 10, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_ABGR = 11, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGB = 12, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBX = 13, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SRGBA = 14, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_SBGRA = 15, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH = 18, + AMDGPU_HSA_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} amdgpu_hsa_image_channel_order_t; + +// AMD GPU Image Channel Type Enumeration Values. +typedef uint8_t amdgpu_hsa_image_channel_type8_t; +typedef enum { + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_555 = 5, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SHORT_565 = 6, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_INT_101010 = 7, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + AMDGPU_HSA_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} amdgpu_hsa_image_channel_type_t; + +// AMD GPU Image Descriptor. +typedef struct amdgpu_hsa_image_descriptor_s { + uint16_t size; + amdgpu_hsa_metadata_kind16_t kind; + amdgpu_hsa_image_geometry8_t geometry; + amdgpu_hsa_image_channel_order8_t channel_order; + amdgpu_hsa_image_channel_type8_t channel_type; + uint8_t reserved1; + uint64_t width; + uint64_t height; + uint64_t depth; + uint64_t array; +} amdgpu_hsa_image_descriptor_t; + +typedef struct amdgpu_hsa_note_code_object_version_s { + uint32_t major_version; + uint32_t minor_version; +} amdgpu_hsa_note_code_object_version_t; + +typedef struct amdgpu_hsa_note_hsail_s { + uint32_t hsail_major_version; + uint32_t hsail_minor_version; + uint8_t profile; + uint8_t machine_model; + uint8_t default_float_round; +} amdgpu_hsa_note_hsail_t; + +typedef struct amdgpu_hsa_note_isa_s { + uint16_t vendor_name_size; + uint16_t architecture_name_size; + uint32_t major; + uint32_t minor; + uint32_t stepping; + char vendor_and_architecture_name[1]; +} amdgpu_hsa_note_isa_t; + +typedef struct amdgpu_hsa_note_producer_s { + uint16_t producer_name_size; + uint16_t reserved; + uint32_t producer_major_version; + uint32_t producer_minor_version; + char producer_name[1]; +} amdgpu_hsa_note_producer_t; + +typedef struct amdgpu_hsa_note_producer_options_s { + uint16_t producer_options_size; + char producer_options[1]; +} amdgpu_hsa_note_producer_options_t; + +typedef enum { + AMDGPU_HSA_RODATA_GLOBAL_PROGRAM = 0, + AMDGPU_HSA_RODATA_GLOBAL_AGENT, + AMDGPU_HSA_RODATA_READONLY_AGENT, + AMDGPU_HSA_DATA_GLOBAL_PROGRAM, + AMDGPU_HSA_DATA_GLOBAL_AGENT, + AMDGPU_HSA_DATA_READONLY_AGENT, + AMDGPU_HSA_BSS_GLOBAL_PROGRAM, + AMDGPU_HSA_BSS_GLOBAL_AGENT, + AMDGPU_HSA_BSS_READONLY_AGENT, + AMDGPU_HSA_SECTION_LAST, +} amdgpu_hsa_elf_section_t; + +#endif // AMD_HSA_ELF_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_kernel_code.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_kernel_code.h new file mode 100644 index 0000000000..66bac64e16 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_kernel_code.h @@ -0,0 +1,274 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_KERNEL_CODE_H +#define AMD_HSA_KERNEL_CODE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Kernel Code Version Enumeration Values. +typedef uint32_t amd_kernel_code_version32_t; +enum amd_kernel_code_version_t { + AMD_KERNEL_CODE_VERSION_MAJOR = 1, + AMD_KERNEL_CODE_VERSION_MINOR = 1 +}; + +// AMD Machine Kind Enumeration Values. +typedef uint16_t amd_machine_kind16_t; +enum amd_machine_kind_t { + AMD_MACHINE_KIND_UNDEFINED = 0, + AMD_MACHINE_KIND_AMDGPU = 1 +}; + +// AMD Machine Version. +typedef uint16_t amd_machine_version16_t; + +// AMD Float Round Mode Enumeration Values. +enum amd_float_round_mode_t { + AMD_FLOAT_ROUND_MODE_NEAREST_EVEN = 0, + AMD_FLOAT_ROUND_MODE_PLUS_INFINITY = 1, + AMD_FLOAT_ROUND_MODE_MINUS_INFINITY = 2, + AMD_FLOAT_ROUND_MODE_ZERO = 3 +}; + +// AMD Float Denorm Mode Enumeration Values. +enum amd_float_denorm_mode_t { + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE_OUTPUT = 0, + AMD_FLOAT_DENORM_MODE_FLUSH_OUTPUT = 1, + AMD_FLOAT_DENORM_MODE_FLUSH_SOURCE = 2, + AMD_FLOAT_DENORM_MODE_NO_FLUSH = 3 +}; + +// AMD Compute Program Resource Register One. +typedef uint32_t amd_compute_pgm_rsrc_one32_t; +enum amd_compute_pgm_rsrc_one_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT, 0, 6), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT, 6, 4), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIORITY, 10, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_32, 12, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_ROUND_MODE_16_64, 14, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32, 16, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64, 18, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_PRIV, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_DEBUG_MODE, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE, 23, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_BULKY, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_CDBG_USER, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FP16_OVFL, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_RESERVED0, 27, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_WGP_MODE, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_MEM_ORDERED, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_ONE_FWD_PROGRESS, 31, 1), +}; + +// AMD System VGPR Workitem ID Enumeration Values. +enum amd_system_vgpr_workitem_id_t { + AMD_SYSTEM_VGPR_WORKITEM_ID_X = 0, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y = 1, + AMD_SYSTEM_VGPR_WORKITEM_ID_X_Y_Z = 2, + AMD_SYSTEM_VGPR_WORKITEM_ID_UNDEFINED = 3 +}; + +// AMD Compute Program Resource Register Two. +typedef uint32_t amd_compute_pgm_rsrc_two32_t; +enum amd_compute_pgm_rsrc_two_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_PRIVATE_SEGMENT_WAVE_BYTE_OFFSET, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT, 1, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_TRAP_HANDLER, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_INFO, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_VGPR_WORKITEM_ID, 11, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_ADDRESS_WATCH, 13, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_MEMORY_VIOLATION, 14, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE, 15, 9), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, 24, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, 25, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, 26, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, 27, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, 28, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, 29, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_EXCEPTION_INT_DIVISION_BY_ZERO, 30, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_TWO_RESERVED1, 31, 1) +}; + +// AMD Element Byte Size Enumeration Values. +enum amd_element_byte_size_t { + AMD_ELEMENT_BYTE_SIZE_2 = 0, + AMD_ELEMENT_BYTE_SIZE_4 = 1, + AMD_ELEMENT_BYTE_SIZE_8 = 2, + AMD_ELEMENT_BYTE_SIZE_16 = 3 +}; + +// AMD Kernel Code Properties. +typedef uint32_t amd_kernel_code_properties32_t; +enum amd_kernel_code_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_ID, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X, 7, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y, 8, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z, 9, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_WAVEFRONT_SIZE32, 10, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED1, 11, 5), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_ENABLE_ORDERED_APPEND_GDS, 16, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_PRIVATE_ELEMENT_SIZE, 17, 2), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_PTR64, 19, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DYNAMIC_CALLSTACK, 20, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_DEBUG_ENABLED, 21, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_IS_XNACK_ENABLED, 22, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTIES_RESERVED2, 23, 9) +}; + +// AMD Power Of Two Enumeration Values. +typedef uint8_t amd_powertwo8_t; +enum amd_powertwo_t { + AMD_POWERTWO_1 = 0, + AMD_POWERTWO_2 = 1, + AMD_POWERTWO_4 = 2, + AMD_POWERTWO_8 = 3, + AMD_POWERTWO_16 = 4, + AMD_POWERTWO_32 = 5, + AMD_POWERTWO_64 = 6, + AMD_POWERTWO_128 = 7, + AMD_POWERTWO_256 = 8 +}; + +// AMD Enabled Control Directive Enumeration Values. +typedef uint64_t amd_enabled_control_directive64_t; +enum amd_enabled_control_directive_t { + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_BREAK_EXCEPTIONS = 1, + AMD_ENABLED_CONTROL_DIRECTIVE_ENABLE_DETECT_EXCEPTIONS = 2, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_DYNAMIC_GROUP_SIZE = 4, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_GRID_SIZE = 8, + AMD_ENABLED_CONTROL_DIRECTIVE_MAX_FLAT_WORKGROUP_SIZE = 16, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_DIM = 32, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_GRID_SIZE = 64, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRED_WORKGROUP_SIZE = 128, + AMD_ENABLED_CONTROL_DIRECTIVE_REQUIRE_NO_PARTIAL_WORKGROUPS = 256 +}; + +// AMD Exception Kind Enumeration Values. +typedef uint16_t amd_exception_kind16_t; +enum amd_exception_kind_t { + AMD_EXCEPTION_KIND_INVALID_OPERATION = 1, + AMD_EXCEPTION_KIND_DIVISION_BY_ZERO = 2, + AMD_EXCEPTION_KIND_OVERFLOW = 4, + AMD_EXCEPTION_KIND_UNDERFLOW = 8, + AMD_EXCEPTION_KIND_INEXACT = 16 +}; + +// AMD Control Directives. +#define AMD_CONTROL_DIRECTIVES_ALIGN_BYTES 64 +#define AMD_CONTROL_DIRECTIVES_ALIGN __ALIGNED__(AMD_CONTROL_DIRECTIVES_ALIGN_BYTES) +typedef AMD_CONTROL_DIRECTIVES_ALIGN struct amd_control_directives_s { + amd_enabled_control_directive64_t enabled_control_directives; + uint16_t enable_break_exceptions; + uint16_t enable_detect_exceptions; + uint32_t max_dynamic_group_size; + uint64_t max_flat_grid_size; + uint32_t max_flat_workgroup_size; + uint8_t required_dim; + uint8_t reserved1[3]; + uint64_t required_grid_size[3]; + uint32_t required_workgroup_size[3]; + uint8_t reserved2[60]; +} amd_control_directives_t; + +// AMD Kernel Code. +#define AMD_ISA_ALIGN_BYTES 256 +#define AMD_KERNEL_CODE_ALIGN_BYTES 64 +#define AMD_KERNEL_CODE_ALIGN __ALIGNED__(AMD_KERNEL_CODE_ALIGN_BYTES) +typedef AMD_KERNEL_CODE_ALIGN struct amd_kernel_code_s { + amd_kernel_code_version32_t amd_kernel_code_version_major; + amd_kernel_code_version32_t amd_kernel_code_version_minor; + amd_machine_kind16_t amd_machine_kind; + amd_machine_version16_t amd_machine_version_major; + amd_machine_version16_t amd_machine_version_minor; + amd_machine_version16_t amd_machine_version_stepping; + int64_t kernel_code_entry_byte_offset; + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + uint64_t max_scratch_backing_memory_byte_size; + amd_compute_pgm_rsrc_one32_t compute_pgm_rsrc1; + amd_compute_pgm_rsrc_two32_t compute_pgm_rsrc2; + amd_kernel_code_properties32_t kernel_code_properties; + uint32_t workitem_private_segment_byte_size; + uint32_t workgroup_group_segment_byte_size; + uint32_t gds_segment_byte_size; + uint64_t kernarg_segment_byte_size; + uint32_t workgroup_fbarrier_count; + uint16_t wavefront_sgpr_count; + uint16_t workitem_vgpr_count; + uint16_t reserved_vgpr_first; + uint16_t reserved_vgpr_count; + uint16_t reserved_sgpr_first; + uint16_t reserved_sgpr_count; + uint16_t debug_wavefront_private_segment_offset_sgpr; + uint16_t debug_private_segment_buffer_sgpr; + amd_powertwo8_t kernarg_segment_alignment; + amd_powertwo8_t group_segment_alignment; + amd_powertwo8_t private_segment_alignment; + amd_powertwo8_t wavefront_size; + int32_t call_convention; + uint8_t reserved1[12]; + uint64_t runtime_loader_kernel_symbol; + amd_control_directives_t control_directives; +} amd_kernel_code_t; + +// TODO: this struct should be completely gone once debugger designs/implements +// Debugger APIs. +typedef struct amd_runtime_loader_debug_info_s { + const void* elf_raw; + size_t elf_size; + const char *kernel_name; + const void *owning_segment; +} amd_runtime_loader_debug_info_t; + +#endif // AMD_HSA_KERNEL_CODE_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_loader.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_loader.hpp new file mode 100644 index 0000000000..7185efb96e --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_loader.hpp @@ -0,0 +1,520 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_LOADER_HPP +#define AMD_HSA_LOADER_HPP + +#include +#include +#include "hsa.h" +#include "hsa_ext_image.h" +#include "hsa_ven_amd_loader.h" +#include "amd_hsa_elf.h" +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#define __read__ _read +#define __lseek__ _lseek +#else +#include +#define __read__ read +#define __lseek__ lseek +#endif // _WIN32 || _WIN64 + +/// @brief Major version of the AMD HSA Loader. Major versions are not backwards +/// compatible. +#define AMD_HSA_LOADER_VERSION_MAJOR 0 + +/// @brief Minor version of the AMD HSA Loader. Minor versions are backwards +/// compatible. +#define AMD_HSA_LOADER_VERSION_MINOR 5 + +/// @brief Descriptive version of the AMD HSA Loader. +#define AMD_HSA_LOADER_VERSION "AMD HSA Loader v0.05 (June 16, 2015)" + +enum hsa_ext_symbol_info_t { + HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE = 100, + HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN = 101, +}; + +typedef uint32_t hsa_symbol_info32_t; +typedef hsa_executable_symbol_t hsa_symbol_t; +typedef hsa_executable_symbol_info_t hsa_symbol_info_t; + +/// @brief Loaded code object attributes. +enum amd_loaded_code_object_info_t { + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE = 0, + AMD_LOADED_CODE_OBJECT_INFO_ELF_IMAGE_SIZE = 1 +}; + +/// @brief Loaded segment handle. +typedef struct amd_loaded_segment_s { + uint64_t handle; +} amd_loaded_segment_t; + +/// @brief Loaded segment attributes. +enum amd_loaded_segment_info_t { + AMD_LOADED_SEGMENT_INFO_TYPE = 0, + AMD_LOADED_SEGMENT_INFO_ELF_BASE_ADDRESS = 1, + AMD_LOADED_SEGMENT_INFO_LOAD_BASE_ADDRESS = 2, + AMD_LOADED_SEGMENT_INFO_SIZE = 3 +}; + +namespace amd { +namespace hsa { +namespace loader { + +/// @class CodeObjectReaderImpl. +/// @brief Code Object Reader Wrapper. +struct CodeObjectReaderImpl final { + public: + /// @returns Handle equivalent of @p object. + static hsa_code_object_reader_t Handle( + const CodeObjectReaderImpl *object) { + hsa_code_object_reader_t handle = {reinterpret_cast(object)}; + return handle; + } + + /// @returns Object equivalent of @p handle. + static CodeObjectReaderImpl *Object( + const hsa_code_object_reader_t &handle) { + CodeObjectReaderImpl *object = + reinterpret_cast(handle.handle); + return object; + } + + /// @brief Default constructor. + CodeObjectReaderImpl() {} + + /// @brief Default destructor. + ~CodeObjectReaderImpl(); + + hsa_status_t SetFile( + hsa_file_t _code_object_file_descriptor, + size_t _code_object_offset = 0, + size_t _code_object_size = 0); + + hsa_status_t SetMemory( + const void *_code_object_memory, + size_t _code_object_size); + + const void *GetCodeObjectMemory() const { return code_object_memory; }; + + std::string GetUri() const { return uri; }; + + private: + const void *code_object_memory{nullptr}; + size_t code_object_size{0}; + std::string uri{}; + bool is_mmap{false}; +}; + +//===----------------------------------------------------------------------===// +// Context. // +//===----------------------------------------------------------------------===// + +class Context { +public: + virtual ~Context() {} + + virtual hsa_isa_t IsaFromName(const char *name) = 0; + + // This function will be deleted in a future patch. Use the overload + // that takes a generic version instead. + virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) = 0; + + virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa, unsigned genericVersion) { return IsaSupportedByAgent(agent, isa); } + + virtual void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) = 0; + + virtual bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) = 0; + + virtual void SegmentFree(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) = 0; + + virtual void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) = 0; + + virtual void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t offset) = 0; + + virtual bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, void* seg, size_t size) = 0; + + virtual bool ImageExtensionSupported() = 0; + + virtual hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) = 0; + + virtual hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) = 0; + + virtual hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) = 0; + + virtual hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) = 0; + +protected: + Context() {} + +private: + Context(const Context &c); + Context& operator=(const Context &c); +}; + +//===----------------------------------------------------------------------===// +// Symbol. // +//===----------------------------------------------------------------------===// + +class Symbol { +public: + static hsa_symbol_t Handle(Symbol *symbol) { + hsa_symbol_t symbol_handle = + {reinterpret_cast(symbol)}; + return symbol_handle; + } + + static Symbol* Object(hsa_symbol_t symbol_handle) { + Symbol *symbol = + reinterpret_cast(symbol_handle.handle); + return symbol; + } + + virtual ~Symbol() {} + + virtual bool GetInfo(hsa_symbol_info32_t symbol_info, void *value) = 0; + + virtual hsa_agent_t GetAgent() = 0; + +protected: + Symbol() {} + +private: + Symbol(const Symbol &s); + Symbol& operator=(const Symbol &s); +}; + +//===----------------------------------------------------------------------===// +// LoadedCodeObject. // +//===----------------------------------------------------------------------===// + +class LoadedCodeObject { +public: + static hsa_loaded_code_object_t Handle(LoadedCodeObject *object) { + hsa_loaded_code_object_t handle = + {reinterpret_cast(object)}; + return handle; + } + + static LoadedCodeObject* Object(hsa_loaded_code_object_t handle) { + LoadedCodeObject *object = + reinterpret_cast(handle.handle); + return object; + } + + virtual ~LoadedCodeObject() {} + + virtual bool GetInfo(amd_loaded_code_object_info_t attribute, void *value) = 0; + + virtual hsa_status_t IterateLoadedSegments( + hsa_status_t (*callback)( + amd_loaded_segment_t loaded_segment, + void *data), + void *data) = 0; + + virtual hsa_agent_t getAgent() const = 0; + virtual hsa_executable_t getExecutable() const = 0; + virtual uint64_t getElfData() const = 0; + virtual uint64_t getElfSize() const = 0; + virtual uint64_t getStorageOffset() const = 0; + virtual uint64_t getLoadBase() const = 0; + virtual uint64_t getLoadSize() const = 0; + virtual int64_t getDelta() const = 0; + virtual std::string getUri() const = 0; + +protected: + LoadedCodeObject() {} + +private: + LoadedCodeObject(const LoadedCodeObject&); + LoadedCodeObject& operator=(const LoadedCodeObject&); +}; + +//===----------------------------------------------------------------------===// +// LoadedSegment. // +//===----------------------------------------------------------------------===// + +class LoadedSegment { +public: + static amd_loaded_segment_t Handle(LoadedSegment *object) { + amd_loaded_segment_t handle = + {reinterpret_cast(object)}; + return handle; + } + + static LoadedSegment* Object(amd_loaded_segment_t handle) { + LoadedSegment *object = + reinterpret_cast(handle.handle); + return object; + } + + virtual ~LoadedSegment() {} + + virtual bool GetInfo(amd_loaded_segment_info_t attribute, void *value) = 0; + +protected: + LoadedSegment() {} + +private: + LoadedSegment(const LoadedSegment&); + LoadedSegment& operator=(const LoadedSegment&); +}; + +//===----------------------------------------------------------------------===// +// Executable. // +//===----------------------------------------------------------------------===// + +class Executable { +public: + static hsa_executable_t Handle(Executable *executable) { + hsa_executable_t executable_handle = + {reinterpret_cast(executable)}; + return executable_handle; + } + + static Executable* Object(hsa_executable_t executable_handle) { + Executable *executable = + reinterpret_cast(executable_handle.handle); + return executable; + } + + virtual ~Executable() {} + + virtual hsa_status_t GetInfo( + hsa_executable_info_t executable_info, void *value) = 0; + + virtual hsa_status_t DefineProgramExternalVariable( + const char *name, void *address) = 0; + + virtual hsa_status_t DefineAgentExternalVariable( + const char *name, + hsa_agent_t agent, + hsa_variable_segment_t segment, + void *address) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t LoadCodeObject( + hsa_agent_t agent, + hsa_code_object_t code_object, + size_t code_object_size, + const char *options, + const std::string &uri, + hsa_loaded_code_object_t *loaded_code_object = nullptr) = 0; + + virtual hsa_status_t Freeze(const char *options) = 0; + + virtual hsa_status_t Validate(uint32_t *result) = 0; + + /// @note needed for hsa v1.0. + /// @todo remove during loader refactoring. + virtual bool IsProgramSymbol(const char *symbol_name) = 0; + + virtual Symbol* GetSymbol( + const char *symbol_name, + const hsa_agent_t *agent) = 0; + + typedef hsa_status_t (*iterate_symbols_f)( + hsa_executable_t executable, + hsa_symbol_t symbol_handle, + void *data); + + virtual hsa_status_t IterateSymbols( + iterate_symbols_f callback, void *data) = 0; + + /// @since hsa v1.1. + virtual hsa_status_t IterateAgentSymbols( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data) = 0; + + /// @since hsa v1.1. + virtual hsa_status_t IterateProgramSymbols( + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data) = 0; + + virtual hsa_status_t IterateLoadedCodeObjects( + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data) = 0; + + virtual size_t GetNumSegmentDescriptors() = 0; + + virtual size_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t total_num_segment_descriptors, + size_t first_empty_segment_descriptor) = 0; + + virtual uint64_t FindHostAddress(uint64_t device_address) = 0; + + virtual void Print(std::ostream& out) = 0; + virtual bool PrintToFile(const std::string& filename) = 0; + +protected: + Executable() {} + +private: + Executable(const Executable &e); + Executable& operator=(const Executable &e); + + static std::vector executables; + static std::mutex executables_mutex; +}; + +/// @class Loader +class Loader { +public: + /// @brief Destructor. + virtual ~Loader() {} + + /// @brief Creates AMD HSA Loader with specified @p context. + /// + /// @param[in] context Context. Must not be null. + /// + /// @returns AMD HSA Loader on success, null on failure. + static Loader* Create(Context* context); + + /// @brief Destroys AMD HSA Loader @p Loader_object. + /// + /// @param[in] loader AMD HSA Loader to destroy. Must not be null. + static void Destroy(Loader *loader); + + /// @returns Context associated with Loader. + virtual Context* GetContext() const = 0; + + /// @brief Creates empty AMD HSA Executable with specified @p profile, + /// @p options + virtual Executable* CreateExecutable( + hsa_profile_t profile, + const char *options, + hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0; + + + /// @brief Freezes @p executable + virtual hsa_status_t FreezeExecutable(Executable *executable, const char *options) = 0; + + /// @brief Destroys @p executable + virtual void DestroyExecutable(Executable *executable) = 0; + + /// @brief Invokes @p callback for each created executable + virtual hsa_status_t IterateExecutables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data) = 0; + + /// @brief same as hsa_ven_amd_loader_query_segment_descriptors. + virtual hsa_status_t QuerySegmentDescriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors) = 0; + + /// @brief Finds the handle of executable to which @p device_address + /// belongs. Return NULL handle if device address is invalid. + virtual hsa_executable_t FindExecutable(uint64_t device_address) = 0; + + /// @brief Returns host address given @p device_address. If @p device_address + /// is already host address, returns null pointer. If @p device_address is + /// invalid address, returns null pointer. + virtual uint64_t FindHostAddress(uint64_t device_address) = 0; + + /// @brief Print loader help. + virtual void PrintHelp(std::ostream& out) = 0; + +protected: + /// @brief Default constructor. + Loader() {} + +private: + /// @brief Copy constructor - not available. + Loader(const Loader&); + + /// @brief Assignment operator - not available. + Loader& operator=(const Loader&); +}; + + +} // namespace loader +} // namespace hsa +} // namespace amd + +#endif // AMD_HSA_LOADER_HPP diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_program.hpp b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_program.hpp new file mode 100644 index 0000000000..e33a14f0aa --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_program.hpp @@ -0,0 +1,347 @@ +//----------------------------------------------------------------------------- +// Copyright (c) 2011 - 2015 Advanced Micro Devices, Inc. All rights reserved. +//----------------------------------------------------------------------------- + +/// @file amd_hsa_program.hpp +/// @author AMD HSA Finalizer Team +/// +/// @brief Public AMD HSA Program Interfaces. +#ifndef AMD_HSA_PROGRAM_HPP +#define AMD_HSA_PROGRAM_HPP + +#include +#include +#include +#include +#include "amd_hsa_code.hpp" +#include "Brig.h" +#include "hsa.h" +#include "hsa_ext_finalize.h" + +/// @brief Descriptive version of AMD HSA Program. +#define AMD_HSA_PROGRAM_VERSION "AMD HSA Program v1.04 (August 3, 2015)" +#define AMD_HSA_PROGRAM_VERSION_MAJOR 1 +#define AMD_HSA_PROGRAM_VERSION_MINOR 4 + +/// @brief Environment variable. If set, overrides options parameter from +/// Program::Create with contents of this environment variable. +#define ENVVAR_AMD_HSA_PROGRAM_CREATE_OPTIONS "AMD_HSA_PROGRAM_CREATE_OPTIONS" + +/// @brief Environment variable. If set, concatenates options parameter from +/// Program::Create with contents of this environment variable. +#define ENVVAR_AMD_HSA_PROGRAM_CREATE_OPTIONS_APPEND "AMD_HSA_PROGRAM_CREATE_OPTIONS_APPEND" + +/// @brief Environment variable. If set, overrides options parameter from +/// Program::Finalize with contents of this environment variable. +#define ENVVAR_AMD_HSA_PROGRAM_FINALIZE_OPTIONS "AMD_HSA_PROGRAM_FINALIZE_OPTIONS" + +/// @brief Environment variable. If set, concatenates options parameter from +/// Program::Finalize with contents of this environment variable. +#define ENVVAR_AMD_HSA_PROGRAM_FINALIZE_OPTIONS_APPEND "AMD_HSA_PROGRAM_FINALIZE_OPTIONS_APPEND" + +/// @brief AMD HSA Program attributes (in addition to hsa_ext_program_info_t, +/// which is defined in HSA Runtime Specification), enumeration values below +/// must be negative. +typedef int32_t amd_hsa_program_info32_t; +enum amd_hsa_program_info_t { + /// @brief Major version of BRIG specified when AMD HSA Program was created. + /// The type of this attribute is BrigVersion32_t. + AMD_HSA_PROGRAM_INFO_BRIG_VERSION_MAJOR = -1, + /// @brief Minor version of BRIG specified when AMD HSA Program was created. + /// The type of this attribute is BrigVersion32_t. + AMD_HSA_PROGRAM_INFO_BRIG_VERSION_MINOR = -2, + /// @brief Indicates whether or not AMD HSA Program was created with debugging + /// enabled. The type of this attribute is bool. + AMD_HSA_PROGRAM_INFO_IS_DEBUGGING_ENABLED = -3 +}; + +namespace amd { +namespace hsa { +namespace program { + +/// @class Context +class Context { +public: + /// @brief Default destructor. + virtual ~Context() {} + + /// @brief Invoked when AMD HSA Program needs to allocate @p size bytes of + /// code object memory whose alignment is specified by @p align. + /// + /// @param[in] size Requested allocation size in bytes. + /// @param[in] align Requested alignment. + /// + /// @returns Pointer to allocated code object memory on success, null pointer + /// on failure. + virtual void* CodeObjectAlloc(size_t size, size_t align) = 0; + + /// @brief Invoked when AMD HSA Program needs to copy @p size bytes from + /// memory pointed to by @p src to code object memory pointed to by @p dst. + /// + /// @param[in] dst Pointer to code object memory to copy to. + /// @param[in] src Pointer to memory to copy from. + /// @param[in] size Requested copy size in bytes. + /// + /// @returns True on success, false on failure. + virtual bool CodeObjectCopy(void *dst, const void *src, size_t size) = 0; + + /// @brief Invoked when AMD HSA Program needs to deallocate @p size bytes of + /// code object memory pointed to by @p ptr. + /// + /// @param[in] ptr Pointer to code object memory to deallocate. + /// @param[in] size Requested deallocation size in bytes. + virtual void CodeObjectFree(void *ptr, size_t size) = 0; + + /// @brief Invoked when AMD HSA Finalizer and Program needs to reprot message or error + /// + /// @param[in] str Message to report. + virtual void ReportMessage(const std::string& str) = 0; + +protected: + /// @brief Default constructor. + Context() {} + +private: + /// @brief Copy constructor - not available. + Context(const Context&); + + /// @brief Assignment operator - not available. + Context& operator=(const Context&); +}; + +class Finalizer; + +/// @class Program +class Program: public amd::hsa::common::Signed<0x71BB0A093D69DA92> { +public: + /// @brief Constant BRIG Module iterator. + typedef std::vector::const_iterator const_module_iterator; + + /// @brief BRIG Module iterator. + typedef std::vector::iterator module_iterator; + + /// @brief Invalid HSA Program Handle. + static const uint64_t INVALID_HANDLE = 0; + + /// @brief Destructor. + virtual ~Program() {} + + /// @brief Converts AMD HSA Program @p program_object to HSA Program Handle. + /// + /// @param[in] program_object AMD HSA Program to convert. Can be null. + /// + /// @returns HSA Program Handle on success, invalid handle on failure. + static hsa_ext_program_t Handle(Program *program_object); + + /// @brief Converts HSA Program Handle @p program_handle to AMD HSA Program. + /// + /// @param[in] program_handle HSA Program Handle to convert. Can be invalid. + /// + /// @returns AMD HSA Program on success, null on failure. + static Program* Object(hsa_ext_program_t program_handle); + + /// @returns Constant iterator to first BRIG Module in AMD HSA Program. If + /// AMD HSA Program does not contain any BRIG Modules, returned constant + /// iterator will be equal to Program::module_end(). + virtual const_module_iterator module_begin() const = 0; + + /// @returns Constant iterator to entity following last BRIG Module in AMD + /// HSA Program. + virtual const_module_iterator module_end() const = 0; + + /// @returns Iterator to first BRIG Module in AMD HSA Program. If + /// AMD HSA Program does not contain any BRIG Modules, returned + /// iterator will be equal to Program::module_end(). + virtual module_iterator module_begin() = 0; + + /// @returns Iterator to entity following last BRIG Module in AMD + /// HSA Program. + virtual module_iterator module_end() = 0; + + /// @returns Context associated with AMD HSA Program. + virtual Context* GetContext() const = 0; + + /// @returns Finalizer associated with this AMD HSA Program. + virtual Finalizer* GetFinalizer() const = 0; + + /// @brief Retrieves current value of specified AMD HSA Program's + /// @p attribute. + /// + /// @param[in] attribute AMD HSA Program's attribute to retrieve. Can be + /// invalid. + /// @param[out] value Pointer to client-allocated memory to store attribute's + /// value in. Must not be null. If client-allocated memory is not large enough + /// to hold attribute's value, behaviour is undefined. + /// + /// @retval HSA_STATUS_SUCCESS Function executed successfully. + /// @retval HSA_STATUS_ERROR_INVALID_ARGUMENT Specified @p attribute is + /// invalid AMD HSA Program's attribute. + /// + /// @note If function failed to execute successfully, details of failure + /// can be retrieved using Program::GetLog. + virtual hsa_status_t GetInfo(amd_hsa_program_info32_t attribute, void *value) const = 0; + + /// @brief Adds specified BRIG Module @p module to AMD HSA Program. + /// + /// @details AMD HSA Program does not perform deep copy of BRIG Module + /// upon addition, it stores pointer to BRIG Module. BRIG Module is owned by + /// the client, which has to ensure that the lifetime of BRIG Module is + /// greater than the lifetime of AMD HSA Program. + /// + /// @param[in] module BRIG Module to add. Must not be null. + /// + /// @retval HSA_STATUS_SUCCESS Function executed successfully. + /// @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES Function failed to allocate + /// resources. + /// @retval HSA_EXT_STATUS_ERROR_INVALID_MODULE Specified @p module is invalid + /// BRIG module. + /// @retval HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED Specified @p module + /// is already included in AMD HSA Program. + /// @retval HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE Specified @p module is + /// incompatible with AMD HSA Program. + /// @retval HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol in specified @p module + /// is incompatible with symbol in AMD HSA Program. + /// + /// @note If function failed to execute successfully, details of failure + /// can be retrieved using Program::GetLog. + virtual hsa_status_t AddModule(hsa_ext_module_t module) = 0; + + /// @brief Finalizes AMD HSA Program with specified @p target, + /// @p call_convention, @p options, @p control_directives, and + /// @p code_object_type. + /// + /// @details Finalizes all kernels and indirect functions that belong to + /// AMD HSA Program for specified @p target, @p call_convention, + /// @p options, @p control_directives, and @p code_object_type. Transitive + /// closure of all functions specified by call or scall must be defined. + /// All kernels and indirect functions that belong to AMD HSA Program must + /// be defined. Kernels and indirect functions that are referenced in kernels + /// and indirect functions that belong to AMD HSA Program may or may not be + /// defined, but must be declared. All global and readonly variables that + /// belong to AMD HSA Program, or referenced in kernels and indirect functions + /// that belong to AMD HSA Program may or may not be defined, but must be + /// declared. + /// + /// @param[in] target Target to finalize for. Must not be null. + /// @param[in] call_convention Call convention to finalize for. Must be valid. + /// @param[in] options Options to finalize for. Can be null. + /// @param[in] control_directives Control directives to finalize for. Can be + /// invalid. + /// @param[in] code_object_type Code object type to create. Must be valid. + /// @param[out] code_object Code object generated by AMD HSA Program. Must + /// not be null. + /// + /// @retval HSA_STATUS_SUCCESS Function executed successfully. + /// @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES Function failed to allocate + /// resources. + /// @retval HSA_STATUS_ERROR_INVALID_ISA Specified @p target is invalid. + /// @retval HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH Specified + /// @p control_directives does not match control directives in + /// one of kernels or indirect functions that belong to AMD HSA Program. + /// @retval HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED AMD HSA Program failed + /// to finalize. + /// + /// @note If function failed to execute successfully, details of failure + /// can be retrieved using Program::GetLog. + /// + /// @deprecated @p control_directives will be included in @p options starting + /// AMD HSA Program v2.0. + virtual hsa_status_t Finalize( + const char *target, + int32_t call_convention, + const char *options, + hsa_ext_control_directives_t control_directives, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object) = 0; + +protected: + /// @brief Default constructor. + Program() {} + +private: + /// @brief Copy constructor - not available. + Program(const Program&); + + /// @brief Assignment operator - not available. + Program& operator=(const Program&); +}; + + +/// @class Finalizer +class Finalizer { +public: + /// @brief Destructor. + virtual ~Finalizer() {} + + /// @brief Creates AMD HSA Finalizer with specified @p context. + /// + /// @param[in] context Context. Must not be null. + /// + /// @returns AMD HSA Finalizer on success, null on failure. + static Finalizer* CreateFinalizer(Context* context); + + /// @brief Destroys AMD HSA Finalizer @p finalizer_object. + /// + /// @param[in] finalizer_object AMD HSA Finalizer to destroy. Must not be null. + static void DestroyFinalizer(Finalizer *finalizer_object); + + /// @brief Creates empty AMD HSA Program with specified @p profile, + /// @p machine_model, @p rounding_mode, @p options, @p context, @p major and + /// @p minor BRIG versions. + /// + /// @param[in] profile HSA profile. Must be valid. + /// @param[in] machine_model HSA machine model. Must be valid. + /// @param[in] rounding_mode HSA rounding mode. Must be valid. + /// @param[in] options User options. Can be null. + /// @param[in] brig_major Major BRIG version. Must be valid. + /// @param[in] brig_minor Minor BRIG version. Must be valid. + /// + /// @returns AMD HSA Program on success, null on failure. + virtual Program* CreateProgram( + hsa_profile_t profile, + hsa_machine_model_t machine_model, + hsa_default_float_rounding_mode_t rounding_mode, + const char *options, + BrigVersion32_t brig_major = BRIG_VERSION_BRIG_MAJOR, + BrigVersion32_t brig_minor = BRIG_VERSION_BRIG_MINOR) = 0; + + /// @brief Destroys AMD HSA Program @p program_object. + /// + /// @param[in] program_object AMD HSA Program to destroy. Must not be null. + virtual void DestroyProgram(Program *program_object) = 0; + + /// @brief Prints available finalizer options as error and exits. + virtual void PrintFinalizerOptions() const = 0; + + /// @returns Context associated with Finalizer. + virtual Context* GetContext() const = 0; + + /// @brief Enables code cache optimization. + virtual void EnableCodeCache() = 0; + + /// @brief Disables code cache optimization. + virtual void DisableCodeCache() = 0; + + /// @returns True if code cache is enabled, false otherwise. + virtual bool IsCodeCacheEnabled() const = 0; + + /// @returns List of names for supported targets. + virtual const std::vector& GetSupportedTargets() const = 0; + +protected: + /// @brief Default constructor. + Finalizer() {} + +private: + /// @brief Copy constructor - not available. + Finalizer(const Finalizer&); + + /// @brief Assignment operator - not available. + Finalizer& operator=(const Finalizer&); +}; + +} // namespace program +} // namespace hsa +} // namespace amd + +#endif // AMD_HSA_PROGRAM_HPP diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_queue.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_queue.h new file mode 100644 index 0000000000..2176e84706 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_queue.h @@ -0,0 +1,87 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_QUEUE_H +#define AMD_HSA_QUEUE_H + +#include "amd_hsa_common.h" +#include "hsa.h" + +// AMD Queue Properties. +typedef uint32_t amd_queue_properties32_t; +enum amd_queue_properties_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_IS_PTR64, 1, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE, 4, 1), + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_QUEUE_PROPERTIES_RESERVED1, 5, 27) +}; + +// AMD Queue. +#define AMD_QUEUE_ALIGN_BYTES 64 +#define AMD_QUEUE_ALIGN __ALIGNED__(AMD_QUEUE_ALIGN_BYTES) +typedef struct AMD_QUEUE_ALIGN amd_queue_s { + hsa_queue_t hsa_queue; + uint32_t reserved1[4]; + volatile uint64_t write_dispatch_id; + uint32_t group_segment_aperture_base_hi; + uint32_t private_segment_aperture_base_hi; + uint32_t max_cu_id; + uint32_t max_wave_id; + volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; + volatile uint32_t legacy_doorbell_lock; + uint32_t reserved2[9]; + volatile uint64_t read_dispatch_id; + uint32_t read_dispatch_id_field_base_byte_offset; + uint32_t compute_tmpring_size; + uint32_t scratch_resource_descriptor[4]; + uint64_t scratch_backing_memory_location; + uint64_t scratch_backing_memory_byte_size; + uint32_t scratch_workitem_byte_size; + amd_queue_properties32_t queue_properties; + uint32_t reserved3[2]; + hsa_signal_t queue_inactive_signal; + uint32_t reserved4[14]; +} amd_queue_t; + +#endif // AMD_HSA_QUEUE_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_signal.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_signal.h new file mode 100644 index 0000000000..57aa1adcdc --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/amd_hsa_signal.h @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef AMD_HSA_SIGNAL_H +#define AMD_HSA_SIGNAL_H + +#include "amd_hsa_common.h" +#include "amd_hsa_queue.h" + +// AMD Signal Kind Enumeration Values. +typedef int64_t amd_signal_kind64_t; +enum amd_signal_kind_t { + AMD_SIGNAL_KIND_INVALID = 0, + AMD_SIGNAL_KIND_USER = 1, + AMD_SIGNAL_KIND_DOORBELL = -1, + AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 +}; + +// AMD Signal. +#define AMD_SIGNAL_ALIGN_BYTES 64 +#define AMD_SIGNAL_ALIGN __ALIGNED__(AMD_SIGNAL_ALIGN_BYTES) +typedef struct AMD_SIGNAL_ALIGN amd_signal_s { + amd_signal_kind64_t kind; + union { + volatile int64_t value; + volatile uint32_t* legacy_hardware_doorbell_ptr; + volatile uint64_t* hardware_doorbell_ptr; + }; + uint64_t event_mailbox_ptr; + uint32_t event_id; + uint32_t reserved1; + uint64_t start_ts; + uint64_t end_ts; + union { + amd_queue_t* queue_ptr; + uint64_t reserved2; + }; + uint32_t reserved3[2]; +} amd_signal_t; + +#endif // AMD_HSA_SIGNAL_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa.h new file mode 100644 index 0000000000..3c0db5d52b --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa.h @@ -0,0 +1,5692 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_H_ +#define HSA_RUNTIME_INC_HSA_H_ + +#include /* size_t */ +#include /* uintXX_t */ + +#ifndef __cplusplus +#include /* bool */ +#endif /* __cplusplus */ + +// Placeholder for calling convention and import/export macros +#ifndef HSA_CALL +#define HSA_CALL +#endif + +#ifndef HSA_EXPORT_DECORATOR +#ifdef __GNUC__ +#define HSA_EXPORT_DECORATOR __attribute__ ((visibility ("default"))) +#else +#define HSA_EXPORT_DECORATOR +#endif +#endif +#define HSA_API_EXPORT HSA_EXPORT_DECORATOR HSA_CALL +#define HSA_API_IMPORT HSA_CALL + +#if !defined(HSA_API) && defined(HSA_EXPORT) +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +// Detect and set large model builds. +#undef HSA_LARGE_MODEL +#if defined(__LP64__) || defined(_M_X64) +#define HSA_LARGE_MODEL +#endif + +// Try to detect CPU endianness +#if !defined(LITTLEENDIAN_CPU) && !defined(BIGENDIAN_CPU) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) +#define LITTLEENDIAN_CPU +#endif +#endif + +#undef HSA_LITTLE_ENDIAN +#if defined(LITTLEENDIAN_CPU) +#define HSA_LITTLE_ENDIAN +#elif defined(BIGENDIAN_CPU) +#else +#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined" +#endif + +#ifndef HSA_DEPRECATED +#define HSA_DEPRECATED +//#ifdef __GNUC__ +//#define HSA_DEPRECATED __attribute__((deprecated)) +//#else +//#define HSA_DEPRECATED __declspec(deprecated) +//#endif +#endif + +#define HSA_VERSION_1_0 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \defgroup status Runtime Notifications + * @{ + */ + +/** + * @brief Status codes. + */ +typedef enum { + /** + * The function has been executed successfully. + */ + HSA_STATUS_SUCCESS = 0x0, + /** + * A traversal over a list of elements has been interrupted by the + * application before completing. + */ + HSA_STATUS_INFO_BREAK = 0x1, + /** + * A generic error has occurred. + */ + HSA_STATUS_ERROR = 0x1000, + /** + * One of the actual arguments does not meet a precondition stated in the + * documentation of the corresponding formal argument. + */ + HSA_STATUS_ERROR_INVALID_ARGUMENT = 0x1001, + /** + * The requested queue creation is not valid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE_CREATION = 0x1002, + /** + * The requested allocation is not valid. + */ + HSA_STATUS_ERROR_INVALID_ALLOCATION = 0x1003, + /** + * The agent is invalid. + */ + HSA_STATUS_ERROR_INVALID_AGENT = 0x1004, + /** + * The memory region is invalid. + */ + HSA_STATUS_ERROR_INVALID_REGION = 0x1005, + /** + * The signal is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL = 0x1006, + /** + * The queue is invalid. + */ + HSA_STATUS_ERROR_INVALID_QUEUE = 0x1007, + /** + * The HSA runtime failed to allocate the necessary resources. This error + * may also occur when the HSA runtime needs to spawn threads or create + * internal OS-specific events. + */ + HSA_STATUS_ERROR_OUT_OF_RESOURCES = 0x1008, + /** + * The AQL packet is malformed. + */ + HSA_STATUS_ERROR_INVALID_PACKET_FORMAT = 0x1009, + /** + * An error has been detected while releasing a resource. + */ + HSA_STATUS_ERROR_RESOURCE_FREE = 0x100A, + /** + * An API other than ::hsa_init has been invoked while the reference count + * of the HSA runtime is 0. + */ + HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B, + /** + * The maximum reference count for the object has been reached. + */ + HSA_STATUS_ERROR_REFCOUNT_OVERFLOW = 0x100C, + /** + * The arguments passed to a functions are not compatible. + */ + HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS = 0x100D, + /** + * The index is invalid. + */ + HSA_STATUS_ERROR_INVALID_INDEX = 0x100E, + /** + * The instruction set architecture is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA = 0x100F, + /** + * The instruction set architecture name is invalid. + */ + HSA_STATUS_ERROR_INVALID_ISA_NAME = 0x1017, + /** + * The code object is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010, + /** + * The executable is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE = 0x1011, + /** + * The executable is frozen. + */ + HSA_STATUS_ERROR_FROZEN_EXECUTABLE = 0x1012, + /** + * There is no symbol with the given name. + */ + HSA_STATUS_ERROR_INVALID_SYMBOL_NAME = 0x1013, + /** + * The variable is already defined. + */ + HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED = 0x1014, + /** + * The variable is undefined. + */ + HSA_STATUS_ERROR_VARIABLE_UNDEFINED = 0x1015, + /** + * An HSAIL operation resulted in a hardware exception. + */ + HSA_STATUS_ERROR_EXCEPTION = 0x1016, + /** + * The code object symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_SYMBOL = 0x1018, + /** + * The executable symbol is invalid. + */ + HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL = 0x1019, + /** + * The file descriptor is invalid. + */ + HSA_STATUS_ERROR_INVALID_FILE = 0x1020, + /** + * The code object reader is invalid. + */ + HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER = 0x1021, + /** + * The cache is invalid. + */ + HSA_STATUS_ERROR_INVALID_CACHE = 0x1022, + /** + * The wavefront is invalid. + */ + HSA_STATUS_ERROR_INVALID_WAVEFRONT = 0x1023, + /** + * The signal group is invalid. + */ + HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP = 0x1024, + /** + * The HSA runtime is not in the configuration state. + */ + HSA_STATUS_ERROR_INVALID_RUNTIME_STATE = 0x1025, + /** + * The queue received an error that may require process termination. + */ + HSA_STATUS_ERROR_FATAL = 0x1026 +} hsa_status_t; + +/** + * @brief Query additional information about a status code. + * + * @param[in] status Status code. + * + * @param[out] status_string A NUL-terminated string that describes the error + * status. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p status is an invalid + * status code, or @p status_string is NULL. + */ +hsa_status_t HSA_API hsa_status_string( + hsa_status_t status, + const char ** status_string); + +/** @} */ + +/** \defgroup common Common Definitions + * @{ + */ + +/** + * @brief Three-dimensional coordinate. + */ +typedef struct hsa_dim3_s { + /** + * X dimension. + */ + uint32_t x; + + /** + * Y dimension. + */ + uint32_t y; + + /** + * Z dimension. + */ + uint32_t z; +} hsa_dim3_t; + +/** + * @brief Access permissions. + */ +typedef enum { + /** + * Read-only access. + */ + HSA_ACCESS_PERMISSION_RO = 1, + /** + * Write-only access. + */ + HSA_ACCESS_PERMISSION_WO = 2, + /** + * Read and write access. + */ + HSA_ACCESS_PERMISSION_RW = 3 +} hsa_access_permission_t; + +/** + * @brief POSIX file descriptor. + */ +typedef int hsa_file_t; + +/** @} **/ + + +/** \defgroup initshutdown Initialization and Shut Down + * @{ + */ + +/** + * @brief Initialize the HSA runtime. + * + * @details Initializes the HSA runtime if it is not already initialized, and + * increases the reference counter associated with the HSA runtime for the + * current process. Invocation of any HSA function other than ::hsa_init results + * in undefined behavior if the current HSA runtime reference counter is less + * than one. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_REFCOUNT_OVERFLOW The HSA runtime reference + * count reaches INT32_MAX. + */ +hsa_status_t HSA_API hsa_init(); + +/** + * @brief Shut down the HSA runtime. + * + * @details Decreases the reference count of the HSA runtime instance. When the + * reference count reaches 0, the HSA runtime is no longer considered valid + * but the application might call ::hsa_init to initialize the HSA runtime + * again. + * + * Once the reference count of the HSA runtime reaches 0, all the resources + * associated with it (queues, signals, agent information, etc.) are + * considered invalid and any attempt to reference them in subsequent API calls + * results in undefined behavior. When the reference count reaches 0, the HSA + * runtime may release resources associated with it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_shut_down(); + +/** @} **/ + +/** \defgroup agentinfo System and Agent Information + * @{ + */ + +/** + * @brief Endianness. A convention used to interpret the bytes making up a data + * word. + */ +typedef enum { + /** + * The least significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_LITTLE = 0, + /** + * The most significant byte is stored in the smallest address. + */ + HSA_ENDIANNESS_BIG = 1 +} hsa_endianness_t; + +/** + * @brief Machine model. A machine model determines the size of certain data + * types in HSA runtime and an agent. + */ +typedef enum { + /** + * Small machine model. Addresses use 32 bits. + */ + HSA_MACHINE_MODEL_SMALL = 0, + /** + * Large machine model. Addresses use 64 bits. + */ + HSA_MACHINE_MODEL_LARGE = 1 +} hsa_machine_model_t; + +/** + * @brief Profile. A profile indicates a particular level of feature + * support. For example, in the base profile the application must use the HSA + * runtime allocator to reserve shared virtual memory, while in the full profile + * any host pointer can be shared across all the agents. + */ +typedef enum { + /** + * Base profile. + */ + HSA_PROFILE_BASE = 0, + /** + * Full profile. + */ + HSA_PROFILE_FULL = 1 +} hsa_profile_t; + +/** + * @brief System attributes. + */ +typedef enum { + /** + * Major version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MAJOR = 0, + /** + * Minor version of the HSA runtime specification supported by the + * implementation. The type of this attribute is uint16_t. + */ + HSA_SYSTEM_INFO_VERSION_MINOR = 1, + /** + * Current timestamp. The value of this attribute monotonically increases at a + * constant rate. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP = 2, + /** + * Timestamp value increase rate, in Hz. The timestamp (clock) frequency is + * in the range 1-400MHz. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3, + /** + * Maximum duration of a signal wait operation. Expressed as a count based on + * the timestamp frequency. The type of this attribute is uint64_t. + */ + HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT = 4, + /** + * Endianness of the system. The type of this attribute is ::hsa_endianness_t. + */ + HSA_SYSTEM_INFO_ENDIANNESS = 5, + /** + * Machine model supported by the HSA runtime. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_SYSTEM_INFO_MACHINE_MODEL = 6, + /** + * Bit-mask indicating which extensions are supported by the + * implementation. An extension with an ID of @p i is supported if the bit at + * position @p i is set. The type of this attribute is uint8_t[128]. + */ + HSA_SYSTEM_INFO_EXTENSIONS = 7, + /** + * String containing the ROCr build identifier. + */ + HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202, + /** + * Returns true if mwaitx is enabled on this system + * The type of this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_MWAITX_ENABLED = 0x203, + /** + * Returns true if DMABUF APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_DMABUF_SUPPORTED = 0x204 +} hsa_system_info_t; + +/** + * @brief Get the current value of a system attribute. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * system attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_system_get_info( + hsa_system_info_t attribute, + void* value); + +/** + * @brief HSA extensions. + */ +typedef enum { + /** + * Finalizer extension. + */ + HSA_EXTENSION_FINALIZER = 0, + /** + * Images extension. + */ + HSA_EXTENSION_IMAGES = 1, + + /** + * Performance counter extension. + */ + HSA_EXTENSION_PERFORMANCE_COUNTERS = 2, + + /** + * Profiling events extension. + */ + HSA_EXTENSION_PROFILING_EVENTS = 3, + /** + * Extension count. + */ + HSA_EXTENSION_STD_LAST = 3, + /** + * First AMD extension number. + */ + HSA_AMD_FIRST_EXTENSION = 0x200, + /** + * Profiler extension. + */ + HSA_EXTENSION_AMD_PROFILER = 0x200, + /** + * Loader extension. + */ + HSA_EXTENSION_AMD_LOADER = 0x201, + /** + * AqlProfile extension. + */ + HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * Last AMD extension. + */ + HSA_AMD_LAST_EXTENSION = 0x202 +} hsa_extension_t; + +/** + * @brief Query the name of a given extension. + * + * @param[in] extension Extension identifier. If the extension is not supported + * by the implementation (see ::HSA_SYSTEM_INFO_EXTENSIONS), the behavior + * is undefined. + * + * @param[out] name Pointer to a memory location where the HSA runtime stores + * the extension name. The extension name is a NUL-terminated string. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p name is NULL. + */ +hsa_status_t HSA_API hsa_extension_get_name( + uint16_t extension, + const char **name); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by the HSA + * implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by the HSA + * implementation. All minor versions from 0 up to the returned @p version_minor + * must be supported by the implementation. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_system_major_extension_supported( + uint16_t extension, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** + * @deprecated + * + * @brief Retrieve the function pointers corresponding to a given version of an + * extension. Portable applications are expected to invoke the extension API + * using the returned function pointers + * + * @details The application is responsible for verifying that the given version + * of the extension is supported by the HSA implementation (see + * ::hsa_system_extension_supported). If the given combination of extension, + * major version, and minor version is not supported by the implementation, the + * behavior is undefined. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] version_minor Minor version number for which to retrieve the + * function pointer table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_system_get_extension_table( + uint16_t extension, + uint16_t version_major, + uint16_t version_minor, + void *table); + +/** + * @brief Retrieve the function pointers corresponding to a given major version + * of an extension. Portable applications are expected to invoke the extension + * API using the returned function pointers. + * + * @details The application is responsible for verifying that the given major + * version of the extension is supported by the HSA implementation (see + * ::hsa_system_major_extension_supported). If the given combination of extension + * and major version is not supported by the implementation, the behavior is + * undefined. Additionally if the length doesn't allow space for a full minor + * version, it is implementation defined if only some of the function pointers for + * that minor version get written. + * + * @param[in] extension Extension identifier. + * + * @param[in] version_major Major version number for which to retrieve the + * function pointer table. + * + * @param[in] table_length Size in bytes of the function pointer table to be + * populated. The implementation will not write more than this many bytes to the + * table. + * + * @param[out] table Pointer to an application-allocated function pointer table + * that is populated by the HSA runtime. Must not be NULL. The memory associated + * with table can be reused or freed after the function returns. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p table is NULL. + */ +hsa_status_t HSA_API hsa_system_get_major_extension_table( + uint16_t extension, + uint16_t version_major, + size_t table_length, + void *table); + +/** + * @brief Struct containing an opaque handle to an agent, a device that participates in + * the HSA memory model. An agent can submit AQL packets for execution, and + * may also accept AQL packets for execution (agent dispatch packets or kernel + * dispatch packets launching HSAIL-derived binaries). + */ +typedef struct hsa_agent_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_agent_t; + +/** + * @brief Agent features. + */ +typedef enum { + /** + * The agent supports AQL packets of kernel dispatch type. If this + * feature is enabled, the agent is also a kernel agent. + */ + HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1, + /** + * The agent supports AQL packets of agent dispatch type. + */ + HSA_AGENT_FEATURE_AGENT_DISPATCH = 2 +} hsa_agent_feature_t; + +/** + * @brief Hardware device type. + */ +typedef enum { + /** + * CPU device. + */ + HSA_DEVICE_TYPE_CPU = 0, + /** + * GPU device. + */ + HSA_DEVICE_TYPE_GPU = 1, + /** + * DSP device. + */ + HSA_DEVICE_TYPE_DSP = 2 +} hsa_device_type_t; + +/** + * @brief Default floating-point rounding mode. + */ +typedef enum { + /** + * Use a default floating-point rounding mode specified elsewhere. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0, + /** + * Operations that specify the default floating-point mode are rounded to zero + * by default. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1, + /** + * Operations that specify the default floating-point mode are rounded to the + * nearest representable number and that ties should be broken by selecting + * the value with an even least significant bit. + */ + HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2 +} hsa_default_float_rounding_mode_t; + +/** + * @brief Agent attributes. + */ +typedef enum { + /** + * Agent name. The type of this attribute is a NUL-terminated char[64]. The + * name must be at most 63 characters long (not including the NUL terminator) + * and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_NAME = 0, + /** + * Name of vendor. The type of this attribute is a NUL-terminated char[64]. + * The name must be at most 63 characters long (not including the NUL + * terminator) and all array elements not used for the name must be NUL. + */ + HSA_AGENT_INFO_VENDOR_NAME = 1, + /** + * Agent capability. The type of this attribute is ::hsa_agent_feature_t. + */ + HSA_AGENT_INFO_FEATURE = 2, + /** + * @deprecated Query ::HSA_ISA_INFO_MACHINE_MODELS for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Machine model supported by the agent. The type of this attribute is + * ::hsa_machine_model_t. + */ + HSA_AGENT_INFO_MACHINE_MODEL = 3, + /** + * @deprecated Query ::HSA_ISA_INFO_PROFILES for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Profile supported by the agent. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_AGENT_INFO_PROFILE = 4, + /** + * @deprecated Query ::HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES for a given + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas. + * + * Default floating-point rounding mode. The type of this attribute is + * ::hsa_default_float_rounding_mode_t, but the value + * ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT is not allowed. + */ + HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5, + /** + * @deprecated Query ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES + * for a given intruction set architecture supported by the agent instead. If + * more than one ISA is supported by the agent, the returned value corresponds + * to the first ISA enumerated by ::hsa_agent_iterate_isas. + * + * A bit-mask of ::hsa_default_float_rounding_mode_t values, representing the + * default floating-point rounding modes supported by the agent in the Base + * profile. The type of this attribute is uint32_t. The default floating-point + * rounding mode (::HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE) bit must not + * be set. + */ + HSA_AGENT_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 23, + /** + * @deprecated Query ::HSA_ISA_INFO_FAST_F16_OPERATION for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the current agent. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is bool. + */ + HSA_AGENT_INFO_FAST_F16_OPERATION = 24, + /** + * @deprecated Query ::HSA_WAVEFRONT_INFO_SIZE for a given wavefront and + * intruction set architecture supported by the agent instead. If more than + * one ISA is supported by the agent, the returned value corresponds to the + * first ISA enumerated by ::hsa_agent_iterate_isas and the first wavefront + * enumerated by ::hsa_isa_iterate_wavefronts for that ISA. + * + * Number of work-items in a wavefront. Must be a power of 2 in the range + * [1,256]. The value of this attribute is undefined if the agent is not + * a kernel agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WAVEFRONT_SIZE = 6, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_DIM for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_AGENT_INFO_WORKGROUP_MAX_SIZE. The value of this attribute is + * undefined if the agent is not a kernel agent. The type of this + * attribute is uint16_t[3]. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7, + /** + * @deprecated Query ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a work-group. The value of this + * attribute is undefined if the agent is not a kernel agent. The type + * of this attribute is uint32_t. + */ + HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_DIM for a given intruction set + * architecture supported by the agent instead. + * + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_AGENT_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_AGENT_INFO_GRID_MAX_SIZE. The value of this attribute is undefined + * if the agent is not a kernel agent. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_AGENT_INFO_GRID_MAX_DIM = 9, + /** + * @deprecated Query ::HSA_ISA_INFO_GRID_MAX_SIZE for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum total number of work-items in a grid. The value of this attribute + * is undefined if the agent is not a kernel agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_GRID_MAX_SIZE = 10, + /** + * @deprecated Query ::HSA_ISA_INFO_FBARRIER_MAX_SIZE for a given intruction + * set architecture supported by the agent instead. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Maximum number of fbarriers per work-group. Must be at least 32. The value + * of this attribute is undefined if the agent is not a kernel agent. The + * type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11, + /** + * @deprecated The maximum number of queues is not statically determined. + * + * Maximum number of queues that can be active (created but not destroyed) at + * one time in the agent. The type of this attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUES_MAX = 12, + /** + * Minimum number of packets that a queue created in the agent + * can hold. Must be a power of 2 greater than 0. Must not exceed + * the value of ::HSA_AGENT_INFO_QUEUE_MAX_SIZE. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13, + /** + * Maximum number of packets that a queue created in the agent can + * hold. Must be a power of 2 greater than 0. The type of this attribute + * is uint32_t. + */ + HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14, + /** + * Type of a queue created in the agent. The type of this attribute is + * ::hsa_queue_type32_t. + */ + HSA_AGENT_INFO_QUEUE_TYPE = 15, + /** + * @deprecated NUMA information is not exposed anywhere else in the API. + * + * Identifier of the NUMA node associated with the agent. The type of this + * attribute is uint32_t. + */ + HSA_AGENT_INFO_NODE = 16, + /** + * Type of hardware device associated with the agent. The type of this + * attribute is ::hsa_device_type_t. + */ + HSA_AGENT_INFO_DEVICE = 17, + /** + * @deprecated Query ::hsa_agent_iterate_caches to retrieve information about + * the caches present in a given agent. + * + * Array of data cache sizes (L1..L4). Each size is expressed in bytes. A size + * of 0 for a particular level indicates that there is no cache information + * for that level. The type of this attribute is uint32_t[4]. + */ + HSA_AGENT_INFO_CACHE_SIZE = 18, + /** + * @deprecated An agent may support multiple instruction set + * architectures. See ::hsa_agent_iterate_isas. If more than one ISA is + * supported by the agent, the returned value corresponds to the first ISA + * enumerated by ::hsa_agent_iterate_isas. + * + * Instruction set architecture of the agent. The type of this attribute + * is ::hsa_isa_t. + */ + HSA_AGENT_INFO_ISA = 19, + /** + * Bit-mask indicating which extensions are supported by the agent. An + * extension with an ID of @p i is supported if the bit at position @p i is + * set. The type of this attribute is uint8_t[128]. + */ + HSA_AGENT_INFO_EXTENSIONS = 20, + /** + * Major version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MAJOR = 21, + /** + * Minor version of the HSA runtime specification supported by the + * agent. The type of this attribute is uint16_t. + */ + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX +} hsa_agent_info_t; + +/** + * @brief Get the current value of an attribute for a given agent. + * + * @param[in] agent A valid agent. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * agent attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_agent_get_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +/** + * @brief Iterate over the available agents, and invoke an + * application-defined callback on every iteration. + * + * @param[in] callback Callback to be invoked once per agent. The HSA + * runtime passes two arguments to the callback: the agent and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_iterate_agents returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t HSA_API hsa_iterate_agents( + hsa_status_t (*callback)(hsa_agent_t agent, void* data), + void* data); + +/* + +// If we do not know the size of an attribute, we need to query it first +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_get_info_size( + hsa_agent_t agent, + hsa_agent_info_t attribute, + size_t* size); + +// Set the value of an agents attribute +// Note: this API will not be in the spec unless needed +hsa_status_t HSA_API hsa_agent_set_info( + hsa_agent_t agent, + hsa_agent_info_t attribute, + void* value); + +*/ + +/** + * @brief Exception policies applied in the presence of hardware exceptions. + */ +typedef enum { + /** + * If a hardware exception is detected, a work-item signals an exception. + */ + HSA_EXCEPTION_POLICY_BREAK = 1, + /** + * If a hardware exception is detected, a hardware status bit is set. + */ + HSA_EXCEPTION_POLICY_DETECT = 2 +} hsa_exception_policy_t; + +/** + * @deprecated Use ::hsa_isa_get_exception_policies for a given intruction set + * architecture supported by the agent instead. If more than one ISA is + * supported by the agent, this function uses the first value returned by + * ::hsa_agent_iterate_isas. + * + * @brief Retrieve the exception policy support for a given combination of + * agent and profile + * + * @param[in] agent Agent. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + * + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_get_exception_policies( + hsa_agent_t agent, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Cache handle. + */ +typedef struct hsa_cache_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_cache_t; + +/** + * @brief Cache attributes. + */ +typedef enum { + /** + * The length of the cache name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is a NUL-terminated + * character array with the length equal to the value of + * ::HSA_CACHE_INFO_NAME_LENGTH attribute. + */ + HSA_CACHE_INFO_NAME = 1, + /** + * Cache level. A L1 cache must return a value of 1, a L2 must return a value + * of 2, and so on. The type of this attribute is uint8_t. + */ + HSA_CACHE_INFO_LEVEL = 2, + /** + * Cache size, in bytes. A value of 0 indicates that there is no size + * information available. The type of this attribute is uint32_t. + */ + HSA_CACHE_INFO_SIZE = 3 +} hsa_cache_info_t; + +/** + * @brief Get the current value of an attribute for a given cache object. + * + * @param[in] cache Cache. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CACHE The cache is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_cache_get_info( + hsa_cache_t cache, + hsa_cache_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory caches of a given agent, and + * invoke an application-defined callback on every iteration. + * + * @details Caches are visited in ascending order according to the value of the + * ::HSA_CACHE_INFO_LEVEL attribute. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per cache that is present in + * the agent. The HSA runtime passes two arguments to the callback: the cache + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_caches( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_cache_t cache, void* data), + void* data); + +/** + * @deprecated + * + * @brief Query if a given version of an extension is supported by an agent + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[in] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_agent_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t version_minor, + bool* result); + +/** + * @brief Query if a given version of an extension is supported by an agent. All + * minor versions from 0 up to the returned @p version_minor must be supported. + * + * @param[in] extension Extension identifier. + * + * @param[in] agent Agent. + * + * @param[in] version_major Major version number. + * + * @param[out] version_minor Minor version number. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. The result is true if the specified version of the + * extension is supported, and false otherwise. The result must be false if + * ::hsa_system_extension_supported returns false for the same extension + * version. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p extension is not a valid + * extension, or @p version_minor is NULL, or @p result is NULL. + */ +hsa_status_t HSA_API hsa_agent_major_extension_supported( + uint16_t extension, + hsa_agent_t agent, + uint16_t version_major, + uint16_t *version_minor, + bool* result); + + +/** @} */ + + +/** \defgroup signals Signals + * @{ + */ + +/** + * @brief Signal handle. + */ +typedef struct hsa_signal_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. The value 0 is reserved. + */ + uint64_t handle; +} hsa_signal_t; + +/** + * @brief Signal value. The value occupies 32 bits in small machine mode, and 64 + * bits in large machine mode. + */ +#ifdef HSA_LARGE_MODEL + typedef int64_t hsa_signal_value_t; +#else + typedef int32_t hsa_signal_value_t; +#endif + +/** + * @brief Create a signal. + * + * @param[in] initial_value Initial value of the signal. + * + * @param[in] num_consumers Size of @p consumers. A value of 0 indicates that + * any agent might wait on the signal. + * + * @param[in] consumers List of agents that might consume (wait on) the + * signal. If @p num_consumers is 0, this argument is ignored; otherwise, the + * HSA runtime might use the list to optimize the handling of the signal + * object. If an agent not listed in @p consumers waits on the returned + * signal, the behavior is undefined. The memory associated with @p consumers + * can be reused or freed after the function returns. + * + * @param[out] signal Pointer to a memory location where the HSA runtime will + * store the newly created signal handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p signal is NULL, @p + * num_consumers is greater than 0 but @p consumers is NULL, or @p consumers + * contains duplicates. + */ +hsa_status_t HSA_API hsa_signal_create( + hsa_signal_value_t initial_value, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_t *signal); + +/** + * @brief Destroy a signal previous created by ::hsa_signal_create. + * + * @param[in] signal Signal. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p signal is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The handle in @p signal is 0. + */ +hsa_status_t HSA_API hsa_signal_destroy( + hsa_signal_t signal); + +/** + * @brief Atomically read the current value of a signal. + * + * @param[in] signal Signal. + * + * @return Value of the signal. +*/ +hsa_signal_value_t HSA_API hsa_signal_load_scacquire( + hsa_signal_t signal); + +/** + * @copydoc hsa_signal_load_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_load_relaxed( + hsa_signal_t signal); + +/** + * @deprecated Renamed as ::hsa_signal_load_scacquire. + * + * @copydoc hsa_signal_load_scacquire +*/ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_load_acquire( + hsa_signal_t signal); + +/** + * @brief Atomically set the value of a signal. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_store_relaxed + */ +void HSA_API hsa_signal_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_store_screlease. + * + * @copydoc hsa_signal_store_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_store_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal without necessarily notifying the + * the agents waiting on it. + * + * @details The agents waiting on @p signal may not wake up even when the new + * value satisfies their wait condition. If the application wants to update the + * signal and there is no need to notify any agent, invoking this function can + * be more efficient than calling the non-silent counterpart. + * + * @param[in] signal Signal. + * + * @param[in] value New signal value. + */ +void HSA_API hsa_signal_silent_store_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_silent_store_relaxed + */ +void HSA_API hsa_signal_silent_store_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal and return its previous value. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value New value. + * + * @return Value of the signal prior to the exchange. + * + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacq_screl. + * + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_scacquire. + * + * @copydoc hsa_signal_exchange_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); +/** + * @copydoc hsa_signal_exchange_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_exchange_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_exchange_screlease. + * + * @copydoc hsa_signal_exchange_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_exchange_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically set the value of a signal if the observed value is equal to + * the expected value. The observed value is returned regardless of whether the + * replacement was done. + * + * @details If the value of the signal is changed, all the agents waiting + * on @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue + * doorbell signal, the behavior is undefined. + * + * @param[in] expected Value to compare with. + * + * @param[in] value New value. + * + * @return Observed value of the signal. + * + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacq_screl. + * + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_scacquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_scacquire. + * + * @copydoc hsa_signal_cas_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_acquire( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_relaxed( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_cas_scacq_screl + */ +hsa_signal_value_t HSA_API hsa_signal_cas_screlease( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_cas_screlease. + * + * @copydoc hsa_signal_cas_screlease + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_cas_release( + hsa_signal_t signal, + hsa_signal_value_t expected, + hsa_signal_value_t value); + +/** + * @brief Atomically increment the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to add to the value of the signal. + * + */ +void HSA_API hsa_signal_add_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacq_screl. + * + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_add_scacquire. + * + * @copydoc hsa_signal_add_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_add_scacq_screl + */ +void HSA_API hsa_signal_add_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_add_screlease. + * + * @copydoc hsa_signal_add_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_add_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically decrement the value of a signal by a given amount. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to subtract from the value of the signal. + * + */ +void HSA_API hsa_signal_subtract_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacq_screl. + * + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_subtract_scacquire. + * + * @copydoc hsa_signal_subtract_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_subtract_scacq_screl + */ +void HSA_API hsa_signal_subtract_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_subtract_screlease. + * + * @copydoc hsa_signal_subtract_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_subtract_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise AND operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to AND with the value of the signal. + * + */ +void HSA_API hsa_signal_and_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacq_screl. + * + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_and_scacquire. + * + * @copydoc hsa_signal_and_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_and_scacq_screl + */ +void HSA_API hsa_signal_and_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_and_screlease. + * + * @copydoc hsa_signal_and_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_and_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise OR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to OR with the value of the signal. + */ +void HSA_API hsa_signal_or_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_or_scacq_screl. + * + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_scacquire. + * + * @copydoc hsa_signal_or_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_or_scacq_screl + */ +void HSA_API hsa_signal_or_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_or_screlease. + * + * @copydoc hsa_signal_or_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_or_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Atomically perform a bitwise XOR operation between the value of a + * signal and a given value. + * + * @details If the value of the signal is changed, all the agents waiting on + * @p signal for which @p value satisfies their wait condition are awakened. + * + * @param[in] signal Signal. If @p signal is a queue doorbell signal, the + * behavior is undefined. + * + * @param[in] value Value to XOR with the value of the signal. + * + */ +void HSA_API hsa_signal_xor_scacq_screl( + hsa_signal_t signal, + hsa_signal_value_t value); + + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacq_screl. + * + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acq_rel( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_scacquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_scacquire. + * + * @copydoc hsa_signal_xor_scacquire + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_acquire( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_relaxed( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @copydoc hsa_signal_xor_scacq_screl + */ +void HSA_API hsa_signal_xor_screlease( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @deprecated Renamed as ::hsa_signal_xor_screlease. + * + * @copydoc hsa_signal_xor_screlease + */ +void HSA_API HSA_DEPRECATED hsa_signal_xor_release( + hsa_signal_t signal, + hsa_signal_value_t value); + +/** + * @brief Wait condition operator. + */ +typedef enum { + /** + * The two operands are equal. + */ + HSA_SIGNAL_CONDITION_EQ = 0, + /** + * The two operands are not equal. + */ + HSA_SIGNAL_CONDITION_NE = 1, + /** + * The first operand is less than the second operand. + */ + HSA_SIGNAL_CONDITION_LT = 2, + /** + * The first operand is greater than or equal to the second operand. + */ + HSA_SIGNAL_CONDITION_GTE = 3 +} hsa_signal_condition_t; + +/** + * @brief State of the application thread during a signal wait. + */ +typedef enum { + /** + * The application thread may be rescheduled while waiting on the signal. + */ + HSA_WAIT_STATE_BLOCKED = 0, + /** + * The application thread stays active while waiting on a signal. + */ + HSA_WAIT_STATE_ACTIVE = 1 +} hsa_wait_state_t; + + +/** + * @brief Wait until a signal value satisfies a specified condition, or a + * certain amount of time has elapsed. + * + * @details A wait operation can spuriously resume at any time sooner than the + * timeout (for example, due to system or other external factors) even when the + * condition has not been met. + * + * The function is guaranteed to return if the signal value satisfies the + * condition at some point in time during the wait, but the value returned to + * the application might not satisfy the condition. The application must ensure + * that signals are used in such way that wait wakeup conditions are not + * invalidated before dependent threads have woken up. + * + * When the wait operation internally loads the value of the passed signal, it + * uses the memory order indicated in the function name. + * + * @param[in] signal Signal. + * + * @param[in] condition Condition used to compare the signal value with @p + * compare_value. + * + * @param[in] compare_value Value to compare with. + * + * @param[in] timeout_hint Maximum duration of the wait. Specified in the same + * unit as the system timestamp. The operation might block for a shorter or + * longer time even if the condition is not met. A value of UINT64_MAX indicates + * no maximum. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is ultimately decided by + * HSA runtime and may not match the provided hint. A value of + * ::HSA_WAIT_STATE_ACTIVE may improve the latency of response to a signal + * update by avoiding rescheduling overhead. + * + * @return Observed value of the signal, which might not satisfy the specified + * condition. + * +*/ +hsa_signal_value_t HSA_API hsa_signal_wait_scacquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API hsa_signal_wait_relaxed( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @deprecated Renamed as ::hsa_signal_wait_scacquire. + * + * @copydoc hsa_signal_wait_scacquire + */ +hsa_signal_value_t HSA_API HSA_DEPRECATED hsa_signal_wait_acquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint); + +/** + * @brief Group of signals. + */ +typedef struct hsa_signal_group_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_signal_group_t; + +/** + * @brief Create a signal group. + * + * @param[in] num_signals Number of elements in @p signals. Must not be 0. + * + * @param[in] signals List of signals in the group. The list must not contain + * any repeated elements. Must not be NULL. + * + * @param[in] num_consumers Number of elements in @p consumers. Must not be 0. + * + * @param[in] consumers List of agents that might consume (wait on) the signal + * group. The list must not contain repeated elements, and must be a subset of + * the set of agents that are allowed to wait on all the signals in the + * group. If an agent not listed in @p consumers waits on the returned group, + * the behavior is undefined. The memory associated with @p consumers can be + * reused or freed after the function returns. Must not be NULL. + * + * @param[out] signal_group Pointer to newly created signal group. Must not be + * NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_signals is 0, @p signals + * is NULL, @p num_consumers is 0, @p consumers is NULL, or @p signal_group is + * NULL. + */ +hsa_status_t HSA_API hsa_signal_group_create( + uint32_t num_signals, + const hsa_signal_t *signals, + uint32_t num_consumers, + const hsa_agent_t *consumers, + hsa_signal_group_t *signal_group); + +/** + * @brief Destroy a signal group previous created by ::hsa_signal_group_create. + * + * @param[in] signal_group Signal group. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + */ +hsa_status_t HSA_API hsa_signal_group_destroy( + hsa_signal_group_t signal_group); + +/** + * @brief Wait until the value of at least one of the signals in a signal group + * satisfies its associated condition. + * + * @details The function is guaranteed to return if the value of at least one of + * the signals in the group satisfies its associated condition at some point in + * time during the wait, but the signal value returned to the application may no + * longer satisfy the condition. The application must ensure that signals in the + * group are used in such way that wait wakeup conditions are not invalidated + * before dependent threads have woken up. + * + * When this operation internally loads the value of the passed signal, it uses + * the memory order indicated in the function name. + * + * @param[in] signal_group Signal group. + * + * @param[in] conditions List of conditions. Each condition, and the value at + * the same index in @p compare_values, is used to compare the value of the + * signal at that index in @p signal_group (the signal passed by the application + * to ::hsa_signal_group_create at that particular index). The size of @p + * conditions must not be smaller than the number of signals in @p signal_group; + * any extra elements are ignored. Must not be NULL. + * + * @param[in] compare_values List of comparison values. The size of @p + * compare_values must not be smaller than the number of signals in @p + * signal_group; any extra elements are ignored. Must not be NULL. + * + * @param[in] wait_state_hint Hint used by the application to indicate the + * preferred waiting state. The actual waiting state is decided by the HSA runtime + * and may not match the provided hint. A value of ::HSA_WAIT_STATE_ACTIVE may + * improve the latency of response to a signal update by avoiding rescheduling + * overhead. + * + * @param[out] signal Signal in the group that satisfied the associated + * condition. If several signals satisfied their condition, the function can + * return any of those signals. Must not be NULL. + * + * @param[out] value Observed value for @p signal, which might no longer satisfy + * the specified condition. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL_GROUP @p signal_group is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p conditions is NULL, @p + * compare_values is NULL, @p signal is NULL, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_scacquire( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** + * @copydoc hsa_signal_group_wait_any_scacquire + */ +hsa_status_t HSA_API hsa_signal_group_wait_any_relaxed( + hsa_signal_group_t signal_group, + const hsa_signal_condition_t *conditions, + const hsa_signal_value_t *compare_values, + hsa_wait_state_t wait_state_hint, + hsa_signal_t *signal, + hsa_signal_value_t *value); + +/** @} */ + +/** \defgroup memory Memory + * @{ + */ + +/** + * @brief A memory region represents a block of virtual memory with certain + * properties. For example, the HSA runtime represents fine-grained memory in + * the global segment using a region. A region might be associated with more + * than one agent. + */ +typedef struct hsa_region_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_region_t; + +/** @} */ + + +/** \defgroup queue Queues + * @{ + */ + +/** + * @brief Queue type. Intended to be used for dynamic queue protocol + * determination. + */ +typedef enum { + /** + * Queue supports multiple producers. Use of multiproducer queue mechanics is + * required. + */ + HSA_QUEUE_TYPE_MULTI = 0, + /** + * Queue only supports a single producer. In some scenarios, the application + * may want to limit the submission of AQL packets to a single agent. Queues + * that support a single producer may be more efficient than queues supporting + * multiple producers. Use of multiproducer queue mechanics is not supported. + */ + HSA_QUEUE_TYPE_SINGLE = 1, + /** + * Queue supports multiple producers and cooperative dispatches. Cooperative + * dispatches are able to use GWS synchronization. Queues of this type may be + * limited in number. The runtime may return the same queue to serve multiple + * ::hsa_queue_create calls when this type is given. Callers must inspect the + * returned queue to discover queue size. Queues of this type are reference + * counted and require a matching number of ::hsa_queue_destroy calls to + * release. Use of multiproducer queue mechanics is required. See + * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this + * type. + */ + HSA_QUEUE_TYPE_COOPERATIVE = 2 +} hsa_queue_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_queue_type_t constants. + */ +typedef uint32_t hsa_queue_type32_t; + +/** + * @brief Queue features. + */ +typedef enum { + /** + * Queue supports kernel dispatch packets. + */ + HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1, + + /** + * Queue supports agent dispatch packets. + */ + HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2 +} hsa_queue_feature_t; + +/** + * @brief User mode queue. + * + * @details The queue structure is read-only and allocated by the HSA runtime, + * but agents can directly modify the contents of the buffer pointed by @a + * base_address, or use HSA runtime APIs to access the doorbell signal. + * + */ +typedef struct hsa_queue_s { + /** + * Queue type. + */ + hsa_queue_type32_t type; + + /** + * Queue features mask. This is a bit-field of ::hsa_queue_feature_t + * values. Applications should ignore any unknown set bits. + */ + uint32_t features; + +#ifdef HSA_LARGE_MODEL + void* base_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Starting address of the HSA runtime-allocated buffer used to store the AQL + * packets. Must be aligned to the size of an AQL packet. + */ + void* base_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; +#else + uint32_t reserved0; + void* base_address; +#endif + + /** + * Signal object used by the application to indicate the ID of a packet that + * is ready to be processed. The HSA runtime manages the doorbell signal. If + * the application tries to replace or destroy this signal, the behavior is + * undefined. + * + * If @a type is ::HSA_QUEUE_TYPE_SINGLE, the doorbell signal value must be + * updated in a monotonically increasing fashion. If @a type is + * ::HSA_QUEUE_TYPE_MULTI, the doorbell signal value can be updated with any + * value. + */ + hsa_signal_t doorbell_signal; + + /** + * Maximum number of packets the queue can hold. Must be a power of 2. + */ + uint32_t size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Queue identifier, which is unique over the lifetime of the application. + */ + uint64_t id; + +} hsa_queue_t; + +/** + * @brief Create a user mode queue. + * + * @details The HSA runtime creates the queue structure, the underlying packet + * buffer, the completion signal, and the write and read indexes. The initial + * value of the write and read indexes is 0. The type of every packet in the + * buffer is initialized to ::HSA_PACKET_TYPE_INVALID. + * + * The application should only rely on the error code returned to determine if + * the queue is valid. + * + * @param[in] agent Agent where to create the queue. + * + * @param[in] size Number of packets the queue is expected to + * hold. Must be a power of 2 between 1 and the value of + * ::HSA_AGENT_INFO_QUEUE_MAX_SIZE in @p agent. The size of the newly + * created queue is the maximum of @p size and the value of + * ::HSA_AGENT_INFO_QUEUE_MIN_SIZE in @p agent. + * + * @param[in] type Type of the queue, a bitwise OR of hsa_queue_type_t values. + * If the value of ::HSA_AGENT_INFO_QUEUE_TYPE in @p agent is ::HSA_QUEUE_TYPE_SINGLE, + * then @p type must also be ::HSA_QUEUE_TYPE_SINGLE. + * + * @param[in] callback Callback invoked by the HSA runtime for every + * asynchronous event related to the newly created queue. May be NULL. The HSA + * runtime passes three arguments to the callback: a code identifying the event + * that triggered the invocation, a pointer to the queue where the event + * originated, and the application data. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @param[in] private_segment_size Hint indicating the maximum + * expected private segment usage per work-item, in bytes. There may + * be performance degradation if the application places a kernel + * dispatch packet in the queue and the corresponding private segment + * usage exceeds @p private_segment_size. If the application does not + * want to specify any particular value for this argument, @p + * private_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[in] group_segment_size Hint indicating the maximum expected + * group segment usage per work-group, in bytes. There may be + * performance degradation if the application places a kernel dispatch + * packet in the queue and the corresponding group segment usage + * exceeds @p group_segment_size. If the application does not want to + * specify any particular value for this argument, @p + * group_segment_size must be UINT32_MAX. If the queue does not + * support kernel dispatch packets, this argument is ignored. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE_CREATION @p agent does not + * support queues of the given type. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, + * @p size is 0, @p type is an invalid queue type, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_queue_create( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + +/** + * @brief Create a queue for which the application or a kernel is responsible + * for processing the AQL packets. + * + * @details The application can use this function to create queues where AQL + * packets are not parsed by the packet processor associated with an agent, + * but rather by a unit of execution running on that agent (for example, a + * thread in the host application). + * + * The application is responsible for ensuring that all the producers and + * consumers of the resulting queue can access the provided doorbell signal + * and memory region. The application is also responsible for ensuring that the + * unit of execution processing the queue packets supports the indicated + * features (AQL packet types). + * + * When the queue is created, the HSA runtime allocates the packet buffer using + * @p region, and the write and read indexes. The initial value of the write and + * read indexes is 0, and the type of every packet in the buffer is initialized + * to ::HSA_PACKET_TYPE_INVALID. The value of the @e size, @e type, @e features, + * and @e doorbell_signal fields in the returned queue match the values passed + * by the application. + * + * @param[in] region Memory region that the HSA runtime should use to allocate + * the AQL packet buffer and any other queue metadata. + * + * @param[in] size Number of packets the queue is expected to hold. Must be a + * power of 2 greater than 0. + * + * @param[in] type Queue type. + * + * @param[in] features Supported queue features. This is a bit-field of + * ::hsa_queue_feature_t values. + * + * @param[in] doorbell_signal Doorbell signal that the HSA runtime must + * associate with the returned queue. The signal handle must not be 0. + * + * @param[out] queue Memory location where the HSA runtime stores a pointer to + * the newly created queue. The application should not rely on the value + * returned for this argument but only in the status code to determine if the + * queue is valid. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is not a power of two, @p + * size is 0, @p type is an invalid queue type, the doorbell signal handle is + * 0, or @p queue is NULL. + * + */ +hsa_status_t HSA_API hsa_soft_queue_create( + hsa_region_t region, + uint32_t size, + hsa_queue_type32_t type, + uint32_t features, + hsa_signal_t doorbell_signal, + hsa_queue_t **queue); + +/** + * @brief Destroy a user mode queue. + * + * @details When a queue is destroyed, the state of the AQL packets that have + * not been yet fully processed (their completion phase has not finished) + * becomes undefined. It is the responsibility of the application to ensure that + * all pending queue operations are finished if their results are required. + * + * The resources allocated by the HSA runtime during queue creation (queue + * structure, ring buffer, doorbell signal) are released. The queue should not + * be accessed after being destroyed. + * + * @param[in] queue Pointer to a queue created using ::hsa_queue_create. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_destroy( + hsa_queue_t *queue); + +/** + * @brief Inactivate a queue. + * + * @details Inactivating the queue aborts any pending executions and prevent any + * new packets from being processed. Any more packets written to the queue once + * it is inactivated will be ignored by the packet processor. + * + * @param[in] queue Pointer to a queue. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE The queue is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p queue is NULL. + */ +hsa_status_t HSA_API hsa_queue_inactivate( + hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_read_index_scacquire. + * + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_read_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the read index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Read index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_read_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_read_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_read_index_relaxed( + const hsa_queue_t *queue); + +/** + * @deprecated Renamed as ::hsa_queue_load_write_index_scacquire. + * + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_load_write_index_acquire( + const hsa_queue_t *queue); + +/** + * @brief Atomically load the write index of a queue. + * + * @param[in] queue Pointer to a queue. + * + * @return Write index of the queue pointed by @p queue. + */ +uint64_t HSA_API hsa_queue_load_write_index_scacquire( + const hsa_queue_t *queue); + +/** + * @copydoc hsa_queue_load_write_index_scacquire + */ +uint64_t HSA_API hsa_queue_load_write_index_relaxed( + const hsa_queue_t *queue); + +/** + * @brief Atomically set the write index of a queue. + * + * @details It is recommended that the application uses this function to update + * the write index when there is a single agent submitting work to the queue + * (the queue type is ::HSA_QUEUE_TYPE_SINGLE). + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the write index. + * + */ +void HSA_API hsa_queue_store_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_write_index_screlease. + * + * @copydoc hsa_queue_store_write_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_write_index_relaxed + */ +void HSA_API hsa_queue_store_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacq_screl. + * + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @brief Atomically set the write index of a queue if the observed value is + * equal to the expected value. The application can inspect the returned value + * to determine if the replacement was done. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] expected Expected value. + * + * @param[in] value Value to assign to the write index if @p expected matches + * the observed write index. Must be greater than @p expected. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_scacquire. + * + * @copydoc hsa_queue_cas_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_acquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_cas_write_index_screlease. + * + * @copydoc hsa_queue_cas_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_cas_write_index_release( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @copydoc hsa_queue_cas_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_cas_write_index_screlease( + const hsa_queue_t *queue, + uint64_t expected, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacq_screl. + * + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acq_rel( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically increment the write index of a queue by an offset. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to add to the write index. + * + * @return Previous value of the write index. + */ +uint64_t HSA_API hsa_queue_add_write_index_scacq_screl( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_scacquire. + * + * @copydoc hsa_queue_add_write_index_scacquire + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_acquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_scacquire( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_add_write_index_screlease. + * + * @copydoc hsa_queue_add_write_index_screlease + */ +uint64_t HSA_API HSA_DEPRECATED hsa_queue_add_write_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_add_write_index_scacq_screl + */ +uint64_t HSA_API hsa_queue_add_write_index_screlease( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @brief Atomically set the read index of a queue. + * + * @details Modifications of the read index are not allowed and result in + * undefined behavior if the queue is associated with an agent for which + * only the corresponding packet processor is permitted to update the read + * index. + * + * @param[in] queue Pointer to a queue. + * + * @param[in] value Value to assign to the read index. + * + */ +void HSA_API hsa_queue_store_read_index_relaxed( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @deprecated Renamed as ::hsa_queue_store_read_index_screlease. + * + * @copydoc hsa_queue_store_read_index_screlease + */ +void HSA_API HSA_DEPRECATED hsa_queue_store_read_index_release( + const hsa_queue_t *queue, + uint64_t value); + +/** + * @copydoc hsa_queue_store_read_index_relaxed + */ +void HSA_API hsa_queue_store_read_index_screlease( + const hsa_queue_t *queue, + uint64_t value); +/** @} */ + + +/** \defgroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief Packet type. + */ +typedef enum { + /** + * Vendor-specific packet. + */ + HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, + /** + * The packet has been processed in the past, but has not been reassigned to + * the packet processor. A packet processor must not process a packet of this + * type. All queues support this packet type. + */ + HSA_PACKET_TYPE_INVALID = 1, + /** + * Packet used by agents for dispatching jobs to kernel agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_AND = 3, + /** + * Packet used by agents for dispatching jobs to agents. Not all + * queues support packets of this type (see ::hsa_queue_feature_t). + */ + HSA_PACKET_TYPE_AGENT_DISPATCH = 4, + /** + * Packet used by agents to delay processing of subsequent packets, and to + * express complex dependencies between multiple packets. All queues support + * this packet type. + */ + HSA_PACKET_TYPE_BARRIER_OR = 5 +} hsa_packet_type_t; + +/** + * @brief Scope of the memory fence operation associated with a packet. + */ +typedef enum { + /** + * No scope (no fence is applied). The packet relies on external fences to + * ensure visibility of memory updates. + */ + HSA_FENCE_SCOPE_NONE = 0, + /** + * The fence is applied with agent scope for the global segment. + */ + HSA_FENCE_SCOPE_AGENT = 1, + /** + * The fence is applied across both agent and system scope for the global + * segment. + */ + HSA_FENCE_SCOPE_SYSTEM = 2 +} hsa_fence_scope_t; + +/** + * @brief Sub-fields of the @a header field that is present in any AQL + * packet. The offset (with respect to the address of @a header) of a sub-field + * is identical to its enumeration constant. The width of each sub-field is + * determined by the corresponding value in ::hsa_packet_header_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Packet type. The value of this sub-field must be one of + * ::hsa_packet_type_t. If the type is ::HSA_PACKET_TYPE_VENDOR_SPECIFIC, the + * packet layout is vendor-specific. + */ + HSA_PACKET_HEADER_TYPE = 0, + /** + * Barrier bit. If the barrier bit is set, the processing of the current + * packet only launches when all preceding packets (within the same queue) are + * complete. + */ + HSA_PACKET_HEADER_BARRIER = 8, + /** + * Acquire fence scope. The value of this sub-field determines the scope and + * type of the memory fence operation applied before the packet enters the + * active phase. An acquire fence ensures that any subsequent global segment + * or image loads by any unit of execution that belongs to a dispatch that has + * not yet entered the active phase on any queue of the same kernel agent, + * sees any data previously released at the scopes specified by the acquire + * fence. The value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9, + /** + * Release fence scope, The value of this sub-field determines the scope and + * type of the memory fence operation applied after kernel completion but + * before the packet is completed. A release fence makes any global segment or + * image data that was stored by any unit of execution that belonged to a + * dispatch that has completed the active phase on any queue of the same + * kernel agent visible in all the scopes specified by the release fence. The + * value of this sub-field must be one of ::hsa_fence_scope_t. + */ + HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11, + /** + * @deprecated Renamed as ::HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11 + } hsa_packet_header_t; + +/** + * @brief Width (in bits) of the sub-fields in ::hsa_packet_header_t. + */ + typedef enum { + HSA_PACKET_HEADER_WIDTH_TYPE = 8, + HSA_PACKET_HEADER_WIDTH_BARRIER = 1, + HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_ACQUIRE_FENCE_SCOPE = 2, + HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE = 2, + /** + * @deprecated Use HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE. + */ + HSA_PACKET_HEADER_WIDTH_RELEASE_FENCE_SCOPE = 2 + } hsa_packet_header_width_t; + +/** + * @brief Sub-fields of the kernel dispatch packet @a setup field. The offset + * (with respect to the address of @a setup) of a sub-field is identical to its + * enumeration constant. The width of each sub-field is determined by the + * corresponding value in ::hsa_kernel_dispatch_packet_setup_width_t. The + * offset and the width are expressed in bits. + */ + typedef enum { + /** + * Number of dimensions of the grid. Valid values are 1, 2, or 3. + * + */ + HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 + } hsa_kernel_dispatch_packet_setup_t; + +/** + * @brief Width (in bits) of the sub-fields in + * ::hsa_kernel_dispatch_packet_setup_t. + */ + typedef enum { + HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2 + } hsa_kernel_dispatch_packet_setup_width_t; + +/** + * @brief AQL kernel dispatch packet + */ +typedef struct hsa_kernel_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Dispatch setup parameters. Used to configure kernel dispatch parameters + * such as the number of dimensions in the grid. The parameters are described + * by ::hsa_kernel_dispatch_packet_setup_t. + */ + uint16_t setup; + + /** + * X dimension of work-group, in work-items. Must be greater than 0. + */ + uint16_t workgroup_size_x; + + /** + * Y dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 dimension, the only valid value is 1. + */ + uint16_t workgroup_size_y; + + /** + * Z dimension of work-group, in work-items. Must be greater than + * 0. If the grid has 1 or 2 dimensions, the only valid value is 1. + */ + uint16_t workgroup_size_z; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * X dimension of grid, in work-items. Must be greater than 0. Must + * not be smaller than @a workgroup_size_x. + */ + uint32_t grid_size_x; + + /** + * Y dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 dimension, the only valid value is 1. Must not be smaller than @a + * workgroup_size_y. + */ + uint32_t grid_size_y; + + /** + * Z dimension of grid, in work-items. Must be greater than 0. If the grid has + * 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a + * workgroup_size_z. + */ + uint32_t grid_size_z; + + /** + * Size in bytes of private memory allocation request (per work-item). + */ + uint32_t private_segment_size; + + /** + * Size in bytes of group memory allocation request (per work-group). Must not + * be less than the sum of the group memory used by the kernel (and the + * functions it calls directly or indirectly) and the dynamically allocated + * group segment variables. + */ + uint32_t group_segment_size; + + /** + * Opaque handle to a code object that includes an implementation-defined + * executable code for the kernel. + */ + uint64_t kernel_object; + +#ifdef HSA_LARGE_MODEL + void* kernarg_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Pointer to a buffer containing the kernel arguments. May be NULL. + * + * The buffer must be allocated using ::hsa_memory_allocate, and must not be + * modified once the kernel dispatch packet is enqueued until the dispatch has + * completed execution. + */ + void* kernarg_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* kernarg_address; +#endif + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_kernel_dispatch_packet_t; + +/** + * @brief Agent dispatch packet. + */ +typedef struct hsa_agent_dispatch_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Application-defined function to be performed by the destination agent. + */ + uint16_t type; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + +#ifdef HSA_LARGE_MODEL + void* return_address; +#elif defined HSA_LITTLE_ENDIAN + /** + * Address where to store the function return values, if any. + */ + void* return_address; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; +#else + uint32_t reserved1; + void* return_address; +#endif + + /** + * Function arguments. + */ + uint64_t arg[4]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_agent_dispatch_packet_t; + +/** + * @brief Barrier-AND packet. + */ +typedef struct hsa_barrier_and_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as satisfied + * dependencies. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_and_packet_t; + +/** + * @brief Barrier-OR packet. + */ +typedef struct hsa_barrier_or_packet_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + * Reserved. Must be 0. + */ + uint16_t reserved0; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Array of dependent signal objects. Signals with a handle value of 0 are + * allowed and are interpreted by the packet processor as dependencies not + * satisfied. + */ + hsa_signal_t dep_signal[5]; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; + +} hsa_barrier_or_packet_t; + +/** @} */ + +/** \addtogroup memory Memory + * @{ + */ + +/** + * @brief Memory segments associated with a region. + */ +typedef enum { + /** + * Global segment. Used to hold data that is shared by all agents. + */ + HSA_REGION_SEGMENT_GLOBAL = 0, + /** + * Read-only segment. Used to hold data that remains constant during the + * execution of a kernel. + */ + HSA_REGION_SEGMENT_READONLY = 1, + /** + * Private segment. Used to hold data that is local to a single work-item. + */ + HSA_REGION_SEGMENT_PRIVATE = 2, + /** + * Group segment. Used to hold data that is shared by the work-items of a + * work-group. + */ + HSA_REGION_SEGMENT_GROUP = 3, + /** + * Kernarg segment. Used to store kernel arguments. + */ + HSA_REGION_SEGMENT_KERNARG = 4 +} hsa_region_segment_t; + +/** + * @brief Global region flags. + */ +typedef enum { + /** + * The application can use memory in the region to store kernel arguments, and + * provide the values for the kernarg segment of a kernel dispatch. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_FINE_GRAINED must be set. + */ + HSA_REGION_GLOBAL_FLAG_KERNARG = 1, + /** + * Updates to memory in this region are immediately visible to all the + * agents under the terms of the HSA memory model. If this + * flag is set, then ::HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED must not be set. + */ + HSA_REGION_GLOBAL_FLAG_FINE_GRAINED = 2, + /** + * Updates to memory in this region can be performed by a single agent at + * a time. If a different agent in the system is allowed to access the + * region, the application must explicitely invoke ::hsa_memory_assign_agent + * in order to transfer ownership to that agent for a particular buffer. + */ + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4 +} hsa_region_global_flag_t; + +/** + * @brief Attributes of a memory region. + */ +typedef enum { + /** + * Segment where memory in the region can be used. The type of this + * attribute is ::hsa_region_segment_t. + */ + HSA_REGION_INFO_SEGMENT = 0, + /** + * Flag mask. The value of this attribute is undefined if the value of + * ::HSA_REGION_INFO_SEGMENT is not ::HSA_REGION_SEGMENT_GLOBAL. The type of + * this attribute is uint32_t, a bit-field of ::hsa_region_global_flag_t + * values. + */ + HSA_REGION_INFO_GLOBAL_FLAGS = 1, + /** + * Size of this region, in bytes. The type of this attribute is size_t. + */ + HSA_REGION_INFO_SIZE = 2, + /** + * Maximum allocation size in this region, in bytes. Must not exceed the value + * of ::HSA_REGION_INFO_SIZE. The type of this attribute is size_t. + * + * If the region is in the global or readonly segments, this is the maximum + * size that the application can pass to ::hsa_memory_allocate. + * + * If the region is in the group segment, this is the maximum size (per + * work-group) that can be requested for a given kernel dispatch. If the + * region is in the private segment, this is the maximum size (per work-item) + * that can be requested for a specific kernel dispatch, and must be at least + * 256 bytes. + */ + HSA_REGION_INFO_ALLOC_MAX_SIZE = 4, + /** + * Maximum size (per work-group) of private memory that can be requested for a + * specific kernel dispatch. Must be at least 65536 bytes. The type of this + * attribute is uint32_t. The value of this attribute is undefined if the + * region is not in the private segment. + */ + HSA_REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE = 8, + /** + * Indicates whether memory in this region can be allocated using + * ::hsa_memory_allocate. The type of this attribute is bool. + * + * The value of this flag is always false for regions in the group and private + * segments. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED = 5, + /** + * Allocation granularity of buffers allocated by ::hsa_memory_allocate in + * this region. The size of a buffer allocated in this region is a multiple of + * the value of this attribute. The value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region. The type + * of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE = 6, + /** + * Alignment of buffers allocated by ::hsa_memory_allocate in this region. The + * value of this attribute is only defined if + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED is true for this region, and must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT = 7 +} hsa_region_info_t; + +/** + * @brief Get the current value of an attribute of a region. + * + * @param[in] region A valid region. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to a application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * region attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_region_get_info( + hsa_region_t region, + hsa_region_info_t attribute, + void* value); + +/** + * @brief Iterate over the memory regions associated with a given agent, and + * invoke an application-defined callback on every iteration. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per region that is + * accessible from the agent. The HSA runtime passes two arguments to the + * callback, the region and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and ::hsa_agent_iterate_regions returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_regions( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_region_t region, void* data), + void* data); + +/** + * @brief Allocate a block of memory in a given region. + * + * @param[in] region Region where to allocate memory from. The region must have + * the ::HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED flag set. + * + * @param[in] size Allocation size, in bytes. Must not be zero. This value is + * rounded up to the nearest multiple of ::HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE + * in @p region. + * + * @param[out] ptr Pointer to the location where to store the base address of + * the allocated block. The returned base address is aligned to the value of + * ::HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT in @p region. If the allocation + * fails, the returned value is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_REGION The region is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION The host is not allowed to + * allocate memory in @p region, or @p size is greater than the value of + * HSA_REGION_INFO_ALLOC_MAX_SIZE in @p region. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p size is 0. + */ +hsa_status_t HSA_API hsa_memory_allocate(hsa_region_t region, + size_t size, + void** ptr); + +/** + * @brief Deallocate a block of memory previously allocated using + * ::hsa_memory_allocate. + * + * @param[in] ptr Pointer to a memory block. If @p ptr does not match a value + * previously returned by ::hsa_memory_allocate, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + */ +hsa_status_t HSA_API hsa_memory_free(void* ptr); + +/** + * @brief Copy a block of memory from the location pointed to by @p src to the + * memory block pointed to by @p dst. + * + * @param[out] dst Buffer where the content is to be copied. If @p dst is in + * coarse-grained memory, the copied data is only visible to the agent currently + * assigned (::hsa_memory_assign_agent) to @p dst. + * + * @param[in] src A valid pointer to the source of data to be copied. The source + * buffer must not overlap with the destination buffer. If the source buffer is + * in coarse-grained memory then it must be assigned to an agent, from which the + * data will be retrieved. + * + * @param[in] size Number of bytes to copy. If @p size is 0, no copy is + * performed and the function returns success. Copying a number of bytes larger + * than the size of the buffers pointed by @p dst or @p src results in undefined + * behavior. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT The source or destination + * pointers are NULL. + */ +hsa_status_t HSA_API hsa_memory_copy( + void *dst, + const void *src, + size_t size); + +/** + * @brief Change the ownership of a global, coarse-grained buffer. + * + * @details The contents of a coarse-grained buffer are visible to an agent + * only after ownership has been explicitely transferred to that agent. Once the + * operation completes, the previous owner cannot longer access the data in the + * buffer. + * + * An implementation of the HSA runtime is allowed, but not required, to change + * the physical location of the buffer when ownership is transferred to a + * different agent. In general the application must not assume this + * behavior. The virtual location (address) of the passed buffer is never + * modified. + * + * @param[in] ptr Base address of a global buffer. The pointer must match an + * address previously returned by ::hsa_memory_allocate. The size of the buffer + * affected by the ownership change is identical to the size of that previous + * allocation. If @p ptr points to a fine-grained global buffer, no operation is + * performed and the function returns success. If @p ptr does not point to + * global memory, the behavior is undefined. + * + * @param[in] agent Agent that becomes the owner of the buffer. The + * application is responsible for ensuring that @p agent has access to the + * region that contains the buffer. It is allowed to change ownership to an + * agent that is already the owner of the buffer, with the same or different + * access permissions. + * + * @param[in] access Access permissions requested for the new owner. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is NULL, or @p access is + * not a valid access value. + */ +hsa_status_t HSA_API hsa_memory_assign_agent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access); + +/** + * + * @brief Register a global, fine-grained buffer. + * + * @details Registering a buffer serves as an indication to the HSA runtime that + * the memory might be accessed from a kernel agent other than the + * host. Registration is a performance hint that allows the HSA runtime + * implementation to know which buffers will be accessed by some of the kernel + * agents ahead of time. + * + * Registration is only recommended for buffers in the global segment that have + * not been allocated using the HSA allocator (::hsa_memory_allocate), but an OS + * allocator instead. Registering an OS-allocated buffer in the base profile is + * equivalent to a no-op. + * + * Registrations should not overlap. + * + * @param[in] ptr A buffer in global, fine-grained memory. If a NULL pointer is + * passed, no operation is performed. If the buffer has been allocated using + * ::hsa_memory_allocate, or has already been registered, no operation is + * performed. + * + * @param[in] size Requested registration size in bytes. A size of 0 is + * only allowed if @p ptr is NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p size is 0 but @p ptr + * is not NULL. + */ +hsa_status_t HSA_API hsa_memory_register( + void *ptr, + size_t size); + +/** + * + * @brief Deregister memory previously registered using ::hsa_memory_register. + * + * @details If the memory interval being deregistered does not match a previous + * registration (start and end addresses), the behavior is undefined. + * + * @param[in] ptr A pointer to the base of the buffer to be deregistered. If + * a NULL pointer is passed, no operation is performed. + * + * @param[in] size Size of the buffer to be deregistered. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + */ +hsa_status_t HSA_API hsa_memory_deregister( + void *ptr, + size_t size); + +/** @} */ + + +/** \defgroup instruction-set-architecture Instruction Set Architecture. + * @{ + */ + +/** + * @brief Instruction set architecture. + */ +typedef struct hsa_isa_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_isa_t; + +/** + * @brief Retrieve a reference to an instruction set architecture handle out of + * a symbolic name. + * + * @param[in] name Vendor-specific name associated with a a particular + * instruction set architecture. @p name must start with the vendor name and a + * colon (for example, "AMD:"). The rest of the name is vendor-specific. Must be + * a NUL-terminated string. + * + * @param[out] isa Memory location where the HSA runtime stores the ISA handle + * corresponding to the given name. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA_NAME The given name does not + * correspond to any instruction set architecture. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p name is NULL, or @p isa is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_from_name( + const char *name, + hsa_isa_t *isa); + +/** + * @brief Iterate over the instruction sets supported by the given agent, and + * invoke an application-defined callback on every iteration. The iterator is + * deterministic: if an agent supports several instruction set architectures, + * they are traversed in the same order in every invocation of this function. + * + * @param[in] agent A valid agent. + * + * @param[in] callback Callback to be invoked once per instruction set + * architecture. The HSA runtime passes two arguments to the callback: the + * ISA and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * that status value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_agent_iterate_isas( + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_isa_t isa, void *data), + void *data); + +/** + * @brief Instruction set architecture attributes. + */ +typedef enum { + /** + * The length of the ISA name in bytes, not including the NUL terminator. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_NAME_LENGTH = 0, + /** + * Human-readable description. The type of this attribute is character array + * with the length equal to the value of ::HSA_ISA_INFO_NAME_LENGTH attribute. + */ + HSA_ISA_INFO_NAME = 1, + /** + * @deprecated + * + * Number of call conventions supported by the instruction set architecture. + * Must be greater than zero. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_COUNT = 2, + /** + * @deprecated + * + * Number of work-items in a wavefront for a given call convention. Must be a + * power of 2 in the range [1,256]. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONT_SIZE = 3, + /** + * @deprecated + * + * Number of wavefronts per compute unit for a given call convention. In + * practice, other factors (for example, the amount of group memory used by a + * work-group) may further limit the number of wavefronts per compute + * unit. The type of this attribute is uint32_t. + */ + HSA_ISA_INFO_CALL_CONVENTION_INFO_WAVEFRONTS_PER_COMPUTE_UNIT = 4, + /** + * Machine models supported by the instruction set architecture. The type of + * this attribute is a bool[2]. If the ISA supports the small machine model, + * the element at index ::HSA_MACHINE_MODEL_SMALL is true. If the ISA supports + * the large model, the element at index ::HSA_MACHINE_MODEL_LARGE is true. + */ + HSA_ISA_INFO_MACHINE_MODELS = 5, + /** + * Profiles supported by the instruction set architecture. The type of this + * attribute is a bool[2]. If the ISA supports the base profile, the element + * at index ::HSA_PROFILE_BASE is true. If the ISA supports the full profile, + * the element at index ::HSA_PROFILE_FULL is true. + */ + HSA_ISA_INFO_PROFILES = 6, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture. The type of this attribute is a bool[3]. The value at a given + * index is true if the corresponding rounding mode in + * ::hsa_default_float_rounding_mode_t is supported. At least one default mode + * has to be supported. + * + * If the default mode is supported, then + * ::HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES must report that + * both the zero and the near roundings modes are supported. + */ + HSA_ISA_INFO_DEFAULT_FLOAT_ROUNDING_MODES = 7, + /** + * Default floating-point rounding modes supported by the instruction set + * architecture in the Base profile. The type of this attribute is a + * bool[3]. The value at a given index is true if the corresponding rounding + * mode in ::hsa_default_float_rounding_mode_t is supported. The value at + * index HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT must be false. At least one + * of the values at indexes ::HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO or + * HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR must be true. + */ + HSA_ISA_INFO_BASE_PROFILE_DEFAULT_FLOAT_ROUNDING_MODES = 8, + /** + * Flag indicating that the f16 HSAIL operation is at least as fast as the + * f32 operation in the instruction set architecture. The type of this + * attribute is bool. + */ + HSA_ISA_INFO_FAST_F16_OPERATION = 9, + /** + * Maximum number of work-items of each dimension of a work-group. Each + * maximum must be greater than 0. No maximum can exceed the value of + * ::HSA_ISA_INFO_WORKGROUP_MAX_SIZE. The type of this attribute is + * uint16_t[3]. + */ + HSA_ISA_INFO_WORKGROUP_MAX_DIM = 12, + /** + * Maximum total number of work-items in a work-group. The type + * of this attribute is uint32_t. + */ + HSA_ISA_INFO_WORKGROUP_MAX_SIZE = 13, + /** + * Maximum number of work-items of each dimension of a grid. Each maximum must + * be greater than 0, and must not be smaller than the corresponding value in + * ::HSA_ISA_INFO_WORKGROUP_MAX_DIM. No maximum can exceed the value of + * ::HSA_ISA_INFO_GRID_MAX_SIZE. The type of this attribute is + * ::hsa_dim3_t. + */ + HSA_ISA_INFO_GRID_MAX_DIM = 14, + /** + * Maximum total number of work-items in a grid. The type of this + * attribute is uint64_t. + */ + HSA_ISA_INFO_GRID_MAX_SIZE = 16, + /** + * Maximum number of fbarriers per work-group. Must be at least 32. The + * type of this attribute is uint32_t. + */ + HSA_ISA_INFO_FBARRIER_MAX_SIZE = 17 +} hsa_isa_info_t; + +/** + * @deprecated The concept of call convention has been deprecated. If the + * application wants to query the value of an attribute for a given instruction + * set architecture, use ::hsa_isa_get_info_alt instead. If the application + * wants to query an attribute that is specific to a given combination of ISA + * and wavefront, use ::hsa_wavefront_get_info. + * + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[in] index Call convention index. Used only for call convention + * attributes, otherwise ignored. Must have a value between 0 (inclusive) and + * the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT (not + * inclusive) in @p isa. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_INDEX The index is out of range. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_get_info( + hsa_isa_t isa, + hsa_isa_info_t attribute, + uint32_t index, + void *value); + +/** + * @brief Get the current value of an attribute for a given instruction set + * architecture (ISA). + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * instruction set architecture attribute, or @p value is + * NULL. + */ +hsa_status_t HSA_API hsa_isa_get_info_alt( + hsa_isa_t isa, + hsa_isa_info_t attribute, + void *value); + +/** + * @brief Retrieve the exception policy support for a given combination of + * instruction set architecture and profile. + * + * @param[in] isa A valid instruction set architecture. + * + * @param[in] profile Profile. + * + * @param[out] mask Pointer to a memory location where the HSA runtime stores a + * mask of ::hsa_exception_policy_t values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is not a valid + * profile, or @p mask is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_exception_policies( + hsa_isa_t isa, + hsa_profile_t profile, + uint16_t *mask); + +/** + * @brief Floating-point types. + */ +typedef enum { + /** + * 16-bit floating-point type. + */ + HSA_FP_TYPE_16 = 1, + /** + * 32-bit floating-point type. + */ + HSA_FP_TYPE_32 = 2, + /** + * 64-bit floating-point type. + */ + HSA_FP_TYPE_64 = 4 +} hsa_fp_type_t; + +/** + * @brief Flush to zero modes. + */ +typedef enum { + /** + * Flush to zero. + */ + HSA_FLUSH_MODE_FTZ = 1, + /** + * Do not flush to zero. + */ + HSA_FLUSH_MODE_NON_FTZ = 2 +} hsa_flush_mode_t; + +/** + * @brief Round methods. + */ +typedef enum { + /** + * Single round method. + */ + HSA_ROUND_METHOD_SINGLE = 1, + /** + * Double round method. + */ + HSA_ROUND_METHOD_DOUBLE = 2 +} hsa_round_method_t; + +/** + * @brief Retrieve the round method (single or double) used to implement the + * floating-point multiply add instruction (mad) for a given combination of + * instruction set architecture, floating-point type, and flush to zero + * modifier. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] fp_type Floating-point type. + * + * @param[in] flush_mode Flush to zero modifier. + * + * @param[out] round_method Pointer to a memory location where the HSA + * runtime stores the round method used by the implementation. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p fp_type is not a valid + * floating-point type, or @p flush_mode is not a valid flush to zero modifier, + * or @p round_method is NULL. + */ +hsa_status_t HSA_API hsa_isa_get_round_method( + hsa_isa_t isa, + hsa_fp_type_t fp_type, + hsa_flush_mode_t flush_mode, + hsa_round_method_t *round_method); + +/** + * @brief Wavefront handle + */ +typedef struct hsa_wavefront_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_wavefront_t; + +/** + * @brief Wavefront attributes. + */ +typedef enum { + /** + * Number of work-items in the wavefront. Must be a power of 2 in the range + * [1,256]. The type of this attribute is uint32_t. + */ + HSA_WAVEFRONT_INFO_SIZE = 0 +} hsa_wavefront_info_t; + +/** + * @brief Get the current value of a wavefront attribute. + * + * @param[in] wavefront A wavefront. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_WAVEFRONT The wavefront is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * wavefront attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_wavefront_get_info( + hsa_wavefront_t wavefront, + hsa_wavefront_info_t attribute, + void *value); + +/** + * @brief Iterate over the different wavefronts supported by an instruction set + * architecture, and invoke an application-defined callback on every iteration. + * + * @param[in] isa Instruction set architecture. + * + * @param[in] callback Callback to be invoked once per wavefront that is + * supported by the agent. The HSA runtime passes two arguments to the callback: + * the wavefront handle and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and that value is returned. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA The instruction set architecture is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_isa_iterate_wavefronts( + hsa_isa_t isa, + hsa_status_t (*callback)(hsa_wavefront_t wavefront, void *data), + void *data); + +/** + * @deprecated Use ::hsa_agent_iterate_isas to query which instructions set + * architectures are supported by a given agent. + * + * @brief Check if the instruction set architecture of a code object can be + * executed on an agent associated with another architecture. + * + * @param[in] code_object_isa Instruction set architecture associated with a + * code object. + * + * @param[in] agent_isa Instruction set architecture associated with an agent. + * + * @param[out] result Pointer to a memory location where the HSA runtime stores + * the result of the check. If the two architectures are compatible, the result + * is true; if they are incompatible, the result is false. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p code_object_isa or @p agent_isa are + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_isa_compatible( + hsa_isa_t code_object_isa, + hsa_isa_t agent_isa, + bool *result); + +/** @} */ + + +/** \defgroup executable Executable + * @{ + */ + +/** + * @brief Code object reader handle. A code object reader is used to + * load a code object from file (when created using + * ::hsa_code_object_reader_create_from_file), or from memory (if created using + * ::hsa_code_object_reader_create_from_memory). + */ +typedef struct hsa_code_object_reader_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_reader_t; + +/** + * @brief Create a code object reader to operate on a file. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_file( + hsa_file_t file, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Create a code object reader to operate on memory. + * + * @param[in] code_object Memory buffer that contains a vendor-specific code + * object. The buffer is owned and managed by the application; the lifetime of + * the buffer must exceed that of any associated code object reader. + * + * @param[in] size Size of the buffer pointed to by @p code_object. Must not be + * 0. + * + * @param[out] code_object_reader Memory location to store newly created code + * object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object is NULL, @p size + * is zero, or @p code_object_reader is NULL. + */ +hsa_status_t HSA_API hsa_code_object_reader_create_from_memory( + const void *code_object, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +/** + * @brief Destroy a code object reader. + * + * @details The code object reader handle becomes invalid after completion of + * this function. Any file or memory used to create the code object read is not + * closed, removed, or deallocated by this function. + * + * @param[in] code_object_reader Code object reader to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + */ +hsa_status_t HSA_API hsa_code_object_reader_destroy( + hsa_code_object_reader_t code_object_reader); + +/** + * @brief Struct containing an opaque handle to an executable, which contains + * ISA for finalized kernels and indirect functions together with the allocated + * global or readonly segment variables they reference. + */ +typedef struct hsa_executable_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_t; + +/** + * @brief Executable state. + */ +typedef enum { + /** + * Executable state, which allows the user to load code objects and define + * external variables. Variable addresses, kernel code handles, and + * indirect function code handles are not available in query operations until + * the executable is frozen (zero always returned). + */ + HSA_EXECUTABLE_STATE_UNFROZEN = 0, + /** + * Executable state, which allows the user to query variable addresses, + * kernel code handles, and indirect function code handles using query + * operations. Loading new code objects, as well as defining external + * variables, is not allowed in this state. + */ + HSA_EXECUTABLE_STATE_FROZEN = 1 +} hsa_executable_state_t; + +/** + * @deprecated Use ::hsa_executable_create_alt instead, which allows the + * application to specify the default floating-point rounding mode of the + * executable and assumes an unfrozen initial state. + * + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] executable_state Executable state. If the state is + * ::HSA_EXECUTABLE_STATE_FROZEN, the resulting executable is useless because no + * code objects can be loaded, and no variables can be defined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores the newly + * created executable handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_create( + hsa_profile_t profile, + hsa_executable_state_t executable_state, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Create an empty executable. + * + * @param[in] profile Profile used in the executable. + * + * @param[in] default_float_rounding_mode Default floating-point rounding mode + * used in the executable. Allowed rounding modes are near and zero (default is + * not allowed). + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] executable Memory location where the HSA runtime stores newly + * created executable handle. The initial state of the executable is + * ::HSA_EXECUTABLE_STATE_UNFROZEN. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p profile is invalid, or + * @p executable is NULL. + */ +hsa_status_t HSA_API hsa_executable_create_alt( + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_executable_t *executable); + +/** + * @brief Destroy an executable. + * + * @details An executable handle becomes invalid after the executable has been + * destroyed. Code object handles that were loaded into this executable are + * still valid after the executable has been destroyed, and can be used as + * intended. Resources allocated outside and associated with this executable + * (such as external global or readonly variables) can be released after the + * executable has been destroyed. + * + * Executable should not be destroyed while kernels are in flight. + * + * @param[in] executable Executable. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + */ +hsa_status_t HSA_API hsa_executable_destroy( + hsa_executable_t executable); + +/** + * @brief Loaded code object handle. + */ +typedef struct hsa_loaded_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_loaded_code_object_t; + +/** + * @brief Load a program code object into an executable. + * + * @details A program code object contains information about resources that are + * accessible by all kernel agents that run the executable, and can be loaded + * at most once into an executable. + * + * If the program code object uses extensions, the implementation must support + * them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] code_object_reader A code object reader that holds the program + * code object to load. If a code object reader is destroyed before all the + * associated executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The program code object is + * not compatible with the executable or the implementation (for example, the + * code object uses an extension that is not supported by the implementation). + */ +hsa_status_t HSA_API hsa_executable_load_program_code_object( + hsa_executable_t executable, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Load an agent code object into an executable. + * + * @details The agent code object contains all defined agent + * allocation variables, functions, indirect functions, and kernels in a given + * program for a given instruction set architecture. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * The default floating-point rounding mode of the code object associated with + * @p code_object_reader must match that of the executable + * (::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE), or be default (in which + * case the value of ::HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE is used). + * If the agent code object uses extensions, the implementation and the agent + * must support them for this operation to return successfully. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. A code object can be loaded + * into an executable at most once for a given agent. The instruction set + * architecture of the code object must be supported by the agent. + * + * @param[in] code_object_reader A code object reader that holds the code object + * to load. If a code object reader is destroyed before all the associated + * executables are destroyed, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] loaded_code_object Pointer to a memory location where the HSA + * runtime stores the loaded code object handle. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE The executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER @p code_object_reader + * is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS The code object read by @p + * code_object_reader is not compatible with the agent (for example, the agent + * does not support the instruction set architecture of the code object), the + * executable (for example, there is a default floating-point mode mismatch + * between the two), or the implementation. + */ +hsa_status_t HSA_API hsa_executable_load_agent_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_reader_t code_object_reader, + const char *options, + hsa_loaded_code_object_t *loaded_code_object); + +/** + * @brief Freeze the executable. + * + * @details No modifications to executable can be made after freezing: no code + * objects can be loaded to the executable, and no external variables can be + * defined. Freezing the executable does not prevent querying the executable's + * attributes. The application must define all the external variables in an + * executable before freezing it. + * + * @param[in] executable Executable. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_UNDEFINED One or more variables are + * undefined in the executable. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is already frozen. + */ +hsa_status_t HSA_API hsa_executable_freeze( + hsa_executable_t executable, + const char *options); + +/** + * @brief Executable attributes. + */ +typedef enum { + /** + * Profile this executable is created for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_EXECUTABLE_INFO_PROFILE = 1, + /** + * Executable state. The type of this attribute is ::hsa_executable_state_t. + */ + HSA_EXECUTABLE_INFO_STATE = 2, + /** + * Default floating-point rounding mode specified when executable was created. + * The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXECUTABLE_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 3 +} hsa_executable_info_t; + +/** + * @brief Get the current value of an attribute for a given executable. + * + * @param[in] executable Executable. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_info( + hsa_executable_t executable, + hsa_executable_info_t attribute, + void *value); + +/** + * @brief Define an external global variable with program allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with program allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * be in global memory and can be read and written by any agent in the + * system. The application cannot deallocate the buffer pointed by @p address + * before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_global_variable_define( + hsa_executable_t executable, + const char *variable_name, + void *address); + +/** + * @brief Define an external global variable with agent allocation. + * + * @details This function allows the application to provide the definition + * of a variable in the global segment memory with agent allocation. The + * variable must be defined before loading a code object into an executable. + * In addition, code objects loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a global region + * that is only visible to @p agent. The application cannot deallocate the + * buffer pointed by @p address before @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_agent_global_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Define an external readonly variable. + * + * @details This function allows the application to provide the definition + * of a variable in the readonly segment memory. The variable must be defined + * before loading a code object into an executable. In addition, code objects + * loaded must not define the variable. + * + * @param[in] executable Executable. Must not be in frozen state. + * + * @param[in] agent Agent for which the variable is being defined. + * + * @param[in] variable_name Name of the variable. The Programmer's Reference + * Manual describes the standard name mangling scheme. + * + * @param[in] address Address where the variable is defined. This address must + * have been previously allocated using ::hsa_memory_allocate in a readonly + * region associated with @p agent. The application cannot deallocate the buffer + * pointed by @p address before @p executable is destroyed. + * + * @param[in] address Address where the variable is defined. The buffer pointed + * by @p address is owned by the application, and cannot be deallocated before + * @p executable is destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE Executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT @p agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_VARIABLE_ALREADY_DEFINED The variable is + * already defined. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no variable with the + * @p variable_name. + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p variable_name is NULL. + */ +hsa_status_t HSA_API hsa_executable_readonly_variable_define( + hsa_executable_t executable, + hsa_agent_t agent, + const char *variable_name, + void *address); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). Invoking this function is equivalent to invoking + * ::hsa_executable_validate_alt with no options. + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate( + hsa_executable_t executable, + uint32_t *result); + +/** + * @brief Validate an executable. Checks that all code objects have matching + * machine model, profile, and default floating-point rounding mode. Checks that + * all declarations have definitions. Checks declaration-definition + * compatibility (see the HSA Programming Reference Manual for compatibility + * rules). + * + * @param[in] executable Executable. Must be in frozen state. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] result Memory location where the HSA runtime stores the + * validation result. If the executable passes validation, the result is 0. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE @p executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p result is NULL. + */ +hsa_status_t HSA_API hsa_executable_validate_alt( + hsa_executable_t executable, + const char *options, + uint32_t *result); + +/** + * @brief Executable symbol handle. + * + * The lifetime of an executable object symbol matches that of the executable + * associated with it. An operation on a symbol whose associated executable has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_executable_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_executable_symbol_t; + +/** + * @deprecated Use ::hsa_executable_get_symbol_by_name instead. + * + * @brief Get the symbol handle for a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[in] agent Agent associated with the symbol. If the symbol is + * independent of any agent (for example, a variable with program + * allocation), this argument is ignored. + * + * @param[in] call_convention Call convention associated with the symbol. If the + * symbol does not correspond to an indirect function, this argument is ignored. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_get_symbol( + hsa_executable_t executable, + const char *module_name, + const char *symbol_name, + hsa_agent_t agent, + int32_t call_convention, + hsa_executable_symbol_t *symbol); + +/** + * @brief Retrieve the symbol handle corresponding to a given a symbol name. + * + * @param[in] executable Executable. + * + * @param[in] symbol_name Symbol name. Must be a NUL-terminated character + * array. The Programmer's Reference Manual describes the standard name mangling + * scheme. + * + * @param[in] agent Pointer to the agent for which the symbol with the given + * name is defined. If the symbol corresponding to the given name has program + * allocation, @p agent must be NULL. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or @p + * symbol is NULL. + */ +hsa_status_t HSA_API hsa_executable_get_symbol_by_name( + hsa_executable_t executable, + const char *symbol_name, + const hsa_agent_t *agent, + hsa_executable_symbol_t *symbol); + +/** + * @brief Symbol type. + */ +typedef enum { + /** + * Variable. + */ + HSA_SYMBOL_KIND_VARIABLE = 0, + /** + * Kernel. + */ + HSA_SYMBOL_KIND_KERNEL = 1, + /** + * Indirect function. + */ + HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2 +} hsa_symbol_kind_t; + +/** + * @brief Linkage type of a symbol. + */ +typedef enum { + /** + * Module linkage. + */ + HSA_SYMBOL_LINKAGE_MODULE = 0, + /** + * Program linkage. + */ + HSA_SYMBOL_LINKAGE_PROGRAM = 1 +} hsa_symbol_linkage_t; + +/** + * @brief Allocation type of a variable. + */ +typedef enum { + /** + * Agent allocation. + */ + HSA_VARIABLE_ALLOCATION_AGENT = 0, + /** + * Program allocation. + */ + HSA_VARIABLE_ALLOCATION_PROGRAM = 1 +} hsa_variable_allocation_t; + +/** + * @brief Memory segment associated with a variable. + */ +typedef enum { + /** + * Global memory segment. + */ + HSA_VARIABLE_SEGMENT_GLOBAL = 0, + /** + * Readonly memory segment. + */ + HSA_VARIABLE_SEGMENT_READONLY = 1 +} hsa_variable_segment_t; + +/** + * @brief Executable symbol attributes. + */ +typedef enum { + /** + * The kind of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2, + /** + * @deprecated + * + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * @deprecated + * + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_EXECUTABLE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * @deprecated + * + * Agent associated with this symbol. If the symbol is a variable, the + * value of this attribute is only defined if + * ::HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION is + * ::HSA_VARIABLE_ALLOCATION_AGENT. The type of this attribute is hsa_agent_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_AGENT = 20, + /** + * The address of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint64_t. + * + * If executable's state is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 is + * returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * @deprecated + * + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * @deprecated + * + * The segment kind of the variable. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * @deprecated + * + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * @deprecated + * + * Size of the variable. The value of this attribute is undefined if + * the symbol is not a variable. The type of this attribute is uint32_t. + * + * A value of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * @deprecated + * + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Kernel object handle, used in the kernel dispatch packet. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint64_t. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is + * true, the kernel may use more private memory than the reported value, and + * the application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * @deprecated + * + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Indirect function object handle. The value of this attribute is undefined + * if the symbol is not an indirect function, or the associated agent does + * not support the Full Profile. The type of this attribute depends on the + * machine model: the type is uint32_t for small machine model, and uint64_t + * for large model. + * + * If the state of the executable is ::HSA_EXECUTABLE_STATE_UNFROZEN, then 0 + * is returned. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_OBJECT = 23, + /** + * @deprecated + * + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function, or the associated + * agent does not support the Full Profile. The type of this attribute is + * uint32_t. + */ + HSA_EXECUTABLE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_executable_symbol_info_t; + +/** + * @brief Get the current value of an attribute for a given executable symbol. + * + * @param[in] executable_symbol Executable symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE_SYMBOL The executable symbol is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * executable symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_executable_symbol_get_info( + hsa_executable_symbol_t executable_symbol, + hsa_executable_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a executable, and invoke an + * application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_iterate_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the kernels, indirect functions, and agent allocation + * variables in an executable for a given agent, and invoke an application- + * defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_agent_symbols( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_agent_t agent, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** + * @brief Iterate over the program allocation variables in an executable, and + * invoke an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per executable symbol. The + * HSA runtime passes three arguments to the callback: the executable, a symbol, + * and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_executable_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_executable_iterate_program_symbols( + hsa_executable_t executable, + hsa_status_t (*callback)(hsa_executable_t exec, + hsa_executable_symbol_t symbol, + void *data), + void *data); + +/** @} */ + + +/** \defgroup code-object Code Objects (deprecated). + * @{ + */ + +/** + * @deprecated + * + * @brief Struct containing an opaque handle to a code object, which contains + * ISA for finalized kernels and indirect functions together with information + * about the global or readonly segment variables they reference. + */ +typedef struct hsa_code_object_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_object_t; + +/** + * @deprecated + * + * @brief Application data handle that is passed to the serialization + * and deserialization functions. + */ +typedef struct hsa_callback_data_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_callback_data_t; + +/** + * @deprecated + * + * @brief Serialize a code object. Can be used for offline finalization, + * install-time finalization, disk code caching, etc. + * + * @param[in] code_object Code object. + * + * @param[in] alloc_callback Callback function for memory allocation. Must not + * be NULL. The HSA runtime passes three arguments to the callback: the + * allocation size, the application data, and a pointer to a memory location + * where the application stores the allocation result. The HSA runtime invokes + * @p alloc_callback once to allocate a buffer that contains the serialized + * version of @p code_object. If the callback returns a status code other than + * ::HSA_STATUS_SUCCESS, this function returns the same code. + * + * @param[in] callback_data Application data that is passed to @p + * alloc_callback. May be NULL. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] serialized_code_object Memory location where the HSA runtime + * stores a pointer to the serialized code object. Must not be NULL. + * + * @param[out] serialized_code_object_size Memory location where the HSA runtime + * stores the size (in bytes) of @p serialized_code_object. The returned value + * matches the allocation size passed by the HSA runtime to @p + * alloc_callback. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p alloc_callback, @p + * serialized_code_object, or @p serialized_code_object_size are NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_serialize( + hsa_code_object_t code_object, + hsa_status_t (*alloc_callback)(size_t size, + hsa_callback_data_t data, + void **address), + hsa_callback_data_t callback_data, + const char *options, + void **serialized_code_object, + size_t *serialized_code_object_size); + +/** + * @deprecated + * + * @brief Deserialize a code object. + * + * @param[in] serialized_code_object A serialized code object. Must not be NULL. + * + * @param[in] serialized_code_object_size The size (in bytes) of @p + * serialized_code_object. Must not be 0. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @param[out] code_object Memory location where the HSA runtime stores the + * deserialized code object. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p serialized_code_object, or @p + * code_object are NULL, or @p serialized_code_object_size is 0. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_deserialize( + void *serialized_code_object, + size_t serialized_code_object_size, + const char *options, + hsa_code_object_t *code_object); + +/** + * @deprecated + * + * @brief Destroy a code object. + * + * @details The lifetime of a code object must exceed that of any executable + * where it has been loaded. If an executable that loaded @p code_object has not + * been destroyed, the behavior is undefined. + * + * @param[in] code_object Code object. The handle becomes invalid after it has + * been destroyed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_destroy( + hsa_code_object_t code_object); + +/** + * @deprecated + * + * @brief Code object type. + */ +typedef enum { + /** + * Produces code object that contains ISA for all kernels and indirect + * functions in HSA source. + */ + HSA_CODE_OBJECT_TYPE_PROGRAM = 0 +} hsa_code_object_type_t; + +/** + * @deprecated + * + * @brief Code object attributes. + */ +typedef enum { + /** + * The version of the code object. The type of this attribute is a + * NUL-terminated char[64]. The name must be at most 63 characters long (not + * including the NUL terminator) and all array elements not used for the name + * must be NUL. + */ + HSA_CODE_OBJECT_INFO_VERSION = 0, + /** + * Type of code object. The type of this attribute is + * ::hsa_code_object_type_t. + */ + HSA_CODE_OBJECT_INFO_TYPE = 1, + /** + * Instruction set architecture this code object is produced for. The type of + * this attribute is ::hsa_isa_t. + */ + HSA_CODE_OBJECT_INFO_ISA = 2, + /** + * Machine model this code object is produced for. The type of this attribute + * is ::hsa_machine_model_t. + */ + HSA_CODE_OBJECT_INFO_MACHINE_MODEL = 3, + /** + * Profile this code object is produced for. The type of this attribute is + * ::hsa_profile_t. + */ + HSA_CODE_OBJECT_INFO_PROFILE = 4, + /** + * Default floating-point rounding mode used when the code object is + * produced. The type of this attribute is + * ::hsa_default_float_rounding_mode_t. + */ + HSA_CODE_OBJECT_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 5 +} hsa_code_object_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code object. + * + * @param[in] code_object Code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code object attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_info( + hsa_code_object_t code_object, + hsa_code_object_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Load code object into the executable. + * + * @details Every global or readonly variable that is external must be defined + * before loading the code object. An internal global or readonly variable is + * allocated once the code object, that is being loaded, references this + * variable and this variable is not allocated. + * + * Any module linkage declaration must have been defined either by a define + * variable or by loading a code object that has a symbol with module linkage + * definition. + * + * @param[in] executable Executable. + * + * @param[in] agent Agent to load code object for. The agent must support the + * default floating-point rounding mode used by @p code_object. + * + * @param[in] code_object Code object to load. The lifetime of the code object + * must exceed that of the executable: if @p code_object is destroyed before @p + * executable, the behavior is undefined. + * + * @param[in] options Standard and vendor-specific options. Unknown options are + * ignored. A standard option begins with the "-hsa_" prefix. Options beginning + * with the "-hsa_ext__" prefix are reserved for extensions. A + * vendor-specific option begins with the "-_" prefix. Must be a + * NUL-terminated string. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p agent is not compatible + * with @p code_object (for example, @p agent does not support the default + * floating-point rounding mode specified by @p code_object), or @p code_object + * is not compatible with @p executable (for example, @p code_object and @p + * executable have different machine models or profiles). + * + * @retval ::HSA_STATUS_ERROR_FROZEN_EXECUTABLE @p executable is frozen. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_executable_load_code_object( + hsa_executable_t executable, + hsa_agent_t agent, + hsa_code_object_t code_object, + const char *options); + +/** + * @deprecated + * + * @brief Code object symbol handle. + * + * The lifetime of a code object symbol matches that of the code object + * associated with it. An operation on a symbol whose associated code object has + * been destroyed results in undefined behavior. + */ +typedef struct hsa_code_symbol_s { + /** + * Opaque handle. Two handles reference the same object of the enclosing type + * if and only if they are equal. + */ + uint64_t handle; +} hsa_code_symbol_t; + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol( + hsa_code_object_t code_object, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Get the symbol handle within a code object for a given a symbol name. + * + * @param[in] code_object Code object. + * + * @param[in] module_name Module name. Must be NULL if the symbol has + * program linkage. + * + * @param[in] symbol_name Symbol name. + * + * @param[out] symbol Memory location where the HSA runtime stores the symbol + * handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SYMBOL_NAME There is no symbol with a name + * that matches @p symbol_name. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p symbol_name is NULL, or + * @p symbol is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_get_symbol_from_name( + hsa_code_object_t code_object, + const char *module_name, + const char *symbol_name, + hsa_code_symbol_t *symbol); + +/** + * @deprecated + * + * @brief Code object symbol attributes. + */ +typedef enum { + /** + * The type of the symbol. The type of this attribute is ::hsa_symbol_kind_t. + */ + HSA_CODE_SYMBOL_INFO_TYPE = 0, + /** + * The length of the symbol name in bytes, not including the NUL terminator. + * The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_NAME_LENGTH = 1, + /** + * The name of the symbol. The type of this attribute is character array with + * the length equal to the value of ::HSA_CODE_SYMBOL_INFO_NAME_LENGTH + * attribute. + */ + HSA_CODE_SYMBOL_INFO_NAME = 2, + /** + * The length of the module name in bytes (not including the NUL terminator) + * to which this symbol belongs if this symbol has module linkage, otherwise 0 + * is returned. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH = 3, + /** + * The module name to which this symbol belongs if this symbol has module + * linkage, otherwise an empty string is returned. The type of this attribute + * is character array with the length equal to the value of + * ::HSA_CODE_SYMBOL_INFO_MODULE_NAME_LENGTH attribute. + */ + HSA_CODE_SYMBOL_INFO_MODULE_NAME = 4, + /** + * The linkage kind of the symbol. The type of this attribute is + * ::hsa_symbol_linkage_t. + */ + HSA_CODE_SYMBOL_INFO_LINKAGE = 5, + /** + * Indicates whether the symbol corresponds to a definition. The type of this + * attribute is bool. + */ + HSA_CODE_SYMBOL_INFO_IS_DEFINITION = 17, + /** + * The allocation kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_allocation_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALLOCATION = 6, + /** + * The segment kind of the variable. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * ::hsa_variable_segment_t. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SEGMENT = 7, + /** + * Alignment of the symbol in memory. The value of this attribute is undefined + * if the symbol is not a variable. The type of this attribute is uint32_t. + * + * The current alignment of the variable in memory may be greater than the + * value specified in the source program variable declaration. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_ALIGNMENT = 8, + /** + * Size of the variable. The value of this attribute is undefined if the + * symbol is not a variable. The type of this attribute is uint32_t. + * + * A size of 0 is returned if the variable is an external variable and has an + * unknown dimension. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_SIZE = 9, + /** + * Indicates whether the variable is constant. The value of this attribute is + * undefined if the symbol is not a variable. The type of this attribute is + * bool. + */ + HSA_CODE_SYMBOL_INFO_VARIABLE_IS_CONST = 10, + /** + * Size of kernarg segment memory that is required to hold the values of the + * kernel arguments, in bytes. Must be a multiple of 16. The value of this + * attribute is undefined if the symbol is not a kernel. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11, + /** + * Alignment (in bytes) of the buffer used to pass arguments to the kernel, + * which is the maximum of 16 and the maximum alignment of any of the kernel + * arguments. The value of this attribute is undefined if the symbol is not a + * kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT = 12, + /** + * Size of static group segment memory required by the kernel (per + * work-group), in bytes. The value of this attribute is undefined + * if the symbol is not a kernel. The type of this attribute is uint32_t. + * + * The reported amount does not include any dynamically allocated group + * segment memory that may be requested by the application when a kernel is + * dispatched. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13, + /** + * Size of static private, spill, and arg segment memory required by + * this kernel (per work-item), in bytes. The value of this attribute is + * undefined if the symbol is not a kernel. The type of this attribute is + * uint32_t. + * + * If the value of ::HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK is true, + * the kernel may use more private memory than the reported value, and the + * application must add the dynamic call stack usage to @a + * private_segment_size when populating a kernel dispatch packet. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14, + /** + * Dynamic callstack flag. The value of this attribute is undefined if the + * symbol is not a kernel. The type of this attribute is bool. + * + * If this flag is set (the value is true), the kernel uses a dynamically + * sized call stack. This can happen if recursive calls, calls to indirect + * functions, or the HSAIL alloca instruction are present in the kernel. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK = 15, + /** + * Call convention of the kernel. The value of this attribute is undefined if + * the symbol is not a kernel. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_CALL_CONVENTION = 18, + /** + * Call convention of the indirect function. The value of this attribute is + * undefined if the symbol is not an indirect function. The type of this + * attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 +} hsa_code_symbol_info_t; + +/** + * @deprecated + * + * @brief Get the current value of an attribute for a given code symbol. + * + * @param[in] code_symbol Code symbol. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_SYMBOL The code symbol is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * code symbol attribute, or @p value is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_symbol_get_info( + hsa_code_symbol_t code_symbol, + hsa_code_symbol_info_t attribute, + void *value); + +/** + * @deprecated + * + * @brief Iterate over the symbols in a code object, and invoke an + * application-defined callback on every iteration. + * + * @param[in] code_object Code object. + * + * @param[in] callback Callback to be invoked once per code object symbol. The + * HSA runtime passes three arguments to the callback: the code object, a + * symbol, and the application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_code_object_iterate_symbols returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT @p code_object is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API HSA_DEPRECATED hsa_code_object_iterate_symbols( + hsa_code_object_t code_object, + hsa_status_t (*callback)(hsa_code_object_t code_object, + hsa_code_symbol_t symbol, + void *data), + void *data); + +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif + +#endif // header guard diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_finalize.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_finalize.h new file mode 100644 index 0000000000..014e49bfe0 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_finalize.h @@ -0,0 +1,531 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ +#define HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_FINALIZER +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct BrigModuleHeader; +typedef struct BrigModuleHeader* BrigModule_t; + +/** \defgroup ext-alt-finalizer-extensions Finalization Extensions + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + */ +enum { + /** + * The HSAIL program is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_PROGRAM = 0x2000, + /** + * The HSAIL module is invalid. + */ + HSA_EXT_STATUS_ERROR_INVALID_MODULE = 0x2001, + /** + * Machine model or profile of the HSAIL module do not match the machine model + * or profile of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE = 0x2002, + /** + * The HSAIL module is already a part of the HSAIL program. + */ + HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED = 0x2003, + /** + * Compatibility mismatch between symbol declaration and symbol definition. + */ + HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH = 0x2004, + /** + * The finalization encountered an error while finalizing a kernel or + * indirect function. + */ + HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED = 0x2005, + /** + * Mismatch between a directive in the control directive structure and in + * the HSAIL kernel. + */ + HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH = 0x2006 +}; + +/** @} */ + +/** \defgroup ext-alt-finalizer-program Finalization Program + * @{ + */ + +/** + * @brief HSAIL (BRIG) module. The HSA Programmer's Reference Manual contains + * the definition of the BrigModule_t type. + */ +typedef BrigModule_t hsa_ext_module_t; + +/** + * @brief An opaque handle to a HSAIL program, which groups a set of HSAIL + * modules that collectively define functions and variables used by kernels and + * indirect functions. + */ +typedef struct hsa_ext_program_s { + /** + * Opaque handle. + */ + uint64_t handle; +} hsa_ext_program_t; + +/** + * @brief Create an empty HSAIL program. + * + * @param[in] machine_model Machine model used in the HSAIL program. + * + * @param[in] profile Profile used in the HSAIL program. + * + * @param[in] default_float_rounding_mode Default float rounding mode used in + * the HSAIL program. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[out] program Memory location where the HSA runtime stores the newly + * created HSAIL program handle. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p machine_model is invalid, + * @p profile is invalid, @p default_float_rounding_mode is invalid, or + * @p program is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_create( + hsa_machine_model_t machine_model, + hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, + hsa_ext_program_t *program); + +/** + * @brief Destroy a HSAIL program. + * + * @details The HSAIL program handle becomes invalid after it has been + * destroyed. Code object handles produced by ::hsa_ext_program_finalize are + * still valid after the HSAIL program has been destroyed, and can be used as + * intended. Resources allocated outside and associated with the HSAIL program + * (such as HSAIL modules that are added to the HSAIL program) can be released + * after the finalization program has been destroyed. + * + * @param[in] program HSAIL program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + */ +hsa_status_t HSA_API hsa_ext_program_destroy( + hsa_ext_program_t program); + +/** + * @brief Add a HSAIL module to an existing HSAIL program. + * + * @details The HSA runtime does not perform a deep copy of the HSAIL module + * upon addition. Instead, it stores a pointer to the HSAIL module. The + * ownership of the HSAIL module belongs to the application, which must ensure + * that @p module is not released before destroying the HSAIL program. + * + * The HSAIL module is successfully added to the HSAIL program if @p module is + * valid, if all the declarations and definitions for the same symbol are + * compatible, and if @p module specify machine model and profile that matches + * the HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] module HSAIL module. The application can add the same HSAIL module + * to @p program at most once. The HSAIL module must specify the same machine + * model and profile as @p program. If the floating-mode rounding mode of @p + * module is not default, then it should match that of @p program. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_MODULE The HSAIL module is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_INCOMPATIBLE_MODULE The machine model of @p + * module does not match machine model of @p program, or the profile of @p + * module does not match profile of @p program. + * + * @retval ::HSA_EXT_STATUS_ERROR_MODULE_ALREADY_INCLUDED The HSAIL module is + * already a part of the HSAIL program. + * + * @retval ::HSA_EXT_STATUS_ERROR_SYMBOL_MISMATCH Symbol declaration and symbol + * definition compatibility mismatch. See the symbol compatibility rules in the + * HSA Programming Reference Manual. + */ +hsa_status_t HSA_API hsa_ext_program_add_module( + hsa_ext_program_t program, + hsa_ext_module_t module); + +/** + * @brief Iterate over the HSAIL modules in a program, and invoke an + * application-defined callback on every iteration. + * + * @param[in] program HSAIL program. + * + * @param[in] callback Callback to be invoked once per HSAIL module in the + * program. The HSA runtime passes three arguments to the callback: the program, + * a HSAIL module, and the application data. If @p callback returns a status + * other than ::HSA_STATUS_SUCCESS for a particular iteration, the traversal + * stops and ::hsa_ext_program_iterate_modules returns that status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_iterate_modules( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, hsa_ext_module_t module, + void* data), + void* data); + +/** + * @brief HSAIL program attributes. + */ +typedef enum { + /** + * Machine model specified when the HSAIL program was created. The type + * of this attribute is ::hsa_machine_model_t. + */ + HSA_EXT_PROGRAM_INFO_MACHINE_MODEL = 0, + /** + * Profile specified when the HSAIL program was created. The type of + * this attribute is ::hsa_profile_t. + */ + HSA_EXT_PROGRAM_INFO_PROFILE = 1, + /** + * Default float rounding mode specified when the HSAIL program was + * created. The type of this attribute is ::hsa_default_float_rounding_mode_t. + */ + HSA_EXT_PROGRAM_INFO_DEFAULT_FLOAT_ROUNDING_MODE = 2 +} hsa_ext_program_info_t; + +/** + * @brief Get the current value of an attribute for a given HSAIL program. + * + * @param[in] program HSAIL program. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behaviour is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * HSAIL program attribute, or @p value is NULL. + */ +hsa_status_t HSA_API hsa_ext_program_get_info( + hsa_ext_program_t program, + hsa_ext_program_info_t attribute, + void *value); + +/** + * @brief Finalizer-determined call convention. + */ +typedef enum { + /** + * Finalizer-determined call convention. + */ + HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO = -1 +} hsa_ext_finalizer_call_convention_t; + +/** + * @brief Control directives specify low-level information about the + * finalization process. + */ +typedef struct hsa_ext_control_directives_s { + /** + * Bitset indicating which control directives are enabled. The bit assigned to + * a control directive is determined by the corresponding value in + * BrigControlDirective. + * + * If a control directive is disabled, its corresponding field value (if any) + * must be 0. Control directives that are only present or absent (such as + * partial workgroups) have no corresponding field as the presence of the bit + * in this mask is sufficient. + */ + uint64_t control_directives_mask; + /** + * Bitset of HSAIL exceptions that must have the BREAK policy enabled. The bit + * assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enablebreakexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t break_exceptions_mask; + /** + * Bitset of HSAIL exceptions that must have the DETECT policy enabled. The + * bit assigned to an HSAIL exception is determined by the corresponding value + * in BrigExceptionsMask. If the kernel contains a enabledetectexceptions + * control directive, the finalizer uses the union of the two masks. + */ + uint16_t detect_exceptions_mask; + /** + * Maximum size (in bytes) of dynamic group memory that will be allocated by + * the application for any dispatch of the kernel. If the kernel contains a + * maxdynamicsize control directive, the two values should match. + */ + uint32_t max_dynamic_group_size; + /** + * Maximum number of grid work-items that will be used by the application to + * launch the kernel. If the kernel contains a maxflatgridsize control + * directive, the value of @a max_flat_grid_size must not be greater than the + * value of the directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_grid_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATGRIDSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint64_t max_flat_grid_size; + /** + * Maximum number of work-group work-items that will be used by the + * application to launch the kernel. If the kernel contains a + * maxflatworkgroupsize control directive, the value of @a + * max_flat_workgroup_size must not be greater than the value of the + * directive, and takes precedence. + * + * The value specified for maximum absolute grid size must be greater than or + * equal to the product of the values specified by @a required_workgroup_size. + * + * If the bit at position BRIG_CONTROL_MAXFLATWORKGROUPSIZE is set in @a + * control_directives_mask, this field must be greater than 0. + */ + uint32_t max_flat_workgroup_size; + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + /** + * Grid size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredgridsize control directive, the + * dimensions should match. + * + * The specified grid size must be consistent with @a required_workgroup_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_grid_size. Note that the listed invariants must hold + * only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDGRIDSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + uint64_t required_grid_size[3]; + /** + * Work-group size that will be used by the application in any dispatch of the + * kernel. If the kernel contains a requiredworkgroupsize control directive, + * the dimensions should match. + * + * The specified work-group size must be consistent with @a required_grid_size + * and @a required_dim. Also, the product of the three dimensions must not + * exceed @a max_flat_workgroup_size. Note that the listed invariants must + * hold only if all the corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDWORKGROUPSIZE is set in @a + * control_directives_mask, the three dimension values must be greater than 0. + */ + hsa_dim3_t required_workgroup_size; + /** + * Number of dimensions that will be used by the application to launch the + * kernel. If the kernel contains a requireddim control directive, the two + * values should match. + * + * The specified dimensions must be consistent with @a required_grid_size and + * @a required_workgroup_size. This invariant must hold only if all the + * corresponding control directives are enabled. + * + * If the bit at position BRIG_CONTROL_REQUIREDDIM is set in @a + * control_directives_mask, this field must be 1, 2, or 3. + */ + uint8_t required_dim; + /** + * Reserved. Must be 0. + */ + uint8_t reserved2[75]; +} hsa_ext_control_directives_t; + +/** + * @brief Finalize an HSAIL program for a given instruction set architecture. + * + * @details Finalize all of the kernels and indirect functions that belong to + * the same HSAIL program for a specific instruction set architecture (ISA). The + * transitive closure of all functions specified by call or scall must be + * defined. Kernels and indirect functions that are being finalized must be + * defined. Kernels and indirect functions that are referenced in kernels and + * indirect functions being finalized may or may not be defined, but must be + * declared. All the global/readonly segment variables that are referenced in + * kernels and indirect functions being finalized may or may not be defined, but + * must be declared. + * + * @param[in] program HSAIL program. + * + * @param[in] isa Instruction set architecture to finalize for. + * + * @param[in] call_convention A call convention used in a finalization. Must + * have a value between ::HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO (inclusive) + * and the value of the attribute ::HSA_ISA_INFO_CALL_CONVENTION_COUNT in @p + * isa (not inclusive). + * + * @param[in] control_directives Low-level control directives that influence + * the finalization process. + * + * @param[in] options Vendor-specific options. May be NULL. + * + * @param[in] code_object_type Type of code object to produce. + * + * @param[out] code_object Code object generated by the Finalizer, which + * contains the machine code for the kernels and indirect functions in the HSAIL + * program. The code object is independent of the HSAIL module that was used to + * generate it. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES There is a failure to allocate + * resources required for the operation. + * + * @retval ::HSA_EXT_STATUS_ERROR_INVALID_PROGRAM The HSAIL program is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ISA @p isa is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_DIRECTIVE_MISMATCH The directive in + * the control directive structure and in the HSAIL kernel mismatch, or if the + * same directive is used with a different value in one of the functions used by + * this kernel. + * + * @retval ::HSA_EXT_STATUS_ERROR_FINALIZATION_FAILED The Finalizer + * encountered an error while compiling a kernel or an indirect function. + */ +hsa_status_t HSA_API hsa_ext_program_finalize( + hsa_ext_program_t program, + hsa_isa_t isa, + int32_t call_convention, + hsa_ext_control_directives_t control_directives, + const char *options, + hsa_code_object_type_t code_object_type, + hsa_code_object_t *code_object); + +/** @} */ + +#define hsa_ext_finalizer_1_00 + +typedef struct hsa_ext_finalizer_1_00_pfn_s { + hsa_status_t (*hsa_ext_program_create)( + hsa_machine_model_t machine_model, hsa_profile_t profile, + hsa_default_float_rounding_mode_t default_float_rounding_mode, + const char *options, hsa_ext_program_t *program); + + hsa_status_t (*hsa_ext_program_destroy)(hsa_ext_program_t program); + + hsa_status_t (*hsa_ext_program_add_module)(hsa_ext_program_t program, + hsa_ext_module_t module); + + hsa_status_t (*hsa_ext_program_iterate_modules)( + hsa_ext_program_t program, + hsa_status_t (*callback)(hsa_ext_program_t program, + hsa_ext_module_t module, void *data), + void *data); + + hsa_status_t (*hsa_ext_program_get_info)( + hsa_ext_program_t program, hsa_ext_program_info_t attribute, + void *value); + + hsa_status_t (*hsa_ext_program_finalize)( + hsa_ext_program_t program, hsa_isa_t isa, int32_t call_convention, + hsa_ext_control_directives_t control_directives, const char *options, + hsa_code_object_type_t code_object_type, hsa_code_object_t *code_object); +} hsa_ext_finalizer_1_00_pfn_t; + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // HSA_RUNTIME_INC_HSA_EXT_FINALIZE_H_ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_image.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_image.h new file mode 100644 index 0000000000..b25f168395 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ext_image.h @@ -0,0 +1,1454 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_EXT_IMAGE_H +#define HSA_EXT_IMAGE_H + +#include "hsa.h" + +#undef HSA_API +#ifdef HSA_EXPORT_IMAGES +#define HSA_API HSA_API_EXPORT +#else +#define HSA_API HSA_API_IMPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + +/** \defgroup ext-images Images and Samplers + * @{ + */ + +/** + * @brief Enumeration constants added to ::hsa_status_t by this extension. + * + * @remark Additions to hsa_status_t + */ +enum { + /** + * Image format is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED = 0x3000, + /** + * Image size is not supported. + */ + HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED = 0x3001, + /** + * Image pitch is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED = 0x3002, + /** + * Sampler descriptor is not supported or invalid. + */ + HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED = 0x3003 +}; + +/** + * @brief Enumeration constants added to ::hsa_agent_info_t by this + * extension. + * + * @remark Additions to hsa_agent_info_t + */ +enum { + /** + * Maximum number of elements in 1D images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1D_MAX_ELEMENTS = 0x3000, + /** + * Maximum number of elements in 1DA images. Must be at least 16384. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DA_MAX_ELEMENTS = 0x3001, + /** + * Maximum number of elements in 1DB images. Must be at least 65536. The type + * of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_1DB_MAX_ELEMENTS = 0x3002, + /** + * Maximum dimensions (width, height) of 2D images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2D_MAX_ELEMENTS = 0x3003, + /** + * Maximum dimensions (width, height) of 2DA images, in image elements. The X + * and Y maximums must be at least 16384. The type of this attribute is + * size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DA_MAX_ELEMENTS = 0x3004, + /** + * Maximum dimensions (width, height) of 2DDEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DDEPTH_MAX_ELEMENTS = 0x3005, + /** + * Maximum dimensions (width, height) of 2DADEPTH images, in image + * elements. The X and Y maximums must be at least 16384. The type of this + * attribute is size_t[2]. + */ + HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS = 0x3006, + /** + * Maximum dimensions (width, height, depth) of 3D images, in image + * elements. The maximum along any dimension must be at least 2048. The type + * of this attribute is size_t[3]. + */ + HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS = 0x3007, + /** + * Maximum number of image layers in a image array. Must be at least 2048. The + * type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS = 0x3008, + /** + * Maximum number of read-only image handles that can be created for an agent at any one + * time. Must be at least 128. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES = 0x3009, + /** + * Maximum number of write-only and read-write image handles (combined) that + * can be created for an agent at any one time. Must be at least 64. The type of this + * attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES = 0x300A, + /** + * Maximum number of sampler handlers that can be created for an agent at any one + * time. Must be at least 16. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS = 0x300B, + /** + * Image pitch alignment. The agent only supports linear image data + * layouts with a row pitch that is a multiple of this value. Must be + * a power of 2. The type of this attribute is size_t. + */ + HSA_EXT_AGENT_INFO_IMAGE_LINEAR_ROW_PITCH_ALIGNMENT = 0x300C +}; + +/** + * @brief Image handle, populated by ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. Image + * handles are only unique within an agent, not across agents. + * + */ +typedef struct hsa_ext_image_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; + +} hsa_ext_image_t; + +/** + * @brief Geometry associated with the image. This specifies the + * number of image dimensions and whether the image is an image + * array. See the Image Geometry section in the HSA + * Programming Reference Manual for definitions on each + * geometry. The enumeration values match the BRIG type @p + * hsa_ext_brig_image_geometry_t. + */ +typedef enum { +/** + * One-dimensional image addressed by width coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1D = 0, + + /** + * Two-dimensional image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2D = 1, + + /** + * Three-dimensional image addressed by width, height, and depth coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_3D = 2, + + /** + * Array of one-dimensional images with the same size and format. 1D arrays + * are addressed by width and index coordinate. + */ + HSA_EXT_IMAGE_GEOMETRY_1DA = 3, + + /** + * Array of two-dimensional images with the same size and format. 2D arrays + * are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DA = 4, + + /** + * One-dimensional image addressed by width coordinate. It has + * specific restrictions compared to ::HSA_EXT_IMAGE_GEOMETRY_1D. An + * image with an opaque image data layout will always use a linear + * image data layout, and one with an explicit image data layout + * must specify ::HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR. + */ + HSA_EXT_IMAGE_GEOMETRY_1DB = 5, + + /** + * Two-dimensional depth image addressed by width and height coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DDEPTH = 6, + + /** + * Array of two-dimensional depth images with the same size and format. 2D + * arrays are addressed by width, height, and index coordinates. + */ + HSA_EXT_IMAGE_GEOMETRY_2DADEPTH = 7 +} hsa_ext_image_geometry_t; + +/** + * @brief Channel type associated with the elements of an image. See + * the Channel Type section in the HSA Programming Reference + * Manual for definitions on each channel type. The + * enumeration values and definition match the BRIG type @p + * hsa_ext_brig_image_channel_type_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0, + HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9, + HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12, + HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13, + HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14, + HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15 +} hsa_ext_image_channel_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_type_t constants. + */ +typedef uint32_t hsa_ext_image_channel_type32_t; + +/** + * + * @brief Channel order associated with the elements of an image. See + * the Channel Order section in the HSA Programming Reference + * Manual for definitions on each channel order. The + * enumeration values match the BRIG type @p + * hsa_ext_brig_image_channel_order_t. + */ +typedef enum { + HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0, + HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1, + HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2, + HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4, + HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7, + HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8, + HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9, + HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10, + HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13, + HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14, + HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15, + HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16, + HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18, + HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19 +} hsa_ext_image_channel_order_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_image_channel_order_t constants. + */ +typedef uint32_t hsa_ext_image_channel_order32_t; + + +/** + * @brief Image format. + */ +typedef struct hsa_ext_image_format_s { + /** + * Channel type. + */ + hsa_ext_image_channel_type32_t channel_type; + + /** + * Channel order. + */ + hsa_ext_image_channel_order32_t channel_order; +} hsa_ext_image_format_t; + +/** + * @brief Implementation independent image descriptor. + */ +typedef struct hsa_ext_image_descriptor_s { + /** + * Image geometry. + */ + hsa_ext_image_geometry_t geometry; + /** + * Width of the image, in components. + */ + size_t width; + /** + * Height of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_2D, ::HSA_EXT_IMAGE_GEOMETRY_3D, + * HSA_EXT_IMAGE_GEOMETRY_2DA, HSA_EXT_IMAGE_GEOMETRY_2DDEPTH, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t height; + /** + * Depth of the image, in components. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, otherwise must be 0. + */ + size_t depth; + /** + * Number of image layers in the image array. Only used if the geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH, otherwise must be 0. + */ + size_t array_size; + /** + * Image format. + */ + hsa_ext_image_format_t format; +} hsa_ext_image_descriptor_t; + +/** + * @brief Image capability. + */ +typedef enum { + /** + * Images of this geometry, format, and layout are not supported by + * the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED = 0x0, + /** + * Read-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_ONLY = 0x1, + /** + * Write-only images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_WRITE_ONLY = 0x2, + /** + * Read-write images of this geometry, format, and layout are + * supported by the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_WRITE = 0x4, + /** + * @deprecated Images of this geometry, format, and layout can be accessed from + * read-modify-write atomic operations in the agent. + */ + HSA_EXT_IMAGE_CAPABILITY_READ_MODIFY_WRITE = 0x8, + /** + * Images of this geometry, format, and layout are guaranteed to + * have a consistent data layout regardless of how they are + * accessed by the associated agent. + */ + HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT = 0x10 +} hsa_ext_image_capability_t; + +/** + * @brief Image data layout. + * + * @details An image data layout denotes such aspects of image data + * layout as tiling and organization of channels in memory. Some image + * data layouts may only apply to specific image geometries, formats, + * and access permissions. Different agents may support different + * image layout identifiers, including vendor specific layouts. Note + * that an agent may not support the same image data layout for + * different access permissions to images with the same image + * geometry, size, and format. If multiple agents support the same + * image data layout then it is possible to use separate image handles + * for each agent that references the same image data. + */ + +typedef enum { + /** + * An implementation specific opaque image data layout which can + * vary depending on the agent, geometry, image format, image size, + * and access permissions. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE = 0x0, + /** + * The image data layout is specified by the following rules in + * ascending byte address order. For a 3D image, 2DA image array, + * or 1DA image array, the image data is stored as a linear sequence + * of adjacent 2D image slices, 2D images, or 1D images + * respectively, spaced according to the slice pitch. Each 2D image + * is stored as a linear sequence of adjacent image rows, spaced + * according to the row pitch. Each 1D or 1DB image is stored as a + * single image row. Each image row is stored as a linear sequence + * of image elements. Each image element is stored as a linear + * sequence of image components specified by the left to right + * channel order definition. Each image component is stored using + * the memory type specified by the channel type. + * + * The 1DB image geometry always uses the linear image data layout. + */ + HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR = 0x1 +} hsa_ext_image_data_layout_t; + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, and image format for an image created with an opaque image + * data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + +/** + * @brief Retrieve the supported image capabilities for a given combination of + * agent, geometry, image format, and image layout for an image created with + * an explicit image data layout. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] geometry Geometry. + * + * @param[in] image_format Pointer to an image format. Must not be NULL. + * + * @param[in] image_data_layout The image data layout. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_get_capability instead. + * + * @param[out] capability_mask Pointer to a memory location where the HSA + * runtime stores a bit-mask of supported image capability + * (::hsa_ext_image_capability_t) values. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_format is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p capability_mask is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_get_capability_with_layout( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + +/** + * @brief Agent specific image size and alignment requirements, populated by + * ::hsa_ext_image_data_get_info and ::hsa_ext_image_data_get_info_with_layout. + */ +typedef struct hsa_ext_image_data_info_s { + /** + * Image data size, in bytes. + */ + size_t size; + + /** + * Image data alignment, in bytes. Must always be a power of 2. + */ + size_t alignment; + +} hsa_ext_image_data_info_t; + +/** + * @brief Retrieve the image data requirements for a given combination of agent, image + * descriptor, and access permission for an image created with an opaque image + * data layout. + * + * @details The optimal image data size and alignment requirements may + * vary depending on the image attributes specified in @p + * image_descriptor, the @p access_permission, and the @p agent. Also, + * different implementations of the HSA runtime may return different + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors as long + * as ::hsa_ext_image_get_capability reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. + * + * @param[in] agent Agent to be associated with the image handle. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by @p agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The @p agent must support the image format + * specified in @p image_descriptor for the given @p + * access_permission. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The @p + * agent does not support the image format specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor with the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * access_permission is not a valid access permission value, or @p + * image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Retrieve the image data requirements for a given combination of + * image descriptor, access permission, image data layout, image data row pitch, + * and image data slice pitch for an image created with an explicit image + * data layout. + * + * @details The image data size and alignment requirements may vary + * depending on the image attributes specified in @p image_descriptor, + * the @p access_permission, and the image layout. However, different + * implementations of the HSA runtime will return the same + * requirements for the same input values. + * + * The implementation must return the same image data requirements for + * different access permissions with matching image descriptors and + * matching image layouts as long as ::hsa_ext_image_get_capability + * reports + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT. Image + * descriptors match if they have the same values, with the exception + * that s-form channel orders match the corresponding non-s-form + * channel order and vice versa. Image layouts match if they are the + * same image data layout and use the same image row and slice pitch + * values. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] access_permission Access permission of the image when + * accessed by an agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. + * + * @param[in] image_data_layout The image data layout to use. + * It is invalid to use ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use + * ::hsa_ext_image_data_get_info instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image_data_info Memory location where the runtime stores the + * size and alignment requirements. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The image + * format specified by @p image_descriptor is not supported for the + * @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The image + * dimensions specified by @p image_descriptor are not supported for + * the @p access_permission and @p image_data_layout specified. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The row and + * slice pitch specified by @p image_data_row_pitch and @p + * image_data_slice_pitch are invalid or not supported. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is + * NULL, @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image_data_info is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_data_get_info_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + +/** + * @brief Creates an agent specific image handle to an image with an + * opaque image data layout. + * + * @details Images with an opaque image data layout created with + * different access permissions but matching image descriptors and + * same agent can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability for the image format specified in + * the image descriptor. Image descriptors match if they have the same + * values, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. + * + * If necessary, an application can use image operations (import, + * export, copy, clear) to prepare the image for the intended use + * regardless of the access permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by agent. The access permission defines how the agent + * is allowed to access the image using the image handle created and + * must match the corresponding HSAIL image handle type. The agent + * must support the image format specified in @p image_descriptor for + * the given @p access_permission. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent + * does not have the capability to support the image format contained + * in @p image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p access_permission is not a valid access permission + * value, or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + +/** + * @brief Creates an agent specific image handle to an image with an explicit + * image data layout. + * + * @details Images with an explicit image data layout created with + * different access permissions but matching image descriptors and + * matching image layout can share the same image data if + * ::HSA_EXT_IMAGE_CAPABILITY_ACCESS_INVARIANT_DATA_LAYOUT is reported + * by ::hsa_ext_image_get_capability_with_layout for the image format + * specified in the image descriptor and specified image data + * layout. Image descriptors match if they have the same values, with + * the exception that s-form channel orders match the corresponding + * non-s-form channel order and vice versa. Image layouts match if + * they are the same image data layout and use the same image row and + * slice values. + * + * If necessary, an application can use image operations (import, export, copy, + * clear) to prepare the image for the intended use regardless of the access + * permissions. + * + * @param[in] agent agent to be associated with the image handle created. + * + * @param[in] image_descriptor Pointer to an image descriptor. Must not be NULL. + * + * @param[in] image_data Image data buffer that must have been allocated + * according to the size and alignment requirements dictated by + * ::hsa_ext_image_data_get_info_with_layout. Must not be NULL. + * + * Any previous memory contents are preserved upon creation. The application is + * responsible for ensuring that the lifetime of the image data exceeds that of + * all the associated images. + * + * @param[in] access_permission Access permission of the image when + * accessed by the agent. The access permission defines how the agent + * is allowed to access the image and must match the corresponding + * HSAIL image handle type. The agent must support the image format + * specified in @p image_descriptor for the given @p access_permission + * and @p image_data_layout. + * + * @param[in] image_data_layout The image data layout to use for the + * @p image_data. It is invalid to use + * ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE; use ::hsa_ext_image_create + * instead. + * + * @param[in] image_data_row_pitch The size in bytes for a single row + * of the image in the image data. If 0 is specified then the default + * row pitch value is used: image width * image element byte size. + * The value used must be greater than or equal to the default row + * pitch, and be a multiple of the image element byte size. For the + * linear image layout it must also be a multiple of the image linear + * row pitch alignment for the agents that will access the image data + * using image instructions. + * + * @param[in] image_data_slice_pitch The size in bytes of a single + * slice of a 3D image, or the size in bytes of each image layer in an + * image array in the image data. If 0 is specified then the default + * slice pitch value is used: row pitch * height if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * ::HSA_EXT_IMAGE_GEOMETRY_2DADEPTH; row pitch if geometry is + * ::HSA_EXT_IMAGE_GEOMETRY_1DA; and 0 otherwise. The value used must + * be 0 if the default slice pitch is 0, be greater than or equal to + * the default slice pitch, and be a multiple of the row pitch. + * + * @param[out] image Pointer to a memory location where the HSA runtime stores + * the newly created image handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED The agent does + * not have the capability to support the image format contained in the image + * descriptor using the specified @p access_permission and @p image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_SIZE_UNSUPPORTED The agent + * does not support the image dimensions specified by @p + * image_descriptor using the specified @p access_permission and @p + * image_data_layout. + * + * @retval ::HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED The agent does + * not support the row and slice pitch specified by @p image_data_row_pitch + * and @p image_data_slice_pitch, or the values are invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * support the creation of more image handles with the given @p access_permission). + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p image_descriptor is NULL, @p + * image_data is NULL, @p image_data does not have a valid alignment, + * @p image_data_layout is ::HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE, + * or @p image is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_create_with_layout( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + +/** + * @brief Destroy an image handle previously created using ::hsa_ext_image_create or + * ::hsa_ext_image_create_with_layout. + * + * @details Destroying the image handle does not free the associated image data, + * or modify its contents. The application should not destroy an image handle while + * there are references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_image_destroy( + hsa_agent_t agent, + hsa_ext_image_t image); + +/** + * @brief Copies a portion of one image (the source) to another image (the + * destination). + * + * @details The source and destination image formats should be the + * same, with the exception that s-form channel orders match the + * corresponding non-s-form channel order and vice versa. For example, + * it is allowed to copy a source image with a channel order of + * HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB to a destination image with a + * channel order of HSA_EXT_IMAGE_CHANNEL_ORDER_RGB. + * + * The source and destination images do not have to be of the same geometry and + * appropriate scaling is performed by the HSA runtime. It is possible to copy + * subregions between any combinations of source and destination geometries, provided + * that the dimensions of the subregions are the same. For example, it is + * allowed to copy a rectangular region from a 2D image to a slice of a 3D + * image. + * + * If the source and destination image data overlap, or the combination of + * offset and range references an out-out-bounds element in any of the images, + * the behavior is undefined. + * + * @param[in] agent Agent associated with both the source and destination image handles. + * + * @param[in] src_image Image handle of source image. The agent associated with the source + * image handle must be identical to that of the destination image. + * + * @param[in] src_offset Pointer to the offset within the source image where to + * copy the data from. Must not be NULL. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] dst_offset Pointer to the offset within the destination + * image where to copy the data. Must not be NULL. + * + * @param[in] range Dimensions of the image portion to be copied. The HSA + * runtime computes the size of the image data to be copied using this + * argument. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_offset is + * NULL, @p dst_offset is NULL, or @p range is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_copy( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + +/** + * @brief Image region. + */ +typedef struct hsa_ext_image_region_s { + /** + * Offset within an image (in coordinates). + */ + hsa_dim3_t offset; + + /** + * Dimension size of the image range (in coordinates). The x, y, and z dimensions + * correspond to width, height, and depth or index respectively. + */ + hsa_dim3_t range; +} hsa_ext_image_region_t; + +/** + * @brief Import a linearly organized image data from memory directly to an + * image handle. + * + * @details This operation updates the image data referenced by the image handle + * from the source memory. The size of the data imported from memory is + * implicitly derived from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the source memory or destination image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the import operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_memory Source memory. Must not be NULL. + * + * @param[in] src_row_pitch The size in bytes of a single row of the image in the + * source memory. If the value is smaller than the destination image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] src_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image layer in an image array in the source memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p src_row_pitch, then the value used for @p src_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p src_row_pitch * destination image region height, then the value used for + * @p src_row_pitch * destination image region height is used. + * Otherwise, the value is not used. + * + * @param[in] dst_image Image handle of destination image. + * + * @param[in] image_region Pointer to the image region to be updated. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p src_memory is NULL, or @p + * image_region is NULL. + * + */ +hsa_status_t HSA_API hsa_ext_image_import( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Export the image data to linearly organized memory. + * + * @details The operation updates the destination memory with the image data of + * @p src_image. The size of the data exported to memory is implicitly derived + * from the image region. + * + * It is the application's responsibility to avoid out of bounds memory access. + * + * None of the destination memory or source image data memory can + * overlap. Overlapping of any of the source and destination image + * data memory within the export operation produces undefined results. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] src_image Image handle of source image. + * + * @param[in] dst_memory Destination memory. Must not be NULL. + * + * @param[in] dst_row_pitch The size in bytes of a single row of the image in the + * destination memory. If the value is smaller than the source image region + * width * image element byte size, then region width * image element byte + * size is used. + * + * @param[in] dst_slice_pitch The size in bytes of a single 2D slice of a 3D image, + * or the size in bytes of each image in an image array in the destination memory. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_1DA and the value is smaller than the + * value used for @p dst_row_pitch, then the value used for @p dst_row_pitch is used. + * If the geometry is ::HSA_EXT_IMAGE_GEOMETRY_3D, ::HSA_EXT_IMAGE_GEOMETRY_2DA, or + * HSA_EXT_IMAGE_GEOMETRY_2DADEPTH and the value is smaller than the value used for + * @p dst_row_pitch * source image region height, then the value used for + * @p dst_row_pitch * source image region height is used. + * Otherwise, the value is not used. + * + * @param[in] image_region Pointer to the image region to be exported. Must not + * be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p dst_memory is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_export( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Clear a region of an image so that every image element has + * the specified value. + * + * @param[in] agent Agent associated with the image handle. + * + * @param[in] image Image handle for image to be cleared. + * + * @param[in] data The value to which to set each image element being + * cleared. It is specified as an array of image component values. The + * number of array elements must match the number of access components + * for the image channel order. The type of each array element must + * match the image access type of the image channel type. When the + * value is used to set the value of an image element, the conversion + * method corresponding to the image channel type is used. See the + * Channel Order section and Channel Type section in + * the HSA Programming Reference Manual for more + * information. Must not be NULL. + * + * @param[in] image_region Pointer to the image region to clear. Must not be + * NULL. If the region references an out-out-bounds element, the behavior is + * undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p data is NULL, or @p + * image_region is NULL. + */ +hsa_status_t HSA_API hsa_ext_image_clear( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + +/** + * @brief Sampler handle. Samplers are populated by + * ::hsa_ext_sampler_create. Sampler handles are only unique within an + * agent, not across agents. + */ +typedef struct hsa_ext_sampler_s { + /** + * Opaque handle. For a given agent, two handles reference the same object of + * the enclosing type if and only if they are equal. + */ + uint64_t handle; +} hsa_ext_sampler_t; + +/** + * @brief Sampler address modes. The sampler address mode describes + * the processing of out-of-range image coordinates. See the + * Addressing Mode section in the HSA Programming Reference + * Manual for definitions on each address mode. The values + * match the BRIG type @p hsa_ext_brig_sampler_addressing_t. + */ +typedef enum { + /** + * Out-of-range coordinates are not handled. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED = 0, + + /** + * Clamp out-of-range coordinates to the image edge. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE = 1, + + /** + * Clamp out-of-range coordinates to the image border color. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER = 2, + + /** + * Wrap out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT = 3, + + /** + * Mirror out-of-range coordinates back into the valid coordinate + * range so the image appears as repeated tiles with every other + * tile a reflection. + */ + HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT = 4 + +} hsa_ext_sampler_addressing_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_addressing_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_addressing_mode32_t; + +/** + * @brief Sampler coordinate normalization modes. See the + * Coordinate Normalization Mode section in the HSA + * Programming Reference Manual for definitions on each + * coordinate normalization mode. The values match the BRIG type @p + * hsa_ext_brig_sampler_coord_normalization_t. + */ +typedef enum { + + /** + * Coordinates are used to directly address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED = 0, + + /** + * Coordinates are scaled by the image dimension size before being + * used to address an image element. + */ + HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED = 1 + +} hsa_ext_sampler_coordinate_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_coordinate_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_coordinate_mode32_t; + + +/** + * @brief Sampler filter modes. See the Filter Mode section + * in the HSA Programming Reference Manual for definitions + * on each address mode. The enumeration values match the BRIG type @p + * hsa_ext_brig_sampler_filter_t. + */ +typedef enum { + /** + * Filter to the image element nearest (in Manhattan distance) to the + * specified coordinate. + */ + HSA_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + + /** + * Filter to the image element calculated by combining the elements in a 2x2 + * square block or 2x2x2 cube block around the specified coordinate. The + * elements are combined using linear interpolation. + */ + HSA_EXT_SAMPLER_FILTER_MODE_LINEAR = 1 + +} hsa_ext_sampler_filter_mode_t; + +/** + * @brief A fixed-size type used to represent ::hsa_ext_sampler_filter_mode_t constants. + */ +typedef uint32_t hsa_ext_sampler_filter_mode32_t; + +/** + * @brief Implementation independent sampler descriptor. + */ +typedef struct hsa_ext_sampler_descriptor_s { + /** + * Sampler coordinate mode describes the normalization of image coordinates. + */ + hsa_ext_sampler_coordinate_mode32_t coordinate_mode; + + /** + * Sampler filter type describes the type of sampling performed. + */ + hsa_ext_sampler_filter_mode32_t filter_mode; + + /** + * Sampler address mode describes the processing of out-of-range image + * coordinates. + */ + hsa_ext_sampler_addressing_mode32_t address_mode; + +} hsa_ext_sampler_descriptor_t; + +/** + * @brief Create an agent specific sampler handle for a given agent + * independent sampler descriptor and agent. + * + * @param[in] agent Agent to be associated with the sampler handle created. + * + * @param[in] sampler_descriptor Pointer to a sampler descriptor. Must not be + * NULL. + * + * @param[out] sampler Memory location where the HSA runtime stores the newly + * created sampler handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * + * @retval ::HSA_EXT_STATUS_ERROR_SAMPLER_DESCRIPTOR_UNSUPPORTED The + * @p agent does not have the capability to support the properties + * specified by @p sampler_descriptor or it is invalid. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to allocate + * the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p sampler_descriptor is NULL, or + * @p sampler is NULL. + */ +hsa_status_t HSA_API hsa_ext_sampler_create( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + +/** + * @brief Destroy a sampler handle previously created using ::hsa_ext_sampler_create. + * + * @details The sampler handle should not be destroyed while there are + * references to it queued for execution or currently being used in a + * kernel dispatch. + * + * @param[in] agent Agent associated with the sampler handle. + * + * @param[in] sampler Sampler handle to destroy. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + */ +hsa_status_t HSA_API hsa_ext_sampler_destroy( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + +#define hsa_ext_images_1_00 + +/** + * @brief The function pointer table for the images v1.00 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_00_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + +} hsa_ext_images_1_00_pfn_t; + +#define hsa_ext_images_1 + +/** + * @brief The function pointer table for the images v1 extension. Can be returned by ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ext_images_1_pfn_s { + + hsa_status_t (*hsa_ext_image_get_capability)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_t *image); + + hsa_status_t (*hsa_ext_image_destroy)( + hsa_agent_t agent, + hsa_ext_image_t image); + + hsa_status_t (*hsa_ext_image_copy)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + const hsa_dim3_t* src_offset, + hsa_ext_image_t dst_image, + const hsa_dim3_t* dst_offset, + const hsa_dim3_t* range); + + hsa_status_t (*hsa_ext_image_import)( + hsa_agent_t agent, + const void *src_memory, + size_t src_row_pitch, + size_t src_slice_pitch, + hsa_ext_image_t dst_image, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_export)( + hsa_agent_t agent, + hsa_ext_image_t src_image, + void *dst_memory, + size_t dst_row_pitch, + size_t dst_slice_pitch, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_image_clear)( + hsa_agent_t agent, + hsa_ext_image_t image, + const void* data, + const hsa_ext_image_region_t *image_region); + + hsa_status_t (*hsa_ext_sampler_create)( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler); + + hsa_status_t (*hsa_ext_sampler_destroy)( + hsa_agent_t agent, + hsa_ext_sampler_t sampler); + + hsa_status_t (*hsa_ext_image_get_capability_with_layout)( + hsa_agent_t agent, + hsa_ext_image_geometry_t geometry, + const hsa_ext_image_format_t *image_format, + hsa_ext_image_data_layout_t image_data_layout, + uint32_t *capability_mask); + + hsa_status_t (*hsa_ext_image_data_get_info_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_data_info_t *image_data_info); + + hsa_status_t (*hsa_ext_image_create_with_layout)( + hsa_agent_t agent, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_access_permission_t access_permission, + hsa_ext_image_data_layout_t image_data_layout, + size_t image_data_row_pitch, + size_t image_data_slice_pitch, + hsa_ext_image_t *image); + +} hsa_ext_images_1_pfn_t; +/** @} */ + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ven_amd_loader.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ven_amd_loader.h new file mode 100644 index 0000000000..47236c86e9 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/hsa_ven_amd_loader.h @@ -0,0 +1,667 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +// HSA AMD extension for additional loader functionality. + +#ifndef HSA_VEN_AMD_LOADER_H +#define HSA_VEN_AMD_LOADER_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @brief Queries equivalent host address for given @p device_address, and + * records it in @p host_address. + * + * + * @details Contents of memory pointed to by @p host_address would be identical + * to contents of memory pointed to by @p device_address. Only difference + * between the two is host accessibility: @p host_address is always accessible + * from host, @p device_address might not be accessible from host. + * + * If @p device_address already points to host accessible memory, then the value + * of @p device_address is simply copied into @p host_address. + * + * The lifetime of @p host_address is the same as the lifetime of @p + * device_address, and both lifetimes are limited by the lifetime of the + * executable that is managing these addresses. + * + * + * @param[in] device_address Device address to query equivalent host address + * for. + * + * @param[out] host_address Pointer to application-allocated buffer to record + * queried equivalent host address in. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p device_address is invalid or + * null, or @p host_address is null. + */ +hsa_status_t hsa_ven_amd_loader_query_host_address( + const void *device_address, + const void **host_address); + +/** + * @brief The storage type of the code object that is backing loaded memory + * segment. + */ +typedef enum { + /** + * Loaded memory segment is not backed by any code object (anonymous), as the + * case would be with BSS (uninitialized data). + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE = 0, + /** + * Loaded memory segment is backed by the code object that is stored in the + * file. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE = 1, + /** + * Loaded memory segment is backed by the code object that is stored in the + * memory. + */ + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY = 2 +} hsa_ven_amd_loader_code_object_storage_type_t; + +/** + * @brief Loaded memory segment descriptor. + * + * + * @details Loaded memory segment descriptor describes underlying loaded memory + * segment. Loaded memory segment is created/allocated by the executable during + * the loading of the code object that is backing underlying memory segment. + * + * The lifetime of underlying memory segment is limited by the lifetime of the + * executable that is managing underlying memory segment. + */ +typedef struct hsa_ven_amd_loader_segment_descriptor_s { + /** + * Agent underlying memory segment is allocated on. If the code object that is + * backing underlying memory segment is program code object, then 0. + */ + hsa_agent_t agent; + /** + * Executable that is managing this underlying memory segment. + */ + hsa_executable_t executable; + /** + * Storage type of the code object that is backing underlying memory segment. + */ + hsa_ven_amd_loader_code_object_storage_type_t code_object_storage_type; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then null; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then null-terminated + * filepath to the code object; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then host + * accessible pointer to the first byte of the code object. + */ + const void *code_object_storage_base; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE, then the length of + * the filepath to the code object (including null-terminating character); + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY, then the size, in + * bytes, of the memory occupied by the code object. + */ + size_t code_object_storage_size; + /** + * If the storage type of the code object that is backing underlying memory + * segment is: + * - HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE, then 0; + * - other, then offset, in bytes, from the beginning of the code object to + * the first byte in the code object data is copied from. + */ + size_t code_object_storage_offset; + /** + * Starting address of the underlying memory segment. + */ + const void *segment_base; + /** + * Size, in bytes, of the underlying memory segment. + */ + size_t segment_size; +} hsa_ven_amd_loader_segment_descriptor_t; + +/** + * @brief Either queries loaded memory segment descriptors, or total number of + * loaded memory segment descriptors. + * + * + * @details If @p segment_descriptors is not null and @p num_segment_descriptors + * points to number that exactly matches total number of loaded memory segment + * descriptors, then queries loaded memory segment descriptors, and records them + * in @p segment_descriptors. If @p segment_descriptors is null and @p + * num_segment_descriptors points to zero, then queries total number of loaded + * memory segment descriptors, and records it in @p num_segment_descriptors. In + * all other cases returns appropriate error code (see below). + * + * The caller of this function is responsible for the allocation/deallocation + * and the lifetime of @p segment_descriptors and @p num_segment_descriptors. + * + * The lifetime of loaded memory segments that are described by queried loaded + * memory segment descriptors is limited by the lifetime of the executable that + * is managing loaded memory segments. + * + * Queried loaded memory segment descriptors are always self-consistent: they + * describe a complete set of loaded memory segments that are being backed by + * fully loaded code objects that are present at the time (i.e. this function + * is blocked until all executable manipulations are fully complete). + * + * + * @param[out] segment_descriptors Pointer to application-allocated buffer to + * record queried loaded memory segment descriptors in. Can be null if @p + * num_segment_descriptors points to zero. + * + * @param[in,out] num_segment_descriptors Pointer to application-allocated + * buffer that contains either total number of loaded memory segment descriptors + * or zero. + * + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p segment_descriptors is null + * while @p num_segment_descriptors points to non-zero number, @p + * segment_descriptors is not null while @p num_segment_descriptors points to + * zero, or @p num_segment_descriptors is null. + * + * @retval HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS @p num_segment_descriptors + * does not point to number that exactly matches total number of loaded memory + * segment descriptors. + */ +hsa_status_t hsa_ven_amd_loader_query_segment_descriptors( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + +/** + * @brief Obtains the handle of executable to which the device address belongs. + * + * @details This method should not be used to obtain executable handle by using + * a host address. The executable returned is expected to be alive until its + * destroyed by the user. + * + * @retval HSA_STATUS_SUCCESS Function is executed successfully. + * + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED Runtime is not initialized. + * + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT The input is invalid or there + * is no exectuable found for this kernel code object. + */ +hsa_status_t hsa_ven_amd_loader_query_executable( + const void *device_address, + hsa_executable_t *executable); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the loaded code objects in an executable, and invoke + * an application-defined callback on every iteration. + * + * @param[in] executable Executable. + * + * @param[in] callback Callback to be invoked once per loaded code object. The + * HSA runtime passes three arguments to the callback: the executable, a + * loaded code object, and the application data. If @p callback returns a + * status other than ::HSA_STATUS_SUCCESS for a particular iteration, the + * traversal stops and + * ::hsa_ven_amd_loader_executable_iterate_loaded_code_objects returns that + * status value. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_EXECUTABLE The executable is invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. + */ +hsa_status_t hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + +/** + * @brief Loaded code object kind. + */ +typedef enum { + /** + * Program code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_PROGRAM = 1, + /** + * Agent code object. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT = 2 +} hsa_ven_amd_loader_loaded_code_object_kind_t; + +/** + * @brief Loaded code object attributes. + */ +typedef enum hsa_ven_amd_loader_loaded_code_object_info_e { + /** + * The executable in which this loaded code object is loaded. The + * type of this attribute is ::hsa_executable_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_EXECUTABLE = 1, + /** + * The kind of this loaded code object. The type of this attribute is + * ::uint32_t interpreted as ::hsa_ven_amd_loader_loaded_code_object_kind_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND = 2, + /** + * The agent on which this loaded code object is loaded. The + * value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_KIND is + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_KIND_AGENT. The type of this + * attribute is ::hsa_agent_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT = 3, + /** + * The storage type of the code object reader used to load the loaded code object. + * The type of this attribute is ::uint32_t interpreted as a + * ::hsa_ven_amd_loader_code_object_storage_type_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE = 4, + /** + * The memory address of the first byte of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE = 5, + /** + * The memory size in bytes of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY. The type of this + * attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE = 6, + /** + * The file descriptor of the code object that was loaaded. + * The value of this attribute is only defined if + * ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE is + * ::HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE. The type of this + * attribute is ::int. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE = 7, + /** + * The signed byte address difference of the memory address at which the code + * object is loaded minus the virtual address specified in the code object + * that is loaded. The value of this attribute is only defined if the + * executable in which the code object is loaded is froozen. The type of this + * attribute is ::int64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA = 8, + /** + * The base memory address at which the code object is loaded. This is the + * base address of the allocation for the lowest addressed segment of the code + * object that is loaded. Note that any non-loaded segments before the first + * loaded segment are ignored. The value of this attribute is only defined if + * the executable in which the code object is loaded is froozen. The type of + * this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE = 9, + /** + * The byte size of the loaded code objects contiguous memory allocation. The + * value of this attribute is only defined if the executable in which the code + * object is loaded is froozen. The type of this attribute is ::uint64_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE = 10, + /** + * The length of the URI in bytes, not including the NUL terminator. The type + * of this attribute is uint32_t. + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH = 11, + /** + * The URI name from which the code object was loaded. The type of this + * attribute is a NUL terminated \p char* with the length equal to the value + * of ::HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH attribute. + * The URI name syntax is defined by the following BNF syntax: + * + * code_object_uri ::== file_uri | memory_uri + * file_uri ::== "file://" file_path [ range_specifier ] + * memory_uri ::== "memory://" process_id range_specifier + * range_specifier ::== [ "#" | "?" ] "offset=" number "&" "size=" number + * file_path ::== URI_ENCODED_OS_FILE_PATH + * process_id ::== DECIMAL_NUMBER + * number ::== HEX_NUMBER | DECIMAL_NUMBER | OCTAL_NUMBER + * + * ``number`` is a C integral literal where hexadecimal values are prefixed by + * "0x" or "0X", and octal values by "0". + * + * ``file_path`` is the file's path specified as a URI encoded UTF-8 string. + * In URI encoding, every character that is not in the regular expression + * ``[a-zA-Z0-9/_.~-]`` is encoded as two uppercase hexidecimal digits + * proceeded by "%". Directories in the path are separated by "/". + * + * ``offset`` is a 0-based byte offset to the start of the code object. For a + * file URI, it is from the start of the file specified by the ``file_path``, + * and if omitted defaults to 0. For a memory URI, it is the memory address + * and is required. + * + * ``size`` is the number of bytes in the code object. For a file URI, if + * omitted it defaults to the size of the file. It is required for a memory + * URI. + * + * ``process_id`` is the identity of the process owning the memory. For Linux + * it is the C unsigned integral decimal literal for the process ID (PID). + * + * For example: + * + * file:///dir1/dir2/file1 + * file:///dir3/dir4/file2#offset=0x2000&size=3000 + * memory://1234#offset=0x20000&size=3000 + */ + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI = 12, +} hsa_ven_amd_loader_loaded_code_object_info_t; + +/** + * @brief Get the current value of an attribute for a given loaded code + * object. + * + * @param[in] loaded_code_object Loaded code object. + * + * @param[in] attribute Attribute to query. + * + * @param[out] value Pointer to an application-allocated buffer where to store + * the value of the attribute. If the buffer passed by the application is not + * large enough to hold the value of @p attribute, the behavior is undefined. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The loaded code object is + * invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p attribute is an invalid + * loaded code object attribute, or @p value is NULL. + */ +hsa_status_t hsa_ven_amd_loader_loaded_code_object_get_info( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + +//===----------------------------------------------------------------------===// + +/** + * @brief Create a code object reader to operate on a file with size and offset. + * + * @param[in] file File descriptor. The file must have been opened by + * application with at least read permissions prior calling this function. The + * file must contain a vendor-specific code object. + * + * The file is owned and managed by the application; the lifetime of the file + * descriptor must exceed that of any associated code object reader. + * + * @param[in] size Size of the code object embedded in @p file. + * + * @param[in] offset 0-based offset relative to the beginning of the @p file + * that denotes the beginning of the code object embedded within the @p file. + * + * @param[out] code_object_reader Memory location to store the newly created + * code object reader handle. Must not be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_FILE @p file is not opened with at least + * read permissions. This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT_READER by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT The bytes starting at offset + * do not form a valid code object. If file size is 0. Or offset > file size. + * This condition may also be reported as + * ::HSA_STATUS_ERROR_INVALID_CODE_OBJECT by the + * ::hsa_executable_load_agent_code_object function. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES The HSA runtime failed to + * allocate the required resources. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p code_object_reader is NULL. + */ +hsa_status_t +hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + +//===----------------------------------------------------------------------===// + +/** + * @brief Iterate over the available executables, and invoke an + * application-defined callback on every iteration. While + * ::hsa_ven_amd_loader_iterate_executables is executing any calls to + * ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy will be blocked. + * + * @param[in] callback Callback to be invoked once per executable. The HSA + * runtime passes two arguments to the callback: the executable and the + * application data. If @p callback returns a status other than + * ::HSA_STATUS_SUCCESS for a particular iteration, the traversal stops and + * ::hsa_ven_amd_loader_iterate_executables returns that status value. If + * @p callback invokes ::hsa_executable_create, ::hsa_executable_create_alt, or + * ::hsa_executable_destroy then the behavior is undefined. + * + * @param[in] data Application data that is passed to @p callback on every + * iteration. May be NULL. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p callback is NULL. +*/ +hsa_status_t +hsa_ven_amd_loader_iterate_executables( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); + +//===----------------------------------------------------------------------===// + +/** + * @brief Extension version. + */ +#define hsa_ven_amd_loader 001003 + +/** + * @brief Extension function table version 1.00. + */ +typedef struct hsa_ven_amd_loader_1_00_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); +} hsa_ven_amd_loader_1_00_pfn_t; + +/** + * @brief Extension function table version 1.01. + */ +typedef struct hsa_ven_amd_loader_1_01_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); +} hsa_ven_amd_loader_1_01_pfn_t; + +/** + * @brief Extension function table version 1.02. + */ +typedef struct hsa_ven_amd_loader_1_02_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); +} hsa_ven_amd_loader_1_02_pfn_t; + +/** + * @brief Extension function table version 1.03. + */ +typedef struct hsa_ven_amd_loader_1_03_pfn_s { + hsa_status_t (*hsa_ven_amd_loader_query_host_address)( + const void *device_address, + const void **host_address); + + hsa_status_t (*hsa_ven_amd_loader_query_segment_descriptors)( + hsa_ven_amd_loader_segment_descriptor_t *segment_descriptors, + size_t *num_segment_descriptors); + + hsa_status_t (*hsa_ven_amd_loader_query_executable)( + const void *device_address, + hsa_executable_t *executable); + + hsa_status_t (*hsa_ven_amd_loader_executable_iterate_loaded_code_objects)( + hsa_executable_t executable, + hsa_status_t (*callback)( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void *data), + void *data); + + hsa_status_t (*hsa_ven_amd_loader_loaded_code_object_get_info)( + hsa_loaded_code_object_t loaded_code_object, + hsa_ven_amd_loader_loaded_code_object_info_t attribute, + void *value); + + hsa_status_t + (*hsa_ven_amd_loader_code_object_reader_create_from_file_with_offset_size)( + hsa_file_t file, + size_t offset, + size_t size, + hsa_code_object_reader_t *code_object_reader); + + hsa_status_t + (*hsa_ven_amd_loader_iterate_executables)( + hsa_status_t (*callback)( + hsa_executable_t executable, + void *data), + void *data); +} hsa_ven_amd_loader_1_03_pfn_t; + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* HSA_VEN_AMD_LOADER_H */ diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/public/amdid.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/public/amdid.h new file mode 100644 index 0000000000..c77b918b17 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/public/amdid.h @@ -0,0 +1,30 @@ +/*****************************************************************************\ +* +* *** NEED TO REPLACE THIS WITH OPEN SOURCE HEADER *** +* +* +* Copyright (c) 1999-2004 ATI Technologies Inc. (unpublished) +* +* +\*****************************************************************************/ + +#ifndef _AMDID_H +#define _AMDID_H + +#define FAMILY_UNKNOWN 0 + +#define FAMILY_SI 110 // Southern Islands: Tahiti, Pitcairn, Cape Verde + +#define FAMILY_TN 105 // Fusion Trinity: Devastator, Scrapper + +#define FAMILY_CI 120 // Sea Islands: Hawaii, Bonaire + +#define FAMILY_KV 125 // Fusion Kaveri: Spectre, Spooky; Fusion Kabini: Kalindi + +#define FAMILY_VI 130 // Volcanic Islands: Iceland, Tonga + +#define FAMILY_CZ 135 // Carrizo + +#define AMD_VENDOR_ID 0x1002 // used for GPUs -- AMD also has 1022 for CPU HW + +#endif // _AMDID_H diff --git a/shared/amdgpu-windows-interop/sc/HSAIL/include/public/cz_id.h b/shared/amdgpu-windows-interop/sc/HSAIL/include/public/cz_id.h new file mode 100644 index 0000000000..9ec36585a3 --- /dev/null +++ b/shared/amdgpu-windows-interop/sc/HSAIL/include/public/cz_id.h @@ -0,0 +1,46 @@ +/*****************************************************************************\ +* +* *** NEED TO REPLACE THIS WITH OPEN SOURCE HEADER *** +* +* +* Copyright (c) 2013 Advanced Micro Devices, Inc. +* +* +\*****************************************************************************/ + +#ifndef _CZ_ID_H +#define _CZ_ID_H + +enum { + CARRIZO_A1 = 0x02, + CARRIZO_BRISTOL_A0 = 0x10, + STONEY_A0 = 0x61, + CZ_UNKNOWN = 0xFF +}; + + +// for the original CARRIZO + other CARRIZO kickers +#define ASICREV_IS_CARRIZO(eChipRev) (eChipRev == CARRIZO_A1) + +// +// Carrizo device IDs +// +#define DEVICE_ID_CZ_9874 0x9874 + +// CARRIZO ASIC internal revision number +#define INTERNAL_REV_CARRIZO_A1 0x01 + +// CARRIZO PCI Revision IDs +#define PRID_CZ_C4 0xC4 // Client B10 +#define PRID_CZ_C5 0xC5 // Client B8 +#define PRID_CZ_C6 0xC6 // Client B6 +#define PRID_CZ_C7 0xC7 // Client B4 + +// eCARRIZO PCI Revision IDs +#define PRID_eCZ_84 0x84 // eCZ B10 +#define PRID_eCZ_81 0x81 // eCZ B8/iTemp +#define PRID_eCZ_85 0x85 // eCZ B8 +#define PRID_eCZ_87 0x87 // eCZ B4 + +#endif // _CZ_ID_H +