diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml index 8e0122885a..5ac783f6c9 100644 --- a/.github/workflows/therock-ci-linux.yml +++ b/.github/workflows/therock-ci-linux.yml @@ -38,7 +38,7 @@ jobs: with: repository: "ROCm/TheRock" path: "TheRock" - ref: 6fab5d65a552483bcfa1f6ccaaabf699c8188c1e # 2025-11-06 commit + ref: eb8f187ff47eb6af9cd5aaa0b8d9a04b06b12796 # 2025-11-15 commit - name: Install python deps run: | @@ -66,7 +66,6 @@ jobs: run: | # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo # rm ./TheRock/patches/amd-mainline/rocm-systems/*.patch - rm ./TheRock/patches/amd-mainline/rocm-systems/0008-Find-bundled-libelf.patch ./TheRock/build_tools/fetch_sources.py --jobs 12 --no-include-rocm-systems --no-include-rocm-libraries --no-include-ml-frameworks @@ -110,7 +109,7 @@ jobs: uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: aws-region: us-east-2 - role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + role-to-assume: arn:aws:iam::692859939525:role/therock-ci-external - name: Post Build Upload if: always() diff --git a/.github/workflows/therock-ci-windows.yml b/.github/workflows/therock-ci-windows.yml index 810d02ea0b..6110c95869 100644 --- a/.github/workflows/therock-ci-windows.yml +++ b/.github/workflows/therock-ci-windows.yml @@ -39,7 +39,7 @@ jobs: with: repository: "ROCm/TheRock" path: "TheRock" - ref: 6fab5d65a552483bcfa1f6ccaaabf699c8188c1e # 2025-11-06 commit + ref: eb8f187ff47eb6af9cd5aaa0b8d9a04b06b12796 # 2025-11-15 commit - name: Set up Python uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 @@ -54,7 +54,6 @@ jobs: run: | # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo # rm ./TheRock/patches/amd-mainline/rocm-systems/*.patch - rm ./TheRock/patches/amd-mainline/rocm-systems/0008-Find-bundled-libelf.patch git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-systems/*.patch - name: Install requirements @@ -72,10 +71,6 @@ jobs: with: version: '3.62.0' - - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0 - with: - version: '3.62.0' - # After other installs, so MSVC get priority in the PATH. - name: Configure MSVC uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 @@ -138,7 +133,7 @@ jobs: uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: aws-region: us-east-2 - role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external + role-to-assume: arn:aws:iam::692859939525:role/therock-ci-external special-characters-workaround: true - name: Post Build Upload diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml index ad904a77c5..38b27b07b9 100644 --- a/.github/workflows/therock-test-packages.yml +++ b/.github/workflows/therock-test-packages.yml @@ -92,7 +92,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: repository: "ROCm/TheRock" - ref: 6fab5d65a552483bcfa1f6ccaaabf699c8188c1e # 2025-11-06 commit + ref: eb8f187ff47eb6af9cd5aaa0b8d9a04b06b12796 # 2025-11-15 commit - name: Run setup test environment workflow uses: './.github/actions/setup_test_environment' diff --git a/shared/amdgpu-windows-interop/pal/CMakeLists.txt b/shared/amdgpu-windows-interop/pal/CMakeLists.txt index deb1cf5314..2be3e0d49e 100644 --- a/shared/amdgpu-windows-interop/pal/CMakeLists.txt +++ b/shared/amdgpu-windows-interop/pal/CMakeLists.txt @@ -1,92 +1,92 @@ -## - ####################################################################################################################### - # - # Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. - # - # Permission is hereby granted, free of charge, to any person obtaining a copy - # of this software and associated documentation files (the "Software"), to deal - # in the Software without restriction, including without limitation the rights - # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - # copies of the Software, and to permit persons to whom the Software is - # furnished to do so, subject to the following conditions: - # - # The above copyright notice and this permission notice shall be included in all - # copies or substantial portions of the Software. - # - # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - # SOFTWARE. - # - ####################################################################################################################### -cmake_minimum_required(VERSION 3.21) -project(PAL LANGUAGES CXX) -add_library(pal INTERFACE) -set_target_properties(pal PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF - POSITION_INDEPENDENT_CODE TRUE -) -target_compile_features(pal INTERFACE cxx_std_20) -if (NOT PAL_CLIENT_INTERFACE_MAJOR_VERSION EQUAL 932) - message(WARNING "PAL: PAL_CLIENT_INTERFACE_MAJOR_VERSION ${PAL_CLIENT_INTERFACE_MAJOR_VERSION} not supported !!!") -endif() -target_link_libraries(pal - INTERFACE - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palCompilerDeps.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palUtil.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_uuid.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/cwpack.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_lz4.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/addrlib.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/amdrdf.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/zstd.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/vam.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/UberTraceService.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/DriverUtilsService.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_settings.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/SettingsRpcService2.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcServer.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddNet.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcShared.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddSocket.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/devdriver.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_common.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCommon.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCore.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_libyaml.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/mpack.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/metrohash.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/stb_sprintf.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcClient.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventStreamer.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventClient.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventParser.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventServer.lib - ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddYaml.lib - SetupAPI.Lib -) -target_compile_definitions(pal - INTERFACE - PAL_CLIENT_INTERFACE_MAJOR_VERSION=932 - GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION=42 - PAL_BUILD_RDF=1 - PAL_DEVELOPER_BUILD=0 - PAL_KMT_BUILD=1 -) -target_include_directories(pal - INTERFACE - inc - inc/core - inc/gpuUtil - inc/util - shared/inc - shared/devdriver/shared/legacy/inc - shared/devdriver/third_party/dd_crc32/inc - shared/metrohash/src -) +## + ####################################################################################################################### + # + # Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + # + # Permission is hereby granted, free of charge, to any person obtaining a copy + # of this software and associated documentation files (the "Software"), to deal + # in the Software without restriction, including without limitation the rights + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + # copies of the Software, and to permit persons to whom the Software is + # furnished to do so, subject to the following conditions: + # + # The above copyright notice and this permission notice shall be included in all + # copies or substantial portions of the Software. + # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + # SOFTWARE. + # + ####################################################################################################################### +cmake_minimum_required(VERSION 3.21) +project(PAL LANGUAGES CXX) +add_library(pal INTERFACE) +set_target_properties(pal PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + POSITION_INDEPENDENT_CODE TRUE +) +target_compile_features(pal INTERFACE cxx_std_20) +if (NOT PAL_CLIENT_INTERFACE_MAJOR_VERSION EQUAL 932) + message(WARNING "PAL: PAL_CLIENT_INTERFACE_MAJOR_VERSION ${PAL_CLIENT_INTERFACE_MAJOR_VERSION} not supported !!!") +endif() +target_link_libraries(pal + INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palCompilerDeps.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/palUtil.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_uuid.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/cwpack.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/pal_lz4.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/addrlib.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/amdrdf.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/zstd.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/vam.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/UberTraceService.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/DriverUtilsService.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_settings.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/SettingsRpcService2.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcServer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddNet.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcShared.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddSocket.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/devdriver.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_common.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCommon.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddCore.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/dd_libyaml.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/mpack.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/metrohash.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/stb_sprintf.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddRpcClient.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventStreamer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventClient.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventParser.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddEventServer.lib + ${CMAKE_CURRENT_SOURCE_DIR}/lib/Release/x64/ddYaml.lib + SetupAPI.Lib +) +target_compile_definitions(pal + INTERFACE + PAL_CLIENT_INTERFACE_MAJOR_VERSION=932 + GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION=42 + PAL_BUILD_RDF=1 + PAL_DEVELOPER_BUILD=0 + PAL_KMT_BUILD=1 +) +target_include_directories(pal + INTERFACE + inc + inc/core + inc/gpuUtil + inc/util + shared/inc + shared/devdriver/shared/legacy/inc + shared/devdriver/third_party/dd_crc32/inc + shared/metrohash/src +) diff --git a/shared/amdgpu-windows-interop/pal/LICENSE b/shared/amdgpu-windows-interop/pal/LICENSE index c8fbdd17ab..45b7b6b50c 100644 --- a/shared/amdgpu-windows-interop/pal/LICENSE +++ b/shared/amdgpu-windows-interop/pal/LICENSE @@ -1,21 +1,21 @@ -The MIT License (MIT) - -Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +The MIT License (MIT) + +Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/shared/amdgpu-windows-interop/pal/inc/core/pal.h b/shared/amdgpu-windows-interop/pal/inc/core/pal.h index 343df6fe44..544a556f1d 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/pal.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/pal.h @@ -1,833 +1,833 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file pal.h - * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines common types, enums, etc. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palFormat.h" -#include "palSysUtil.h" - -// Forward declarations of global types (must be done outside of Pal namespace). -#if (PAL_KMT_BUILD) && !defined(__unix__) -struct HMONITOR__; -struct HWND__; -#endif - -#if PAL_KMT_BUILD -struct _SECURITY_ATTRIBUTES; -#endif - -/// Library-wide namespace encapsulating all PAL entities. -namespace Pal -{ - -typedef Util::int8 int8; ///< 8-bit integer. -typedef Util::int16 int16; ///< 16-bit integer. -typedef Util::int32 int32; ///< 32-bit integer. -typedef Util::int64 int64; ///< 64-bit integer. -typedef Util::uint8 uint8; ///< Unsigned 8-bit integer. -typedef Util::uint16 uint16; ///< Unsigned 16-bit integer. -typedef Util::uint32 uint32; ///< Unsigned 32-bit integer. -typedef Util::uint64 uint64; ///< Unsigned 64-bit integer. -typedef Util::gpusize gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from - /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. -typedef Util::Result Result; ///< The PAL core and utility companion share the same result codes for convenience. - -typedef Util::Rational Rational; ///< A ratio of two unsigned integers. - -#if defined(_WIN32) -typedef HMONITOR__* OsDisplayHandle; ///< OsDisplayHandle corresponds to an HMONITOR on Windows. -typedef HWND__* OsWindowHandle; ///< OsWindowHandle corresponds to an HWND on Windows. -typedef void* OsExternalHandle; ///< OsExternalHandle corresponds to a generic HANDLE on Windows - -typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on Vulkan. - -constexpr OsWindowHandle NullWindowHandle = nullptr; ///< Value representing a null or invalid window handle. -#elif defined(__unix__) - -typedef void* OsDisplayHandle; ///< The Display Handle for Linux except X11 platform -typedef uint32 OsExternalHandle; ///< OsExternalHandle corresponds to a generic handle on linux -typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on linux. - -/// OsWindowHandle corresponds to a window on X-Windows or surface on Wayland. -union OsWindowHandle -{ - void* pSurface; ///< Native surface handle in wayland is a pointer. - uint64 win; ///< Native window handle in X is a 32-bit integer (but stored here as 64 bit). -}; -constexpr OsWindowHandle NullWindowHandle = {nullptr}; ///< Value representing a null or invalid window handle. - -// don't check for the Linux Platform type; just compare the larger member of the union -inline bool operator==(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) - { return (lhs.pSurface == rhs.pSurface); } -inline bool operator!=(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) - { return (lhs.pSurface != rhs.pSurface); } -#else -#error "Unsupported OS platform detected!" -#endif - -#if PAL_CLIENT_EXAMPLE -typedef void* AddrHandle; ///< Corresponds to an ADDR_HANDLE. -#endif - -constexpr uint32 InvalidVidPnSourceId = ~0u; ///< In cases where PAL cannot abstract a Windows VidPnSourceId, this - /// represents an invalid value. (Note: zero is a valid value.) - -constexpr uint32 MaxVertexBuffers = 32; ///< Maximum number of vertex buffers per pipeline. -constexpr uint32 MaxColorTargets = 8; ///< Maximum number of color targets. -constexpr uint32 MaxStreamOutTargets = 4; ///< Maximum number of stream output target buffers. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 -constexpr uint32 MaxDescriptorSets = 2; ///< Maximum number of descriptor sets. -#endif -constexpr uint32 MaxMsaaRasterizerSamples = 16; ///< Maximum number of MSAA samples supported by the rasterizer. -constexpr uint32 MaxAvailableEngines = 12; ///< Maximum number of engines for a particular engine type. -constexpr uint32 MaxNumPlanes = 3; ///< Maximum number of format planes. - -constexpr uint64 InternalApiPsoHash = UINT64_MAX; ///< Default Hash for PAL internal pipelines. - -/// Specifies a category of GPU engine. Each category corresponds directly to a hardware engine. There may be multiple -/// engines available for a given type; the available engines on a particular GPU can be queried via -/// Device::GetProperties, returned in DeviceProperties.engineProperties[]. -enum EngineType : uint32 -{ - /// Corresponds to the graphics hardware engine (a.k.a. graphcis ring a.k.a 3D). - EngineTypeUniversal, - - /// Corresponds to asynchronous compute engines (ACE). - EngineTypeCompute, - - /// Corresponds to SDMA engines. - EngineTypeDma, - - /// Virtual engine that only supports inserting sleeps, used for implementing frame-pacing. - EngineTypeTimer, - - /// Number of engine types. - EngineTypeCount, -}; - -/// Specifies a category of GPU work. Each queue type only supports specific types of work. Determining which -/// QueueTypes are supported on which engines can be queried via IDevice::GetProperties, returned in -/// DeviceProperties.engineProperties[]. -enum QueueType : uint32 -{ - /// Supports graphics commands (draws), compute commands (dispatches), and copy commands. - QueueTypeUniversal, - - /// Supports compute commands (dispatches), and copy commands. - QueueTypeCompute, - - /// Supports copy commands. - QueueTypeDma, - - /// Virtual engine that only supports inserting sleeps, used for implementing frame pacing. - /// This is a software-only queue. - QueueTypeTimer, - - /// Number of queue types. - QueueTypeCount, -}; - -/// Defines flags for describing which queues are supported. -enum QueueTypeSupport : uint32 -{ - SupportQueueTypeUniversal = (1 << static_cast(QueueTypeUniversal)), - SupportQueueTypeCompute = (1 << static_cast(QueueTypeCompute)), - SupportQueueTypeDma = (1 << static_cast(QueueTypeDma)), - SupportQueueTypeTimer = (1 << static_cast(QueueTypeTimer)), - -}; - -// Many command buffers break down into multiple command streams targeting internal sub-engines. For example, Universal -// command buffers build a primary stream (DE) but may also build a second stream for async compute engine (ACE). -enum class SubEngineType : uint32 -{ - Primary = 0, // Subqueue that is the queue itself, rather than an ancillary queue. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 - AsyncCompute = 1, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. - ConstantEngine = 2, // CP constant update engine that runs in parallel with draw engine. - // Internal usage only. -#else - ConstantEngine = 1, // CP constant update engine that runs in parallel with draw engine. - AsyncCompute = 2, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. - Pup = 3, // Subqueue that is the queue itself but for PUP-style packets, rather than an - // ancillary queue -#endif - Count, -}; - -/// Defines the execution priority for a queue, specified either at queue creation or via IQueue::SetExecutionPriority() -/// on platforms that support it. QueuePriority::Normal corresponds to the default priority. -enum class QueuePriority : uint32 -{ - Normal = 0, ///< Normal priority (default). - Idle = 1, ///< Idle, or low priority (lower than Normal). - Medium = 2, ///< Medium priority (higher than Normal). - High = 3, ///< High priority (higher than Normal). - Realtime = 4, ///< Real time priority (higher than Normal). - Count -}; - -/// Defines flags for describing which queue priority levels are supported. -enum QueuePrioritySupport : uint32 -{ - SupportQueuePriorityNormal = (1 << static_cast(QueuePriority::Normal)), - SupportQueuePriorityIdle = (1 << static_cast(QueuePriority::Idle)), - SupportQueuePriorityMedium = (1 << static_cast(QueuePriority::Medium)), - SupportQueuePriorityHigh = (1 << static_cast(QueuePriority::High)), - SupportQueuePriorityRealtime = (1 << static_cast(QueuePriority::Realtime)), -}; - -/// Selects one of a few possible memory heaps accessible by a GPU. -enum GpuHeap : uint32 -{ - GpuHeapLocal = 0x0, ///< Local heap visible to the CPU. - GpuHeapInvisible = 0x1, ///< Local heap not visible to the CPU. - GpuHeapGartUswc = 0x2, ///< GPU-accessible uncached system memory. - GpuHeapGartCacheable = 0x3, ///< GPU-accessible cached system memory. - GpuHeapCount -}; - -/// Describes the desired access for a memory allocation. -enum GpuHeapAccess : uint32 -{ - GpuHeapAccessExplicit = 0x0, ///< Memory access is not known. Heaps will be explicitly defined. - GpuHeapAccessCpuNoAccess = 0x1, ///< Memory access from CPU not required. - GpuHeapAccessGpuMostly = 0x2, ///< Memory optimized for reads/writes from GPU and accessible from CPU. - GpuHeapAccessCpuReadMostly = 0x3, ///< Memory optimized for reads from CPU. - GpuHeapAccessCpuWriteMostly = 0x4, ///< Memory optimized for writes from CPU. - GpuHeapAccessCpuMostly = 0x5, ///< Memory optimized for read/writes from CPU. - GpuHeapAccessCount -}; - -#if defined(__unix__) -/// Describes possible handle types. -enum class HandleType : uint32 -{ - GemFlinkName = 0x0, ///< GEM flink name (needs DRM authentication, used by DRI2) - Kms = 0x1, ///< KMS handle which is used by all driver ioctls - DmaBufFd = 0x2, ///< DMA-buf fd handle - KmsNoImport = 0x3, ///< Deprecated in favour of and same behaviour as HandleTypeDmaBufFd, use that instead of this -}; -#endif - -/// Comparison function determines how a pass/fail condition is determined between two values. For depth/stencil -/// comparison, the first value comes from source data and the second value comes from destination data. -enum class CompareFunc : uint8 -{ - Never = 0x0, - Less = 0x1, - Equal = 0x2, - LessEqual = 0x3, - Greater = 0x4, - NotEqual = 0x5, - GreaterEqual = 0x6, - _Always = 0x7, - - // Unfortunately for Linux clients, X.h includes a "#define Always 2" macro. Clients have their choice of either - // undefing Always before including this header or using _Always when dealing with PAL. -#ifndef Always - Always = _Always, -#endif - - Count -}; - -/// Defines an offset into a 2D pixel region. -struct Offset2d -{ - int32 x; ///< X offset. - int32 y; ///< Y offset. -}; - -/// Defines an offset into a 3D pixel region. -struct Offset3d -{ - int32 x; ///< X offset. - int32 y; ///< Y offset. - int32 z; ///< Z offset. -}; - -/// Defines an floating-point offset into a 3D pixel region. -struct Offset3dFloat -{ - float x; ///< X offset. - float y; ///< Y offset. - float z; ///< Z offset. -}; - -/// Defines a width and height for a 2D image region. The dimensions could be pixels, blocks, or bytes -/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you -/// get it right. -struct Extent2d -{ - uint32 width; ///< Width of region. - uint32 height; ///< Height of region. -}; - -/// Defines a signed width and height, for a 2D image region. The dimensions could be pixels, blocks, or bytes -/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you -/// get it right. -struct SignedExtent2d -{ - int32 width; ///< Width of region. - int32 height; ///< Height of region. -}; - -/// Defines a width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes -/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you -/// get it right. -struct Extent3d -{ - uint32 width; ///< Width of region. - uint32 height; ///< Height of region. - uint32 depth; ///< Depth of region. -}; - -constexpr bool operator==(const Extent3d& x, const Extent3d& y) -{ - return (x.width == y.width) && (x.height == y.height) && (x.depth == y.depth); -} - -constexpr bool operator!=(const Extent3d& x, const Extent3d& y) { return (x == y) == false; } - -/// Defines a signed width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes -/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you -/// get it right. -struct SignedExtent3d -{ - int32 width; ///< Width of region. - int32 height; ///< Height of region. - int32 depth; ///< Depth of region. -}; - -/// Defines a floating-point width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or -/// bytes depending on context, so be sure to check documentation for the PAL interface of interest to be sure you -/// get it right. -struct Extent3dFloat -{ - float width; ///< Width of region. - float height; ///< Height of region. - float depth; ///< Depth of region. -}; - -/// Defines a region in 1D space. -struct Range -{ - int32 offset; ///< Starting position. - uint32 extent; ///< Region size. -}; - -/// Defines a rectangular region in 2D space. -struct Rect -{ - Offset2d offset; ///< Top left corner. - Extent2d extent; ///< Rectangle width and height. -}; - -/// Defines a cubic region in 3D space. -struct Box -{ - Offset3d offset; ///< Top left front corner. - Extent3d extent; ///< Box width, height and depth. -}; - -/// ShaderHash represents a 128-bit shader hash. -struct ShaderHash -{ - uint64 lower; ///< Lower 64-bits of hash - uint64 upper; ///< Upper 64-bits of hash -}; - -/// PipelineHash represents a concatenated pair of 64-bit hashes. -struct PipelineHash -{ - uint64 stable; ///< Lower 64-bits of hash. "Stable" portion, suitable for e.g. shader replacement use cases. - uint64 unique; ///< Upper 64-bits of hash. "Unique" portion, suitable for e.g. pipeline cache use cases. -}; - -/// Common shader pre and post compilation stats. -struct CommonShaderStats -{ - uint32 numUsedVgprs; ///< Number of VGPRs used by this shader - uint32 numUsedSgprs; ///< Number of SGPRs used by this shader - - uint32 ldsSizePerThreadGroup; ///< LDS size per thread group in bytes. - size_t ldsUsageSizeInBytes; ///< LDS usage by this shader. - - size_t scratchMemUsageInBytes; ///< Amount of scratch mem used by this shader. - gpusize gpuVirtAddress; ///< Gpu mem address of shader ISA code. - - union - { - struct - { - uint32 isWave32 : 1; ///< If set, specifies that the shader is compiled in wave32 mode. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as a 32-bit uint. - } flags; ///< Shader compilation stat flags. -}; - -/// Per-thread stack sizes -struct CompilerStackSizes -{ - uint32 backendSize; ///< Managed by compiler backend - uint32 frontendSize; ///< Managed by compiler frontend -}; - -///@{ -/// Determines whether two ShaderHashes or PipelineHashes are equal. -/// -/// @param [in] hash1 The first 128-bit shader hash or pipeline hash -/// @param [in] hash2 The second 128-bit shader hash or pipeline hash -/// -/// @returns True if the hashes are equal. -constexpr bool ShaderHashesEqual(const ShaderHash hash1, const ShaderHash hash2) - { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } -constexpr bool operator==(const ShaderHash hash1, const ShaderHash hash2) - { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } -constexpr bool operator!=(const ShaderHash hash1, const ShaderHash hash2) - { return ((hash1.lower != hash2.lower) || (hash1.upper != hash2.upper)); } -constexpr bool PipelineHashesEqual(const PipelineHash hash1, const PipelineHash hash2) - { return ((hash1.stable == hash2.stable) && (hash1.unique == hash2.unique)); } -///@} - -///@{ -/// Determines whether the given ShaderHash or PipelineHash is non-zero. -/// -/// @param [in] hash A 128-bit shader hash or pipeline hash -/// -/// @returns True if the hash is non-zero. -constexpr bool ShaderHashIsNonzero(const ShaderHash hash) { return ((hash.upper | hash.lower) != 0); } -constexpr bool PipelineHashIsNonzero(const PipelineHash hash) { return ((hash.stable | hash.unique) != 0); } -///@} - -/// Specifies the Display Output Post-Processing (DOPP) desktop texture information, which are provided by OpenGL via -/// interop. The DOPP is an OpenGL extension to allow its client to access the desktop texture directly without the -/// need of copying to system memory. This is only supported on Windows. -struct DoppDesktopInfo -{ - gpusize gpuVirtAddr; ///< The VA of the dopp desktop texture. Set to 0 for the non-dopp resource. - uint32 vidPnSourceId; ///< Display source id of the dopp desktop texture. -}; - -/// Specifies the Direct Capture resource information. Direct Capture is an extension that allows to access on-screen -/// primary, motion vectors, depth, and camera matrix directly. This is only supported on Windows. -struct DirectCaptureInfo -{ - uint32 vidPnSourceId; ///< VidPnSource ID of the on-screen primary. - union - { - struct - { - uint32 preflip : 1; ///< Requires pre-flip primary access - uint32 postflip : 1; ///< Requires post-flip primary access. A DirectCapture resource cannot - /// have pre-flip and post-flip access at the same time - uint32 accessDesktop : 1; ///< Requires acces to the desktop - uint32 shared : 1; ///< This resource will be shared between APIs - uint32 frameGenRatio : 4; ///< Frame generation ratio - uint32 paceGeneratedFrame : 1; ///< Requires pacing the generated frames - uint32 requiresDisplayDcc : 1; ///< Requires display dcc support - uint32 requestMotionVectors : 1; ///< Request DirectCapture access to motion vector data if available - uint32 requestDepth : 1; ///< Request DirectCapture access to depth data if available - uint32 requestCamera : 1; ///< Request DirectCapture access to camera matrix data if available - uint32 initMotionVectors : 1; ///< Initialize the DirectCapture resource to access motion vector data - uint32 initDepth : 1; ///< Initialize the DirectCapture resource to access depth data - uint32 initCamera : 1; ///< Initialize the DirectCapture resource to access camera matrix - uint32 requestHudLessImage : 1; ///< Request DirectCapture access to HUD less image if available - uint32 initHudLessImage : 1; ///< Initialize the DirectCapture resource to access HUD less image - uint32 reserved : 14; - }; - uint32 u32All; - } usageFlags; - - OsExternalHandle hNewFrameEvent; ///< Event to notify of a new frame available for pre-flip or post-flip access - OsExternalHandle hFatalErrorEvent; ///< Event to notify of a fatal error -}; - -/// Specifies parameters for opening a shared GPU resource from a non-PAL device or non-local process. -struct ExternalResourceOpenInfo -{ - OsExternalHandle hExternalResource; ///< External GPU resource from another non-PAL device to open. -#if defined(__unix__) - HandleType handleType; ///< Type of the external GPU resource to be opened. -#endif - - union - { - struct - { - uint32 ntHandle : 1; ///< The provided hExternalResource is an NT handle instead of a default - /// KMT handle. - uint32 androidHwBufHandle : 1; ///< The provided hExternalResource is android hardware buffer handle - /// instead of fd. - uint32 isDopp : 1; ///< This is a Dopp texture, doppDesktopInfo is in use. - uint32 isDirectCapture : 1; ///< This is a Direct Capture resource, directCaptureInfo is in use. - uint32 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. - uint32 reserved : 27; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< External resource open flags. - - union - { - DoppDesktopInfo doppDesktopInfo; ///< The information of dopp desktop texture. - DirectCaptureInfo directCaptureInfo; ///< The information of direct capture resource. - }; -}; - -/// Packed pixel display enumeration. -/// -/// In the medical imaging market space, there are several 10-bit per component color and grayscale displays -/// available.In addition to being high precision, these displays tend to be very high resolution.For grayscale -/// displays,one method of getting high pixel resolution in 10b precision is a proprietary method called -/// "packed pixel".Each of these packed pixel formats packs two/three 10-bit luminance values into a single -/// R8G8B8 pixel. -/// -/// Example Displays: -/// -/// EIZO GS510 -/// NEC MD21GS -/// TOTOKU ME55Xi2 -/// FIMI 3/5MP -/// -/// -/// The enumerations are named in a way to describe the format of the packed pixels. Names for -/// formats with two or three pixels packed into a single word (corresponding to a simple RGB pixel) -/// follow this convention: -/// -/// LLLLLL_RRRRRR (L=left pixel, R=right pixel) or -/// LLL_MMM_RRR (L=left pixel, M=middle pixel, R=right pixel) -/// -/// The bit order for a pixel follows this convention: -/// -/// (ColorBand)MSB(ColorBand)LSB -/// -/// For example: G70B54 means that the MSBs are in 7-0 of the green channel, and the LSBs -/// are stored in bits 5-4. -/// -enum class PackedPixelType : uint32 -{ - NotPacked = 0, ///< Pixels not packed, for standard color RGB8 monitor - SplitG70B54_R70B10, ///< 10-bit mono, split screen - SplitB70G10_R70G76, ///< 10-bit mono, split screen - G70B54_R70B10, ///< 10-bit mono, 2 adjacent pixels - B70R32_G70R76, ///< 10-bit mono, 2 adjacent pixels - B70R30_G70R74, ///< 12-bit mono, 2 adjacent pixels - B70_G70_R70, ///< 8-bit mono, 3 adjacent pixels - R70G76, ///< 10-bit mono, single pixel - G70B54, ///< 10-bit mono, single pixel - Native, ///< 10-bit color, without packing -}; - -/// Enumerates the logging priority levels supported by PAL. -enum class LogLevel : uint32 -{ - Debug = 0, ///< Debug messages - Verbose, ///< High frequency messages - Info, ///< Low frequency messages - Alert, ///< Warnings - Error, ///< Critical issues - Always ///< All messages -}; - -/// Enumerates all log categories explicitly defined by PAL -enum class LogCategory : uint64 -{ - Correctness = 0, ///< Application correctness - Performance, ///< Application performance - Internal, ///< Internal logging - Display, ///< Display Info - Count -}; - -/// String table used to register log categories -constexpr const char* LogCategoryTable[] = -{ - "Correctness", - "Performance", - "Internal", - "Display" -}; - -/// Typedef for log category masks. -typedef uint64 LogCategoryMask; - -/// Log category mask for messages related to application correctness -constexpr LogCategoryMask LogCategoryMaskCorrectness = (1 << static_cast(LogCategory::Correctness)); - -/// Log category mask for messages related to application performance -constexpr LogCategoryMask LogCategoryMaskPerformance = (1 << static_cast(LogCategory::Performance)); - -/// Log category mask for messages related to internal messages -constexpr LogCategoryMask LogCategoryMaskInternal = (1 << static_cast(LogCategory::Internal)); - -/// Log category mask for messages related to display information (e.g. HDR format) -constexpr LogCategoryMask LogCategoryMaskDisplay = (1 << static_cast(LogCategory::Display)); - -/// Defines the modes that the GPU Profiling layer can be enabled with. If the GpuProfilerMode is -/// GpuProfilerTraceEnabledTtv or GpuProfilerTraceEnabledRgp, then the GpuProfilerConfig_TraceModeMask is examined to -/// configure the trace type (spm, sqtt or both) requested. -enum GpuProfilerMode : uint32 -{ - GpuProfilerDisabled = 0, ///< Gpu Profiler is disabled. - GpuProfilerCounterAndTimingOnly = 1, ///< Traces are disabled but perf counter and timing operations are enabled. - GpuProfilerTraceEnabledTtv = 2, ///< Traces are output in format (.csv, .out) for Thread trace viewer. - GpuProfilerTraceEnabledRgp = 3, ///< Trace data is output as .rgp file for Radeon Gpu Profiler. -}; - -// Defines the trigger keys for capturing the GPU profiler. -typedef Util::KeyCode GpuProfilerCaptureTriggerKey; - -#define PAL_EVENT_LOGGING_VERSION 528 - -/// This enumeration identifies the source/owner of a resource object, used for event logging. -enum ResourceOwner : uint32 -{ - ResourceOwnerApplication = 0, ///< The resource is owned by the application - ResourceOwnerPalClient = 1, ///< The resource is owned by the PAL client - ResourceOwnerPal = 2, ///< The resource is owned by PAL - ResourceOwnerUnknown = 3, ///< The resource owner is unknown -}; - -/// This enumeration lists the usage/category of a resource object to give context in event logging. -enum ResourceCategory : uint32 -{ - ResourceCategoryApplication = 0, ///< The resource is used by the application. - ResourceCategoryRpm = 1, ///< The resource is used by RPM - ResourceCategoryProfiling = 2, ///< The resource is used for profiling (e.g. SQTT, SPM, etc) - ResourceCategoryDebug = 3, ///< The resource is used for debug purposes - ResourceCategoryRayTracing = 4, ///< The resource is used for ray tracing - ResourceCategoryVideo = 5, ///< The resource is used for video encode/decode - ResourceCategoryMisc = 6, ///< Miscellaneous, resource doesn't fit in any of the above categories - ResourceCategoryUnknown = 7, ///< The resource category is unknown -}; - -/// Set of information about resource ownership and usage, used for event logging. -struct ResourceEventInfo -{ - ResourceOwner owner; ///< Resource owner - ResourceCategory category; ///< Resource category -}; - -/// General purpose on/off/default tri-state enum. -enum class TriState : uint8 -{ - Default = 0, ///< Let implementation decide whether to enable or disable - Enable = 1, ///< Force enable - Disable = 2, ///< Force disable - Count -}; - -/// Defines the modes that the GPU Profiling layer can be enabled with. -/** - *********************************************************************************************************************** - * @mainpage - * - * Introduction - * ------------ - * The Platform Abstraction Library (PAL) provides hardware and OS abstractions for Radeon (GCN+) user-mode 3D graphics - * drivers. The level of abstraction is chosen to support performant driver implementations of several APIs while - * hiding the client from hardware and operating system details. - * - * PAL client drivers will have no HW-specific code; their responsibility is to translate API/DDI commands into PAL - * commands as efficiently as possible. This means that the client should be unaware of hardware registers, PM4 - * commands, SP3 shaders, etc. However, PAL is an abstraction of AMD hardware only, so many things in the PAL interface - * have an obvious correlation to hardware features. - * - * PAL client drivers should have little OS-specific code. PAL and its companion utility collection provide - * OS abstractions for almost everything a client might need, but there are some cases where this is unavoidable: - * - * + Handling dynamic library infrastructure. I.e., the client has to implement DllMain() on Windows, etc. - * + OS-specific APIs or extensions. DX may have Windows-specific functionality in the core API, and Vulkan may - * export certain OS-specific features as extensions (like for presenting contents to the screen). - * + Single OS clients (e.g., DX) may choose to make OS-specific calls directly simply out of convenience with no down - * side. - * - * - * The following diagram illustrates the software stack when running a 3D application with a PAL-based UMD. Non-AMD - * components are in gray, UMD client code is blue, AMD static libs linked into the UMD are green, and the AMD KMD - * is in red. - * - * @image html swStack.png - * - * PAL is a relatively _thick_ abstraction layer, typically accounting for the majority of code (excluding SC) in any - * particular UMD built on PAL. The level of abstraction tends to be higher in areas where client APIs are similar, - * and lower (closer to hardware) in areas where client APIs diverge significantly. The overall philosophy is to share - * as much code as possible without impacting client driver performance. Our committed goal is that CPU-limited - * performance should be within 5% of what a native solution could achieve, and GPU-limited performance should be within - * 2%. - * - * PAL uses a C++ interface. The public interface is defined in .../pal/inc, and client must _only_ include headers - * from that directory. The interface is spread over many header files - typically one per class - in order to clarify - * dependencies and reduce build times. There are two sub-directories in .../pal/inc: - * - * + .../pal/inc/core - Defines the PAL Core (see @ref Overview). - * + .../pal/inc/gpuUtil - Defines the PAL GPU Utility Collection (see @ref GpuUtilOverview). - * + .../pal/inc/util - Defines the PAL Utility Collection (see @ref UtilOverview). - * - * - * @copydoc VersionHistory - * - * Next: @ref Build - *********************************************************************************************************************** - */ - -/** - *********************************************************************************************************************** - * @page Overview PAL Core Overview - * - * ### Introduction - * PAL's core interface is defined in the @ref Pal namespace, and defines an object-oriented model for interacting with - * the GPU and OS. The interface closely resembles the Vulkan and DX12 APIs. Some common features of these - * APIs that are central to the PAL interface: - * - * - All shader stages, and some additional "shader adjacent" state, are glommed together into a monolithic pipeline - * object. - * - Explicit, free-threaded command buffer generation. - * - Support for multiple, asynchronous engines for executing GPU work (graphics, compute, DMA). - * - Explicit system and GPU memory management. - * - Flexible shader resource binding model. - * - Explicit management of stalls, cache flushes, and compression state changes. - * - * However, as a common component supporting multiple APIs, the PAL interface tends to be lower level in places where - * client APIs diverge. - * - * ### Settings - * The PAL library has a number of configuration settings available for the client to modify either programmatically - * or via external settings. PAL also includes infrastructure for building/loading client-specific settings. - * See @ref Settings for a detailed description of this support. - * - * ### Initialization - * The first step to interacting with the PAL core is creating an IPlatform object and enumerating IDevice objects - * representing GPUs attached to the system and, optionally, IScreen objects representing displays attached to the - * system. See @ref LibInit for a detailed description. - * - * ### System Memory Allocation - * Clients have a lot of control over PAL's system memory allocations. Most PAL objects require the client to provide - * system memory; the client first calls a GetSize() method and then passes a pointer to PAL on the actual create call. - * Further, when PAL needs to make an internal allocation, it will optionally call a client callback, which can be - * specified on platform creation. This callback will specify a category for the allocation, which may imply an - * expected lifetime. - * - * ### Interface Classes - * The following diagram illustrates the relationship of some key PAL interfaces and how they interact to render a - * typical frame in a modern game. Below that is a listing of all of PAL's interface classes, and a very brief - * description of their purpose. Follow the link for each interface to see detailed reference documentation. - * - * @image html scheduling.png - * - * - __OS Abstractions__ - * + _IPlatform_: Root-level object created by clients that interact with PAL. Mostly responsible for enumerating - * devices and screens attached to the system and returning any system-wide properties.

- * + _IDevice_: Configurable context for querying properties of a particular GPU and interacting with it. Acts as a - * factory for almost all other PAL objects.

- * + _IQueue_: A device has one or more _engines_ which are able to issue certain types of work. Tahiti, for example, - * has 1 universal engine (supports graphics, compute, or copy commands), 2 compute engines (support - * compute or copy commands), and 2 DMA engines (support only copy commands). An IQueue object is a - * context for submitting work on a particular engine. This mainly takes the form of submitting command - * buffers and presenting images to the screen. Work performed in a queue will be started in order, but - * work executed on different queues (even if the queues reference the same engine) is not guaranteed - * to be ordered without explicit synchronization.

- * + _IQueueSemaphore_: Queue semaphores can be signaled and waited on from an IQueue in order to control execution - * order between queues.

- * + _IFence_: Used for coarse-grain CPU/GPU synchronization. Fences can be signalled from the GPU as part of a - * command buffer submission on a queue, then waited on from the CPU.

- * + _IGpuMemory_: Represents a GPU-accessible memory allocation. Can either be virtual (only VA allocation which - * must be explicitly mapped via an IQueue operation) or physical. Residency of physical allocations - * must be managed by the client either globally for a device (IDevice::AddGpuMemoryReferences) or by - * specifying allocations referenced by command buffers at submit.

- * + _ICmdAllocator_: GPU memory allocation pool used for backing an ICmdBuffer. The client is free to create one - * allocator per device, or one per thread to remove thread contention.

- * + _IScreen_: Represents a display attached to the system. Mostly used for managing full-screen flip - * presents.

- * + _IPrivateScreen_: Represents a display that is not otherwise visible to the OS, typically a VR head mounted - * display.

- * - __Hardware IP Abstractions__ - * + __All IP__ - * - _ICmdBuffer_: Clients build command buffers to execute the desired work on the GPU, and submit them on a - * corresponding queue. Different types of work can be executed depending on the _queueType_ of - * the command buffer (graphics work, compute work, DMA work).

- * - _IImage_: Images are a 1D, 2D, or 3D collection of pixels (i.e., _texture_) that can be accessed by the - * GPU in various ways: texture sampling, BLT source/destination, UAV, etc.

- * + __GFXIP-only__ - * - _IShader_: Container for shader byte code used as an input to pipeline creation. No compilation occurs - * until an IPipeline is created. Currently, AMDIL is the only supported input language.

- * - _IPipeline_: Comprised of all shader stages (CS for compute, VS/HS/DS/GS/PS for graphics), resource mappings - * describing how user data entries are to be used by the shaders, and some other fixed-function - * state like depth/color formats, blend enable, MSAA enable, etc.

- * - _IColorTargetView_: IImage view allowing the image to be bound as a color target (i.e., RTV.).

- * - _IDepthStencilView_: IImage view allowing the image to be bound as a depth/stencil target (i.e., DSV).

- * - _IGpuEvent_: Used for fine-grained (intra-command buffer) synchronization between the CPU and GPU. GPU - * events can be set/reset from either the CPU or GPU and waited on from either.

- * - _IQueryPool_: Collection of query slots for tracking occlusion or pipeline stats query results.

- * - __Dynamic State Objects__: _IColorBlendState_, _IDepthStencilState_, _IMsaaState_, _IScissorState_, - * and _IViewportState_ define logical collections of related fixed function graphics - * state, similar to DX11.

- * - _IPerfExperiment_: Used for gathering performance counter and thread trace data.

- * - _IBorderColorPalette_: Provides a collection of indexable colors for use by samplers that clamp to an - * arbitrary border color.

- * - __Common Base Classes__ - * + _IDestroyable_: Defines a _Destroy()_ method for the PAL interface. Calling _Destroy()_ will release any - * internally allocated resources for the object, but the client is still responsible for freeing - * the system memory provided for the object.

- * + _IGpuMemoryBindable_: Defines a set of methods for binding GPU memory to the object. Interfaces that inherit - * _IGpuMemoryBindable_ require GPU memory in order to be used by the GPU. The client - * must query the requirements (e.g., alignment, size, heaps) and allocate/bind GPU memory - * for the object. _IGpuMemoryBindable_ inherits from _IDestroyable_.

- * - * ### %Format Info - * Several helper methods are available for dealing with image formats in the @ref Formats namespace. - * - * ### Graphics/Compute Execution Model - * Most graphics/compute work is defined by first binding a set of states then issuing a draw or dispatch command to - * kick off the work. The complete set of graphics states available in PAL is illustrated below; compute is a subset - * of this that only includes the pipeline, user data entries, and border color palette. - * - * @image html stateBreakdown.jpg - * - * Most of these correspond directly to a PAL interface object above, and these items are bound by calling a - * corresponding _CmdBind...()_ method in the ICmdBuffer interface. The states marked in yellow and orange, however, - * are _immediate_ states for which there is no object, you just specify the required state values in the corresponding - * _CmdSet...()_ method in the ICmdBuffer interface. - * - * User data entries are the way that input resources are specified for the pipeline on an upcoming draw/dispatch. This - * mapping is complicated, and is described fully in @ref ResourceBinding. - * - * A final complication worth noting is that PAL provides no implicit surface synchronization. The client is - * responsible for explicitly inserting barriers to resolve data hazards, flush/invalidate caches, and ensure images - * are in the proper compression state. For more detail, see ICmdBuffer::CmdReleaseThenAcquire, CmdRelease, CmdAcquire, - * CmdReleaseEvent, CmdAcquireEvent and AcquireReleaseInfo. - * - *********************************************************************************************************************** - */ - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file pal.h + * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines common types, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palFormat.h" +#include "palSysUtil.h" + +// Forward declarations of global types (must be done outside of Pal namespace). +#if (PAL_KMT_BUILD) && !defined(__unix__) +struct HMONITOR__; +struct HWND__; +#endif + +#if PAL_KMT_BUILD +struct _SECURITY_ATTRIBUTES; +#endif + +/// Library-wide namespace encapsulating all PAL entities. +namespace Pal +{ + +typedef Util::int8 int8; ///< 8-bit integer. +typedef Util::int16 int16; ///< 16-bit integer. +typedef Util::int32 int32; ///< 32-bit integer. +typedef Util::int64 int64; ///< 64-bit integer. +typedef Util::uint8 uint8; ///< Unsigned 8-bit integer. +typedef Util::uint16 uint16; ///< Unsigned 16-bit integer. +typedef Util::uint32 uint32; ///< Unsigned 32-bit integer. +typedef Util::uint64 uint64; ///< Unsigned 64-bit integer. +typedef Util::gpusize gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from + /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. +typedef Util::Result Result; ///< The PAL core and utility companion share the same result codes for convenience. + +typedef Util::Rational Rational; ///< A ratio of two unsigned integers. + +#if defined(_WIN32) +typedef HMONITOR__* OsDisplayHandle; ///< OsDisplayHandle corresponds to an HMONITOR on Windows. +typedef HWND__* OsWindowHandle; ///< OsWindowHandle corresponds to an HWND on Windows. +typedef void* OsExternalHandle; ///< OsExternalHandle corresponds to a generic HANDLE on Windows + +typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on Vulkan. + +constexpr OsWindowHandle NullWindowHandle = nullptr; ///< Value representing a null or invalid window handle. +#elif defined(__unix__) + +typedef void* OsDisplayHandle; ///< The Display Handle for Linux except X11 platform +typedef uint32 OsExternalHandle; ///< OsExternalHandle corresponds to a generic handle on linux +typedef uint32 OsVideoSessionHandle; ///< OsVideoSessionHandle corresponds to a video session handle on linux. + +/// OsWindowHandle corresponds to a window on X-Windows or surface on Wayland. +union OsWindowHandle +{ + void* pSurface; ///< Native surface handle in wayland is a pointer. + uint64 win; ///< Native window handle in X is a 32-bit integer (but stored here as 64 bit). +}; +constexpr OsWindowHandle NullWindowHandle = {nullptr}; ///< Value representing a null or invalid window handle. + +// don't check for the Linux Platform type; just compare the larger member of the union +inline bool operator==(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) + { return (lhs.pSurface == rhs.pSurface); } +inline bool operator!=(const Pal::OsWindowHandle& lhs, const Pal::OsWindowHandle& rhs) + { return (lhs.pSurface != rhs.pSurface); } +#else +#error "Unsupported OS platform detected!" +#endif + +#if PAL_CLIENT_EXAMPLE +typedef void* AddrHandle; ///< Corresponds to an ADDR_HANDLE. +#endif + +constexpr uint32 InvalidVidPnSourceId = ~0u; ///< In cases where PAL cannot abstract a Windows VidPnSourceId, this + /// represents an invalid value. (Note: zero is a valid value.) + +constexpr uint32 MaxVertexBuffers = 32; ///< Maximum number of vertex buffers per pipeline. +constexpr uint32 MaxColorTargets = 8; ///< Maximum number of color targets. +constexpr uint32 MaxStreamOutTargets = 4; ///< Maximum number of stream output target buffers. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 +constexpr uint32 MaxDescriptorSets = 2; ///< Maximum number of descriptor sets. +#endif +constexpr uint32 MaxMsaaRasterizerSamples = 16; ///< Maximum number of MSAA samples supported by the rasterizer. +constexpr uint32 MaxAvailableEngines = 12; ///< Maximum number of engines for a particular engine type. +constexpr uint32 MaxNumPlanes = 3; ///< Maximum number of format planes. + +constexpr uint64 InternalApiPsoHash = UINT64_MAX; ///< Default Hash for PAL internal pipelines. + +/// Specifies a category of GPU engine. Each category corresponds directly to a hardware engine. There may be multiple +/// engines available for a given type; the available engines on a particular GPU can be queried via +/// Device::GetProperties, returned in DeviceProperties.engineProperties[]. +enum EngineType : uint32 +{ + /// Corresponds to the graphics hardware engine (a.k.a. graphcis ring a.k.a 3D). + EngineTypeUniversal, + + /// Corresponds to asynchronous compute engines (ACE). + EngineTypeCompute, + + /// Corresponds to SDMA engines. + EngineTypeDma, + + /// Virtual engine that only supports inserting sleeps, used for implementing frame-pacing. + EngineTypeTimer, + + /// Number of engine types. + EngineTypeCount, +}; + +/// Specifies a category of GPU work. Each queue type only supports specific types of work. Determining which +/// QueueTypes are supported on which engines can be queried via IDevice::GetProperties, returned in +/// DeviceProperties.engineProperties[]. +enum QueueType : uint32 +{ + /// Supports graphics commands (draws), compute commands (dispatches), and copy commands. + QueueTypeUniversal, + + /// Supports compute commands (dispatches), and copy commands. + QueueTypeCompute, + + /// Supports copy commands. + QueueTypeDma, + + /// Virtual engine that only supports inserting sleeps, used for implementing frame pacing. + /// This is a software-only queue. + QueueTypeTimer, + + /// Number of queue types. + QueueTypeCount, +}; + +/// Defines flags for describing which queues are supported. +enum QueueTypeSupport : uint32 +{ + SupportQueueTypeUniversal = (1 << static_cast(QueueTypeUniversal)), + SupportQueueTypeCompute = (1 << static_cast(QueueTypeCompute)), + SupportQueueTypeDma = (1 << static_cast(QueueTypeDma)), + SupportQueueTypeTimer = (1 << static_cast(QueueTypeTimer)), + +}; + +// Many command buffers break down into multiple command streams targeting internal sub-engines. For example, Universal +// command buffers build a primary stream (DE) but may also build a second stream for async compute engine (ACE). +enum class SubEngineType : uint32 +{ + Primary = 0, // Subqueue that is the queue itself, rather than an ancillary queue. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 + AsyncCompute = 1, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. + ConstantEngine = 2, // CP constant update engine that runs in parallel with draw engine. + // Internal usage only. +#else + ConstantEngine = 1, // CP constant update engine that runs in parallel with draw engine. + AsyncCompute = 2, // Auxiliary ACE subqueue, together with a primary subqueue forms a "ganged" submit. + Pup = 3, // Subqueue that is the queue itself but for PUP-style packets, rather than an + // ancillary queue +#endif + Count, +}; + +/// Defines the execution priority for a queue, specified either at queue creation or via IQueue::SetExecutionPriority() +/// on platforms that support it. QueuePriority::Normal corresponds to the default priority. +enum class QueuePriority : uint32 +{ + Normal = 0, ///< Normal priority (default). + Idle = 1, ///< Idle, or low priority (lower than Normal). + Medium = 2, ///< Medium priority (higher than Normal). + High = 3, ///< High priority (higher than Normal). + Realtime = 4, ///< Real time priority (higher than Normal). + Count +}; + +/// Defines flags for describing which queue priority levels are supported. +enum QueuePrioritySupport : uint32 +{ + SupportQueuePriorityNormal = (1 << static_cast(QueuePriority::Normal)), + SupportQueuePriorityIdle = (1 << static_cast(QueuePriority::Idle)), + SupportQueuePriorityMedium = (1 << static_cast(QueuePriority::Medium)), + SupportQueuePriorityHigh = (1 << static_cast(QueuePriority::High)), + SupportQueuePriorityRealtime = (1 << static_cast(QueuePriority::Realtime)), +}; + +/// Selects one of a few possible memory heaps accessible by a GPU. +enum GpuHeap : uint32 +{ + GpuHeapLocal = 0x0, ///< Local heap visible to the CPU. + GpuHeapInvisible = 0x1, ///< Local heap not visible to the CPU. + GpuHeapGartUswc = 0x2, ///< GPU-accessible uncached system memory. + GpuHeapGartCacheable = 0x3, ///< GPU-accessible cached system memory. + GpuHeapCount +}; + +/// Describes the desired access for a memory allocation. +enum GpuHeapAccess : uint32 +{ + GpuHeapAccessExplicit = 0x0, ///< Memory access is not known. Heaps will be explicitly defined. + GpuHeapAccessCpuNoAccess = 0x1, ///< Memory access from CPU not required. + GpuHeapAccessGpuMostly = 0x2, ///< Memory optimized for reads/writes from GPU and accessible from CPU. + GpuHeapAccessCpuReadMostly = 0x3, ///< Memory optimized for reads from CPU. + GpuHeapAccessCpuWriteMostly = 0x4, ///< Memory optimized for writes from CPU. + GpuHeapAccessCpuMostly = 0x5, ///< Memory optimized for read/writes from CPU. + GpuHeapAccessCount +}; + +#if defined(__unix__) +/// Describes possible handle types. +enum class HandleType : uint32 +{ + GemFlinkName = 0x0, ///< GEM flink name (needs DRM authentication, used by DRI2) + Kms = 0x1, ///< KMS handle which is used by all driver ioctls + DmaBufFd = 0x2, ///< DMA-buf fd handle + KmsNoImport = 0x3, ///< Deprecated in favour of and same behaviour as HandleTypeDmaBufFd, use that instead of this +}; +#endif + +/// Comparison function determines how a pass/fail condition is determined between two values. For depth/stencil +/// comparison, the first value comes from source data and the second value comes from destination data. +enum class CompareFunc : uint8 +{ + Never = 0x0, + Less = 0x1, + Equal = 0x2, + LessEqual = 0x3, + Greater = 0x4, + NotEqual = 0x5, + GreaterEqual = 0x6, + _Always = 0x7, + + // Unfortunately for Linux clients, X.h includes a "#define Always 2" macro. Clients have their choice of either + // undefing Always before including this header or using _Always when dealing with PAL. +#ifndef Always + Always = _Always, +#endif + + Count +}; + +/// Defines an offset into a 2D pixel region. +struct Offset2d +{ + int32 x; ///< X offset. + int32 y; ///< Y offset. +}; + +/// Defines an offset into a 3D pixel region. +struct Offset3d +{ + int32 x; ///< X offset. + int32 y; ///< Y offset. + int32 z; ///< Z offset. +}; + +/// Defines an floating-point offset into a 3D pixel region. +struct Offset3dFloat +{ + float x; ///< X offset. + float y; ///< Y offset. + float z; ///< Z offset. +}; + +/// Defines a width and height for a 2D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent2d +{ + uint32 width; ///< Width of region. + uint32 height; ///< Height of region. +}; + +/// Defines a signed width and height, for a 2D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct SignedExtent2d +{ + int32 width; ///< Width of region. + int32 height; ///< Height of region. +}; + +/// Defines a width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent3d +{ + uint32 width; ///< Width of region. + uint32 height; ///< Height of region. + uint32 depth; ///< Depth of region. +}; + +constexpr bool operator==(const Extent3d& x, const Extent3d& y) +{ + return (x.width == y.width) && (x.height == y.height) && (x.depth == y.depth); +} + +constexpr bool operator!=(const Extent3d& x, const Extent3d& y) { return (x == y) == false; } + +/// Defines a signed width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or bytes +/// depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct SignedExtent3d +{ + int32 width; ///< Width of region. + int32 height; ///< Height of region. + int32 depth; ///< Depth of region. +}; + +/// Defines a floating-point width, height, and depth for a 3D image region. The dimensions could be pixels, blocks, or +/// bytes depending on context, so be sure to check documentation for the PAL interface of interest to be sure you +/// get it right. +struct Extent3dFloat +{ + float width; ///< Width of region. + float height; ///< Height of region. + float depth; ///< Depth of region. +}; + +/// Defines a region in 1D space. +struct Range +{ + int32 offset; ///< Starting position. + uint32 extent; ///< Region size. +}; + +/// Defines a rectangular region in 2D space. +struct Rect +{ + Offset2d offset; ///< Top left corner. + Extent2d extent; ///< Rectangle width and height. +}; + +/// Defines a cubic region in 3D space. +struct Box +{ + Offset3d offset; ///< Top left front corner. + Extent3d extent; ///< Box width, height and depth. +}; + +/// ShaderHash represents a 128-bit shader hash. +struct ShaderHash +{ + uint64 lower; ///< Lower 64-bits of hash + uint64 upper; ///< Upper 64-bits of hash +}; + +/// PipelineHash represents a concatenated pair of 64-bit hashes. +struct PipelineHash +{ + uint64 stable; ///< Lower 64-bits of hash. "Stable" portion, suitable for e.g. shader replacement use cases. + uint64 unique; ///< Upper 64-bits of hash. "Unique" portion, suitable for e.g. pipeline cache use cases. +}; + +/// Common shader pre and post compilation stats. +struct CommonShaderStats +{ + uint32 numUsedVgprs; ///< Number of VGPRs used by this shader + uint32 numUsedSgprs; ///< Number of SGPRs used by this shader + + uint32 ldsSizePerThreadGroup; ///< LDS size per thread group in bytes. + size_t ldsUsageSizeInBytes; ///< LDS usage by this shader. + + size_t scratchMemUsageInBytes; ///< Amount of scratch mem used by this shader. + gpusize gpuVirtAddress; ///< Gpu mem address of shader ISA code. + + union + { + struct + { + uint32 isWave32 : 1; ///< If set, specifies that the shader is compiled in wave32 mode. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit uint. + } flags; ///< Shader compilation stat flags. +}; + +/// Per-thread stack sizes +struct CompilerStackSizes +{ + uint32 backendSize; ///< Managed by compiler backend + uint32 frontendSize; ///< Managed by compiler frontend +}; + +///@{ +/// Determines whether two ShaderHashes or PipelineHashes are equal. +/// +/// @param [in] hash1 The first 128-bit shader hash or pipeline hash +/// @param [in] hash2 The second 128-bit shader hash or pipeline hash +/// +/// @returns True if the hashes are equal. +constexpr bool ShaderHashesEqual(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } +constexpr bool operator==(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower == hash2.lower) && (hash1.upper == hash2.upper)); } +constexpr bool operator!=(const ShaderHash hash1, const ShaderHash hash2) + { return ((hash1.lower != hash2.lower) || (hash1.upper != hash2.upper)); } +constexpr bool PipelineHashesEqual(const PipelineHash hash1, const PipelineHash hash2) + { return ((hash1.stable == hash2.stable) && (hash1.unique == hash2.unique)); } +///@} + +///@{ +/// Determines whether the given ShaderHash or PipelineHash is non-zero. +/// +/// @param [in] hash A 128-bit shader hash or pipeline hash +/// +/// @returns True if the hash is non-zero. +constexpr bool ShaderHashIsNonzero(const ShaderHash hash) { return ((hash.upper | hash.lower) != 0); } +constexpr bool PipelineHashIsNonzero(const PipelineHash hash) { return ((hash.stable | hash.unique) != 0); } +///@} + +/// Specifies the Display Output Post-Processing (DOPP) desktop texture information, which are provided by OpenGL via +/// interop. The DOPP is an OpenGL extension to allow its client to access the desktop texture directly without the +/// need of copying to system memory. This is only supported on Windows. +struct DoppDesktopInfo +{ + gpusize gpuVirtAddr; ///< The VA of the dopp desktop texture. Set to 0 for the non-dopp resource. + uint32 vidPnSourceId; ///< Display source id of the dopp desktop texture. +}; + +/// Specifies the Direct Capture resource information. Direct Capture is an extension that allows to access on-screen +/// primary, motion vectors, depth, and camera matrix directly. This is only supported on Windows. +struct DirectCaptureInfo +{ + uint32 vidPnSourceId; ///< VidPnSource ID of the on-screen primary. + union + { + struct + { + uint32 preflip : 1; ///< Requires pre-flip primary access + uint32 postflip : 1; ///< Requires post-flip primary access. A DirectCapture resource cannot + /// have pre-flip and post-flip access at the same time + uint32 accessDesktop : 1; ///< Requires acces to the desktop + uint32 shared : 1; ///< This resource will be shared between APIs + uint32 frameGenRatio : 4; ///< Frame generation ratio + uint32 paceGeneratedFrame : 1; ///< Requires pacing the generated frames + uint32 requiresDisplayDcc : 1; ///< Requires display dcc support + uint32 requestMotionVectors : 1; ///< Request DirectCapture access to motion vector data if available + uint32 requestDepth : 1; ///< Request DirectCapture access to depth data if available + uint32 requestCamera : 1; ///< Request DirectCapture access to camera matrix data if available + uint32 initMotionVectors : 1; ///< Initialize the DirectCapture resource to access motion vector data + uint32 initDepth : 1; ///< Initialize the DirectCapture resource to access depth data + uint32 initCamera : 1; ///< Initialize the DirectCapture resource to access camera matrix + uint32 requestHudLessImage : 1; ///< Request DirectCapture access to HUD less image if available + uint32 initHudLessImage : 1; ///< Initialize the DirectCapture resource to access HUD less image + uint32 reserved : 14; + }; + uint32 u32All; + } usageFlags; + + OsExternalHandle hNewFrameEvent; ///< Event to notify of a new frame available for pre-flip or post-flip access + OsExternalHandle hFatalErrorEvent; ///< Event to notify of a fatal error +}; + +/// Specifies parameters for opening a shared GPU resource from a non-PAL device or non-local process. +struct ExternalResourceOpenInfo +{ + OsExternalHandle hExternalResource; ///< External GPU resource from another non-PAL device to open. +#if defined(__unix__) + HandleType handleType; ///< Type of the external GPU resource to be opened. +#endif + + union + { + struct + { + uint32 ntHandle : 1; ///< The provided hExternalResource is an NT handle instead of a default + /// KMT handle. + uint32 androidHwBufHandle : 1; ///< The provided hExternalResource is android hardware buffer handle + /// instead of fd. + uint32 isDopp : 1; ///< This is a Dopp texture, doppDesktopInfo is in use. + uint32 isDirectCapture : 1; ///< This is a Direct Capture resource, directCaptureInfo is in use. + uint32 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External resource open flags. + + union + { + DoppDesktopInfo doppDesktopInfo; ///< The information of dopp desktop texture. + DirectCaptureInfo directCaptureInfo; ///< The information of direct capture resource. + }; +}; + +/// Packed pixel display enumeration. +/// +/// In the medical imaging market space, there are several 10-bit per component color and grayscale displays +/// available.In addition to being high precision, these displays tend to be very high resolution.For grayscale +/// displays,one method of getting high pixel resolution in 10b precision is a proprietary method called +/// "packed pixel".Each of these packed pixel formats packs two/three 10-bit luminance values into a single +/// R8G8B8 pixel. +/// +/// Example Displays: +/// +/// EIZO GS510 +/// NEC MD21GS +/// TOTOKU ME55Xi2 +/// FIMI 3/5MP +/// +/// +/// The enumerations are named in a way to describe the format of the packed pixels. Names for +/// formats with two or three pixels packed into a single word (corresponding to a simple RGB pixel) +/// follow this convention: +/// +/// LLLLLL_RRRRRR (L=left pixel, R=right pixel) or +/// LLL_MMM_RRR (L=left pixel, M=middle pixel, R=right pixel) +/// +/// The bit order for a pixel follows this convention: +/// +/// (ColorBand)MSB(ColorBand)LSB +/// +/// For example: G70B54 means that the MSBs are in 7-0 of the green channel, and the LSBs +/// are stored in bits 5-4. +/// +enum class PackedPixelType : uint32 +{ + NotPacked = 0, ///< Pixels not packed, for standard color RGB8 monitor + SplitG70B54_R70B10, ///< 10-bit mono, split screen + SplitB70G10_R70G76, ///< 10-bit mono, split screen + G70B54_R70B10, ///< 10-bit mono, 2 adjacent pixels + B70R32_G70R76, ///< 10-bit mono, 2 adjacent pixels + B70R30_G70R74, ///< 12-bit mono, 2 adjacent pixels + B70_G70_R70, ///< 8-bit mono, 3 adjacent pixels + R70G76, ///< 10-bit mono, single pixel + G70B54, ///< 10-bit mono, single pixel + Native, ///< 10-bit color, without packing +}; + +/// Enumerates the logging priority levels supported by PAL. +enum class LogLevel : uint32 +{ + Debug = 0, ///< Debug messages + Verbose, ///< High frequency messages + Info, ///< Low frequency messages + Alert, ///< Warnings + Error, ///< Critical issues + Always ///< All messages +}; + +/// Enumerates all log categories explicitly defined by PAL +enum class LogCategory : uint64 +{ + Correctness = 0, ///< Application correctness + Performance, ///< Application performance + Internal, ///< Internal logging + Display, ///< Display Info + Count +}; + +/// String table used to register log categories +constexpr const char* LogCategoryTable[] = +{ + "Correctness", + "Performance", + "Internal", + "Display" +}; + +/// Typedef for log category masks. +typedef uint64 LogCategoryMask; + +/// Log category mask for messages related to application correctness +constexpr LogCategoryMask LogCategoryMaskCorrectness = (1 << static_cast(LogCategory::Correctness)); + +/// Log category mask for messages related to application performance +constexpr LogCategoryMask LogCategoryMaskPerformance = (1 << static_cast(LogCategory::Performance)); + +/// Log category mask for messages related to internal messages +constexpr LogCategoryMask LogCategoryMaskInternal = (1 << static_cast(LogCategory::Internal)); + +/// Log category mask for messages related to display information (e.g. HDR format) +constexpr LogCategoryMask LogCategoryMaskDisplay = (1 << static_cast(LogCategory::Display)); + +/// Defines the modes that the GPU Profiling layer can be enabled with. If the GpuProfilerMode is +/// GpuProfilerTraceEnabledTtv or GpuProfilerTraceEnabledRgp, then the GpuProfilerConfig_TraceModeMask is examined to +/// configure the trace type (spm, sqtt or both) requested. +enum GpuProfilerMode : uint32 +{ + GpuProfilerDisabled = 0, ///< Gpu Profiler is disabled. + GpuProfilerCounterAndTimingOnly = 1, ///< Traces are disabled but perf counter and timing operations are enabled. + GpuProfilerTraceEnabledTtv = 2, ///< Traces are output in format (.csv, .out) for Thread trace viewer. + GpuProfilerTraceEnabledRgp = 3, ///< Trace data is output as .rgp file for Radeon Gpu Profiler. +}; + +// Defines the trigger keys for capturing the GPU profiler. +typedef Util::KeyCode GpuProfilerCaptureTriggerKey; + +#define PAL_EVENT_LOGGING_VERSION 528 + +/// This enumeration identifies the source/owner of a resource object, used for event logging. +enum ResourceOwner : uint32 +{ + ResourceOwnerApplication = 0, ///< The resource is owned by the application + ResourceOwnerPalClient = 1, ///< The resource is owned by the PAL client + ResourceOwnerPal = 2, ///< The resource is owned by PAL + ResourceOwnerUnknown = 3, ///< The resource owner is unknown +}; + +/// This enumeration lists the usage/category of a resource object to give context in event logging. +enum ResourceCategory : uint32 +{ + ResourceCategoryApplication = 0, ///< The resource is used by the application. + ResourceCategoryRpm = 1, ///< The resource is used by RPM + ResourceCategoryProfiling = 2, ///< The resource is used for profiling (e.g. SQTT, SPM, etc) + ResourceCategoryDebug = 3, ///< The resource is used for debug purposes + ResourceCategoryRayTracing = 4, ///< The resource is used for ray tracing + ResourceCategoryVideo = 5, ///< The resource is used for video encode/decode + ResourceCategoryMisc = 6, ///< Miscellaneous, resource doesn't fit in any of the above categories + ResourceCategoryUnknown = 7, ///< The resource category is unknown +}; + +/// Set of information about resource ownership and usage, used for event logging. +struct ResourceEventInfo +{ + ResourceOwner owner; ///< Resource owner + ResourceCategory category; ///< Resource category +}; + +/// General purpose on/off/default tri-state enum. +enum class TriState : uint8 +{ + Default = 0, ///< Let implementation decide whether to enable or disable + Enable = 1, ///< Force enable + Disable = 2, ///< Force disable + Count +}; + +/// Defines the modes that the GPU Profiling layer can be enabled with. +/** + *********************************************************************************************************************** + * @mainpage + * + * Introduction + * ------------ + * The Platform Abstraction Library (PAL) provides hardware and OS abstractions for Radeon (GCN+) user-mode 3D graphics + * drivers. The level of abstraction is chosen to support performant driver implementations of several APIs while + * hiding the client from hardware and operating system details. + * + * PAL client drivers will have no HW-specific code; their responsibility is to translate API/DDI commands into PAL + * commands as efficiently as possible. This means that the client should be unaware of hardware registers, PM4 + * commands, SP3 shaders, etc. However, PAL is an abstraction of AMD hardware only, so many things in the PAL interface + * have an obvious correlation to hardware features. + * + * PAL client drivers should have little OS-specific code. PAL and its companion utility collection provide + * OS abstractions for almost everything a client might need, but there are some cases where this is unavoidable: + * + * + Handling dynamic library infrastructure. I.e., the client has to implement DllMain() on Windows, etc. + * + OS-specific APIs or extensions. DX may have Windows-specific functionality in the core API, and Vulkan may + * export certain OS-specific features as extensions (like for presenting contents to the screen). + * + Single OS clients (e.g., DX) may choose to make OS-specific calls directly simply out of convenience with no down + * side. + * + * + * The following diagram illustrates the software stack when running a 3D application with a PAL-based UMD. Non-AMD + * components are in gray, UMD client code is blue, AMD static libs linked into the UMD are green, and the AMD KMD + * is in red. + * + * @image html swStack.png + * + * PAL is a relatively _thick_ abstraction layer, typically accounting for the majority of code (excluding SC) in any + * particular UMD built on PAL. The level of abstraction tends to be higher in areas where client APIs are similar, + * and lower (closer to hardware) in areas where client APIs diverge significantly. The overall philosophy is to share + * as much code as possible without impacting client driver performance. Our committed goal is that CPU-limited + * performance should be within 5% of what a native solution could achieve, and GPU-limited performance should be within + * 2%. + * + * PAL uses a C++ interface. The public interface is defined in .../pal/inc, and client must _only_ include headers + * from that directory. The interface is spread over many header files - typically one per class - in order to clarify + * dependencies and reduce build times. There are two sub-directories in .../pal/inc: + * + * + .../pal/inc/core - Defines the PAL Core (see @ref Overview). + * + .../pal/inc/gpuUtil - Defines the PAL GPU Utility Collection (see @ref GpuUtilOverview). + * + .../pal/inc/util - Defines the PAL Utility Collection (see @ref UtilOverview). + * + * + * @copydoc VersionHistory + * + * Next: @ref Build + *********************************************************************************************************************** + */ + +/** + *********************************************************************************************************************** + * @page Overview PAL Core Overview + * + * ### Introduction + * PAL's core interface is defined in the @ref Pal namespace, and defines an object-oriented model for interacting with + * the GPU and OS. The interface closely resembles the Vulkan and DX12 APIs. Some common features of these + * APIs that are central to the PAL interface: + * + * - All shader stages, and some additional "shader adjacent" state, are glommed together into a monolithic pipeline + * object. + * - Explicit, free-threaded command buffer generation. + * - Support for multiple, asynchronous engines for executing GPU work (graphics, compute, DMA). + * - Explicit system and GPU memory management. + * - Flexible shader resource binding model. + * - Explicit management of stalls, cache flushes, and compression state changes. + * + * However, as a common component supporting multiple APIs, the PAL interface tends to be lower level in places where + * client APIs diverge. + * + * ### Settings + * The PAL library has a number of configuration settings available for the client to modify either programmatically + * or via external settings. PAL also includes infrastructure for building/loading client-specific settings. + * See @ref Settings for a detailed description of this support. + * + * ### Initialization + * The first step to interacting with the PAL core is creating an IPlatform object and enumerating IDevice objects + * representing GPUs attached to the system and, optionally, IScreen objects representing displays attached to the + * system. See @ref LibInit for a detailed description. + * + * ### System Memory Allocation + * Clients have a lot of control over PAL's system memory allocations. Most PAL objects require the client to provide + * system memory; the client first calls a GetSize() method and then passes a pointer to PAL on the actual create call. + * Further, when PAL needs to make an internal allocation, it will optionally call a client callback, which can be + * specified on platform creation. This callback will specify a category for the allocation, which may imply an + * expected lifetime. + * + * ### Interface Classes + * The following diagram illustrates the relationship of some key PAL interfaces and how they interact to render a + * typical frame in a modern game. Below that is a listing of all of PAL's interface classes, and a very brief + * description of their purpose. Follow the link for each interface to see detailed reference documentation. + * + * @image html scheduling.png + * + * - __OS Abstractions__ + * + _IPlatform_: Root-level object created by clients that interact with PAL. Mostly responsible for enumerating + * devices and screens attached to the system and returning any system-wide properties.

+ * + _IDevice_: Configurable context for querying properties of a particular GPU and interacting with it. Acts as a + * factory for almost all other PAL objects.

+ * + _IQueue_: A device has one or more _engines_ which are able to issue certain types of work. Tahiti, for example, + * has 1 universal engine (supports graphics, compute, or copy commands), 2 compute engines (support + * compute or copy commands), and 2 DMA engines (support only copy commands). An IQueue object is a + * context for submitting work on a particular engine. This mainly takes the form of submitting command + * buffers and presenting images to the screen. Work performed in a queue will be started in order, but + * work executed on different queues (even if the queues reference the same engine) is not guaranteed + * to be ordered without explicit synchronization.

+ * + _IQueueSemaphore_: Queue semaphores can be signaled and waited on from an IQueue in order to control execution + * order between queues.

+ * + _IFence_: Used for coarse-grain CPU/GPU synchronization. Fences can be signalled from the GPU as part of a + * command buffer submission on a queue, then waited on from the CPU.

+ * + _IGpuMemory_: Represents a GPU-accessible memory allocation. Can either be virtual (only VA allocation which + * must be explicitly mapped via an IQueue operation) or physical. Residency of physical allocations + * must be managed by the client either globally for a device (IDevice::AddGpuMemoryReferences) or by + * specifying allocations referenced by command buffers at submit.

+ * + _ICmdAllocator_: GPU memory allocation pool used for backing an ICmdBuffer. The client is free to create one + * allocator per device, or one per thread to remove thread contention.

+ * + _IScreen_: Represents a display attached to the system. Mostly used for managing full-screen flip + * presents.

+ * + _IPrivateScreen_: Represents a display that is not otherwise visible to the OS, typically a VR head mounted + * display.

+ * - __Hardware IP Abstractions__ + * + __All IP__ + * - _ICmdBuffer_: Clients build command buffers to execute the desired work on the GPU, and submit them on a + * corresponding queue. Different types of work can be executed depending on the _queueType_ of + * the command buffer (graphics work, compute work, DMA work).

+ * - _IImage_: Images are a 1D, 2D, or 3D collection of pixels (i.e., _texture_) that can be accessed by the + * GPU in various ways: texture sampling, BLT source/destination, UAV, etc.

+ * + __GFXIP-only__ + * - _IShader_: Container for shader byte code used as an input to pipeline creation. No compilation occurs + * until an IPipeline is created. Currently, AMDIL is the only supported input language.

+ * - _IPipeline_: Comprised of all shader stages (CS for compute, VS/HS/DS/GS/PS for graphics), resource mappings + * describing how user data entries are to be used by the shaders, and some other fixed-function + * state like depth/color formats, blend enable, MSAA enable, etc.

+ * - _IColorTargetView_: IImage view allowing the image to be bound as a color target (i.e., RTV.).

+ * - _IDepthStencilView_: IImage view allowing the image to be bound as a depth/stencil target (i.e., DSV).

+ * - _IGpuEvent_: Used for fine-grained (intra-command buffer) synchronization between the CPU and GPU. GPU + * events can be set/reset from either the CPU or GPU and waited on from either.

+ * - _IQueryPool_: Collection of query slots for tracking occlusion or pipeline stats query results.

+ * - __Dynamic State Objects__: _IColorBlendState_, _IDepthStencilState_, _IMsaaState_, _IScissorState_, + * and _IViewportState_ define logical collections of related fixed function graphics + * state, similar to DX11.

+ * - _IPerfExperiment_: Used for gathering performance counter and thread trace data.

+ * - _IBorderColorPalette_: Provides a collection of indexable colors for use by samplers that clamp to an + * arbitrary border color.

+ * - __Common Base Classes__ + * + _IDestroyable_: Defines a _Destroy()_ method for the PAL interface. Calling _Destroy()_ will release any + * internally allocated resources for the object, but the client is still responsible for freeing + * the system memory provided for the object.

+ * + _IGpuMemoryBindable_: Defines a set of methods for binding GPU memory to the object. Interfaces that inherit + * _IGpuMemoryBindable_ require GPU memory in order to be used by the GPU. The client + * must query the requirements (e.g., alignment, size, heaps) and allocate/bind GPU memory + * for the object. _IGpuMemoryBindable_ inherits from _IDestroyable_.

+ * + * ### %Format Info + * Several helper methods are available for dealing with image formats in the @ref Formats namespace. + * + * ### Graphics/Compute Execution Model + * Most graphics/compute work is defined by first binding a set of states then issuing a draw or dispatch command to + * kick off the work. The complete set of graphics states available in PAL is illustrated below; compute is a subset + * of this that only includes the pipeline, user data entries, and border color palette. + * + * @image html stateBreakdown.jpg + * + * Most of these correspond directly to a PAL interface object above, and these items are bound by calling a + * corresponding _CmdBind...()_ method in the ICmdBuffer interface. The states marked in yellow and orange, however, + * are _immediate_ states for which there is no object, you just specify the required state values in the corresponding + * _CmdSet...()_ method in the ICmdBuffer interface. + * + * User data entries are the way that input resources are specified for the pipeline on an upcoming draw/dispatch. This + * mapping is complicated, and is described fully in @ref ResourceBinding. + * + * A final complication worth noting is that PAL provides no implicit surface synchronization. The client is + * responsible for explicitly inserting barriers to resolve data hazards, flush/invalidate caches, and ensure images + * are in the proper compression state. For more detail, see ICmdBuffer::CmdReleaseThenAcquire, CmdRelease, CmdAcquire, + * CmdReleaseEvent, CmdAcquireEvent and AcquireReleaseInfo. + * + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h index 9fca2615d2..542d470247 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdAllocator.h @@ -1,204 +1,204 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palCmdAllocator.h - * @brief Defines the Platform Abstraction Library (PAL) ICmdAllocator interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" - -namespace Pal -{ - -// Forward declarations. -struct GpuMemSubAllocInfo; -class IGpuMemory; - -/// Flags controlling the creation of ICmdAllocator objects. -union CmdAllocatorCreateFlags -{ - struct - { - uint32 threadSafe : 1; ///< If set, the allocator will acquire a lock each time it is accessed; - /// otherwise it will not attempt to protect itself from multithreaded - /// access. - uint32 autoMemoryReuse : 1; ///< If set, the allocator will track when the GPU finishes accessing - /// each piece of command memory and attempt to reuse memory which the - /// GPU is done with before allocating more memory from the OS. If not - /// set, memory will only be recycled after a call to - /// @ref ICmdAllocator::Reset(). - uint32 disableBusyChunkTracking : 1; ///< If set, the allocator will not do any GPU-side tracking of which - /// command chunks are still in use. It will be the client's (or the - /// application's) responsibility to guarantee that command chunks are - /// not returned to the allocator before the GPU has finished processing - /// them. Failure to guarantee this will result in undefined behavior. - /// This flag has no effect if @ref autoMemoryReuse is not set. - uint32 autoTrimMemory : 1; ///< If set the allocator will automatically trim down the allocations - /// (where all chunks are idle on the freeList). A minimum of - /// allocFreeThreshold allocations are kept for fast reuse. - uint32 reserved : 28; ///< Reserved for future use. - }; - - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Different type of allocation data that an ICmdAllocator allocates and distributes to command buffers. -enum CmdAllocType : uint32 -{ - CommandDataAlloc = 0, ///< Data allocated is for executable commands. - EmbeddedDataAlloc, ///< Data allocated is for embedded data. - LargeEmbeddedDataAlloc, ///< Data allocated is for embedded data, allocation is >32kb - GpuScratchMemAlloc, ///< Data allocated is GPU-only accessible at command buffer execution-time. Possible - /// uses like GPU events. - CmdAllocatorTypeCount ///< Number of allocation types for ICmdAllocator's. -}; - -/// Specifies properties for creation of an ICmdAllocator object. Input structure to IDevice::CreateCmdAllocator(). -struct CmdAllocatorCreateInfo -{ - CmdAllocatorCreateFlags flags; ///< Flags controlling command allocator creation. - - struct - { - GpuHeap allocHeap; ///< Preferred allocation heap. For @ref GpuScratchMemAlloc, this field is - /// ignored and the allocation will always be in GPU-invisible memory. For - /// all other allocation types, this must be CPU-mappable. - /// For best performance, command allocators that will be used by the - /// UVD engine should prefer the Local heap - gpusize allocSize; ///< Size, in bytes, of the GPU memory allocations this allocator will create. - /// It must be an integer multiple of suballocSize. - gpusize suballocSize; ///< Size, in bytes, of the chunks of GPU memory this allocator will give to - /// command buffers. It must be an integer multiple of 4096. - /// Must be greater than zero even if the client doesn't plan on using this - /// allocation type. - uint32 allocFreeThreshold; ///< Minimum count of free allocations that the allocator should keep around - /// for fast reuse. It is used when the autoTrimMemory flag is set. - } allocInfo[CmdAllocatorTypeCount]; ///< Information for each allocation type. -}; - -/// Output structure for QueryUtilizationInfo(). -/// The CmdAllocator utilization data can be queried by PAL clients in order to decide whether to trim the allocations. -struct CmdAllocatorUtilizationInfo -{ - uint32 numAllocations; ///< Number of allocations owned by the allocator. - uint32 numFreeChunks; ///< Number of chunks that are reset and not in use. - uint32 numBusyChunks; ///< Number of chunks that in use by the GPU. - uint32 numReuseChunks; ///< Number of chunks that have been 'returned' to the allocator for reuse. -}; - -/** - *********************************************************************************************************************** - * @interface ICmdAllocator - * @brief Allocates and distributes GPU memory to command buffers on the client's behalf. - * - * All ICmdBuffer objects must be associated with an ICmdAllocator at creation. Command buffers may switch command - * allocators when ICmdBuffer::Reset() is called. The set of command buffers associated with a given command allocator - * will query that allocator for additional GPU memory as they are building commands. - * - * To protect against race conditions the client must ask for a thread safe command allocator unless its can guarantee - * that all command buffers associated with a given command allocator will be built, reset, and destroyed in a thread- - * safe manner. It is illegal to destroy a command allocator while it still has command buffers associated with it. - * - * @see IDevice::CreateCmdAllocator() - *********************************************************************************************************************** - */ -class ICmdAllocator : public IDestroyable -{ -public: - /// Explicitly resets a command allocator, marking all internal GPU memory allocations as unused. - /// - /// The client is responsible for guaranteeing that all command buffers associated with this allocator have finished - /// GPU execution and have been explicitly reset before calling this function. - /// - /// @param [in] freeMemory If the all GPU and CPU memory allocations should be returned to the OS. - /// - /// @returns Success if the command allocator was successfully reset. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Result Reset(bool freeMemory) = 0; - - /// Explicitly trims a command allocator, deleting as many unused internal GPU memory allocations as possible. - /// - /// @returns Success if the command allocator was successfully trimmed. - /// - /// @param [in] allocTypeMask Gives control whether trimming will be applied for each CmdAllocType. - /// Use (1 << CmdAllocatorTypeCount) - 1 to apply trimming to all types. - /// When trimming only the embedded date use (1 << EmbeddedDataAlloc). - /// @param [in] dynamicThreshold Minimum count of free allocations that the allocator should keep around - - virtual Result Trim(uint32 allocTypeMask, uint32 dynamicThreshold) = 0; - - /// Query the numbers of allocations and chunks of the given CmdAllocator type. - /// This may help clients to decide whether they may apply trimming or not. - /// - /// @returns Success if valid values can be reported. - /// - /// @param [in] type CmdAllocType that is being queried - /// @param [out] pUtilizationInfo The allocation and chunk counts will be stored here. - - virtual Result QueryUtilizationInfo(CmdAllocType type, CmdAllocatorUtilizationInfo* pUtilizationInfo) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - ICmdAllocator() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~ICmdAllocator() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdAllocator.h + * @brief Defines the Platform Abstraction Library (PAL) ICmdAllocator interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +// Forward declarations. +struct GpuMemSubAllocInfo; +class IGpuMemory; + +/// Flags controlling the creation of ICmdAllocator objects. +union CmdAllocatorCreateFlags +{ + struct + { + uint32 threadSafe : 1; ///< If set, the allocator will acquire a lock each time it is accessed; + /// otherwise it will not attempt to protect itself from multithreaded + /// access. + uint32 autoMemoryReuse : 1; ///< If set, the allocator will track when the GPU finishes accessing + /// each piece of command memory and attempt to reuse memory which the + /// GPU is done with before allocating more memory from the OS. If not + /// set, memory will only be recycled after a call to + /// @ref ICmdAllocator::Reset(). + uint32 disableBusyChunkTracking : 1; ///< If set, the allocator will not do any GPU-side tracking of which + /// command chunks are still in use. It will be the client's (or the + /// application's) responsibility to guarantee that command chunks are + /// not returned to the allocator before the GPU has finished processing + /// them. Failure to guarantee this will result in undefined behavior. + /// This flag has no effect if @ref autoMemoryReuse is not set. + uint32 autoTrimMemory : 1; ///< If set the allocator will automatically trim down the allocations + /// (where all chunks are idle on the freeList). A minimum of + /// allocFreeThreshold allocations are kept for fast reuse. + uint32 reserved : 28; ///< Reserved for future use. + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Different type of allocation data that an ICmdAllocator allocates and distributes to command buffers. +enum CmdAllocType : uint32 +{ + CommandDataAlloc = 0, ///< Data allocated is for executable commands. + EmbeddedDataAlloc, ///< Data allocated is for embedded data. + LargeEmbeddedDataAlloc, ///< Data allocated is for embedded data, allocation is >32kb + GpuScratchMemAlloc, ///< Data allocated is GPU-only accessible at command buffer execution-time. Possible + /// uses like GPU events. + CmdAllocatorTypeCount ///< Number of allocation types for ICmdAllocator's. +}; + +/// Specifies properties for creation of an ICmdAllocator object. Input structure to IDevice::CreateCmdAllocator(). +struct CmdAllocatorCreateInfo +{ + CmdAllocatorCreateFlags flags; ///< Flags controlling command allocator creation. + + struct + { + GpuHeap allocHeap; ///< Preferred allocation heap. For @ref GpuScratchMemAlloc, this field is + /// ignored and the allocation will always be in GPU-invisible memory. For + /// all other allocation types, this must be CPU-mappable. + /// For best performance, command allocators that will be used by the + /// UVD engine should prefer the Local heap + gpusize allocSize; ///< Size, in bytes, of the GPU memory allocations this allocator will create. + /// It must be an integer multiple of suballocSize. + gpusize suballocSize; ///< Size, in bytes, of the chunks of GPU memory this allocator will give to + /// command buffers. It must be an integer multiple of 4096. + /// Must be greater than zero even if the client doesn't plan on using this + /// allocation type. + uint32 allocFreeThreshold; ///< Minimum count of free allocations that the allocator should keep around + /// for fast reuse. It is used when the autoTrimMemory flag is set. + } allocInfo[CmdAllocatorTypeCount]; ///< Information for each allocation type. +}; + +/// Output structure for QueryUtilizationInfo(). +/// The CmdAllocator utilization data can be queried by PAL clients in order to decide whether to trim the allocations. +struct CmdAllocatorUtilizationInfo +{ + uint32 numAllocations; ///< Number of allocations owned by the allocator. + uint32 numFreeChunks; ///< Number of chunks that are reset and not in use. + uint32 numBusyChunks; ///< Number of chunks that in use by the GPU. + uint32 numReuseChunks; ///< Number of chunks that have been 'returned' to the allocator for reuse. +}; + +/** + *********************************************************************************************************************** + * @interface ICmdAllocator + * @brief Allocates and distributes GPU memory to command buffers on the client's behalf. + * + * All ICmdBuffer objects must be associated with an ICmdAllocator at creation. Command buffers may switch command + * allocators when ICmdBuffer::Reset() is called. The set of command buffers associated with a given command allocator + * will query that allocator for additional GPU memory as they are building commands. + * + * To protect against race conditions the client must ask for a thread safe command allocator unless its can guarantee + * that all command buffers associated with a given command allocator will be built, reset, and destroyed in a thread- + * safe manner. It is illegal to destroy a command allocator while it still has command buffers associated with it. + * + * @see IDevice::CreateCmdAllocator() + *********************************************************************************************************************** + */ +class ICmdAllocator : public IDestroyable +{ +public: + /// Explicitly resets a command allocator, marking all internal GPU memory allocations as unused. + /// + /// The client is responsible for guaranteeing that all command buffers associated with this allocator have finished + /// GPU execution and have been explicitly reset before calling this function. + /// + /// @param [in] freeMemory If the all GPU and CPU memory allocations should be returned to the OS. + /// + /// @returns Success if the command allocator was successfully reset. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result Reset(bool freeMemory) = 0; + + /// Explicitly trims a command allocator, deleting as many unused internal GPU memory allocations as possible. + /// + /// @returns Success if the command allocator was successfully trimmed. + /// + /// @param [in] allocTypeMask Gives control whether trimming will be applied for each CmdAllocType. + /// Use (1 << CmdAllocatorTypeCount) - 1 to apply trimming to all types. + /// When trimming only the embedded date use (1 << EmbeddedDataAlloc). + /// @param [in] dynamicThreshold Minimum count of free allocations that the allocator should keep around + + virtual Result Trim(uint32 allocTypeMask, uint32 dynamicThreshold) = 0; + + /// Query the numbers of allocations and chunks of the given CmdAllocator type. + /// This may help clients to decide whether they may apply trimming or not. + /// + /// @returns Success if valid values can be reported. + /// + /// @param [in] type CmdAllocType that is being queried + /// @param [out] pUtilizationInfo The allocation and chunk counts will be stored here. + + virtual Result QueryUtilizationInfo(CmdAllocType type, CmdAllocatorUtilizationInfo* pUtilizationInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + ICmdAllocator() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~ICmdAllocator() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h index 059bcfe9e2..ca38a765c1 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdBuffer.h @@ -1,5204 +1,5118 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palCmdBuffer.h - * @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDevice.h" -#include "palGpuMemory.h" -#include "palImage.h" -#include "palMsaaState.h" -#include "palPipeline.h" -#include "palQueryPool.h" -#include "palCmdTracking.h" - -/// HSA kernel dispatch packet typedef -typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; -/// AMD kernel code typedef -typedef struct amd_kernel_code_s amd_kernel_code_t; -/// AMD kernel descriptor -namespace llvm {namespace amdhsa {struct kernel_descriptor_t;} } - -namespace Util -{ -class VirtualLinearAllocator; -class Event; -} - -namespace Pal -{ - -// Forward declarations. -class IBorderColorPalette; -class ICmdAllocator; -class ICmdBuffer; -class IColorBlendState; -class IColorTargetView; -class IDepthStencilState; -class IDepthStencilView; -class IGpuEvent; -class IGpuMemory; -class IIndirectCmdGenerator; -class IMsaaState; -class IPerfExperiment; -class IQueue; -class IQueryPool; -enum class PerfTraceMarkerType : uint32; -enum class PointOrigin : uint32; - -struct VideoCodecInfo; -struct VideoCodecAuxInfo; - -/// Specifies a pipeline bind point (i.e., compute or graphics). -enum class PipelineBindPoint : uint32 -{ - Compute = 0x0, - Graphics = 0x1, - Count -}; - -/// Fully specifies a type of graphics primitive and vertex ordering for geometry. -enum class PrimitiveTopology : uint8 -{ - PointList = 0x0, - LineList = 0x1, - LineStrip = 0x2, - TriangleList = 0x3, - TriangleStrip = 0x4, - RectList = 0x5, ///< Each rect is three 2D axis-aligned rectangle vertices. - QuadList = 0x6, - QuadStrip = 0x7, - LineListAdj = 0x8, - LineStripAdj = 0x9, - TriangleListAdj = 0xA, - TriangleStripAdj = 0xB, - Patch = 0xC, - TriangleFan = 0xD, - LineLoop = 0xE, - Polygon = 0xF, - TwoDRectList = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle. - /// Support is optional, see support2DRectList in DeviceProperties. - Count -}; - -/// Specifies how triangle primitives should be rasterized. -enum class FillMode : uint8 -{ - Points = 0x0, - Wireframe = 0x1, - Solid = 0x2, - Count -}; - -/// Specifies the triangle face direction that should result in culled primitives. -enum class CullMode : uint8 -{ - _None = 0x0, ///< All triangles are rasterized. - Front = 0x1, ///< Front facing triangles are culled. - Back = 0x2, ///< Back facing triangles are culled. - FrontAndBack = 0x3, ///< All triangles are culled. - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< All triangles are rasterized. -#endif -}; - -/// Specifies vertex winding order corresponding to a front facing triangle. @see CullMode. -enum class FaceOrientation : uint8 -{ - Ccw = 0x0, ///< Counter-clockwise vertex winding primitives are front facing. - Cw = 0x1 ///< Clockwise vertex winding primitives are front facing. -}; - -/// Specifies which vertex of a primitive is the _provoking vertex_. This impacts which vertex's "flat" VS outputs -/// are passed to the PS (i.e., flat shading). -enum class ProvokingVertex : uint8 -{ - First = 0x0, - Last = 0x1 -}; - -/// Specifies bit size of each element in an index buffer. -enum class IndexType : uint32 -{ - Idx8 = 0x0, - Idx16 = 0x1, - Idx32 = 0x2, - Count -}; - -/// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic(). -enum class AtomicOp : uint32 -{ - AddInt32 = 0x00, - SubInt32 = 0x01, - MinUint32 = 0x02, - MaxUint32 = 0x03, - MinSint32 = 0x04, - MaxSint32 = 0x05, - AndInt32 = 0x06, - OrInt32 = 0x07, - XorInt32 = 0x08, - IncUint32 = 0x09, - DecUint32 = 0x0A, - AddInt64 = 0x0B, - SubInt64 = 0x0C, - MinUint64 = 0x0D, - MaxUint64 = 0x0E, - MinSint64 = 0x0F, - MaxSint64 = 0x10, - AndInt64 = 0x11, - OrInt64 = 0x12, - XorInt64 = 0x13, - IncUint64 = 0x14, - DecUint64 = 0x15, - Count -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 -/// Specifies the point in the GPU pipeline where an action should take place. -/// -/// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps. -/// -/// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield -/// true if a stall is necessary to resolve a hazard between those two pipe points. This guideline does not -/// hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with -/// graphics shader work. -/// -/// @see ICmdBuffer::CmdSetEvent() -/// @see ICmdBuffer::CmdResetEvent() -/// @see ICmdBuffer::CmdPredicateEvent() -/// @see ICmdBuffer::CmdBarrier() -/// @see ICmdBuffer::CmdWriteTimestamp() -/// @see ICmdBuffer::CmdWriteImmediate() -enum HwPipePoint : uint32 -{ - HwPipeTop = 0x0, ///< Earliest possible point in the GPU pipeline (CP PFP), can be - /// used as wait point for indirect args and index buffer fetch. - HwPipePostPrefetch = 0x1, ///< Indirect arguments have been fetched for all prior - /// draws/dispatches (CP ME). - HwPipePreRasterization = 0x2, ///< All prior generated VS/HS/DS/GS waves have completed, can be - /// used as release point for VB/IB fetch and streamout target. - HwPipePostPs = 0x3, ///< All prior generated PS waves have completed. - /// Only valid as a pipe point to wait on (release point). - HwPipePreColorTarget = 0x4, ///< Represents the same point in pipe to HwPipePostPs, but provides - /// clients with a better option to accurately specify the pipeline - /// sync request. And PAL uses it as entry-point to add partial - /// flushes to prevent write-after-read hazard from corner cases. - /// Only valid as a wait point (acquire point). - HwPipePreIndexBuffer = HwPipeTop, ///< As late as possible before index buffer fetches (CP PFP). - HwPipePostIndexBuffer = HwPipePreRasterization,///< All prior index buffer fetches have completed. - - // The following points apply to compute-specific work: - HwPipePreCs = HwPipePostPrefetch, ///< As late as possible before CS waves are launched (CP ME). - HwPipePostCs = 0x5, ///< All prior generated CS waves have completed. - - // The following points apply to BLT-specific work: - HwPipePreBlt = HwPipePostPrefetch, ///< As late as possible before BLT operations are launched. - HwPipePostBlt = 0x6, ///< All prior requested BLTs have completed. - - HwPipeBottom = 0x7, ///< All prior GPU work (graphics, compute, or BLT) has completed. - HwPipePointCount -}; -#endif - -/// Bitmask values that can be OR'ed together to specify a synchronization scope. See srcStageMask and dstStageMask in -/// @ref AcquireReleaseInfo. -/// -/// When specifying an execution dependency at a synchronization point where previous operations must *happen-before* -/// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior -/// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs. -/// -/// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should -/// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering. -enum PipelineStageFlag : uint32 -{ - PipelineStageTopOfPipe = 0x00000001, - PipelineStageFetchIndirectArgs = 0x00000002, - PipelineStagePostPrefetch = 0x00000004, - PipelineStageFetchIndices = 0x00000008, - PipelineStageStreamOut = 0x00000010, - PipelineStageVs = 0x00000020, - PipelineStageHs = 0x00000040, - PipelineStageDs = 0x00000080, - PipelineStageGs = 0x00000100, - PipelineStagePs = 0x00000200, - PipelineStageSampleRate = 0x00000400, - PipelineStageEarlyDsTarget = 0x00000800, - PipelineStageLateDsTarget = 0x00001000, - PipelineStageColorTarget = 0x00002000, - PipelineStageCs = 0x00004000, - PipelineStageBlt = 0x00008000, - PipelineStageBottomOfPipe = 0x00010000, - PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget, - PipelineStageAllStages = 0x0001FFFF -}; - -/// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time. Such a -/// mask should be specified in the usages field of ImageLayout. These combined usages can be examined by PAL to infer -/// the layout (i.e., compression state) of the image. -/// -/// @note There is no layout corresponding to CmdClear*(). The layout flags passed to those functions will determine -/// the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the -/// layout the same. -enum ImageLayoutUsageFlags : uint32 -{ - LayoutUninitializedTarget = 0x00000001, ///< Initial state of any image that can be used as a color or - /// depth/stencil target. A layout transition out of this state will - /// likely result in a mask RAM initialization BLT. If this bit is - /// set, no other bits may be set. - LayoutColorTarget = 0x00000002, ///< Color target bound via CmdBindTargets(). This bit is exclusive - /// with LayoutDepthStencilTarget. - LayoutDepthStencilTarget = 0x00000004, ///< Depth/stencil target bound via CmdBindTargets(). This bit is - /// exclusive with LayoutColorTarget. - LayoutShaderRead = 0x00000008, ///< Any shader read state including texture, UAV, constant buffer, - /// vertex buffer. - LayoutShaderFmaskBasedRead = 0x00000010, ///< Images in this state support the load_fptr AMD IL instruction, - /// which will read decompressed fmask in order to access compressed - /// MSAA color data from a shader. - LayoutShaderWrite = 0x00000020, ///< Writeable UAV. - LayoutCopySrc = 0x00000040, ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or - /// CmdCopyTiledImageToMemory() source image. - LayoutCopyDst = 0x00000080, ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or - /// CmdCopyMemoryToTiledImage() destination image. - LayoutResolveSrc = 0x00000100, ///< CmdResolveImage() source. - LayoutResolveDst = 0x00000200, ///< CmdResolveImage() destination. - LayoutPresentWindowed = 0x00000400, ///< Windowed-mode IQueue::Present(). - LayoutPresentFullscreen = 0x00000800, ///< Fullscreen (flip) present. Layout must be supported by the - /// display engine. - LayoutUncompressed = 0x00001000, ///< Metadata fully decompressed/expanded layout - LayoutSampleRate = 0x00002000, ///< CmdBindSampleRateImage() source. - LayoutVideoEncodeRead = 0x00004000, ///< Video encoder input image layout, output is buffer so no layout. - LayoutVideoDecodeWrite = 0x00008000, ///< Video decoder output image layout, input is buffer so no layout. - LayoutAllUsages = 0x0000FFFF, -}; - -/// Bitmask values that can be ORed together to specify all potential engines an image might be used on. Such a -/// mask should be specified in the engines field of ImageLayout. -/// -/// Generally speaking, image transition inside the all video queues doesn't require barrier including stall, cache -/// sync and layout transition. For transition across queues, we rely inter-queue sync to guarantee the stall -/// and cache sync. However, it's possible the layout transition is incompatible and we need handle it. Clients can -/// call @ref IImage::IsLayoutTransitionCompatible() to check if the transition is compatible or not; if not, -/// must issue a barrier to do the layout transition. Note that Layout transitions must always be executed on Universal -/// or Compute queues; and DMA queue only supports metadata initialization transition. -/// -/// If the client API is unable to determine which engines might be used, it should specify all possible engines -/// corresponding to the usage flags. -enum ImageLayoutEngineFlags : uint32 -{ - LayoutUniversalEngine = 0x1, - LayoutComputeEngine = 0x2, - LayoutDmaEngine = 0x4, - LayoutVideoEncodeEngine = 0x8, - LayoutVideoDecodeEngine = 0x10, - LayoutVideoJpegDecodeEngine = 0x20, - LayoutAllEngines = 0x3F -}; - -/// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or -/// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages. -enum CacheCoherencyUsageFlags : uint32 -{ - CoherCpu = 0x00000001, ///< Data read or written by CPU. - CoherShaderRead = 0x00000002, ///< Data read by a GPU shader. - CoherShaderWrite = 0x00000004, ///< Data written by a GPU shader. - CoherCopySrc = 0x00000008, ///< Source of a ICmdBuffer::CmdCopy*() call. - CoherCopyDst = 0x00000010, ///< Destination of a ICmdBuffer::CmdCopy*() call. - CoherColorTarget = 0x00000020, ///< Color target. - CoherDepthStencilTarget = 0x00000040, ///< Depth stencil target. - CoherResolveSrc = 0x00000080, ///< Source of a CmdResolveImage() call. - CoherResolveDst = 0x00000100, ///< Destination of a CmdResolveImage() call. - CoherClear = 0x00000200, ///< Destination of a CmdClear() call. - CoherIndirectArgs = 0x00000400, ///< Source argument data read by CmdDrawIndirect() and similar functions. - CoherIndexData = 0x00000800, ///< Index buffer data. - CoherQueueAtomic = 0x00001000, ///< Destination of a CmdMemoryAtomic() call. - CoherTimestamp = 0x00002000, ///< Destination of a CmdWriteTimestamp() call. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 - CoherStreamOut = 0x00004000, ///< Data written as stream output. - CoherMemory = 0x00008000, ///< Data read or written directly from/to memory - CoherSampleRate = 0x00010000, ///< CmdBindSampleRateImage() source. - CoherPresent = 0x00020000, ///< Source of present. - CoherCp = 0x00080000, ///< HW Command Processor (CP) encompassing the front - end command - CoherAllUsages = 0x000FFFFF, ///< processing of any queue, including SDMA. -#else - CoherCeLoad = 0x00004000, ///< Source of a CmdLoadCeRam() call. - CoherCeDump = 0x00008000, ///< Destination of CmdDumpCeRam() call. - CoherStreamOut = 0x00010000, ///< Data written as stream output. - CoherMemory = 0x00020000, ///< Data read or written directly from/to memory - CoherSampleRate = 0x00040000, ///< CmdBindSampleRateImage() source. - CoherPresent = 0x00080000, ///< Source of present. - CoherCp = 0x00200000, ///< HW Command Processor (CP) encompassing the front - end command - CoherAllUsages = 0x003FFFFF, ///< processing of any queue, including SDMA. -#endif - - CoherShader = CoherShaderRead | CoherShaderWrite, - CoherCopy = CoherCopySrc | CoherCopyDst, - CoherResolve = CoherResolveSrc | CoherResolveDst, -}; - -/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage(). -enum ClearColorImageFlags : uint32 -{ - ColorClearAutoSync = 0x01, ///< PAL will automatically insert required barrier synchronization before - /// and after the clear assuming all subresources to be cleared are currently - /// ready for rendering as a color target (as is required by API convention in - /// DX12). Allows reduced sync costs in some situations since PAL knows - /// the details of how the clear will be performed. - ColorClearForceSlow = 0x02, ///< Force these to use slow clears. - ColorClearSkipIfSlow = 0x04, ///< Only issue the clear if it is a fast clear. - ColorClearInitMetaData = 0x08, ///< PAL will make sure initialize all metadata (including internal metadata state - /// data) for this image to be cleared. This is typically used for placed resource - /// initialization (as required by API convention in DX12); should only be used - /// when this is a full box clear. - ColorClearAllFlags = 0x0F ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil(). -enum ClearDepthStencilFlags : uint32 -{ - DsClearAutoSync = 0x01, ///< PAL will automatically insert required barrier synchronization before - /// and after the clear assuming all subresources to be cleared are currently - /// ready for rendering as a depth/stencil target (as is required by API convention - /// in DX12). Allows reduced sync costs in some situations since PAL knows the - /// details of how the clear will be performed. - DsClearInitMetaData = 0x02, ///< PAL will make sure initialize all metadata (including internal metadata state - /// data) for this image to be cleared. This is typically used for placed resource - /// initialization (as is required by API convention in DX12); should only be used - /// when this is a full box clear. Note that if clients call @ref - /// CmdClearDepthStencil() with this flag, MUST call @ref CmdUpdateHiSPretests() - /// after clear call otherwise HiSPretests will be overridden to initialized state. - DsClearAllFlags = 0x03 ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage(). -enum ResolveImageFlags : uint32 -{ - ImageResolveInvertY = 0x00000001, ///< PAL will invert the y-axis (flip upside down) of the resolved region to - /// the destination image. - ImageResolveDstAsSrgb = 0x00000002, ///< If set, a non-srgb destination image will be treated as srgb format. - /// The flag cannot be set when @ref ImageResolveDstAsNorm is set. - ImageResolveDstAsNorm = 0x00000004, ///< If set, a srgb destination image will be treated as non-srgb format. - /// The flag cannot be set when @ref ImageResolveDstAsSrgb is set. - ImageResolveSrcAsNorm = 0x00000008, ///< If set, a srgb source image will be treated as non-srgb format. - ImageResolveAllFlags = 0x0000000F ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/// Specifies properties for creation of an ICmdBuffer object. Input structure to IDevice::CreateCmdBuffer(). -struct CmdBufferCreateInfo -{ - ICmdAllocator* pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory - /// If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset - /// with a non-null pCmdAllocator before calling ICmdBuffer::Begin. - QueueType queueType; ///< Type of queue commands in this command buffer will target. - /// This defines the set of allowed actions in the command buffer. - QueuePriority queuePriority; ///< Priority level of the queue this command buffer will target. - EngineType engineType; ///< Type of engine the queue commands will run on. - - union - { - struct - { - /// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root" - /// command buffer. Nested command buffers differ from root command buffers in how they are sent to the - /// GPU for execution: root command buffers must be submitted to the hardware by calling - /// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root - /// command buffer. - /// - /// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is - /// meaningless and unsupported. It is an error to attempt to create a nested DMA command buffer. - /// - /// @see ICmdBuffer::CmdExecuteNestedCmdBuffers. - uint32 nested : 1; - - /// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming. - uint32 realtimeComputeUnits : 1; - - /// Target queue uses dispatch tunneling. - uint32 dispatchTunneling : 1; - - /// Indicates that each subsequent Dispatch command is desired to be executed in alternating - /// order of forward and reverse workgroup walk order. This can improve cache locality when - /// subsequent Dispatches consume data from the previous Dispatch and the overall footprint - /// does not fit in cache. - /// This is a best effort as not all implementations or Queues may support this. - uint32 dispatchPingPongWalk : 1; - - /// Reserved for future use. - uint32 reserved : 28; - }; - - /// Flags packed as 32-bit uint. - uint32 u32All; - - } flags; ///< Command buffer creation flags. -}; - -/// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling -/// root-level command buffer. -union InheritedStateFlags -{ - struct - { - /// Color and depth target views are inherited from the root-level command buffer. The nested command buffer - /// should not modify this state. - uint32 targetViewState : 1; - - /// Occlusion query is inherited from the root-level command buffer. The nested command buffer - /// should not modify this state. - uint32 occlusionQuery : 1; - - /// Predication is inherited from the root-level command buffer. The nested command buffer should not modify - /// this state. - uint32 predication : 1; - - /// Reserved for future usage. - uint32 reserved : 29; - }; - - /// Flags packed as 32-bit uint. - uint32 u32All; -}; - -/// Specifies parameters inherited from primary command buffer into nested command buffer. -struct InheritedStateParams -{ -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891 - uint32 colorTargetCount; ///< Number of color targets bound in the - /// root-level command buffer. - SwizzledFormat colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color - /// target. - uint32 sampleCount[MaxColorTargets]; ///< Sample count for each color target. -#endif - InheritedStateFlags stateFlags; ///< States that are inherited from the - /// calling root-level command buffer. -}; - -/// Specifies optional hints to control command buffer building optimizations. -union CmdBufferBuildFlags -{ - struct - { - /// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end - /// limited. These optimizations include removing redundant PM4 commands and reducing the VGT prim group size. - /// This flag might increase the CPU overhead of building command buffers. - uint32 optimizeGpuSmallBatch : 1; - - /// Optimize command buffer building for exclusive command buffer submission. Command buffers built with this - /// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that - /// they are no longer in use. This flag allows PAL to modify the contents of command buffers during - /// submission. - uint32 optimizeExclusiveSubmit : 1; - - /// Optimize command buffer building for single command buffer submission. Command buffers built with this flag - /// cannot be submitted more than once. This flag allows PAL to modify the contents of command buffers during - /// submission. This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set - /// optimizeExclusiveSubmit if this flag is set. - uint32 optimizeOneTimeSubmit : 1; - - /// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients - /// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided. - uint32 optimizeTessDistributionFactors : 1; - - /// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound - /// pipeline object. This optimization might increase the CPU overhead of building command buffers and/or - /// introduce additional front-end GPU bottlenecks. - uint32 prefetchShaders : 1; - - /// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end. - /// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes. - uint32 prefetchCommands : 1; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - /// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(), - /// or CmdWriteCeRam() - uint32 usesCeRamCmds : 1; -#else - uint32 placeholder914 : 1; -#endif - - /// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet. - /// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if - /// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers. - uint32 disallowNestedLaunchViaIb2 : 1; - - /// placeholder - uint32 placeholder1 : 2; - - /// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write - /// non-TMZ memory, the results are undefined. Only valid for graphics and compute. - uint32 enableTmz : 1; - - /// @internal - /// Build this command buffer in system memory - /// - /// @warning This is an internal flag and its existence, its signature and its semantics are not guaranteed - /// across different PAL versions. - uint32 buildInSysMem : 1; - - /// If set, internal operations such as blits, copies, etc. will not affect active Query results. - /// Otherwise they may affect the results. - uint32 disableQueryInternalOps : 1; - - uint32 optimizeContextStatesPerBin : 1; - uint32 optimizePersistentStatesPerBin : 1; - - /// Reserved for future use. - uint32 reserved : 16; - }; - - /// Flags packed as 32-bit uint. - uint32 u32All; -}; - -/// Specifies tessellation accum factors. -union TessDistributionFactors -{ - struct - { - /// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for - /// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a - /// different VGT. - uint32 isoDistributionFactor : 8; - uint32 triDistributionFactor : 8; ///< Recommended to be higher than quad factor. - uint32 quadDistributionFactor : 8; - /// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is - /// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still - /// increments the accumulator for the Patch distribution factor. - uint32 donutDistributionFactor : 5; - /// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch - /// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to - /// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken - /// into trapezoids. - uint32 trapDistributionFactor : 3; - }; - - /// Values packed as 32-bit uint. - uint32 u32All; -}; - -/// Specifies options that direct command buffer building. -struct CmdBufferBuildInfo -{ - /// Command buffer build flags, specifies optional hints to control command buffer build optimizations. - CmdBufferBuildFlags flags; - - /// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested - /// command buffer should not modify the software states. Any software params that may be needed within nested - /// command buffer needs to be provided here. - const InheritedStateParams* pInheritedState; - - /// If non-null, the command buffer will begin with all states set as they are in this previously built command - /// buffer. Any state specified in pInheritedState is excluded if it is also provided. - const ICmdBuffer* pStateInheritCmdBuffer; - - /// Optional allocator for PAL to use when allocating temporary memory during command buffer building. PAL will - /// stop using this allocator once command building ends. If no allocator is provided PAL will use an internally - /// managed allocator instead which may be less efficient. PAL will use this allocator in two ways: - /// + Temporary storage within a single command building call. PAL will rewind the allocator before returning to - /// free all memory allocated within the call. - /// + Temporary storage for the entire command building period. When Begin() is called, PAL will save the current - /// position of the allocator and rewind the allocator to that point when End() is called. If the client also - /// wishes to allocate temporary storage that lasts between command building function calls they must allocate it - /// before calling Begin() or PAL will accidentally free it. - Util::VirtualLinearAllocator* pMemAllocator; - - /// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the - /// optimizeTessDistributionFactors flag for these custom factors to take effect. - /// Nested command buffers inherit this value from the primary. - TessDistributionFactors clientTessDistributionFactors; - - /// Number of context states per PBB bin. - /// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect. - uint8 contextStatesPerBin; - - /// Number of persistent states per PBB bin. - /// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect. - uint8 persistentStatesPerBin; - - /// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer. - uint64 execMarkerClientHandle; -}; - -/// Specifies info on how a compute shader should use resources. -struct DynamicComputeShaderInfo -{ - float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively - /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a - /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] - /// and specify the maximum number of waves per compute unit. If the hardware has one wave - /// limit control for multiple shader stages PAL will select the most strict limit. - /// This option is converted internally to set set HW WavesPerSh setting and the non-integer - /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for - /// example specify less number of waves than number of CUs per shader array. - - uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on, - /// throttling it, to enable more graphics work to complete. 0 disables the limit. - - uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before - /// moving to the next compute unit. 0 selects optimal default. - - uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes. - /// Zero indicates that the LDS size determined at pipeline-compilation time will be used. -}; - -/// Specifies info on how a graphics shader should use resources. -struct DynamicGraphicsShaderInfo -{ - float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively - /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a - /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] - /// and specify the maximum number of waves per compute unit. If the hardware has one wave - /// limit control for multiple shader stages PAL will select the most strict limit. - /// This option is converted internally to set HW WavesPerSh setting and the non-integer - /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for - /// example specify less number of waves than number of CUs per shader array. -}; - -/// Specifies dynamic states of a graphics pipeline -struct DynamicGraphicsState -{ - uint32 colorWriteMask; ///< Color target write mask. 4b / RT (8 count) - struct - { - uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation. - uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate. - uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate. - uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage. - uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to - /// axis-aligned line end caps during line rasterization. - uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels. - uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend - uint32 vertexBufferCount : 6; ///< Number vertex buffer slots accessed by this pipeline - LogicOp logicOp : 4; ///< Logic operation to perform. - DepthRange depthRange : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport - /// transform: 0 to 1 or -1 to 1). - DepthClampMode depthClampMode : 2; ///< Depth clamping behavior. - uint32 reserved1 : 7; ///< Reserved - uint32 reserved : 5; ///< Reserved for future use. - }; - - union - { - struct - { - uint32 depthClampMode : 1; ///< Whether to enable dynamic state depthClampMode. - uint32 depthRange : 1; ///< Whether to enable dynamic state depthRange. - uint32 logicOp : 1; ///< Whether to enable dynamic state logicOp. - uint32 colorWriteMask : 1; ///< Whether to enable dynamic state colorWriteMask. - uint32 switchWinding : 1; ///< Whether to enable dynamic state switchWinding. - uint32 depthClipMode : 1; ///< Whether to enable dynamic state depthClipNear/FarEnable. - uint32 alphaToCoverageEnable : 1; ///< Whether to enable dynamic state alphaToCoverageEnable. - uint32 perpLineEndCapsEnable : 1; ///< Whether to enable dynamic state perpLineEndCapsEnable. - uint32 rasterizerDiscardEnable : 1; ///< Whether to enable dynamic state rasterizerDiscardEnable. - uint32 dualSourceBlendEnable : 1; ///< Whether to enable dynamic state dualSourceBlendEnable - uint32 vertexBufferCount : 1; ///< Whether to enable dynamic state vertexBufferCount. - uint32 reserved1 : 1; ///< Reserved. - uint32 reserved : 20; ///< Reserved for future use. - }; - uint32 u32All; - } enable; -}; - -/// Specifies info on how graphics shaders should use resources. -struct DynamicGraphicsShaderInfos -{ - union - { - // VS/HS/DS/GS or TS/MS are active - struct - { - DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information. - DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information. - DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information. - DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information. - }; - struct - { - DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information. - DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information. - }; - }; - - DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information. - - union - { - struct - { - uint8 vs : 1; // If set, there is dynamic VS shader info. - uint8 hs : 1; // If set, there is dynamic HS shader info. - uint8 ds : 1; // If set, there is dynamic DS shader info. - uint8 gs : 1; // If set, there is dynamic GS shader info. - uint8 ps : 1; // If set, there is dynamic PS shader info. - uint8 ts : 1; // If set, there is dynamic TS shader info. - uint8 ms : 1; // If set, there is dynamic MS shader info. - uint8 reserved : 1; // Reserved. - }; - uint8 u8All; - } enable; -}; - -/// Specifies parameters for binding a pipeline. -/// @see ICmdBuffer::CmdBindPipeline -struct PipelineBindParams -{ - PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics). - const IPipeline* pPipeline; ///< New pipeline to be bound. Can be null in order to unbind a previously - /// bound pipeline without binding a new one. - uint64 apiPsoHash; ///< 64-bit identifier provided by client driver based on the Pipeline State - /// Object. There exists a many-to-one correlation for ApiPsoHash to - /// internalPipelineHash to map the two. - union - { - DynamicComputeShaderInfo cs; ///< Dynamic Compute shader information. - - struct - { - DynamicGraphicsShaderInfos gfxShaderInfo; - DynamicGraphicsState gfxDynState; - }; - }; -}; - -/// Specifies per-MRT color target view and current image state. Used as input to ICmdBuffer::CmdBindTargets(). -struct ColorTargetBindInfo -{ - const IColorTargetView* pColorTargetView; ///< Color target view to bind. - ImageLayout imageLayout; ///< Specifies the current image layout based on bitmasks of currently - /// allowed operations and engines that may perform those operations. - /// At minimum, the LayoutColorTarget usage flag and - /// LayoutUniversalEngine engine flag must be set. -}; - -/// Specifies depth/stencil view and current image state of the depth and stencil planes. Used as input to -/// ICmdBuffer::CmdBindTargets(). -struct DepthStencilBindInfo -{ - const IDepthStencilView* pDepthStencilView; ///< Depth/stencil target view to bind. - ImageLayout depthLayout; ///< Specifies the current image layout of the depth plane based on - /// bitmasks of currently allowed operations and engines that may - /// perform those operations. At minimum, the - /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine - /// engine flag must be set. Ignored if the specified view does not - /// have a depth plane. - ImageLayout stencilLayout; ///< Specifies the current image layout of the stencil plane based on - /// bitmasks of currently allowed operations and engines that may - /// perform those operations. At minimum, the - /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine - /// engine flag must be set. Ignored if the specified view does not - /// have a stencil plane. -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 -/// Represents a GPU memory or image transition as part of a barrier. -/// -/// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the -/// specified set of destination caches. The source and destination designation is relative to the barrier itself -/// and does not indicate whether a particular cache is a read or write cache. -/// -/// Typically a transition flushes written data from the source caches into the destination caches and thus the source -/// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior -/// read-only caches accesses as PAL may be able to optimize its cache operations. -/// -/// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL -/// may still issue cache operations for internal reasons. -/// -/// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt. -/// -/// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches. -struct BarrierTransition -{ - - uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose - /// results need to be visible for subsequent operations. Flags for prior read operations - /// may be included as well and may be used for internal optimizations. - uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read - /// and/or write data flushed from the caches indicated by the srcCacheMask. - - struct - { - const IImage* pImage; ///< If non-null, indicates this transition only applies to the specified image. - /// The remaining members of this structure are ignored if this member is null. - SubresRange subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref - /// LayoutUninitializedTarget this range must cover all subresources of pImage - /// unless the perSubresInit image create flag was specified. - ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and - /// engines up to this point. These masks imply the previous compression state. No - /// usage flags should ever be set in oldLayout.usages that correspond to usages - /// that are not supported by the engine that is performing the transition. The - /// queue type performing the transition must be set in oldLayout.engines. - ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and - /// engines after this point. These masks imply the upcoming compression state. - /// point. This usage mask implies the upcoming compressions state. A difference - /// between oldLayoutUsageMask and newLayoutUsageMask may result in a - /// decompression. - - /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a - /// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum - /// valid position (not quite to the bottom/right border of the pixel). - /// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when - /// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call. - const MsaaQuadSamplePattern* pQuadSamplePattern; - - } imageInfo; ///< Image-specific transition information. -}; - -/// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier(). -/// -/// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate -/// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how -/// the GPU will use the image. -/// -/// This structure directly specifies how #1 is performed. #2 and #3 are managed by the list of @ref BarrierTransition -/// structures passed in pTransitions. -struct BarrierInfo -{ - /// Determine at what point the GPU should stall until all specified waits and transitions have completed. If the - /// specified wait point is unavailable, PAL will wait at the closest available earlier point. - HwPipePoint waitPoint; - - uint32 pipePointWaitCount; ///< Number of entries in pPipePoints. - const HwPipePoint* pPipePoints; ///< The barrier will stall until the hardware pipeline has cleared - /// up to each point specified in this array. One entry in this - /// array is typically enough, but CS and GFX operate in parallel - /// at certain stages. - - uint32 gpuEventWaitCount; ///< Number of entries in ppGpuEvents. - const IGpuEvent** ppGpuEvents; ///< The barrier will stall until each GPU event in this array is - /// in the set state. - - uint32 rangeCheckedTargetWaitCount; ///< Number of entries in ppTargets. - const IImage** ppTargets; ///< The barrier will stall until all previous rendering with any - /// color or depth/stencil image in this list bound as a target - /// has completed. If one of the targets is a nullptr it will - /// perform a full range sync. - - uint32 transitionCount; ///< Number of entries in pTransitions. - const BarrierTransition* pTransitions; ///< List of image/memory transitions to process. See - /// @ref BarrierTransition. The same subresource should never - /// be specified more than once in the list of transitions. - /// PAL assumes that all specified subresources are unique. - - uint32 globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined - /// (bitwise logical union) with the @ref srcCacheMask field belonging to every - /// element in @ref pTransitions. If this is zero or if there are no transitions, - /// then no global cache flags are applied during every transition. - - uint32 globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined - /// (bitwise logical union) with the @ref dstCacheMask field belonging to every - /// element in @ref pTransitions. If this is zero or if there are no transitions, - /// then no global cache flags are applied during every transition. - - uint32 reason; ///< The reason that the barrier was invoked. -}; -#endif - -/// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory -/// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain -/// cache operations. -/// -/// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of -/// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier. -/// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution -/// dependencies. -/// -/// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values. -/// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made -/// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or -/// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate -/// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed. -/// -/// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions -/// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero. -/// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations. -/// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations. -/// -/// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations -/// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional -/// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations. -/// If they are provided PAL may detect cases where future read operations use the same caches as the prior read -/// operations and thus can skip the usual visibility operations. -/// -/// Note that, -/// 1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags -/// were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise. -/// 2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal -/// optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one -/// @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource -/// transitions' stage or access mask into the global transition mask. -/// -/// This struct is used by @ref AcquireReleaseInfo. -struct MemBarrier -{ -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - union - { - struct - { - uint32 globallyAvailable : 1; ///< Normally, data made available is in the GPU LLC. When this bit is - /// set, available means in memory, available to all clients in the - /// system. This is useful for rare cases like mid command buffer - /// synchronization with the CPU or another external device. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as a 32-bit uint. - } flags; ///< Flags controlling the memory barrier. -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 - GpuMemSubAllocInfo memory; ///< Specifies a portion of an IGpuMemory object this memory barrier affects. - /// Zero values of memory structure indicate full range barrier operations. -#endif - - uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization - /// scope that must be confirmed complete as part of a release. Must be - /// 0 when passed in to CmdAcquire or CmdAcquireEvent. - uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization - /// scope of operations to be performed after the acquire. Must be - /// 0 when passed in to CmdRelease or CmdReleaseEvent. - - uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the - /// availability operation, as defined in the struct comment header. - /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. - uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the - /// visibility operation, as defined in the struct comment header. - /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. -}; - -/// Specifies required layout transition, execution dependencies, *availability*, and/or *visibility* operations on a -/// subresource of an IImage object. -/// -/// See the header comment on @ref MemBarrier for a full description of the execution dependencies, availability and -/// visibility operations, including what rules the clients must follow when filling out srcAccessMask and -/// dstAccessMask. -/// -/// This struct is used by @ref AcquireReleaseInfo. -struct ImgBarrier -{ - const IImage* pImage; ///< Relevant image resource for this barrier. - SubresRange subresRange; ///< Selects a range of planes/slices/mips the barrier affects. If newLayout - /// includes @ref LayoutUninitializedTarget this range must cover all subresources of - /// pImage unless the perSubresInit image create flag was specified. - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 - Box box; ///< Restricts the barrier to a sub-section of each subresource. The Z offset/extent - /// must be 0 for 1D/2D images, and the Y offset/extent must be 0 for 1D images. A - /// box with zero extents will be ignored, and the barrier will affect the entire - /// subresource range. This box may be used to restrict ranges of cache flushes or - /// invalidations, or may restrict what data is decompressed. However, the - /// implementation may not be able to optimize particular cases and may expand the - /// barrier to cover the entire subresource range. Specifying a subregion with a box - /// when newLayout includes @ref LayoutUninitializedTarget is not supported. -#endif - - uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization - /// scope that must be confirmed complete as part of a release. Must be - /// 0 when passed in to CmdAcquire or CmdAcquireEvent. - uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization - /// scope of operations to be performed after the acquire. Must be - /// 0 when passed in to CmdRelease or CmdReleaseEvent. - - uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the - /// availability operation, as defined in the struct comment header. - /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. - uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the - /// visibility operation, as defined in the struct comment header. - /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. - - ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and - /// engines up to this point. These masks imply the previous compression state. No - /// usage flags should ever be set in oldLayout.usages that correspond to usages - /// that are not supported by the engine that is performing the transition. The - /// engine type performing the transition must be set in oldLayout.engines. Can set - /// both oldLayout and newLayout to zero value for no layout transition case. - ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and - /// engines after this point. These masks imply the upcoming compression state. - /// point. A difference between oldLayoutUsageMask and newLayoutUsageMask may result - /// in a decompression. PAL's implementation will ensure the results of any layout - /// operations are consistent with the requested availability and visibility - /// operations. Can set both oldLayout and newLayout to zero value for no layout - /// transition case. - - /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a grid - /// where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum valid - /// position (not quite to the bottom/right border of the pixel). Specifies a custom sample pattern over a 2x2 - /// pixel quad. Can be left null for non-MSAA images or when a valid IMsaaState is bound prior to the barrier - /// call. - const MsaaQuadSamplePattern* pQuadSamplePattern; -}; - -/// Input structure to CmdRelease(), CmdReleaseEvent(), CmdAcquire(), CmdAcquireEvent(), and CmdReleastThenAcquire(). -/// It describes the execution dependencies, memory dependencies, and image layout transitions that must be resolved. -/// -/// Global transition doesn't have buffer or image info so it will assume the worst case and the barrier operations may -/// not be optimal (e.g. metadata may be misaligned and need issue LLC flush/invalidation). It's suggested that if -/// clients know the buffer or image info, try setting up the barrier call with the full buffer or image transition -/// info (including stageMask and accessMask) instead of global transition for optimal performance. -/// -/// Clients may OR multiple MemBarrier into a single MemBarrier on full range barrier cases for simple and saving CPU -/// overhead. To allow more optimization chances (e.g. skip unnecessary stalls for read only transitions) in PAL, -/// it's suggested to split the single grouped MemBarrier into two separate grouped MemBarriers: one is read only -/// MemBarrier and the other is writeable MemBarrier; both are then passed together to the barrier call. -struct AcquireReleaseInfo -{ - uint32 srcGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global - /// synchronization scope that must be confirmed complete as part of a - /// release. Must be 0 when passed in to CmdAcquire or CmdAcquireEvent. - uint32 dstGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global - /// synchronization scope of operations to be performed after the - /// acquire. Must be 0 when passed in to CmdRelease or CmdReleaseEvent. - - uint32 srcGlobalAccessMask; ///< *Access scope* for the global availability operation. Serves the - /// same purpose as srcAccessMask in @ref MemoryBarrier, but will cause - /// all relevant caches to be flushed without range checking. - /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. - uint32 dstGlobalAccessMask; ///< *Access scope* for the global visibility operation. Serves the - /// same purpose as dstAccessMask in @ref MemoryBarrier, but will cause - /// all relevant caches to be invalidated without range checking. - /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. - - uint32 memoryBarrierCount; ///< Number of entries in pMemoryBarriers. - const MemBarrier* pMemoryBarriers; ///< Describes memory dependencies specific to a range of a particular - /// IGpuMemory object. - - uint32 imageBarrierCount; ///< Number of entries in pImageBarriers. - const ImgBarrier* pImageBarriers; /// Describes memory dependencies and image layout transitions required - /// for a subresource range of a particular IImage object. - uint32 reason; ///< The reason that the barrier was invoked. - /// See @ref Developer::BarrierReason for internal reason codes, though - /// clients may define their own as well -}; - -/// Specifies barrier type, global (potentially mixed cases of buffer and image), buffer or image. -enum class BarrierType : uint32 -{ - Global, - Buffer, - Image -}; - -/// Number of all HW opaque release token types. -constexpr uint32 NumReleaseTokenTypes = 4; - -/// Synchronization token structure for CmdRelease() and CmdAcquire(). -/// -/// Clients should pass the ReleaseToken returned by CmdRelease() to CmdAcquire() directly without changing the value. -/// If a resource with given subresource range has multiple ReleaseToken, all related ReleaseToken should be passed to -/// CmdAcquire(). -/// -/// Passing ReleaseToken { .fenceValue = N; .type = T } into CmdAcquire() will wait for all prior releases with -/// .fenceValue <= N for .type == T. Resource with a large number of subresources may introduce lots of ReleaseToken -/// potentially (e.g. released per subresource). No need to track all ReleaseToken for each resource since clients -/// can optimize this based on the fact that release type and fenceValue are exposed for each ReleaseToken: define a -/// ReleaseToken array with size @ref NumReleaseTokenTypes, only track ReleaseToken with the largest fenceValue per -/// each release type; and then passing the tracked array ReleaseToken values to CmdAcquire() is enough. -union ReleaseToken -{ - struct - { - uint32 fenceValue : 24; ///< Release fence value per token type. - uint32 type : 8; ///< Release token type (HW opaque). Note that please increase the number of bits if - /// it can't hold all types, see @ref NumReleaseTokenTypes for details. - }; - - uint32 u32All; -}; - -/// Specifies parameters for a copy from one range of a source GPU memory allocation to a range of the same size in a -/// destination GPU memory allocation. Used as an input to ICmdBuffer::CmdCopyMemory(). -struct MemoryCopyRegion -{ - gpusize srcOffset; ///< Offset in bytes into the source GPU memory allocation to copy data from. - gpusize dstOffset; ///< Offset in bytes into the destination GPU memory allocation to copy data to. - gpusize copySize; ///< Amount of data to copy in bytes. -}; - -/// Specifies parameters for an image copy from one region in a source image subresource to a region of the same size in -/// a destination image subresource. Used as input to ICmdBuffer::CmdCopyImage(). -/// If the region describes a copy between a 2D and a 3D image, extent.depth and numSlices must be equal and may be -/// larger than 1. -struct ImageCopyRegion -{ - SubresId srcSubres; ///< Selects the source subresource. - Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. - SubresId dstSubres; ///< Selects the destination subresource. - Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination - /// subresource. - Extent3d extent; ///< Size of the copy region in pixels. - uint32 numSlices; ///< Number of slices the copy will span. -}; - -/// Specifies parameters for a copy between an image and a GPU memory allocation. The same structure is used regardless -/// of direction, an input for both ICmdBuffer::CmdCopyImageToMemory() and ICmdBuffer::CmdCopyMemoryToImage(). -struct MemoryImageCopyRegion -{ - SubresId imageSubres; ///< Selects the image subresource. - Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. - Extent3d imageExtent; ///< Size of the image region in pixels. - uint32 numSlices; ///< Number of slices the copy will span. - gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. - gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. - SwizzledFormat swizzledFormat;///< If not Undefined, reinterpret both subresources using this format and swizzle. -}; - -/// Specifies parameters for a copy between a PRT and a GPU memory allocation. The same structure is used regardless -/// of direction, an input for both ICmdBuffer::CmdCopyTiledImageToMemory() and ICmdBuffer::CmdCopyMemoryToTiledImage(). -struct MemoryTiledImageCopyRegion -{ - SubresId imageSubres; ///< Selects the image subresource; must not be a part of the packed mip tail. - Offset3d imageOffset; ///< Tile offset to the start of the chosen subresource region. - Extent3d imageExtent; ///< Size of the image region in tiles. - uint32 numSlices; ///< Number of slices the copy will span. - gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. - gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. -}; - -/// Used by copy operations to temporarily interpret a range of GPU memory as a "typed buffer". A typed buffer is -/// essentially a linear image with a caller-defined row pitch and depth pitch. Typed buffer copies do not require -/// the GPU memory objects to be created with the "typedBuffer" flag. -struct TypedBufferInfo -{ - SwizzledFormat swizzledFormat; ///< The pixels in this buffer have this format. - gpusize offset; ///< Offset in bytes to the start of the copy region in the buffer's GPU memory - /// allocation. - gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. -}; - -/// Specifies parameters for a copy from one region of a typed buffer to a region of the same size in a destination -/// typed buffer. Used as an input to ICmdBuffer::CmdCopyTypedBuffer(). -struct TypedBufferCopyRegion -{ - TypedBufferInfo srcBuffer; ///< How to interpret the source GPU memory allocation as a typed buffer. - TypedBufferInfo dstBuffer; ///< How to interpret the destination GPU memory allocation as a typed buffer. - Extent3d extent; ///< Size of the copy region in pixels. -}; - -/// Specifies parameters for a scaled copy between an image and a typed buffer. The same structure is used regardless -/// of direction, an input for ICmdBuffer::CmdScaledCopyTypedBufferToImage(). -struct TypedBufferImageScaledCopyRegion -{ - SubresId imageSubres; ///< Selects the image subresource. - Offset2d imageOffset; ///< Pixel offset to the start of the chosen subresource region. - Extent2d imageExtent; ///< Size of the image region in pixels. - TypedBufferInfo bufferInfo; ///< How to interpret the GPU memory allocation as a typed buffer. - Extent2d bufferExtent; ///< Size of the typed buffer region in pixels. - SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. -}; - -/// Specifies parameters for a scaled image copy from one region in a source image subresource to a region in the -/// destination image subresource. Used as an input to ICmdBuffer::CmdScaledCopyImage. -struct ImageScaledCopyRegion -{ - SubresId srcSubres; ///< Selects the source subresource. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 - uint32 srcSlices; ///< Number of source image slices to read across. -#endif - union - { - Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. - Offset3dFloat srcOffsetFloat; ///< Alternative representation in floating point. - }; - union - { - SignedExtent3d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates - /// a copy in the reverse direction. - Extent3dFloat srcExtentFloat; ///< Alternative representation in floating point. - }; - - SubresId dstSubres; ///< Selects the destination subresource. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 - uint32 dstSlices; ///< Number of destination image slices to write. -#endif - union - { - Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. - Offset3dFloat dstOffsetFloat; ///< Alternative representation in floating point. - }; - union - { - SignedExtent3d dstExtent; ///< Signed size of the destination region in pixels. A negative size - /// indicates a copy in the reverse direction. - Extent3dFloat dstExtentFloat; ///< Alternative representation in floating point. - }; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 887 - uint32 numSlices; ///< Number of slices the copy will span. -#endif - SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. - /// The specified format needs to have been included in the "pViewFormats" list - /// specified at image-creation time, otherwise the result might be incorrect. -}; - -/// Specifies parameters for a color-space-conversion copy from one region in a source image subresource to a region in -/// a destination image subresource. Used as an input to ICmdBuffer::CmdColorSpaceConversionCopy. -struct ColorSpaceConversionRegion -{ - Offset2d srcOffset; ///< Offset to the start of the chosen region in the source subresource(s). - SignedExtent2d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates a copy - /// in the reverse direction. - Offset2d dstOffset; ///< Offset to the start of the chosen region in the destination subresource(s). - SignedExtent2d dstExtent; ///< Signed size of the destination region in pixels. A negative size indicates a - /// copy in the reverse direction. - SubresId rgbSubres; ///< Selects the first subresource of the RGB image where the copy will begin. This - /// can either be the source or destination of the copy, depending on whether the - /// copy is performing an RGB->YUV or YUV->RGB conversion. - uint32 yuvStartSlice; ///< Array slice of the YUV image where the copy will begin. All planes of planar - /// YUV images will be implicitly involved in the copy. This can either be the - /// source or destination of the copy, depending on whether the copy is performing - /// an RGB->YUV or YUV->RGB conversion. - uint32 sliceCount; ///< Number of slices the copy will span. -}; - -/// Specifies the color-space-conversion table used when converting between YUV and RGB Image formats. Used as an input -/// to ICmdBuffer:CmdColorSpaceConversionCopy. -struct ColorSpaceConversionTable -{ - float table[3][4]; ///< Values forming the conversion table matrix, which has three rows and four columns. For RGB - /// to YUV conversions, the conversion shader uses the following expressions to evaluate the - /// YUV color: - /// Y = dot( [R G B 1], [row #0] ) - /// U = dot( [R G B 1], [row #1] ) - /// V = dot( [R G B 1], [row #2] ) - /// For YUV to RGB conversions, the conversion shader uses the following expressions to - /// evaluate the RGB color: - /// R = dot( [Y U V 1], [row #0] ) - /// G = dot( [Y U V 1], [row #1] ) - /// B = dot( [Y U V 1], [row #2] ) - /// A fourth row is not needed because alpha is copied directly between the RGB and YUV colors. -}; - -/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy -/// to perform a YUV to RGB color space conversion. Represents the BT.601 standard (standard-definition TV). -extern const ColorSpaceConversionTable DefaultCscTableYuvToRgb; - -/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy -/// to perform a RGB to YUV color space conversion. Represents the BT.601 standard (standard-definition TV). -extern const ColorSpaceConversionTable DefaultCscTableRgbToYuv; - -/// Specifies flags controlling GPU copy behavior in @ref CmdCopyImage. Format related flags are ignored by DMA queues. -enum CopyImageControlFlags : uint32 -{ - CopyImageFormatConversion = 0x1, ///< Requests that the copy convert between two compatible formats. This is - /// ignored unless both formats support @ref FormatFeatureFormatConversion. - CopyImageRawSwizzle = 0x2, ///< If possible, raw copies will swizzle from the source channel format into the - /// destination channel format (e.g., RGBA to BGRA). - CopyImageEnableScissorTest = 0x4, ///< If set, do scissor test using the specified scissor rectangle. - CopyImageInitDstMetadata = 0x8, ///< Requests copy initializes dst image's metadata; requires full box copy. - CopyImageControlAllFlags = 0xF ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 955 -enum CopyControlFlags : uint32 -{ - CopyFormatConversion = CopyImageFormatConversion, - CopyRawSwizzle = CopyImageRawSwizzle, - CopyEnableScissorTest = CopyImageEnableScissorTest, - CopyControlAllFlags = 0x7 ///< Clients should NOT use it, for internal static_assert purpose only. -}; -#endif - -/// Specifies flags controlling GPU copy behavior in @ref CmdCopyMemoryToImage. -/// Format related flags are ignored by DMA queues. -enum CopyMemoryToImageControlFlags : uint32 -{ - CopyMemoryToImageInitDstMetadata = 0x1, ///< Requests copy initializes dst image's metadata; requires full box copy. - CopyMemoryToImageControlAllFlags = 0x1 ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single -/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). -struct ImageResolveRegion -{ - uint32 srcPlane; ///< The source color, depth, or stencil plane. - uint32 srcSlice; ///< Selects the source starting slice - Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. - uint32 dstPlane; ///< The destination color, depth, or stencil plane. - uint32 dstMipLevel; ///< Selects destination mip level. - uint32 dstSlice; ///< Selects the destination starting slice - Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. - Extent3d extent; ///< Size of the resolve region in pixels. - uint32 numSlices; ///< Number of slices to be resolved - SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. - /// The format must match both subresource's native formats. - - const MsaaQuadSamplePattern* pQuadSamplePattern; ///< Specifies sample pattern for MSAA depth image. It must be a - /// valid pointer if image was created with sampleLocsAlwaysKnown - /// flag set. -}; - -/// A list of the types of PRT+ resolves that can be performed. -enum class PrtPlusResolveType : uint32 -{ - Decode = 0x0, ///< Translate from AMD HW format to format of destination image. - Encode = 0x1, ///< Translate from source image to AMD HW format - Count = 0x2, -}; - -/// Input structure to the CmdResolvePrtPlusImage function -struct PrtPlusImageResolveRegion -{ - Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 - SubresId srcSubresId; ///< Selects the source subresource -#else - uint32 srcMipLevel; ///< Selects source mip level - uint32 srcSlice; ///< Selects the source starting slice -#endif - - Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 - SubresId dstSubresId; ///< Selects the destination subresource -#else - uint32 dstMipLevel; ///< Selects destination mip level - uint32 dstSlice; ///< Selects the destination starting slice -#endif - - Extent3d extent; ///< Size of the resolve region in pixels. - uint32 numSlices; ///< Number of slices to be resolved -}; - -/// Input structure to ICmdBuffer::CmdResolvePrtPlusImageToBuffer() -struct PrtPlusImageToBufferResolveRegion -{ - SubresId srcSubresId; ///< Selects the source subresource - - gpusize dstOffset; ///< Offset into destination subresource - - Extent3d extent; ///< Size of the resolve region in pixels. - uint32 numSlices; ///< Number of slices to be resolved -}; - -/// Input structure to ICmdBuffer::CmdResolvePrtPlusBufferToImage() -struct PrtPlusBufferToImageResolveRegion -{ - gpusize srcOffset; ///< Offset into source subresource - - SubresId dstSubresId; ///< Selects the destination subresource - - Extent3d extent; ///< Size of the resolve region in pixels. - uint32 numSlices; ///< Number of slices to be resolved -}; - -/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single -/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). -enum class ResolveMode : uint32 -{ - Average = 0x0, ///< Resolve result is an average of all the individual samples - Minimum = 0x1, ///< Resolve result is the minimum value of all individual samples - Maximum = 0x2, ///< Resolve result is the maximum value of all individual samples - Count = 0x4, -}; - -/// Specifies width of immediate data to be written out. -enum class ImmediateDataWidth : uint32 -{ - ImmediateData32Bit = 0x0, - ImmediateData64Bit = 0x1, - - Count = 0x2, -}; - -/// Specifies flags controlling GPU query behavior. -union QueryControlFlags -{ - struct - { - /// Controls accuracy of query data collection. Available only for occlusion queries. If set, occlusion query - /// is guaranteed to return imprecise non-zero value if any samples pass the depth and stencil test. Using - /// imprecise occlusion query results could improve rendering performance while an occlusion query is active. - uint32 impreciseData : 1; - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies layout of GPU memory used as an input to CmdDrawIndirectMulti. -struct DrawIndirectArgs -{ - uint32 vertexCount; ///< Number of vertices to draw. - uint32 instanceCount; ///< Number of instances to draw. - uint32 firstVertex; ///< Starting index value for the draw. Indices passed to the vertex shader will range from - /// firstVertex to firstVertex + vertexCount - 1. - uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from - /// firstInstance to firstInstance + instanceCount - 1. -}; - -/// Specifies layout of GPU memory used as an input to CmdDrawIndexedIndirectMulti. -/// -/// Indices passed to the vertex shader will be: -/// -/// + IndexBuffer[firstIndex] + vertexOffset -/// + IndexBuffer[firstIndex + 1] + vertexOffset, -/// + ... -/// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset -struct DrawIndexedIndirectArgs -{ - uint32 indexCount; ///< Number of vertices to draw. - uint32 instanceCount; ///< Number of instances to draw. - uint32 firstIndex; ///< Starting index buffer slot for the draw. - int32 vertexOffset; ///< Offset added to the index fetched from the index buffer before it is passed to the - /// vertex shader. - uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from - /// firstInstance to firstInstance + instanceCount - 1. -}; - -/// Specifies layout of GPU memory used as an input to CmdDispatchIndirect. -struct DispatchIndirectArgs -{ - uint32 x; ///< Threadgroups to dispatch in the X dimension. - uint32 y; ///< Threadgroups to dispatch in the Y dimension. - uint32 z; ///< Threadgroups to dispatch in the Z dimension. -}; - -/// Specifies layout of GPU memory used as an input to CmdDispatchMeshIndirect. -using DispatchMeshIndirectArgs = DispatchIndirectArgs; - -/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. -struct CpuVirtAddrAndStride -{ - const void* pCpuVirtAddr; ///< CPU virtual address of the 0th array element. - struct - { - uint64 stride : 32; ///< Distance between array elements in bytes. - uint64 _pad : 32; ///< Padding for structure alignment. - }; -}; - -/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. -struct GpuVirtAddrAndStride -{ - gpusize gpuVirtAddr; ///< GPU virtual address of the 0th array element. - struct - { - uint64 stride : 32; ///< Distance between array elements in bytes. - uint64 _pad : 32; ///< Padding for structure alignment. - }; -}; - -/// Flags to describe a dispatch -union DispatchInfoFlags -{ - struct - { - uint32 devDriverOverlay : 1; ///< Flag indicates this dispatch draws the DevDriver overlay - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies the different stages at which a combiner can choose between different shading rates. -enum class VrsCombinerStage : uint32 -{ - ProvokingVertex, ///< Chooses between the shading rate specified by the VrsRateParams struct and the shader - /// rate provided by the provoking vertex. - Primitive, ///< Chooses between previous combiner stage and the shader rate associated with the primitive - Image, ///< Chooses between previous combiner stage and the shader rate associated with an image - PsIterSamples, ///< Chooses between previous combiner stage and the PS_ITER_SAMPLES rate. - Max -}; - -/// Specifies the different possible shading rates. Not all are supported on all HW; see the supportedVrsRates -/// entry in the gfxipProperties structure. -enum class VrsShadingRate : uint32 -{ - _16xSsaa = 0x0, - _8xSsaa = 0x1, - _4xSsaa = 0x2, - _2xSsaa = 0x3, - _1x1 = 0x4, - _1x2 = 0x5, - _2x1 = 0x6, - _2x2 = 0x7, - Count -}; - -/// Indices into the centerOffset array member of the VrsCenterState structure. -enum class VrsCenterRates : uint32 -{ - _1x1 = 0x0, - _1x2 = 0x1, - _2x1 = 0x2, - _2x2 = 0x3, - Max = 0x4, -}; - -/// Specifies the different ways in which a combiner can choose between two different shading rate inputs. -enum class VrsCombiner : uint32 -{ - Passthrough = 0, ///< Keep previous shading rate. - Override = 1, ///< C.xy = B.xy - Min = 2, ///< min(A.xy, B.xy) - Max = 3, ///< max(A.xy, B.xy) - Sum = 4, ///< min(maxRate, A.xy + B.xy) - Count -}; - -/// Structure for defining paramters to the CmdSetPerDrawVrsRate function. -struct VrsRateParams -{ - /// The shading rate to be bound to the render state. - VrsShadingRate shadingRate; - - /// The state of all the combiners. - VrsCombiner combinerState[static_cast(VrsCombinerStage::Max)]; - - union - { - struct - { - uint32 exposeVrsPixelsMask : 1; ///< Controls how the shader input mask of a coarse pixel is generated. - /// 0 : Bitwise OR of all fine pixel`s mask - /// 1 : Pack fine pixels` coverage mask into iMask. Layout based - /// on VRS rate - uint32 reserved : 31; - }; - - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Flags controlling VRS rate parameters -}; - -/// Structure for defininig paramters to the CmdSetVrsCenterState function. -struct VrsCenterState -{ - /// The offset is scaled by the coarse pixel size and then added to the center location - /// Center offsets are specified as two 4 bits signed integer value representing a location on a 16x16 grid gd. - /// The offset is scaled by the coarse pixel size and then added to the center location - /// 1x1, 1x2, 2x1 and 2x2 shading rates can all have their own unique offsets - Offset2d centerOffset[static_cast(VrsCenterRates::Max)]; - - union - { - struct - { - uint32 overrideCenterSsaa : 1; ///< Override center interpolants to be evaluated at the sample - /// position. - uint32 overrideCentroidSsaa : 1; ///< Override centroid interpolants to be evaluated at the centroid - /// of each sample group being iterated (simply the sample position - /// in the typical case of 1-sample groups). - uint32 alwaysComputeCentroid : 1; ///< Don't assume the centroid of a fully covered shading region is - /// the center. It is possible all samples could be lit but the - /// center is not lit for certain combinations of centerOffset[] - /// values and programmable sample positions - uint32 reserved : 29; ///< Reserved for future HW - }; - - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Flags controlling VRS center state -}; - -/// @internal -/// Function pointer type definition for setting pipeline-accessible user data entries to the specified values. Each -/// command buffer object has one such callback per pipeline bind point, so the bind point is implicit. -/// -/// @see ICmdBuffer::CmdSetUserData(). -typedef void (PAL_STDCALL *CmdSetUserDataFunc)( - ICmdBuffer* pCmdBuffer, - uint32 firstEntry, - uint32 entryCount, - const uint32* pEntryValues); - -/// @internal Function pointer type definition for issuing non-indexed draws. -/// -/// @see ICmdBuffer::CmdDraw(). -typedef void (PAL_STDCALL *CmdDrawFunc)( - ICmdBuffer* pCmdBuffer, - uint32 firstVertex, - uint32 vertexCount, - uint32 firstInstance, - uint32 instanceCount, - uint32 drawId); - -/// @internal Function pointer type definition for issuing draws auto. -/// -/// @see ICmdBuffer::CmdDrawOpaque(). -typedef void (PAL_STDCALL *CmdDrawOpaqueFunc)( - ICmdBuffer* pCmdBuffer, - gpusize streamOutFilledSizeVa, - uint32 streamOutOffset, - uint32 stride, - uint32 firstInstance, - uint32 instanceCount); - -/// @internal Function pointer type definition for issuing indexed draws. -/// -/// @see ICmdBuffer::CmdDrawIndexed(). -typedef void (PAL_STDCALL *CmdDrawIndexedFunc)( - ICmdBuffer* pCmdBuffer, - uint32 firstIndex, - uint32 indexCount, - int32 vertexOffset, - uint32 firstInstance, - uint32 instanceCount, - uint32 drawId); - -/// @internal Function pointer type definition for issuing indirect draws. -/// -/// @see ICmdBuffer::CmdDrawIndirectMulti(). -typedef void (PAL_STDCALL *CmdDrawIndirectMultiFunc)( - ICmdBuffer* pCmdBuffer, - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr); - -/// @internal Function pointer type definition for issuing indexed, indirect draws. -/// -/// @see ICmdBuffer::CmdDrawIndexedIndirectMulti(). -typedef void (PAL_STDCALL *CmdDrawIndexedIndirectMultiFunc)( - ICmdBuffer* pCmdBuffer, - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr); - -/// @internal Function pointer type definition for issuing direct dispatches. -/// -/// @see ICmdBuffer::CmdDispatch(). -typedef void (PAL_STDCALL *CmdDispatchFunc)( - ICmdBuffer* pCmdBuffer, - DispatchDims size, - DispatchInfoFlags infoFlags); - -/// @internal Function pointer type definition for issuing indirect dispatches. -/// -/// @see ICmdBuffer::CmdDispatchIndirect(). -typedef void (PAL_STDCALL *CmdDispatchIndirectFunc)( - ICmdBuffer* pCmdBuffer, - gpusize gpuVirtAddr); -/// @internal Function pointer type definition for issuing direct dispatches with threadgroup offsets. -/// -/// @see ICmdBuffer::CmdDispatchOffset(). -typedef void (PAL_STDCALL *CmdDispatchOffsetFunc)( - ICmdBuffer* pCmdBuffer, - DispatchDims offset, - DispatchDims launchSize, - DispatchDims logicalSize); - -/// @internal Function pointer type definition for issuing direct mesh dispatches. -/// -/// @see ICmdBuffer::CmdDispatchMesh(). -typedef void (PAL_STDCALL *CmdDispatchMeshFunc)( - ICmdBuffer* pCmdBuffer, - DispatchDims size); - -/// @internal Function pointer type definition for issuing indirect mesh dispatches. -/// -/// @see ICmdBuffer::CmdDispatchMeshIndirectMulti(). -typedef void (PAL_STDCALL *CmdDispatchMeshIndirectMultiFunc)( - ICmdBuffer* pCmdBuffer, - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr); - -/// This struct provides the parameters of all the supported features for kernel dispatch -struct DispatchAqlParams -{ - const hsa_kernel_dispatch_packet_t* pAqlPacket; ///< Pointer to AQL packet contains the essential - /// information (size of workgroup, grid, data - /// segments, handle of kernel code object, kernel - /// arguments) of the kernel to be dispatched. - gpusize scratchAddr; ///< GPU VM scratch buffer address - uint32 scratchSize; ///< Scratch buffer size - uint32 scratchOffset; ///< Scratch buffer offset from the base for generic - /// address space -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 920 - const llvm::amdhsa::kernel_descriptor_t* pCpuAqlCode; ///< AMD kernel descriptor on CPU for PM4 emulation -#else - const amd_kernel_code_t* pCpuAqlCode; ///< AMD kernel code object on CPU for PM4 emulation -#endif - gpusize hsaQueueVa; ///< GPU VM address where amd_queue_t is allocated - uint32 wavesPerSh; ///< Waves Per Shade Array - bool useAtc; ///< Indicates whether ATC bit in registers should be set - /// ATC bit is used for indicating if the address - /// is GPUVM(=0) or SVM(=1). Valid only in gfx6 and older - uint64 kernargSegmentSize; ///< The size of kerarg segment that holds the - /// values of the arguments to the kernels - uint32 workitemPrivateSegmentSize; ///< The amount of fixed private address - /// space memory required for a workitem. - uint32 aqlPacketIndex; ///< AQL ID in QueueCreateInfo.aqlPacketList for debugger. - /// Note: debugger support enabled for Navi3x+ - -}; - -/// This structure holds the parameters used during kernel dispatch. -struct DispatchAqlFeedback -{ - uint32 tmpRingSize; ///< Content of the compute_tmpring_size register. -}; - -/// @internal Function pointer type definition for issuing AQL dispatches. -/// -/// @see ICmdBuffer::CmdDispatchAql(). -typedef void (PAL_STDCALL *CmdDispatchAqlFunc)( - ICmdBuffer* pCmdBuffer, - const DispatchAqlParams& dispatchInfo, - DispatchAqlFeedback* pFeedback); - -/// Specifies input assembler state for draws. -/// @see ICmdBuffer::CmdSetInputAssemblyState -struct InputAssemblyStateParams -{ - PrimitiveTopology topology; ///< Defines how vertices should be interpretted and rendered by - /// the graphics pipeline. - uint8 patchControlPoints; ///< # of control points per patch. [0-32] valid. Should be set to - /// 0 by clients if topology is not PrimitiveTopology::Patch. - bool primitiveRestartEnable; ///< Enables the index specified by primitiveRestartIndex to _cut_ - /// a primitive (i.e., triangle strip) and begin a new primitive - /// with the next index. - bool primitiveRestartMatchAllBits; ///< Specifies which bits from primitiveRestartIndex to use. - /// false - only check relevant bits based on index type - /// true - check all 32 bits irrespective of index type - uint32 primitiveRestartIndex; ///< When primitiveRestartEnable is true, this is the index value - /// that will restart a primitive. When using a 16-bit index - /// buffer, the upper 16 bits of this value will be ignored. -}; - -/// Specifies parameters for controlling triangle rasterization. -/// @see ICmdBuffer::CmdSetTriangleRasterState -struct TriangleRasterStateParams -{ - struct - { - FillMode frontFillMode : 2; ///< Whether front-facing triangles should be rendered solid or wireframe. - FillMode backFillMode : 2; ///< Whether back-facing triangles should be rendered solid or wireframe. - CullMode cullMode : 2; ///< Specifies which, if any, triangles should be culled based on whether - /// they are front or back facing. - FaceOrientation frontFace : 1; ///< Specifies the vertex winding that results in a front-facing triangle. - ProvokingVertex provokingVertex : 1; ///< Specifies whether the first or last vertex of a primitive is the - }; /// provoking vertex as it affects flat shading. - union - { - struct - { - uint8 frontDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for front-facing - /// triangle-based primitives - uint8 backDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for back-facing - /// triangle-based primitives - uint8 reserved : 6; ///< Reserved for future use. - }; - uint8 u8All; ///< Flags packed as 8-bit uint. - } flags; ///< Triangle raster state flags. -}; - -/// Specifies parameters for controlling point and line rasterization. -/// @see ICmdBuffer::CmdSetPointLineRasterState -struct PointLineRasterStateParams -{ - float pointSize; ///< Width of a point primitive in pixels. - float lineWidth; ///< Width of a line primitive in pixels. - float pointSizeMin; ///< Minimum width of a point primitive in pixels. - float pointSizeMax; ///< Maximum width of a point primitive in pixels. -}; - -/// Specifies parameters for controlling line stippling. -/// @see ICmdBuffer::CmdSetLineStippleState -struct LineStippleStateParams -{ - uint16 lineStippleValue; ///< Line stipple bit pattern. - uint32 lineStippleScale; ///< Line stipple repeat factor. -}; - -/// Specifies paramters for setting up depth bias. Depth Bias is used to ensure a primitive can properly be displayed -/// (without Z fighting) in front (or behind) of the previously rendered co-planar primitive. This is useful for decal -/// or shadow rendering. -/// @see ICmdBuffer::CmdSetDepthBiasState -struct DepthBiasParams -{ - float depthBias; ///< Base depth bias to be added to each fragment's Z value. In units of the - /// minimum delta representable in the bound depth buffer. - float depthBiasClamp; ///< Maximum allowed depth bias result. Prevents polygons viewed at a sharp value - /// from generating very large biases. - float slopeScaledDepthBias; ///< Factor multiplied by the depth slope (change in Z coord per x/y pixel) to - /// create more bias for "steep" polygons. This result is applied to the final - /// Z value in addition to the base depthBias parameter. -}; - -/// Specifies parameters for setting the value range to be used for depth bounds testing. -/// @see ICmdBuffer::CmdSetDepthBounds -struct DepthBoundsParams -{ - float min; ///< Minimum depth value in passing range (closest). - float max; ///< Maximum depth value in passing range (farthest). -}; - -/// Specifies parameters for setting bit-masks applied to stencil buffer reads and writes. -/// @see ICmdBuffer::CmdSetStencilRefMasks -struct StencilRefMaskParams -{ - - uint8 frontRef; ///< Stencil reference value for front-facing polygons. - uint8 frontReadMask; ///< Bitmask to restrict stencil buffer reads for front-facing polygons. - uint8 frontWriteMask; ///< Bitmask to restrict stencil buffer writes for front-facing polygons. - uint8 frontOpValue; ///< Stencil operation value for front-facing polygons. - /// This is the value used as a parameter for a given stencil operation. - /// For example: StencilOp::IncWrap will use this value when incrementing the current - /// stencil contents. Typically, this would be set to one, but on AMD hardware, - /// this register is 8 bits so there is a greater flexibility. - - uint8 backRef; ///< Stencil reference value for back-facing polygons. - uint8 backReadMask; ///< Bitmask to restrict stencil buffer reads for back-facing polygons. - uint8 backWriteMask; ///< Bitmask to restrict stencil buffer writes for back-facing polygons. - uint8 backOpValue; ///< Stencil operation value for back-facing polygons - See description of frontOpValue - /// for further details. - union - { - uint8 u8All; ///< Flags packed as a 8-bit uint. - struct - { - uint8 updateFrontRef : 1; ///< Updating reference value for front-facing polygons. - uint8 updateFrontReadMask : 1; ///< Updating read mask value for front-facing polygons. - uint8 updateFrontWriteMask : 1; ///< Updating write mask value for front-facing polygons. - uint8 updateFrontOpValue : 1; ///< Updating stencil op value for front-facing polygons. - uint8 updateBackRef : 1; ///< Updating reference value for back-facing polygons. - uint8 updateBackReadMask : 1; ///< Updating read mask value for back-facing polygons. - uint8 updateBackWriteMask : 1; ///< Updating write mask value for back-facing polygons. - uint8 updateBackOpValue : 1; ///< Updating stencil op value for back-facing polygons. - }; - } flags; ///< Flags to indicate which of the stencil state values are being updated. -}; - -/// HiS always exposes two pretests. -constexpr uint32 NumHiSPretests = 2; - -/// Hierarchical stencil (HiS) allows work to be discarded by the stencil test at tile rate in certain cases. -/// In order to use HiS, the client will define a set of pretests that will be performed whenever a particular stencil -/// buffer is written. The stencil image will track the results of the pretest for each 8x8 tile, keeping a record of -/// whether any pixel in the tile "may-pass" or "may-fail" the specified pretest. When stencil testing is enabled, -/// the hardware may be able to discard whole tiles early based on what it can glean from the HiS pretest states. -/// -/// Each stencil image has two pretest slots per mip level. Pretest slots are reset when an initialization barrier -/// targets their mip level on the stencil plane. The client can then pass this struct to @ref CmdUpdateHiSPretests -/// to bind one or more valid pretests. It is legal to bind a pretest over a reset slot at any point. -/// -/// @warning Except in special cases, it is illegal to bind a pretest on top of an existing pretest. -/// -/// It is only legal to bind a new pretest on top of an existing pretest if: -/// 1. All array slices within the given mip have been reset using an initialization barrier. -/// 2. The client guarantees that they will rewrite all stencil values in all array slices within the given mip -/// before the next draw with stencil testing enabled by doing either: -/// a. One or more calls to @ref CmdClearDepthStencil. -/// b. One or more draws with the stencil test disabled and stencil writes enabled. -/// -/// Once pretests are selected via @ref CmdUpdateHiSPretests the client should keep track of which tests were enabled -/// on each stencil image and provide them to every call to @ref CmdClearDepthStencil. This is optional but PAL will -/// not be able to generate HiS optimized clears unless it is given the current pretests. -/// -/// @warning The pretests provided to @ref CmdUpdateHiSPretests are applied to all mips of all subresource ranges. -/// If the client varies pretests between mips they must guarantee that the given pretests were bound to all -/// mips in the given subresource ranges. -/// -/// This feature works best if the future stencil test behavior is known, either directly told via an API extension -/// or via an app profile in the client layer. For example, if the application 1) clears stencil, 2) does a pass to -/// write stencil, 3) then does a final pass that masks rendering based on the stencil value being > 0, ideally we -/// would choose a pretest of func=Greater, mask=0xFF, and value=0 so that #2 would update the stencil image with -/// per-tile data that lets #3 be accelerated at maximum effeciency. -/// -/// In absence of app-specific knowledge, the following algorithm may be a good generic approach: -/// 1. When the stencil image is cleared, set pretest #0 to func=Equal, mask=0xFF, and value set to the clear value. -/// 2. On the first draw with stencil writes enabled, set pretest #1 with the mask set to the app's current stencil -/// mask, and -/// a. If the stencil op is INC or DEC, set func=GreaterEqual and value the same as in #1. -/// b. If the stencil op is REPLACE, set func=Equal and set value to the app's current stencil ref value. -/// -/// Note that HiS can only be beneficial for GPU performance so clients that do not want to implement app profiles or -/// generic heuristics should at least hard-code both tests to something simple. -struct HiSPretests -{ - struct - { - CompareFunc func; ///< This function is used to compare the pretest value with the image's stencil value. - /// The expression is evaluated with the pretest value as the left-hand operand and the - /// image's stencil value as the right-hand operand. - uint8 mask; ///< This value is ANDed with both stencil values before evaluating the comparison. - uint8 value; ///< The pretest value, used as the left-hand operand in the comparison. - bool isValid; ///< True if this pretest contains valid information. Set to false to skip this test. - } test[NumHiSPretests]; ///< The set of pretest slots. -}; - -/// Specifies coordinates for setting up single user clip plane. -/// @see ICmdBuffer::CmdSetUserClipPlanes -struct UserClipPlane -{ - float x; ///< Plane coordinate x - float y; ///< Plane coordinate y - float z; ///< Plane coordinate z - float w; ///< Plane coordinate w -}; - -/// Specifies parameters for setting the constant factor to be used by the blend hardware when programmed with the -/// Blend::ConstantColor, Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend -/// coefficients. -/// @see ICmdBuffer::CmdSetBlendConst -struct BlendConstParams -{ - float blendConst[4]; ///< 4-component RGBA float specifying the new blend constant. -}; - -/// Specifies the parameters for a single viewport -struct Viewport -{ - float originX; ///< X coordinate for the viewport's origin. - float originY; ///< Y coordinate for the viewport's origin. - float width; ///< Width of the viewport. - float height; ///< Height of the viewport. - float minDepth; ///< Minimum depth value of the viewport. Must be in the [0..1] range. - float maxDepth; ///< Maximum depth value of the viewport. Must be in the [0..1] range. - PointOrigin origin; ///< Origin of the viewport relative to NDC. UpperLeft or LowerLeft. -}; - -/// Specifies the range for user-defined depth clamp. -struct DepthClamp -{ - float minDepth; ///< Minimum depth value after viewport transform. - float maxDepth; ///< Maximum depth value after viewport transform. -}; - -/// Specifies the viewport transform parameters for setting a single viewport. -/// @see ICmdBuffer::CmdSetViewport -struct ViewportParams -{ - uint32 count; ///< Number of viewports. - float horzDiscardRatio; ///< The ratio between guardband discard rect width and viewport width. - /// For all guard band ratio settings, values less than 1.0f are illegal. - /// Value FLT_MAX opens the guardband as wide as the HW supports. - /// Value 1.0f disables the guardband. - float vertDiscardRatio; ///< The ratio between guardband discard rect height and viewport height. - float horzClipRatio; ///< The ratio between guardband clip rect width and viewport width. - float vertClipRatio; ///< The ratio between guardband clip rect height and viewport height. - DepthRange depthRange; ///< Specifies the target range of Z values - DepthClamp userDepthClamp; ///< Specifies the clamp range of Z values for DepthClampMode::UserDefined. - // Define viewports array at the end of the structure as it is common to only access the first N from the CPU. - Viewport viewports[MaxViewports]; ///< Array of desciptors for each viewport. -}; - -/// Specifies the parameters for specifing the scissor rectangle. -struct ScissorRectParams -{ - uint32 count; ///< Number of scissor rectangles. - Rect scissors[MaxViewports]; ///< Array of scissor regions corresponding to each viewport. -}; - -/// Specifies parameters for setting the global scissor rectangle. -/// @see ICmdBuffer::CmdSetGlobalScissor -struct GlobalScissorParams -{ - Rect scissorRegion; ///< Rectangle of the global scissor window. -}; - -/// Specifies parameters for binding the color targets and depth target. -/// @see ICmdBuffer::CmdBindTargets -struct BindTargetParams -{ - uint32 colorTargetCount; ///< Number of color targets to bind. - ColorTargetBindInfo colorTargets[MaxColorTargets]; ///< Array of color target descriptors. - DepthStencilBindInfo depthTarget; ///< Describes the depth target bind info. -}; - -/// Specifies parameters for binding the stream-output targets. -/// @see ICmdBuffer::CmdBindStreamOutTargets -struct BindStreamOutTargetParams -{ - struct - { - gpusize gpuVirtAddr; ///< GPU virtual address of this stream-output target. Must be DWORD-aligned. If - /// this is zero, 'size' is ignored and the target is considered un-bound. - gpusize size; ///< Size of this stream-output target, in bytes. Must be DWORD-aligned. - } target[MaxStreamOutTargets]; ///< Describes the stream-output target for each buffer slot. -}; - -/// Specifies the different types of predication ops available. -enum class PredicateType : uint32 -{ - Zpass = 1, ///< Enable occlusion predicate - PrimCount = 2, ///< Enable streamout predicate - Boolean64 = 3, ///< CP PFP treats memory as a 64bit integer which is either false (0) or true, DX12 style. - Boolean32 = 4, ///< CP PFP treats memory as a 32bit integer which is either false (0) or true, Vulkan style. - Count -}; - -/// Bitfield structure used to specify masks for functions that operate on depth and/or stencil planes of an image. -union DepthStencilSelectFlags -{ - struct - { - /// Select Depth. - uint32 depth : 1; - - /// Select Stencil. - uint32 stencil : 1; - - /// Reserved for future usage. - uint32 reserved : 30; - }; - - /// Flags packed as 32-bit uint. - uint32 u32All; -}; - -/// Specifies information related to clearing a bound color target. Input structure to CmdClearBoundColorTargets(). -struct BoundColorTarget -{ - uint32 targetIndex; ///< Render target index where the target image is currently bound. - SwizzledFormat swizzledFormat; ///< Format and swizzle of the target image. - uint32 samples; ///< Sample count for the target. - uint32 fragments; ///< Fragment count for the target. - ClearColor clearValue; ///< clear color value. -}; - -/// Specifies clear region to clear a bound target. Input structure to CmdClearBoundColorTargets() and -/// CmdClearBoundDepthStencilTargets() -struct ClearBoundTargetRegion -{ - Rect rect; ///< The 2D region to clear. - uint32 startSlice; ///< The starting slice to clear. - uint32 numSlices; ///< The number of slices to clear. -}; - -/// Specifies flags controlling CmdSaveComputeState and CmdRestoreComputeState. PAL clients must be aware that saving -/// and restoring specific state in a nested command buffer may not be supported. The rule is simple: if the client -/// requires that the caller leak the given state to the callee, PAL will not support saving and restoring that state. -enum ComputeStateFlags : uint32 -{ - ComputeStatePipelineAndUserData = 0x1, ///< Selects the bound compute pipeline, all non-indirect user data, and all - /// kernel arguments (if applicable). Note that the current user data will - /// be invalidated on CmdSaveComputeState. - ComputeStateBorderColorPalette = 0x2, ///< Selects the bound border color pallete that affects compute pipelines. - ComputeStateAll = 0x3, ///< Selects all state -}; - -/// Provides dynamic command buffer flags during submission -/// The following flags are used for Frame Pacing when delay time is configured to be caculated by KMD. -/// (Currently DX clients require this). -/// For clients that do not need Frame Pacing with KMD caculated delay time, they can ignore these flags: -/// -/// - frameBegin and frameEnd : Client's presenting queue should track its present state, -/// and set frameBegin flag on the first command buffer after present, -/// set frameEnd flag on the the last command buffer before present. (Could be the Present command buffer itself.) -/// We don't need to set them on queues other than the presenting queue. -/// - P2PCmd : Mark a P2P copy command. KMD could use this flag for adjustments for its frame time calculation. -/// For the current frame time algorithm, clients should only set this flag on SW compositing copy command. -/// But KMD may adjust their algorithm, and clients should update the flag depending on KMD needs. -/// -/// The following flags are used for Direct Capture. -/// -/// - captureBegin and captureEnd : Direct capture info should be filled if any of these is set. And captureEnd flag -/// also notifies KMD that the on-screen primary is safe to release. -struct CmdBufInfo -{ - union - { - struct - { - uint32 isValid : 1; ///< Indicate if this CmdBufInfo is valid and should be submitted - uint32 frameBegin : 1; ///< First command buffer after Queue creation or Present. - uint32 dfSpmTraceBegin : 1; ///< This command buffer begins a DF SPM trace. - uint32 dfSpmTraceEnd : 1; ///< This command buffer ends a DF SPM trace. - uint32 frameEnd : 1; ///< Last command buffer before Present. - uint32 p2pCmd : 1; ///< Is P2P copy command. See CmdBufInfo comments for details. - uint32 captureBegin : 1; ///< This command buffer begins a Direct Capture frame capture. - uint32 captureEnd : 1; ///< This command buffer ends a Direct Capture frame capture. - uint32 rayTracingExecuted : 1; ///< This command buffer contains ray tracing work. - uint32 preflip : 1; ///< This command buffer has pre-flip access to DirectCapture resource - uint32 postflip : 1; ///< This command buffer has post-flip access to DirectCapture resource - uint32 privateFlip : 1; ///< Need to flip to a private primary surface for DirectCapture feature - uint32 vpBltExecuted : 1; ///< This command buffer comtains VP Blt work. - uint32 disableDccRejected : 1; ///< Reject KMD's DisableDcc request to avoid writing to front buffer. - uint32 noFlip : 1; ///< No flip when DirectCapture access submission completes - uint32 frameGenIndex : 4; ///< Index of the DirectCapture feature generated frames - uint32 noRenderPresent : 1; ///< Last command buffer before present which is no render present or not - uint32 motionVectorPropChanged : 1; ///< Indicates whether motion vector properties changed - uint32 depthPropChanged : 1; ///< Indicates whether depth properties changed - uint32 cameraPropChanged : 1; ///< Indicates whether camera matrix properties changed - uint32 capturePrimary : 1; ///< Has Direct Capture primary surface capture - uint32 captureMotionVector : 1; ///< Has Direct Capture motion vector capture - uint32 captureDepth : 1; ///< Has Direct Capture depth capture - uint32 captureCamera : 1; ///< Has Direct Capture camera matrix capture - uint32 hudLessImagePropChanged : 1; ///< Indicates whether HUD less image properties changed - uint32 captureHudLessImage : 1; ///< Has Direct Capture HUD less image capture - uint32 llmDecodeStart : 1; ///< Has LLM decode Start Enabled in the CmdBufInfo packet - uint32 llmDecodeStop : 1; ///< Has LLM decode Stop Enabled in the CmdBufInfo packet - uint32 reserved : 1; ///< Reserved for future usage. - }; - uint32 u32All; ///< Flags packed as uint32. - }; - - const IGpuMemory* pPrimaryMemory; ///< The primary's gpu memory object used for passing its allocation handle - /// to KMD for pre-flip primary access (PFPA). If frame metadata flags - /// specifies that primaryHandle should be sent, clients should set this to - /// current frame pending primary's IGpuMemory object on the creating GPU - /// for the frameEnd command. Otherwise set this to nullptr. - const IGpuMemory* pDirectCapMemory; ///< The Direct Capture gpu memory object. It should be set if flag - /// captureBegin or captureEnd is set. Otherwise set this to nullptr. - const IGpuMemory* pPrivFlipMemory; ///< The gpu memory object of the private flip primary surface for the - /// DirectCapture feature. - const Util::Event* pEarlyPresentEvent; ///< The 'early present' event object. This variable can be nullptr. - uint64 frameIndex; ///< The frame index of this command buffer. It is only required for the - /// DirectCapture feature - uint32 vidPnSourceId; ///< The display source id for the DirectCapture feature. Clients must set - /// a valid vidPnSourceId when privateFlip flag is set and pDirectCapMemory - /// is nullptr. - uint64 frameId; ///< Present frame index, incremented at each present - const IGpuMemory* pMotionVectorMemory; ///< The motion vector gpu memory object for the DirectCapture feature. - const IGpuMemory* pDepthMemory; ///< The depth gpu memory object for the DirectCapture feature. - const IGpuMemory* pCameraMemory; ///< The camera gpu memory object for the DirectCapture feature. - const IGpuMemory* pHudLessImageMemory; ///< The HUD less image gpu memory object for DirectCapture. -}; - -/// Specifies rotation angle between two images. Used as input to ICmdBuffer::CmdScaledCopyImage. -enum class ImageRotation : uint32 -{ - Ccw0 = 0x0, ///< Counter clockwise degree 0 - Ccw90 = 0x1, ///< Counter clockwise degree 90 - Ccw180 = 0x2, ///< Counter clockwise degree 180 - Ccw270 = 0x3, ///< Counter clockwise degree 270 - Count -}; - -/// Describes a color-key value which can control a pixel get copied or ignored during a CmdScaledCopyImage operation. -struct ColorKey -{ - uint32 u32Color[4]; ///< The color value for each channel -}; - -/// Uniquely identifies the target of the a Present operation (swap chain / destination window / etc.) so that PAL's debug -/// layers can track frames-per-second or other statistics correctly when applications render to multiple displays or -/// windows. Client drivers which don't care about this can always specify a key value of 0. -using UniquePresentKey = uint64; - -/// Convert an OS window handle to a unique present key. -inline UniquePresentKey PresentKeyFromOsWindowHandle(OsWindowHandle handle) -#if defined(_WIN32) - { return reinterpret_cast(handle); } -#else - { return handle.win; } -#endif -/// Convert any pointer to a unique present key. -template -constexpr inline UniquePresentKey PresentKeyFromPointer(T* ptr) { return reinterpret_cast(ptr); } - -/// Specifies the input parameters for debug overlay's visual confirm. This struct is not functional. -/// The client is expected to default initialize this struct and then fill out any state that makes -/// sense under its presentation model. PAL will process any valid input and ignore fields that are -/// default initialized. -struct CmdPostProcessDebugOverlayInfo -{ - PresentMode presentMode; ///< The Presentation Mode of the application. - WsiPlatform wsiPlatform; ///< The WsiPlatform that Swap Chain works upon - UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. -}; - -/// Specifies the input parameters for ICmdBuffer::CmdPostProcessFrame. -struct CmdPostProcessFrameInfo -{ - union - { - struct - { - uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. - uint32 reserved : 31; ///< Reserved for future usage. - }; - uint32 u32All; ///< Flags packed as uint32. - } flags; - - union - { - const IImage* pSrcImage; ///< The image to postprocess (prior to presenting). - const IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to postprocess. - /// Must have been created as a typed buffer. - }; - - CmdPostProcessDebugOverlayInfo debugOverlay; - FullScreenFrameMetadataControlFlags fullScreenFrameMetadataControlFlags; - - Pal::ImageLayout srcImageLayout; -}; - -/// External flags for ScaledCopyImage. -union ScaledCopyFlags -{ - struct - { - uint32 srcColorKey : 1; ///< If set, enables source color-keying by using the value in the ColorKey member. - /// That is, any pixel in the source image that matches the color key should not be - /// copied to the destination image, and all of the source pixels that do not match - /// the color key should be copied. Mutually exclusive with dstColorKey. - uint32 dstColorKey : 1; ///< If set, enables destination color-keying by using the value in the ColorKey - /// member. That is, any pixel in the destination image that matches the color key - /// should be replaced with the corresponding pixel from the source image, and all of - /// the destination pixels that do not match the color key should not be replaced. - /// Mutually exclusive with srcColorKey. - uint32 srcAlpha : 1; ///< If set, use alpha channel in source surface as blend factor. - /// color = src alpha * src color + (1.0 - src alpha) * dst color. - uint32 dstAsSrgb : 1; ///< If set, a non-srgb destination image will be treated as srgb format. - /// Cannot be set if @ref dstAsNorm is set. - uint32 dstAsNorm : 1; ///< If set, a srgb destination image will be treated as non-srgb format. - /// Cannot be set if @ref dstAsSrgb is set. - uint32 scissorTest : 1; ///< If set, do scissor test using the specified scissor rectangle. - uint32 coordsInFloat : 1; ///< If set, copy regions are represented in floating point type. - uint32 srcAsNorm : 1; ///< If set, an srgb source image will be treated as non-srgb format. - /// Cannot be set if @ref srcAsSrgb is set. - uint32 srcAsSrgb : 1; ///< If set, a non-srgb source image will be treated as srgb format. - /// Cannot be set if @ref srcAsNorm is set. - uint32 reserved : 23; ///< reserved for future usage. - }; - uint32 u32All; ///< Flags packed as uint32. -}; - -/// Input structure to @ref ICmdBuffer::CmdScaledCopyImage. Specifies parameters needed to execute CmdScaledCopyImage. -struct ScaledCopyInfo -{ - const IImage* pSrcImage; ///< The source image to blt from. - ImageLayout srcImageLayout; ///< The source image layout. - const IImage* pDstImage; ///< The dest image to blt to. - ImageLayout dstImageLayout; ///< The dest image layout. - uint32 regionCount; ///< Copy region array size. - const ImageScaledCopyRegion* pRegions; ///< Region array to copy. - TexFilter filter; ///< Controlling how a given texture is sampled. - ImageRotation rotation; ///< Rotation option between two images. - const ColorKey* pColorKey; ///< Color key value. - const Rect* pScissorRect; ///< Scissor test rectangle. - ScaledCopyFlags flags; ///< Copy flags, identifies the type of blt to peform. -}; - -/// Input structure to @ref ICmdBuffer::CmdGenerateMipmaps. Specifies parameters needed to execute CmdGenerateMipmaps. -struct GenMipmapsInfo -{ - const IImage* pImage; ///< Populate mips in this image by reading from existing higher-level mips. - ImageLayout baseMipLayout; ///< The layout of all slices in the read-only base mip; must include LayoutCopySrc. - ImageLayout genMipLayout; ///< The layout of all slices and mips that will be generated; must include - /// LayoutCopySrc and LayoutCopyDst. - SubresRange range; ///< Which subresources should be generated from earlier mips. The starting mipLevel - /// must never be zero because there would be no larger mip to read. - TexFilter filter; ///< Controls texture sampling during mip generation. Linear texture filtering is - /// only supported for images with non-integer formats. - SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret all subresources using this format and swizzle. - /// The specified format needs to have been included in the "pViewFormats" list - /// specified at image-creation time, otherwise the result might be incorrect. -}; - -/// Defines a single memory range to prefetch using CmdPrimeGpuCaches. -struct PrimeGpuCacheRange -{ - gpusize gpuVirtAddr; ///< Base GPU virtual address to be prefetched. - gpusize size; ///< Number of bytes to prefetch. Clients should keep range sizes small relative - /// to the GPU caches (e.g., tccSizeInBytes); the PAL implementation may clamp - /// prefetched ranges if they are too large for the cache being prefetched. - uint32 usageMask; ///< Bitmask of CacheCoherencyUsageFlags defining the usage to prefetch for. - /// E.g., if the mask includes CoherShader, then PAL will attempt to prefetch - /// into caches that are on the shader core's data path. This mask must be a - /// subset of the dstCacheMask specified in the last barrier operation executed - /// on this memory range. Performing the cache prefetch is considered a read - /// operation of the specified usage, and so must be properly accounted for - /// in future barrier memory dependencies for this range. - bool addrTranslationOnly; ///< If set, only the address translation caches (i.e., TLB) will be primed; - /// no data caches will be affected. If this is set, the prefetch operation - /// has no bearing on barrier execution or memory dependencies. -}; - -/// Magic number tag for payloads in command buffer dumps -constexpr uint32 CmdBufferPayloadSignature = 0x1337F77D; - -/// Maximum size, in DWORDs, of payload data in command buffer dumps. -constexpr uint32 MaxPayloadSize = 254; - -/// Payload types used in special embedded NOP packets. -enum class CmdBufferPayloadType : uint32 -{ - Integer = 0, ///< Payload consists of a single 32-bit signed integer. - UnsignedInteger = 1, ///< Payload consists of a single 32-bit unsigned integer. - Integer64 = 2, ///< Payload consists of a single 64-bit signed integer. - UnsignedInteger64 = 3, ///< Payload consists of a single 64-bit unsigned integer. - Float = 4, ///< Payload consists of a single 32-bit floating point number. - Double = 5, ///< Payload consists of a single 64-bit double precision floating point number. - Pointer = 6, ///< Payload consists of a single 64-bit pointer address. - String = 7, ///< Payload consists of a variable length string. Must contain null-terminator. - Binary = 8, ///< Payload consists of DWORD-aligned binary data. -}; - -/// Structure layout for embedded CmdBuffer payloads. This can be embedded into the command stream with the -/// @ref ICmdBuffer::CmdNop() function. -struct CmdBufferPayload -{ - uint32 signature; ///< Magic number tag indicating the structure to follow. - uint32 payloadSize; ///< Size of the NOP packet (one DWORD) plus the sizeof this structure and the - /// payload data to follow. - /// This value is in DWORDs. Payload size is expected to be under - /// MaxPayloadSize. - CmdBufferPayloadType type; ///< The type of payload. - uint32 payload[1]; ///< Initial DWORD of payload data with the other data to follow. -}; - -/// Flags controlling which sub-queue(s) of a command buffer should insert an RGP trace marker. Zeroing out this -/// union is invalid, because RGP markers must be sent to at least one sub-queue. -union RgpMarkerSubQueueFlags -{ - struct - { - uint32 includeMainSubQueue : 1; ///< If set, includes the main sub-queue in the RGP marker. - uint32 includeGangedSubQueues : 1; ///< If set, includes any ganged sub-queues in the RGP marker. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed into a uint32 -}; - -/** - *********************************************************************************************************************** - * @interface ICmdBuffer - * @brief Contains GPU rendering and other commands recorded by PAL on the client's behalf. - * - * A command buffer can be executed by the GPU multiple times and recycled, provided the command buffer is not pending - * execution on the GPU when it is recycled. - * - * Command buffers are fully independent and there is no persistence of GPU state between submitted command buffers. - * When a new command buffer is recorded, the state is undefined. All relevant state must be explicitly set by the - * client before state-dependent operations such as draws and dispatches. - * - * @see IDevice::CreateCmdBuffer() - *********************************************************************************************************************** - */ -class ICmdBuffer : public IDestroyable -{ -public: - /// Resets the command buffer's previous contents and state, then puts it in the _building_ _state_, allowing new - /// commands to be recorded. - /// - /// If this is a root command buffer, the state will be reset to a "clean slate" with nothing bound. If this is a - /// nested command buffer, the state is set to an "undefined" state so that all render state can be inherited from - /// any root command buffer which executes this one. - /// - /// @param [in] info Controls how PAL will generate commands for this command buffer. E.g., specifies whether the - /// command buffer may be submitted more than once, and controls options for optimizing PM4, etc. - /// - /// @returns Success if the command buffer was successfully reset and put into the _building_ _state_. Otherwise, - /// one of the following error codes may be returned: - /// + ErrorInvalidFlags if invalid flags are set in the flags parameter. - /// + ErrorIncompleteCommandBuffer if the command buffer is already in the _building_ _state_. - virtual Result Begin( - const CmdBufferBuildInfo& info) = 0; - - /// Completes recording of a command buffer in the _building_ _state_, making it _executable_. - /// - /// @returns Success if the command buffer was successfully made _executable_. Otherwise, one of the following - /// errors may be returned: - /// + ErrorIncompleteCommandBuffer if the command buffer is not in the _building_ _state_. - /// + ErrorBuildingCommandBuffer if some error occurred while building the command buffer, and it could not - /// be made _executable_. If this error is returned, the command buffer can not be submitted. - virtual Result End() = 0; - - /// Explicitly resets a command buffer, releasing any internal resources associated with it. - /// - /// This call must be used to reset command buffers that have previously reported a ErrorIncompleteCommandBuffer - /// error. - /// - /// @note @ref Begin will implicitly cause a command buffer to be reset in addition to putting it in the - /// _building_ _state_. This method just gives a way to release resources between when the client knows - /// it is done with the command buffer and when it is ready to reuse this command buffer object for - /// recording new commands. - /// - /// @param [in] pCmdAllocator If non-null, all future GPU memory allocations will be done using this allocator. - /// Otherwise the command buffer will continue to use its current command allocator. - /// - /// @param [in] returnGpuMemory If true then all GPU memory associated with this command buffer will be returned - /// to the allocator upon reset. If false data chunks will be retained and reused. - /// Note: This flag must be true if changing command allocators. - /// - /// @warning If returnGpuMemory is false, the client must guarantee that this command buffer is not queued for - /// execution, is not currently being executed, and that all other command buffers that have referenced - /// this command buffer in a @ref CmdExecuteNestedCmdBuffers call have also been reset. - /// - /// @returns Success if the command buffer was successfully reset. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Result Reset(ICmdAllocator* pCmdAllocator, bool returnGpuMemory) = 0; - - /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to CmdAllocateEmbeddedData. - /// - /// This a property of the command buffer and its associated command allocator; it may change if the caller - /// specifies a different command allocator on Reset(). - /// - /// @returns How many DWORDs of embedded data the command buffer can allocate at once. - virtual uint32 GetEmbeddedDataLimit() const = 0; - - /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to - /// CmdAllocateLargeEmbeddedData. - /// - /// @returns Number of DWORDs that can be allocated in one call to CmdAllocateLargeEmbeddedData - virtual uint32 GetLargeEmbeddedDataLimit() const = 0; - - /// Binds a graphics or compute pipeline to the current command buffer state. - /// - /// Graphics pipelines must be compiled for the PAL ABI. Compute pipelines must either be compiled for the PAL ABI - /// or the HSA ABI, if it's supported. HSA ABI support is indicated by supportHsaAbi in @ref DeviceProperties. - /// - /// PAL ABI pipelines and HSA ABI pipelines use different mechanisms to bind inputs and outputs. PAL ABI pipelines - /// use user data entries set by @ref CmdSetUserData. HSA ABI pipelines use kernel arguments set by @ref - /// CmdSetKernelArguments. Binding or unbinding a compute pipeline can implicitly modify the user data and kernel - /// argument state, please read the @ref CmdSetUserData and @ref CmdSetKernelArguments documentation for details. - /// - /// @param [in] params Parameters necessary to manage dynamic pipeline shader information. - virtual void CmdBindPipeline( - const PipelineBindParams& params) = 0; - - /// Binds the specified MSAA state object to the current command buffer state. - /// - /// @param [in] pMsaaState New MSAA state to be bound. Can be null in order to unbind a previously bound MSAA state - /// object without binding a new one. - virtual void CmdBindMsaaState( - const IMsaaState* pMsaaState) = 0; - - /// Saves a copy of all of the current command buffer state that is used by graphics workloads. This feature is - /// intended to give PAL clients a convenient way to issue their own internal graphics workloads without modifying - /// the application-facing state. - /// - /// PAL cannot save multiple layers of state, each call to CmdSaveGraphicsState must be followed by a call to - /// CmdRestoreGraphicsState before the next call to CmdSaveGraphicsState. Any barriers, resolves, blits, etc are not - /// allowed while the state is pushed. - /// - /// This function can only be called on command buffers that support graphics workloads. All query counters will be - /// disabled until CmdRestoreGraphicsState is called. - virtual void CmdSaveGraphicsState() = 0; - - /// Restores all of the command buffer state that is used by graphics workloads. This feature is intended to - /// give PAL clients a convenient way to issue their own internal graphics workloads without modifying the - /// application-facing state. - /// - /// A call to this function must be preceded by a call to CmdSaveGraphicsState - /// - /// This function can only be called on command buffers that support graphics workloads. All previously disabled - /// query counters will be reactivated. - virtual void CmdRestoreGraphicsState() = 0; - - /// Sets the shading rate in the command buffer along with the state of the various combiners. - /// - /// @param [in] rateParams Nwe VRS shading rate parameters to be bound. - virtual void CmdSetPerDrawVrsRate( - const VrsRateParams& rateParams) = 0; - - /// Setup parameters regarding how pixel center will be evaluated with VRS. - /// - /// @param [in] centerState Nwe VRS parameters to be bound that control how pixel center is defined. - virtual void CmdSetVrsCenterState( - const VrsCenterState& centerState) = 0; - - /// Binds the shading rate data in the specified image into the pipeline for use with VRS. Only relevant if the - /// combiner stage for VrsCombinerStage is set to something other than Passthrough. - /// - /// This binding point requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageSampleRate - /// - CacheCoherency: @ref CoherSampleRate - /// - ImageLayout: @ref LayoutSampleRate - /// - /// @param [in] pImage Image that contains sample rate data. Pointer can be NULL to force 1x1 shading rate. - virtual void CmdBindSampleRateImage( - const IImage* pImage) = 0; - - /// Binds the specified color/blend state object to the current command buffer state. - /// - /// @param [in] pColorBlendState New color/blend state to be bound. Can be null in order to unbind a previously - /// bound color/blend state object without binding a new one. - virtual void CmdBindColorBlendState( - const IColorBlendState* pColorBlendState) = 0; - - /// Binds the specified depth/stencil state object to the current command buffer state. - /// - /// @param [in] pDepthStencilState New depth/stencil state to be bound. Can be null in order to unbind a previously - /// bound depth/stencil state object without binding a new one. - virtual void CmdBindDepthStencilState( - const IDepthStencilState* pDepthStencilState) = 0; - - /// Sets the value range to be used for depth bounds testing. - /// - /// The depth bounds test is enabled in the graphics pipeline. When enabled, an additional check will be done that - /// will reject a pixel if the pre-existing depth value stored at its destination location is outside of the - /// specified bounds. Applications would typically use this feature to optimize shadow volume rendering. - /// - /// @param [in] params Parameters necessary to set the depth bounds (such as min/max depth). - virtual void CmdSetDepthBounds( - const DepthBoundsParams& params) = 0; - - /// Sets pipeline-accessible user data to the specified values. - /// - /// The values set in user data entries will be interpreted based on the resource mapping specified for each shader - /// in the currently bound pipeline. For example, the client can write virtual addresses of tables containing - /// SRDs, immediate SRDs that can be loaded without an indirection, or even a small number of immediate ALU - /// constants. - /// - /// The user data values are only used by PAL ABI pipelines. Almost all pipelines used by PAL clients are compiled - /// for the PAL ABI, but PAL also supports HSA ABI compute pipelines which use @ref CmdSetKernelArguments instead. - /// When an HSA ABI pipeline is bound the current compute user data entries are saved and will be restored if the - /// client later binds a PAL ABI compute pipeline. - /// - /// @warning It's illegal to set compute user data if an HSA ABI pipeline is currently bound. - /// - /// If no compute pipeline is currently bound PAL assumes the client will bind a PAL ABI pipeline and thus accepts - /// user data bindings. Graphics user data are unaffected by all of this because graphics pipelines can only use - /// the PAL ABI. - /// - /// @see PipelineShaderInfo - /// @see ResourceMappingNode - /// @ingroup ResourceBinding - /// - /// @param [in] bindPoint Specifies which type of user-date is to be set (i.e., compute or graphics). - /// @param [in] firstEntry First user data entry to be updated. - /// @param [in] entryCount Number of user data entries to update; size of the pEntryValues array. Must be greater - /// than zero, and (firstEntry + entryCount) must not extend beyond MaxUserDataEntries. - /// @param [in] pEntryValues Array of 32-bit values to be copied into user data. - void CmdSetUserData( - PipelineBindPoint bindPoint, - uint32 firstEntry, - uint32 entryCount, - const uint32* pEntryValues) - { (m_funcTable.pfnCmdSetUserData[static_cast(bindPoint)])(this, firstEntry, entryCount, pEntryValues); } - - /// Copies all pipeline-accessible user-data from one bind point to another. It is invalid if the source and - /// dest parameters refer to the same bind point. - /// - /// @see CmdSetUserData for how the user-date entries will be interpreted by the pipeline. - /// - /// @param [in] source Specifies which bind point to copy from. - /// @param [in] dest Specifies which bind point to copy into. - virtual void CmdDuplicateUserData( - PipelineBindPoint source, - PipelineBindPoint dest) = 0; - - /// Sets one or more HSA code object kernel argument values. - /// - /// If the currently bound compute pipeline was compiled using the HSA compute ABI this function must be used to - /// bind that pipeline's arguments. The argument position and value types are static properties of the pipeline - /// and must be known by the client. - /// - /// @note Calling @ref CmdBindPipeline invalidates all prior kernel argument bindings, even if the new pipeline - /// also uses the HSA ABI. Any kernel arguments that the client intends to share between pipelines must - /// be manually rebound. - /// - /// @warning It's illegal to call this function if no compute pipeline is bound or if the bound compute pipeline - /// uses a different ABI (e.g., the PAL compute ABI). - /// - /// @ingroup ResourceBinding - /// - /// @param [in] firstArg The zero-based position of the first kernel argument to bind. - /// @param [in] argCount Number of kernel arguments this call binds. - /// @param [in] ppValues Array of pointers to kernel argument values. - virtual void CmdSetKernelArguments( - uint32 firstArg, - uint32 argCount, - const void*const* ppValues) = 0; - - /// Changes one or more of the command buffer's active vertex buffers. - /// - /// @note If bufferViews.offsetMode is false, PAL will construct SRDs for each bound vertex buffer which are - /// equivalent to the client calling @ref IDevice::CreateUntypedBufferViewSrd on each element of the - /// pBuffers parameter. - /// - /// Note that vertex buffers require use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageVs - /// - CacheCoherency: @ref CoherShaderRead - /// - /// @param [in] bufferViews Vertex buffer view descriptors.This parameter defines which vertex mode is used through - /// @ref VertexBufferViews::offsetMode. VertexBufferViews::pVertexBufferViews or - /// VertexBufferViews::pBufferViewInfos must not be nullptr. - virtual void CmdSetVertexBuffers( - const VertexBufferViews& bufferViews) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 877 - void CmdSetVertexBuffers( - uint32 firstBuffer, - uint32 bufferCount, - const BufferViewInfo* pBuffers) - { - const VertexBufferViews bufferViews = - { - .firstBuffer = firstBuffer, - .bufferCount = bufferCount, - .offsetMode = false, - .pBufferViewInfos = pBuffers - }; - CmdSetVertexBuffers(bufferViews); - } -#endif - - /// Binds a range of memory for use as index data (i.e., binds an index buffer). - /// - /// The GPU virtual address must be index element aligned: 2-byte aligned for 16-bit indices or 4-byte aligned for - /// 32-bit indices. - /// - /// The index buffer binding point requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageFetchIndices - /// - CacheCoherency: @ref CoherIndexData - /// - /// @param [in] gpuAddr GPU virtual address of the index data. Can be zero to unbind the previously bound data. - /// @param [in] indexCount Maximum number of indices in the index data; the GPU may read less indices. - /// @param [in] indexType Specifies whether to use 8-bit, 16-bit or 32-bit index data. - virtual void CmdBindIndexData( - gpusize gpuAddr, - uint32 indexCount, - IndexType indexType) = 0; - - /// Binds color and depth/stencil targets to the current command buffer state. - /// - /// The current layout of each target must also be specified. - /// - /// The color target binding points require use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageColorTarget - /// - CacheCoherency: @ref CoherColorTarget - /// - ImageLayout: @ref LayoutColorTarget - /// - /// The depth and stencil target binding points require use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget - /// - CacheCoherency: @ref CoherDepthStencilTarget - /// - ImageLayout: @ref LayoutDepthStencilTarget - /// - /// @param [in] params Parameters representing the color and depth/stencil targets to bind to the command buffer. - virtual void CmdBindTargets( - const BindTargetParams& params) = 0; - - /// Binds stream-output target buffers to the current command buffer state. - /// - /// At draw-time, the stream-output targets must be consistent with the soState parameters specified by the - /// currently bound graphics pipeline. - /// - /// The stream-output target buffers require use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageStreamOut - /// - CacheCoherency: @ref CoherStreamOut - /// - /// @param [in] params Parameters representing the stream-output target buffers to bind to the command buffer. - virtual void CmdBindStreamOutTargets( - const BindStreamOutTargetParams& params) = 0; - - /// Sets the constant factor to be used by the blend hardware when programmed with the Blend::ConstantColor, - /// Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend coefficients. - /// - /// @param [in] params Parameters representing the blend constant factor. - virtual void CmdSetBlendConst( - const BlendConstParams& params) = 0; - - /// Sets input assembly state for upcoming draws in this command buffer. - /// - /// At draw-time, the topology specified with this method must be consistent with the _topologyInfo_ parameters - /// specified by the currently bound graphics pipeline. - /// - /// @param [in] params Parameters representing the input assembly state for upcoming draws. - virtual void CmdSetInputAssemblyState( - const InputAssemblyStateParams& params) = 0; - - /// Sets parameters controlling triangle rasterization. - /// - /// @param [in] params Parameters to set the triangle raster state (such as fill/cull mode). - virtual void CmdSetTriangleRasterState( - const TriangleRasterStateParams& params) = 0; - - /// Sets parameters controlling point and line rasterization. - /// - /// @param [in] params Parameters to set the point and line rasterization state (such as pointSize and lineWidth). - virtual void CmdSetPointLineRasterState( - const PointLineRasterStateParams& params) = 0; - - /// Sets parameters controlling line stippling. - /// - /// @param [in] params Parameters to set the line stipple state. - virtual void CmdSetLineStippleState( - const LineStippleStateParams& params) = 0; - - /// Sets depth bias parameters. - /// - /// Depth bias is used to ensure a primitive can properly be displayed (without Z fighting) in front (or behind) - /// of the previously rendered co-planar primitive. This is useful for decal or shadow rendering. - /// - /// @param [in] params Parameters for setting the depth bias (such as depth bias, depth bias clamp, and slope - /// scaled depth bias). - virtual void CmdSetDepthBiasState( - const DepthBiasParams& params) = 0; - - /// Sets stencil reference values and mask buffer reads and writes in upcoming draws. Separate reference values - /// can be specified for front-facing and back-facing polygons. Update flags should be set for state which needs to - /// be updated. All other state will be preserved. - /// Setting all the values (reference, read/write masks and stencil op) in the StencilRefMaskParams together - /// takes the faster path. - /// Setting either the ref value, read/write masks or the stencil op value individually takes the slower - /// read-modify-write path. - /// - /// @param [in] params Parameters for setting the stencil read and write masks. - virtual void CmdSetStencilRefMasks( - const StencilRefMaskParams& params) = 0; - - /// Sets user defined clip planes, should only be called on universal command buffers. - /// - /// @param [in] firstPlane The index of first plane in user define clip plane array. - /// @param [in] planeCount The count of planes in plane array. - /// @param [in] pPlanes Pointer to plane array. - virtual void CmdSetUserClipPlanes( - uint32 firstPlane, - uint32 planeCount, - const UserClipPlane* pPlanes) = 0; - - /// Sets clip rects, should only be called on universal command buffers. - /// - /// @param [in] clipRule 16-bit clip rule bits are used to determine if pixel shall be discarded or retained. - /// For each pixel, a 4-bit index is computed based on which clip rects the pixel is - /// inside (bitN represents rectN). Then uses this index to check the corresponding bit - /// in clip rule for this pixel - 0 for discarded, 1 for retained. - /// @param [in] rectCount The count of rectangles in rect list. This must be less than or equal to - /// MaxClipRects (4). - /// @param [in] pRectList Pointer to the rect list. - virtual void CmdSetClipRects( - uint16 clipRule, - uint32 rectCount, - const Rect* pRectList) = 0; - - /// Sets user defined MSAA quad-pixel sample pattern, should only be called on universal command buffers - /// This should be called before clearing, rendering, barriering and resolving of MSAA DepthStencil image. - /// - /// @param [in] numSamplesPerPixel Number of samples per pixel - /// @param [in] quadSamplePattern The input msaa sample pattern - virtual void CmdSetMsaaQuadSamplePattern( - uint32 numSamplesPerPixel, - const MsaaQuadSamplePattern& quadSamplePattern) = 0; - - /// Sets the specified viewports to the current command buffer state. - /// - /// @param [in] params Parameters for setting the specified number of viewports. - virtual void CmdSetViewports( - const ViewportParams& params) = 0; - - /// Sets the scissor regions corresponding to each viewport to the current command buffer state. - /// - /// @param [in] params Parameters for setting the specified number of scissor regions. - virtual void CmdSetScissorRects( - const ScissorRectParams& params) = 0; - - /// Sets the global scissor rectangle. - /// - /// @param [in] params Parameters for setting the global scissor rectangle from the top left to bottom right - /// coordinate. - virtual void CmdSetGlobalScissor( - const GlobalScissorParams& params) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 - /// Inserts a barrier in the current command stream that can stall GPU execution, flush/invalidate caches, or - /// decompress images before further, dependent work can continue in this command buffer. - /// - /// This operation does not honor the command buffer's predication state, if active. - /// - /// @param [in] barrierInfo See @ref BarrierInfo for detailed information. - virtual void CmdBarrier( - const BarrierInfo& barrierInfo) = 0; -#endif - /// Perform source pipeline stage and cache access optimization based on the acquire/release interface. - /// - /// @param [in] barrierType Barrier transition type @ref BarrierType. - /// @param [in] pImage Image pointer for image transition, required when @ref BarrierType is - /// BarrierType::Image. - /// @param [in/out] pSrcStageMask A source mask of ORed @ref PipelineStageFlag to optimize, can't be null. - /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null. - /// @param [in/out] pDstStageMask A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null. - /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize. - /// - /// @returns If need flush and invalidate GL2 cache. - /// - /// @note PipelineStageBlt will be converted to more accurate stage(s) based on the underlying implementation of - /// outstanding BLTs, but will be left as PipelineStageBlt if the internal outstanding BLTs can't be expressed - /// as a client-facing PipelineStage (e.g., if there are CP DMA BLTs in flight). - virtual bool OptimizeAcqRelReleaseInfo( - BarrierType barrierType, - const IImage* pImage, - uint32* pSrcStageMask, - uint32* pSrcAccessMask, - uint32* pDstStageMask, - uint32* pDstAccessMask) const = 0; - - /// Performs the release portion of an acquire/release-based barrier. This releases a set of resources from their - /// current usage, while CmdAcquire() is expected to be called to acquire access to the resources for future, - /// different usage. - /// - /// Conceptually, this method will: - /// - Ensure the specified source synchronization scope has completed. - /// - Ensure all specified resources are available in memory. The availability operation will flush all - /// write-back caches to the last-level-cache. - /// - Perform any requested layout transitions. - /// - /// Once all of these operations are complete, the release issues a timestamp event that signals the operation - /// completion. The event type and timestamp value is returned to caller in a packed uint32 token. A corresponding - /// CmdAcquire() call is expected to wait on one or a list of such synchronization tokens and perform any necessary - /// visibility operations and/or layout transitions that could not be predicted at release-time. - /// - /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout - /// transitions. - /// @returns Synchronization token for the release operation. Pass this token to CmdAcquire to confirm completion. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 - virtual uint32 CmdRelease( -#else - virtual ReleaseToken CmdRelease( -#endif - const AcquireReleaseInfo& releaseInfo) = 0; - - /// Performs the acquire portion of an acquire/release-based barrier. This acquire a set of resources for a new - /// set of usages, assuming CmdRelease() was called to release access for the resource's past usage. - /// - /// Conceptually, this method will: - /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all - /// relevant caches above the last-level-cache. - /// - Perform any requested layout transitions. - /// - Ensure the release(s) have completed by waiting on the synchronization token of the release operation. - /// - /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout - /// layout transitions. - /// @param [in] syncTokenCount Number of entries in pSyncTokens, can be zero if no valid release token. - /// @param [in] pSyncTokens Array of synchronization tokens, as returned from CmdRelease, to confirm completion. - /// The token value(s) must have been returned by a CmdRelease call in the same command - /// buffer. pSyncTokens can be null if syncTokenCount is 0. - virtual void CmdAcquire( - const AcquireReleaseInfo& acquireInfo, - uint32 syncTokenCount, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 - const uint32* pSyncTokens) = 0; -#else - const ReleaseToken* pSyncTokens) = 0; -#endif - - /// Performs the release portion of an acquire/release event-based barrier. This releases a set of resources from - /// their current usage, while CmdAcquireEvent() is expected to be called to acquire access to the resources for - /// future, different usage. - /// - /// Conceptually, this method will: - /// - Ensure the specified source synchronization scope has completed. - /// - Ensure all specified resources are available in memory. The availability operation will flush all - /// write-back caches to the last-level-cache. - /// - Perform any requested layout transitions. - /// - /// Once all of these operations are complete, the specified IGpuEvent object will be signaled. A corresponding - /// CmdAcquireEvent() call is expected to wait on this event and perform any necessary visibility operations and/or - /// layout transitions that could not be predicted at release-time. - /// - /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout - /// transitions. - /// @param [in] pGpuEvent Event to be signaled once the release has completed. Must be a valid (non-null) GPU - /// event pointer. Call CmdRelease()/CmdAcquire() pair instead if want to release/acquire - /// something but no GPU event is available. - virtual void CmdReleaseEvent( - const AcquireReleaseInfo& releaseInfo, - const IGpuEvent* pGpuEvent) = 0; - - /// Performs the acquire portion of an acquire/release event-based barrier. This acquire a set of resources for a - /// new set of usages, assuming CmdReleaseEvent() was called to release access for the resource's past usage. - /// - /// Conceptually, this method will: - /// - Ensure the release(s) have completed by waiting for the specified IGpuEvent early enough in the pipeline to - /// support the specified destination synchronization scope. - /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all - /// relevant caches above the last-level-cache. - /// - Perform any requested layout transitions. - /// - /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout - /// layout transitions. - /// @param [in] gpuEventCount Number of entries in pGpuEvents. - /// @param [in] ppGpuEvents Array of one or more events to wait on. Typically these will be set via - /// CmdReleaseEvent(), but it's valid to wait on an event set through a different means - /// like CmdSetEvent() from CPU side. Must be a valid (non-null) pointer to an array of - /// gpuEventCount valid GPU event pointers. Call CmdReleaseThenAcquire() instead if wait - /// to acquire something but no GPU event is available. - virtual void CmdAcquireEvent( - const AcquireReleaseInfo& acquireInfo, - uint32 gpuEventCount, - const IGpuEvent* const* ppGpuEvents) = 0; - - /// Conceptually equivalent to calling CmdRelease() followed immediately by CmdAcquire(), but it potentially has - /// better performance than calling CmdRelease()/CmdAcquire() directly. Can be called in cases where the client/ - /// application cannot detect separate release and acquire points for a transition. - /// - /// @param [in] barrierInfo Describes the synchronization scopes, availability/visibility operations, and the - /// required layout transitions. - virtual void CmdReleaseThenAcquire( - const AcquireReleaseInfo& barrierInfo) = 0; - - /// Issues an instanced, non-indexed draw call using the command buffer's currently bound graphics state. Results - /// in instanceCount * vertexCount vertices being processed. - /// - /// It is an error if the currently bound pipeline contains a mesh and/or task shader. - /// - /// @param [in] firstVertex Starting index value for the draw. Indices passed to the vertex shader will range - /// from firstVertex to firstVertex + vertexCount - 1. - /// @param [in] vertexCount Number of vertices to draw. If zero, the draw will be discarded. - /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range - /// from firstInstance to firstInstance + instanceCount - 1. - /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. - /// @param [in] drawId Draw index for the draw. - void CmdDraw( - uint32 firstVertex, - uint32 vertexCount, - uint32 firstInstance, - uint32 instanceCount, - uint32 drawId) - { - m_funcTable.pfnCmdDraw(this, firstVertex, vertexCount, firstInstance, instanceCount, drawId); - } - - /// Issues draw opaque call using the command buffer's currently bound graphics state. - /// Uses the stream-out target of a previous draw as the input vertex data. - /// the number of vertices = (streamOutFilledSize (value of streamOutFilledSizeVa) - streamOutOffset) / stride - /// - /// It is an error if the currently bound pipeline contains a mesh and/or task shader. - /// - /// @param [in] streamOutFilledSizeVa gpuAddress of streamOut filled size for streamOut buffer. - /// @param [in] streamOutOffset the offset of begin of streamOut as vertex. - /// @param [in] stride stride for stream data as vertex. - /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader - /// will range from firstInstance to firstInstance + instanceCount - 1. - /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. - void CmdDrawOpaque( - gpusize streamOutFilledSizeVa, - uint32 streamOutOffset, - uint32 stride, - uint32 firstInstance, - uint32 instanceCount) - { - m_funcTable.pfnCmdDrawOpaque(this, - streamOutFilledSizeVa, - streamOutOffset, - stride, - firstInstance, - instanceCount); - } - - /// Issues an instanced, indexed draw call using the command buffer's currently bound graphics state. Results in - /// instanceCount * indexCount vertices being processed. - /// - /// It is an error if the currently bound pipeline contains a mesh and/or task shader. - /// - /// Indices passed to the vertex shader will be: - /// - /// + IndexBuffer[firstIndex] + vertexOffset - /// + IndexBuffer[firstIndex + 1] + vertexOffset, - /// + ... - /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset - /// - /// @param [in] firstIndex Starting index buffer slot for the draw. - /// @param [in] indexCount Number of vertices to draw. If zero, the draw will be discarded. - /// @param [in] vertexOffset Offset added to the index fetched from the index buffer before it is passed to the - /// vertex shader. - /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range - /// from firstInstance to firstInstance + instanceCount - 1. - /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. - /// @param [in] drawId Draw index for the draw. - void CmdDrawIndexed( - uint32 firstIndex, - uint32 indexCount, - int32 vertexOffset, - uint32 firstInstance, - uint32 instanceCount, - uint32 drawId) - { - m_funcTable.pfnCmdDrawIndexed(this, firstIndex, indexCount, vertexOffset, firstInstance, instanceCount, drawId); - } - - /// Issues instanced, non-indexed draw calls using the command buffer's currently bound graphics state. The draw - /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find - /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its vertexCount or - /// instanceCount is zero. - /// - /// The layout of the argument data is defined in the @ref DrawIndirectArgs structure. - /// - /// It is an error if the currently bound pipeline contains a mesh and/or task shader. - /// - /// This function requires use of the following barrier flags on the indirect memory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @see CmdDraw - /// @see DrawIndirectArgs - /// - /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in - /// memory from one structure to another. - /// The virtual address must be 4 byte aligned. - /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr - /// is nonzero, the value at that memory location is clamped to - /// this maximum. If countGpuAddr is zero, then the number of draws - /// issued exactly matches this number. - /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. - /// Must be 4-byte aligned. - void CmdDrawIndirectMulti( - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr) - { - m_funcTable.pfnCmdDrawIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); - } - - /// Issues instanced, indexed draw calls using the command buffer's currently bound graphics state. The draw - /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find - /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its indexCount or - /// instanceCount is zero. - /// - /// The layout of the argument data is defined in the @ref DrawIndexedIndirectArgs structure. - /// - /// It is an error if the currently bound pipeline contains a mesh and/or task shader. - /// - /// This function requires use of the following barrier flags on the indirect memory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @see CmdDrawIndexed - /// @see DrawIndexedIndirectArgs - /// - /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in - /// memory from one structure to another. - /// The virtual address must be 4 byte aligned. - /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr - /// is nonzero, the value at that memory location is clamped to - /// this maximum. If countGpuAddr is zero, then the number of draws - /// issued exactly matches this number. - /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. - /// Must be 4-byte aligned. - void CmdDrawIndexedIndirectMulti( - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr) - { - m_funcTable.pfnCmdDrawIndexedIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); - } - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 909 - /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. - /// - /// The thread group size is defined in the compute shader. - /// - /// Supports PAL ABI and HSA ABI pipelines. - /// - /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. - inline void CmdDispatch( - DispatchDims size) - { - m_funcTable.pfnCmdDispatch(this, size, {}); - } -#endif - - /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. - /// - /// The thread group size is defined in the compute shader. - /// - /// Supports PAL ABI and HSA ABI pipelines. - /// - /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. - /// @param [in] infoFlags Additional information about the dispatch. - void CmdDispatch( - DispatchDims size, - DispatchInfoFlags infoFlags) - { - m_funcTable.pfnCmdDispatch(this, size, infoFlags); - } - - /// Dispatches a compute workload using the command buffer's currently bound compute state. The dimensions of the - /// workload come from GPU memory. The dispatch will be discarded if any of its dimensions are zero. - /// - /// The layout of the argument data is defined in the @ref DispatchIndirectArgs structure. - /// - /// @warning Does not support HSA ABI pipelines. - /// - /// This function requires use of the following barrier flags on the indirect memory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @see CmdDispatch - /// @see DispatchIndirectArgs - /// - /// @param [in] gpuVirtAddr GPU virtual memory address where the indirect argument data is located. - /// The virtual address must be 4-byte aligned. - void CmdDispatchIndirect( - gpusize gpuVirtAddr) - { - m_funcTable.pfnCmdDispatchIndirect(this, gpuVirtAddr); - } - - /// Dispatches a compute workload of the given dimensions and offsets using the command buffer's currently bound - /// compute state. This command allows targeting regions of thread groups without adding the offset computations in - /// the shader. - /// - /// The caller may also provide a logical thread group count which is larger than the number of groups actually - /// launched. If the shader reads the dispatch's thread group count from PAL metadata it will see the logical size, - /// not the launch size. - /// - /// The combination of an offset, launch size, and logical size give the caller enough flexibility to take an - /// incoming dispatch, split it up into sub-dispatches, and execute those sub-dispatches using multiple - /// CmdDispatchOffset calls in whatever execution pattern they would like. Note that such an optimization - /// would not work if the shader has global logic that does make assumptions about thread group launch order. - /// - /// The thread group size is defined in the compute shader. - /// - /// Supports PAL ABI and HSA ABI pipelines. - /// - /// @param [in] offset The thread groups offsets. Set them to zero if you don't want an offset. - /// @param [in] launchSize Thread groups to dispatch. If any components are zero the dispatch will be discarded. - /// @param [in] logicalSize The thread group dimensions reported to the shader via metadata. - void CmdDispatchOffset( - DispatchDims offset, - DispatchDims launchSize, - DispatchDims logicalSize) - { - m_funcTable.pfnCmdDispatchOffset(this, offset, launchSize, logicalSize); - } - - /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if - /// the currently bound graphics pipeline does not contain a mesh and/or task shader. - /// - /// The thread group size is defined in the mesh shader or task shader. - /// - /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. - void CmdDispatchMesh( - DispatchDims size) - { - m_funcTable.pfnCmdDispatchMesh(this, size); - } - - /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if - /// the currently bound graphics pipeline does not contain a mesh shader. The dimensions of the workload come from - /// GPU memory. The dispatch will be discarded if any of its dimensions are zero. - /// - /// The layout of the argument data is defined in the @ref DispatchMeshIndirectArgs structure. - /// - /// This function requires use of the following barrier flags on the indirect memory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @see CmdDispatchMesh - /// @see DispatchMeshIndirectArgs - /// - /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in - /// memory from one structure to another. - /// The virtual address must be 4 byte aligned. - /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr - /// is nonzero, the value at that memory location is clamped to - /// this maximum. If countGpuAddr is zero, then the number of draws - /// issued exactly matches this number. - /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. - /// Must be 4-byte aligned. - void CmdDispatchMeshIndirectMulti( - GpuVirtAddrAndStride gpuVirtAddrAndStride, - uint32 maximumCount, - gpusize countGpuAddr) - { - m_funcTable.pfnCmdDispatchMeshIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); - } - - /// Copies multiple regions from one GPU memory allocation to another. - /// - /// None of the destination regions are allowed to overlap each other, nor are destination and source regions - /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping - /// will cause undefined results. - /// - /// This call should be used for buffer memory copy only; don't use it for image memory. - /// - /// For best performance, offsets and copy sizes should be 4-byte aligned. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - /// @param [in] srcGpuMemory GPU memory allocation where the source regions are located. - /// @param [in] dstGpuMemory GPU memory allocation where the destination regions are located. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, destination offset, and - /// copy size. - virtual void CmdCopyMemory( - const IGpuMemory& srcGpuMemory, - const IGpuMemory& dstGpuMemory, - uint32 regionCount, - const MemoryCopyRegion* pRegions) = 0; - - /// Copies multiple regions from one GPU memory virtual address to another. - /// - /// @note The CmdCopyMemory() path should be preferred because it contains more optimizations due to more - /// knowledge about the memory itself that is lost when only virtual addresses are passed in. - /// - /// - /// None of the destination regions are allowed to overlap each other, nor are destination and source regions - /// allowed to overlap when the source and destination GPU memory virtual address are the same. Any illegal - /// overlapping will cause undefined results. - /// - /// For best performance, addresses, offsets, and copy sizes should be 4-byte aligned. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - /// @param [in] srcGpuVirtAddr GPU memory vitrual address where the source regions are located. - /// @param [in] dstGpuVirtAddr GPU memory virtual address where the destination regions are located. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifynig a source offset, destination offset, - /// and copy size. - virtual void CmdCopyMemoryByGpuVa( - gpusize srcGpuVirtAddr, - gpusize dstGpuVirtAddr, - uint32 regionCount, - const MemoryCopyRegion* pRegions) = 0; - - /// Copies multiple regions from one image to another. - /// - /// The source and destination subresource of a particular region are not allowed to be the same, and will produce - /// undefined results. Additionally, destination subresources cannot be present more than once per CmdCopyImage() - /// call. - /// - /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image - /// extents are specified in compression blocks. - /// - /// The source and destination images must to be of the same type (1D, 2D or 3D), or optionally 2D and 3D with the - /// number of slices matching the depth. MSAA source and destination images must have the same number of samples. - /// - /// Each region must satisfy these restrictions. - /// - srcOffset >= 0 and dstOffset >= 0 - /// - srcOffset + extent <= srcSubres's extent - /// - dstOffset + extent <= dstSubres's extent - /// - /// Images copied via this function must have x/y/z offsets and width/height/depth extents aligned to the minimum - /// tiled copy alignment specified in @ref DeviceProperties for the engine this function is executed on. Note that - /// the DMA engine supports tiled copies regardless of the alignment; the reported minimum tiled copy alignments - /// are an indication of the minimum alignments for which the copy will be performant. - /// - /// When the per-engine capability flag supportsMismatchedTileTokenCopy (@see DeviceProperties) is false, - /// CmdCopyImage is only valid between two subresources that share the same tileToken (@see SubresLayout). - /// - /// Note that the copy can go through clone copy automatically if, - /// - Both source and destination images are created with @ref ImageCreateInfo::flags::cloneable = 1 - /// - Both source and destination images have same @ref ImageCreateInfo - /// - Source image's layout is compatible with destination images' layout - /// - This is a full image copy - /// - Copy flags @ref CopyControlFlags required to be 0. - /// - /// Basically clone copy clones all subresources' data of one image object in another while preserving the image - /// layout. It does raw copy on image data and metadata; and tries to keep the metadata (like DCC/HiZ/HiS) - /// unchanged but may be not true due to different HW design. - /// e.g. Client compression (fragment and ZPlane compression) will be missed during the compute based raw copy. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. - /// - /// @param [in] srcImage Image where source regions reside. - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include - /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] dstImage Image where destination regions reside. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a source subresource, destination - /// subresource, source x/y/z offset, destination x/y/z offset, and copy size in the - /// x/y/z dimensions. - /// @param [in] pScissorRect Rectangle for scissor test. - /// @param [in] flags A mask of ORed @ref CopyControlFlags that can be used to control copy behavior. - virtual void CmdCopyImage( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const ImageCopyRegion* pRegions, - const Rect* pScissorRect, - uint32 flags) = 0; - - /// Copies data directly (without format conversion) from a GPU memory object to an image. - /// - /// For compressed images, the extents are specified in compression blocks. - /// - /// The size of the data copied from memory is implicitly derived from the image extents. - /// - /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination - /// subresource cannot be present more than once per CmdCopyMemoryToImage() call. - /// - /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopyDst - /// - /// @param [in] srcGpuMemory GPU memory where the source data is located. - /// @param [in] dstImage Image where destination data will be written. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination - /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. - /// @param [in] flags A mask of ORed @ref CopyMemoryToImageControlFlags that can be used to control copy - /// behavior. - virtual void CmdCopyMemoryToImage( - const IGpuMemory& srcGpuMemory, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const MemoryImageCopyRegion* pRegions, - uint32 flags) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 955 - void CmdCopyMemoryToImage( - const IGpuMemory& srcGpuMemory, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const MemoryImageCopyRegion* pRegions) - { - CmdCopyMemoryToImage(srcGpuMemory, dstImage, dstImageLayout, regionCount, pRegions, 0); - } -#endif - - /// Copies data directly (without format conversion) from an image to a GPU memory object. - /// - /// For compressed images, the extents are specified in compression blocks. - /// - /// The size of the data copied to memory is implicitly derived from the image extents. - /// - /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A - /// destination region cannot be present more than once per CmdCopyImageToMemory() call. - /// - /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopySrc - /// - /// @param [in] srcImage Image where source data will be read from. - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include - /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] dstGpuMemory GPU memory where the destination data will be written. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source - /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. - virtual void CmdCopyImageToMemory( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IGpuMemory& dstGpuMemory, - uint32 regionCount, - const MemoryImageCopyRegion* pRegions) = 0; - - /// Copies data directly (without format conversion) from a GPU memory object to a PRT. - /// - /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. - /// This function always copies entire tiles, even if parts of the tile are internal padding. - /// - /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that - /// operate in texels like the generic CmdCopyMemoryToImage() should be used instead. - /// - /// The size of the data copied from memory is implicitly derived from the image extents. - /// - /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination - /// subresource cannot be present more than once per CmdCopyMemoryToTiledImage() call. - /// - /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopyDst - /// - /// @param [in] srcGpuMemory GPU memory where the source data is located. - /// @param [in] dstImage Image where destination data will be written. Must have the "prt" flag set. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination - /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. - virtual void CmdCopyMemoryToTiledImage( - const IGpuMemory& srcGpuMemory, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const MemoryTiledImageCopyRegion* pRegions) = 0; - - /// Copies data directly (without format conversion) from a PRT to a GPU memory object. - /// - /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. - /// This function always copies entire tiles, even if parts of the tile are internal padding. - /// - /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that - /// operate in texels like the generic CmdCopyImageToMemory() should be used instead. - /// - /// The size of the data copied to memory is implicitly derived from the image extents. - /// - /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A - /// destination region cannot be present more than once per CmdCopyTiledImageToMemory() call. - /// - /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopySrc - /// - /// @param [in] srcImage Image where source data will be read from. - /// @param [in] srcImageLayout Current allowed usages and queues for the source image. These masks must include - /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] dstGpuMemory GPU memory where the destination data will be written. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source - /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. - virtual void CmdCopyTiledImageToMemory( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IGpuMemory& dstGpuMemory, - uint32 regionCount, - const MemoryTiledImageCopyRegion* pRegions) = 0; - - /// Copies multiple regions directly (without format conversion) from one typed buffer to another. - /// - /// For compressed formats, the extents are specified in compression blocks. - /// - /// The buffer memory offsets have to be aligned to the smaller of their texel sizes or 4 bytes. - /// - /// None of the destination regions are allowed to overlap each other, nor are destination and source regions - /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping - /// will cause undefined results. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - /// @param [in] srcGpuMemory GPU memory where the source data is located. - /// @param [in] dstGpuMemory GPU memory where the destination data will be written. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source offset, - /// and copy size in the x/y/z dimensions. - virtual void CmdCopyTypedBuffer( - const IGpuMemory& srcGpuMemory, - const IGpuMemory& dstGpuMemory, - uint32 regionCount, - const TypedBufferCopyRegion* pRegions) = 0; - - /// Copies data directly (without format conversion) from a 2D typed buffer to a 2D image. - /// - /// For compressed images, the extents are specified in compression blocks. - /// - /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination - /// subresource cannot be present more than once per CmdScaledCopyTypedBufferToImage() call. - /// - /// MSAA resource is unsupported. The client must resolve both resources before calling this function. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopyDst - /// - /// @param [in] srcGpuMemory GPU memory where the source data is located. - /// @param [in] dstImage Image where destination data will be written. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, copy size of source - /// region, a destination offset, destination subresource, and copy size of destination - /// region. - virtual void CmdScaledCopyTypedBufferToImage( - const IGpuMemory& srcGpuMemory, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const TypedBufferImageScaledCopyRegion* pRegions) = 0; - - /// Copies a GPU register content to a GPU memory location. - /// - /// The destination memory offset has to be aligned to 4 bytes. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherMemory - /// - /// @param [in] srcRegisterOffset Source register offset in bytes - /// @param [in] dstGpuMemory GPU memory where the destination data will be written. - /// @param [in] dstOffset Destination memory offset in bytes. - virtual void CmdCopyRegisterToMemory( - uint32 srcRegisterOffset, - const IGpuMemory& dstGpuMemory, - gpusize dstOffset) = 0; - - /// Copies multiple scaled regions from one image to another. - /// - /// The source and destination subresource of a particular region are not allowed to be the same, and will produce - /// undefined results. Additionally, destination subresources cannot be present more than once per - /// CmdScaledCopyImage() call. - /// - /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image - /// extents are specified in compression blocks. - /// - /// The source and destination images must to be of the same type (1D, 2D or 3D). Both single sampled images and - /// MSAA images are supported. - /// - /// Linear texture filtering is only supported for images with non-integer formats. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. - /// - /// @param [in] copyInfo Specifies parameters needed to execute CmdScaledCopyImage. See - /// @ref ScaledCopyInfo for more information. - virtual void CmdScaledCopyImage( - const ScaledCopyInfo& copyInfo) = 0; - - /// Automatically generates texture data for a range of subresources such that they may be used as intermediate - /// images in a mipmap chain. The existing values in mip N are used to generate mip N+1. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the base mip, @ref CoherCopySrc and @ref CoherCopyDst for the others. - /// - ImageLayout: @ref LayoutCopySrc for the base mip, @ref LayoutCopySrc and @ref LayoutCopyDst for the others. - /// - /// @param [in] genInfo The parameters for CmdGenerateMipmaps. See @ref GenMipmapsInfo for more information. - virtual void CmdGenerateMipmaps( - const GenMipmapsInfo& genInfo) = 0; - - /// Copies multiple scaled regions from one image to another, converting between RGB and YUV color spaces during - /// the copy. The exact conversion between YUV and RGB is controlled by a caller-specified color-space-conversion - /// table. - /// - /// The source and destination images must both be of the 2D type. Only single-sampled images are supported. - /// One of the two images involved must have an RGB color format, and the other must have a YUV color format. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. - /// - /// @param [in] srcImage Images where source region reside. If this is a YUV image, the destination must be - /// RGB, and this copy will convert YUV to RGB. Otherwise, the destination must be YUV, - /// and the copy will convert RGB to YUV. - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include - /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] dstImage Image where destination regions reside. If this is a YUV image, the source must be - /// RGB, and this copy will convert RGB to YUV. Otherwise, the source must be YUV and - /// the copy will convert YUV to RGB. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] regionCount Number of regions to copy; size of the pRegions array. - /// @param [in] pRegions Array of conversion-copy regions, each entry specifying a source x/y/z offset, source - /// x/y/z extent, destination x/y/z offset, destination x/y/z extent, RGB subresource and - /// YUV subresource(s). - /// @param [in] filter Texture filtering for shader sample instruction. - /// @param [in] cscTable Color-space-conversion table which controls how YUV data is converted to a specific - /// RGB representation and vice-versa. - virtual void CmdColorSpaceConversionCopy( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const ColorSpaceConversionRegion* pRegions, - TexFilter filter, - const ColorSpaceConversionTable& cscTable) = 0; - - /// Clones data of one image object in another while preserving the image layout. - /// - /// The source and destination images must be created with identical creation parameters and must specify the - /// cloneable flag. The clone operation clones all subresources. - /// - /// Both resources can be in any layout before the clone operation. After the clone, the source image state is left - /// intact and the destination image layout becomes the same as the source. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. - /// - /// @param [in] srcImage Source image. - /// @param [in] dstImage Destination image. - virtual void CmdCloneImageData( - const IImage& srcImage, - const IImage& dstImage) = 0; - - /// Directly updates a range of GPU memory with a small amount of host data. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopyDst - /// - /// The client is responsible for choosing the proper method for optimal performance. If updating data size is less - /// equal than 8 bytes, CmdWriteImmediate() is preferred. - /// - /// @param [in] dstGpuMemory GPU memory object to be updated. - /// @param [in] dstOffset Byte offset into the GPU memory object to be udpated. Must be a multiple of 4. - /// @param [in] dataSize Amount of data to write, in bytes. Must be a multiple of 4. - /// @param [in] pData Pointer to host data to be copied into the GPU memory. - virtual void CmdUpdateMemory( - const IGpuMemory& dstGpuMemory, - gpusize dstOffset, - gpusize dataSize, - const uint32* pData) = 0; - - /// Updates marker surface with a DWORD value to indicate an event completion. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherCp - /// - /// @param [in] dstGpuMemory GPU memory object to be updated. - /// @param [in] offset Byte offset into marker address - /// @param [in] value Marker DWORD value to be copied to the bus addressable or external physical memory. - virtual void CmdUpdateBusAddressableMemoryMarker( - const IGpuMemory& dstGpuMemory, - gpusize offset, - uint32 value) = 0; - - /// Fills a range of GPU memory with the provided 32-bit data. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopyDst - /// - /// @param [in] dstGpuMemory GPU memory object to be filled. - /// @param [in] dstOffset Byte offset into the GPU memory object to be filled. Must be a multiple of 4. - /// @param [in] fillSize Size to fill, in bytes. Must be a multiple of 4. - /// @param [in] data 32-bit value to be repeated in the filled range. - virtual void CmdFillMemory( - const IGpuMemory& dstGpuMemory, - gpusize dstOffset, - gpusize fillSize, - uint32 data) = 0; - - /// Interprets a range of GPU memory as a color buffer and clears it to the specified clear color. - /// - /// The maximum clear range is determined by the buffer offset and buffer extent; if any Ranges are specified they - /// must be specified in texels with respect to the beginning of the buffer and must not exceed its extent. - /// With 96-bit formats, bufferOffset must be specified in bytes. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageCs is expected but the more general @ref PipelineStageBlt is also OK. - /// - CacheCoherency: @ref CoherShader is expected but the more general @ref CoherClear is also OK. - /// - /// @param [in] gpuMemory GPU memory to be cleared. - /// @param [in] color Specifies the clear color data and how to interpret it. - /// @param [in] bufferFormat The format of the color data in the buffer. - /// @param [in] bufferOffset The offset to the beginning of the buffer, in units of texels - /// (or bytes for 96-bit texels). - /// @param [in] bufferExtent The extent of the buffer, in units of texels. - /// @param [in] rangeCount Number of ranges within the buffer to clear; size of the pRanges array. - /// If zero, the entire view will be cleared and pRanges will be ignored. - /// @param [in] pRanges Array of ranges within the GPU memory to clear. - virtual void CmdClearColorBuffer( - const IGpuMemory& gpuMemory, - const ClearColor& color, - SwizzledFormat bufferFormat, - uint32 bufferOffset, - uint32 bufferExtent, - uint32 rangeCount = 0, - const Range* pRanges = nullptr) = 0; - - /// Clears the currently bound color targets to the specified clear color. - /// - /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. - /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in - /// Vulkan where the color attachments are inherited. - /// - /// This requires regionCount being specified since resource size is for sure to be known. The bound color targets - /// shouldn't have UndefinedSwizzledFormat as their swizzle format. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageColorTarget - /// - CacheCoherency: @ref CoherColorTarget - /// - ImageLayout: @ref LayoutColorTarget - /// - /// @param [in] colorTargetCount Number of bound color target that needs to be cleared. - /// @param [in] pBoundColorTargets Color target information for the bound color targets. - /// @param [in] regionCount Number of volumes within the image to clear; size of the pClearRegions array. - /// This need to be non-zero. - /// @param [in] pClearRegions Array of volumes within the subresources to clear. - virtual void CmdClearBoundColorTargets( - uint32 colorTargetCount, - const BoundColorTarget* pBoundColorTargets, - uint32 regionCount, - const ClearBoundTargetRegion* pClearRegions) = 0; - - /// Clears a color image to the specified clear color. - /// - /// If any Boxes have been specified, all subresource ranges must contain a single, identical mip level. - /// - /// The imageLayout can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick - /// to layouts that are likely to support compression like @ref LayoutColorTarget. - /// - /// This function requires use of the following barrier flags if @ref flags includes @ref ColorClearAutoSync: - /// - PipelineStage: @ref PipelineStageColorTarget - /// - CacheCoherency: @ref CoherColorTarget - /// - ImageLayout: @ref LayoutColorTarget - /// Otherwise the following barrier flags must be used: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherClear - /// - /// @param [in] image Image to be cleared. - /// @param [in] imageLayout Current allowed usages and engines for the target image. - /// @param [in] color Specifies the clear color data and how to interpret it. - /// @param [in] clearFormat If clearFormat.format is Undefined (e.g. if UndefinedSwizzledFormat is provided), do not - /// reinterpret the subresources' formats. Otherwise, the subresources' formats will be - /// reinterpreted according to this parameter. The specified format needs to have been - /// included in the "pViewFormats" list specified at image-creation time, otherwise - /// corruption may occur. - /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. - /// @param [in] pRanges Array of subresource ranges to clear. - /// @param [in] boxCount Number of volumes within the image to clear; size of the pBoxes array. - /// If zero, entire subresources will be cleared and pBoxes will be ignored. - /// @param [in] pBoxes Array of volumes within the subresources to clear. - /// @param [in] flags Mask of ClearColorImageFlags values controlling behavior of the clear. - virtual void CmdClearColorImage( - const IImage& image, - ImageLayout imageLayout, - const ClearColor& color, - const SwizzledFormat& clearFormat, - uint32 rangeCount, - const SubresRange* pRanges, - uint32 boxCount, - const Box* pBoxes, - uint32 flags) = 0; - - /// Clears the currently bound depth/stencil targets to the specified clear values. - /// - /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. - /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in - /// Vulkan where the color attachments are inherited. - /// - /// This requires regionCount being specified since resource size is for sure to be known. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget - /// - CacheCoherency: @ref CoherDepthStencilTarget - /// - ImageLayout: @ref LayoutDepthStencilTarget - /// - /// @param [in] depth Depth clear value. - /// @param [in] stencil Stencil clear value. - /// @param [in] stencilWriteMask Stencil write mask to clear specific stencil planes. - /// @param [in] samples Sample count. - /// @param [in] fragments Fragment count. - /// @param [in] flag Select to depth, stencil or depth and stencil. - /// @param [in] regionCount Number of volumes within the bound depth/stencil target to clear. - /// @param [in] pClearRegions Array of volumes within the subresources to clear. - virtual void CmdClearBoundDepthStencilTargets( - float depth, - uint8 stencil, - uint8 stencilWriteMask, - uint32 samples, - uint32 fragments, - DepthStencilSelectFlags flag, - uint32 regionCount, - const ClearBoundTargetRegion* pClearRegions) = 0; - - /// Clears a depth/stencil image to the specified clear values. - /// - /// If any Rects have been specified, all subresource ranges must contain a single, identical mip level. - /// - /// The layouts can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick to - /// layouts that are likely to support compression like @ref LayoutDepthStencilTarget. - /// - /// This function requires use of the following barrier flags if @ref flags includes @ref DsClearAutoSync: - /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget - /// - CacheCoherency: @ref CoherDepthStencilTarget - /// - ImageLayout: @ref LayoutDepthStencilTarget - /// Otherwise the following barrier flags must be used: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherClear - /// - /// @param [in] image Image to be cleared. - /// @param [in] depth Depth clear value. - /// @param [in] depthLayout Current allowed usages and engines for the depth plane. - /// @param [in] stencil Stencil clear value. - /// @param [in] stencilWriteMask Write-mask to apply to the stencil subresource ranges during the clear. - /// @param [in] stencilLayout Current allowed usages and engines for the stencil plane. - /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. - /// @param [in] pRanges Array of subresource ranges to clear. - /// @param [in] rectCount Number of areas within the image to clear; size of the pRects array. If zero, - /// the entire subresources will be cleared and pRects will be ignored. - /// @param [in] pRects Array of areas within the subresources to clear. - /// @param [in] flags Mask of ClearDepthStencilFlags values controlling behavior of the clear. - virtual void CmdClearDepthStencil( - const IImage& image, - ImageLayout depthLayout, - ImageLayout stencilLayout, - float depth, - uint8 stencil, - uint8 stencilWriteMask, - uint32 rangeCount, - const SubresRange* pRanges, - uint32 rectCount, - const Rect* pRects, - uint32 flags) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 910 - /// Clears a range of GPU memory to the specified clear color using the specified buffer view SRD. - /// - /// The maximum clear range is determined by the view; if any Ranges are specified they must fit within the view's - /// range. The view must support shader writes. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageCs - /// - CacheCoherency: @ref CoherShader - /// - /// @note You may use the more general @ref PipelineStageBlt and @ref CoherClear if you wish but they may result in - /// higher barrier overhead. - /// - /// @param [in] gpuMemory GPU memory to be cleared. - /// @param [in] color Specifies the clear color data and how to interpret it. - /// @param [in] pBufferViewSrd The image view SRD that will be used to interpret the image. - /// @param [in] rangeCount Number of ranges within the GPU memory to clear; size of the pRanges array. - /// If zero, the entire view will be cleared and pRanges will be ignored. - /// @param [in] pRanges Array of ranges within the GPU memory to clear. - virtual void CmdClearBufferView( - const IGpuMemory& gpuMemory, - const ClearColor& color, - const void* pBufferViewSrd, - uint32 rangeCount = 0, - const Range* pRanges = nullptr) = 0; - - /// Clears an image to the specified clear color using the specified image view SRD. - /// - /// The clear subresouce range is determined by the view; if any Rects have been specified, the image view must - /// contain a single mip level. The view must support shader writes. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageCs but the more general @ref PipelineStageBlt is also OK. - /// - CacheCoherency: @ref CoherShader but the more general @ref CoherClear is also OK. - /// - ImageLayout: @ref LayoutShaderWrite - /// - /// @param [in] image Image to be cleared. - /// @param [in] imageLayout Current allowed usages and engines for the image, must include LayoutShaderWrite. - /// @param [in] color Specifies the clear color data and how to interpret it. - /// @param [in] pImageViewSrd The image view SRD that will be used to interpret the image. - /// @param [in] rectCount Number of volumes within the image to clear; size of the pRects array. - /// If zero, entire subresources will be cleared and pRects will be ignored. - /// @param [in] pRects Array of volumes within the subresources to clear. The begin and end slices to be - /// cleard are from SubresRange in pImageViewSrd. - virtual void CmdClearImageView( - const IImage& image, - ImageLayout imageLayout, - const ClearColor& color, - const void* pImageViewSrd, - uint32 rectCount = 0, - const Rect* pRects = nullptr) = 0; -#endif - - /// Resolves multiple regions of a multisampled image to a single-sampled image. - /// - /// The source image must be a 2D multisampled image and the destination must be a single-sampled image. - /// The formats of the source and destination images must match unless all regions specify a valid format. - /// - /// For color images, if the source image has an integer numeric format, a single sample is copied (sample 0). - /// - /// For depth/stencil images, the resolve is performed by simply copying sample 0 from every source pixel to the - /// destination pixel. - /// - /// The same subresource may not appear more than once in the specified array of regions. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherResolveSrc for the source and @ref CoherResolveDst for the destination. - /// - ImageLayout: @ref LayoutResolveSrc for the source and @ref LayoutResolveDst for the destination. - /// - /// @param [in] srcImage MSAA source image. - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include - /// LayoutResolveSrc and the ImageLayoutEngineFlags corresponding to the engine this - /// function is being called on. - /// @param [in] dstImage Single-sample destination image. - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutResolveDst and the ImageLayoutEngineFlags corresponding to the engine - /// this function is being called on. - /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. - /// @param [in] resolveMode Resolve mode - /// @param [in] pRegions Specifies src/dst subresources and rectangles. - /// @param [in] flags Mask of ResolveImageFlags values controlling behavior of the resolve. - virtual void CmdResolveImage( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IImage& dstImage, - ImageLayout dstImageLayout, - ResolveMode resolveMode, - uint32 regionCount, - const ImageResolveRegion* pRegions, - uint32 flags) = 0; - - /// Resolves multiple regions of a Sampler Feedback map to another image. - /// - /// The source image must be: - /// Decode: A 2D Feedback map, in which case the destination image will hold the decoded sampled data - /// Encode: A texture, in which case the destination will hold the encoded sampler map - /// - /// The formats of the source and destination images must be 8bpp - /// - /// The same subresource may not appear more than once in the specified array of regions. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. - /// - ImageLayout: @ref LayoutShaderRead for the source and @ref LayoutShaderWrite for the destination. - /// - /// @param [in] srcImage Source image, depends on resolve type - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must - /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine - /// this function is being called on. - /// @param [in] dstImage Destination image, depends on resolve type - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine - /// this function is being called on. - /// @param [in] resolveType Resolve type - /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. - /// @param [in] pRegions Specifies src/dst subresources and rectangles - virtual void CmdResolvePrtPlusImage( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IImage& dstImage, - ImageLayout dstImageLayout, - PrtPlusResolveType resolveType, - uint32 regionCount, - const PrtPlusImageResolveRegion* pRegions) = 0; - - /// Encodes a buffer into a Sampler Feedback image. - /// - /// This interface only supports encoding raw data from a buffer to an encoded Sampler Feedback map - /// - /// The format of the data in the source buffer and destination image must be 8bpp - /// - /// The same subresource may not appear more than once in the specified array of regions. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. - /// - ImageLayout: @ref LayoutShaderWrite for the destination. - /// - /// @param [in] srcBuffer Source buffer, depends on the resolve type - /// @param [in] dstImage Destination image, depends on resolve type - /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must - /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine - /// this function is being called on. - /// @param [in] regionCount Number of regions to resolve, size of pRegions array - /// @param [in] pRegions Specifies src/dst subresources and rectangles - virtual void CmdResolvePrtPlusBufferToImage( - const IGpuMemory& srcBuffer, - const IImage& dstImage, - ImageLayout dstImageLayout, - uint32 regionCount, - const PrtPlusBufferToImageResolveRegion* pRegions) = 0; - - /// Decodes a Sampler Feedback map to a buffer - /// - /// This interface only supports decoding a Sampler Feedback map to buffer - /// - /// The format of the data in the source image and destination buffer must be 8bpp - /// - /// The same subresource may not appear more than once in the specified array of regions. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. - /// - ImageLayout: @ref LayoutShaderRead for the source - /// - /// @param [in] srcImage Source image, depends on the resolve type - /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must - /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine - /// this function is being called on. - /// @param [in] dstBuffer Destination buffer, depends on resolve type - /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. - /// @param [in] pRegions Specifies src/dst subresources and rectangles. - virtual void CmdResolvePrtPlusImageToBuffer( - const IImage& srcImage, - ImageLayout srcImageLayout, - const IGpuMemory& dstBuffer, - uint32 regionCount, - const PrtPlusImageToBufferResolveRegion* pRegions) = 0; - - /// Puts the specified event into the _set_ state when all prior GPU work has progressed past the given stages. - /// - /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. - /// - /// @param [in] gpuEvent GPU event to be set. - /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that - /// restricts which stages of prior GPU work must happen before the event is set. The set - /// will be performed at the earliest possible stage after the prior stages. - virtual void CmdSetEvent( - const IGpuEvent& gpuEvent, - uint32 stageMask) = 0; - - /// Puts the specified event into the _reset_ state when all prior GPU work has progressed past the given stages. - /// - /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. - /// - /// @param [in] gpuEvent GPU event to be reset. - /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that - /// restricts which stages of prior GPU work must happen before the event is reset. The - /// reset will be performed at the earliest possible stage after the prior stages. - virtual void CmdResetEvent( - const IGpuEvent& gpuEvent, - uint32 stageMask) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 - /// Puts the specified GPU event into the _set_ state when all previous GPU work reaches the specified point in the - /// pipeline. - /// - /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. - /// - /// @param [in] gpuEvent GPU event to be set. - /// @param [in] setPoint Point in the graphics pipeline where the GPU event will be _set_, indicating all prior - /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't - /// support this operation at the exact specified point, the set will be performed at the - /// earliest possible point _after_ the specified point. - inline void CmdSetEvent( - const IGpuEvent& gpuEvent, - HwPipePoint setPoint) - { CmdSetEvent(gpuEvent, HwPipePointToStage[setPoint]); } - - /// Puts the specified GPU event into the _reset_ state when all previous GPU work reaches the specified point in - /// the pipeline. - /// - /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. - /// - /// @param [in] gpuEvent GPU event to be reset. - /// @param [in] resetPoint Point in the graphics pipeline where the GPU event will be _reset_, indicating all prior - /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't - /// support this operation at the exact specified point, the reset will be performed at the - /// earliest possible point _after_ the specified point. - inline void CmdResetEvent( - const IGpuEvent& gpuEvent, - HwPipePoint resetPoint) - { CmdResetEvent(gpuEvent, HwPipePointToStage[resetPoint]); } -#endif - - /// Predicate the subsequent jobs in the command buffer if the event is set. - /// - /// @param [in] gpuEvent GPU event to be checked. - virtual void CmdPredicateEvent( - const IGpuEvent& gpuEvent) = 0; - - /// Performs the specified 32- or 64-bit memory operation. These operations are atomic with respect to shader - /// atomic operations. - /// - /// The data size (32-bit or 64-bit) is determined by the operation type. For 32-bit atomics, only the lower - /// 32-bits of srcData is used. - /// - /// The destination GPU memory offset must be 4-byte aligned for 32-bit atomics and 8-byte aligned for 64-bit - /// atomics. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherQueueAtomic - /// - /// @param [in] dstGpuMemory Destination GPU memory object. - /// @param [in] dstOffset Offset into the memory object where the atomic will be performed. - /// @param [in] srcData Source data for the atomic operation. Use depends on the atomicOp. - /// @param [in] atomicOp Specifies which atomic operation to perform. @see AtomicOp. - virtual void CmdMemoryAtomic( - const IGpuMemory& dstGpuMemory, - gpusize dstOffset, - uint64 srcData, - AtomicOp atomicOp) = 0; - - /// Starts a query operation for the given slot of a query pool. - /// - /// The query slot must have been previously cleared with CmdResetQueryPool() before starting a query. - /// - /// @note Queries may not span multiple command buffers. - /// - /// @param [in] queryPool Query pool for this query. - /// @param [in] queryType The type of query this operation will produce. - /// @param [in] slot Slot in pQueryPool where the results of this query should be accumulated. - /// @param [in] flags Flags controlling query behavior. @see QueryControlFlags. - virtual void CmdBeginQuery( - const IQueryPool& queryPool, - QueryType queryType, - uint32 slot, - QueryControlFlags flags) = 0; - - /// Stops a query operation for the given slot of a query pool. - /// - /// The query slot must have an open query on it when this is called. - /// - /// @param [in] queryPool Query pool for this query. - /// @param [in] queryType The type of query this operation will produce. - /// @param [in] slot Slot in pQueryPool where the query is running. - virtual void CmdEndQuery( - const IQueryPool& queryPool, - QueryType queryType, - uint32 slot) = 0; - - /// Resolves the results of a range of queries to the specified query type into the specified GPU memory location. - /// - /// This function requires use of the following barrier flags on @ref dstGpuMemory: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopyDst - /// - /// This operation does not honor the command buffer's predication state, if active. - /// - /// @param [in] queryPool Query pool holding the source queries. - /// @param [in] flags Flags that control the result data layout and how the results are retrieved. - /// @param [in] queryType The type of queries this resolve will produce. - /// @param [in] startQuery First slot in pQueryPool to resolve. - /// @param [in] queryCount Number of query pool slots to resolve. - /// @param [in] dstGpuMemory Destination GPU memory object. - /// @param [in] dstOffset 4-byte aligned offset into pDstGpuMemory where the results should be written. - /// @param [in] dstStride 4-byte aligned stride between where results are written into pDstGpuMemory. - virtual void CmdResolveQuery( - const IQueryPool& queryPool, - QueryResultFlags flags, - QueryType queryType, - uint32 startQuery, - uint32 queryCount, - const IGpuMemory& dstGpuMemory, - gpusize dstOffset, - gpusize dstStride) = 0; - - /// Rests a range of slots in a query pool. A query slot must be reset each time before a query can be started - /// using that slot. - /// - /// @param [in] queryPool Query pool to be reset. - /// @param [in] startQuery First slot in pQueryPool to be reset. - /// @param [in] queryCount Number of slots to reset. - virtual void CmdResetQueryPool( - const IQueryPool& queryPool, - uint32 startQuery, - uint32 queryCount) = 0; - - /// Writes a GPU performance timestamp to memory when all prior GPU work has progressed past the given stages. - /// - /// The timestamp data is a 64-bit value that increments once per clock. @ref timestampFrequency in DeviceProperties - /// reports the frequency the timestamps are clocked at. Timestamps are only supported by engines that report - /// @ref supportsTimestamps in DeviceProperties. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: the same flag(s) specified in @ref stageMask. - /// - CacheCoherency: @ref CoherTimestamp - /// - /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. - /// - /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that - /// restricts which stages of prior GPU work must happen before the timestamp is written. - /// The timestamp will be performed at the earliest possible stage after the prior stages. - /// Note that the SDMA engine only supports bottom-of-pipe timestamps. - /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. - /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to - /// minTimestampAlignment in DeviceProperties. - virtual void CmdWriteTimestamp( - uint32 stageMask, - const IGpuMemory& dstGpuMemory, - gpusize dstOffset) = 0; - - /// Writes an immediate value to memory when all prior GPU work has progressed past the given stages. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: the same flag(s) specified in @ref stageMask. - /// - CacheCoherency: @ref CoherCp - /// - /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. - /// - /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope - /// that restricts which stages of prior GPU work must happen before the immediate - /// value is written. The write will be occur at the earliest possible stage after - /// the prior stages. Note that the SDMA engine only supports bottom-of-pipe writes. - /// @param [in] data Value to be written to gpu address. - /// @param [in] ImmediateDataWidth Size of the data to be written out. - /// @param [in] address GPU address where immediate value should be written. - virtual void CmdWriteImmediate( - uint32 stageMask, - uint64 data, - ImmediateDataWidth dataSize, - gpusize address) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 - /// Writes a HwPipePostPrefetch or HwPipeBottom timestamp to the specified memory location. - /// - /// The timestamp data is a 64-bit value that increments once per clock. timestampFrequency in DeviceProperties - /// reports the frequency the timestamps are clocked at. - /// - /// Timestamps are only supported by engines that report supportsTimestamps in DeviceProperties. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: the same flag specified in @ref pipePoint. - /// - CacheCoherency: @ref CoherTimestamp - /// - /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. - /// - /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. The only - /// valid choices are HwPipePostPrefetch and HwPipeBottom. HwPipePostPrefetch timestamps - /// are not supported on the SDMA engine, so all timestamps will be executed as - /// bottom-of-pipe. - /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. - /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to - /// minTimestampAlignment in DeviceProperties. - inline void CmdWriteTimestamp( - HwPipePoint pipePoint, - const IGpuMemory& dstGpuMemory, - gpusize dstOffset) - { CmdWriteTimestamp(HwPipePointToStage[pipePoint], dstGpuMemory, dstOffset); } - - /// Writes a top-of-pipe or bottom-of-pipe immediate value to the specified memory location. - /// - /// This function requires use of the following barrier flags: - /// - PipelineStage: the same flag specified in @ref pipePoint. - /// - CacheCoherency: @ref CoherCp - /// - /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. - /// - /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. - /// The only valid choices are HwPipeTop, HwPipePostPrefetch and HwPipeBottom. - /// Top-of-pipe timestamps are not supported on the SDMA engine, so all timestamps - /// will be executed as bottom-of-pipe. - /// @param [in] data Value to be written to gpu address. - /// @param [in] ImmediateDataWidth Size of the data to be written out. - /// @param [in] address GPU address where immediate value should be written. - inline void CmdWriteImmediate( - HwPipePoint pipePoint, - uint64 data, - ImmediateDataWidth dataSize, - gpusize address) - { CmdWriteImmediate(HwPipePointToStage[pipePoint], data, dataSize, address); } -#endif - - /// Loads the current stream-out buffer-filled-sizes stored on the GPU from memory, typically from a target of a - /// prior CmdSaveBufferFilledSizes() call. - /// - /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use - /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherCp - /// - /// @param [in] gpuVirtAddr Array of GPU virtual addresses to load each counter from. If any of these are zero, - /// the corresponding filled-size counter is not loaded. - virtual void CmdLoadBufferFilledSizes( - const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; - - /// Saves the current stream-out buffer-filled-sizes into GPU memory. - /// - /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use - /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherCp - /// - /// @param [in] gpuVirtAddr Array of GPU virtual addresses to save each counter into. If any of these are zero, - /// the corresponding filled-size counter is not saved. - virtual void CmdSaveBufferFilledSizes( - const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; - - /// Set the offset to buffer-filled-size for a stream-out target. - /// - /// @param [in] bufferId Stream-out buffer ID, it could be in the range [0, MaxStreamOutTargets). - /// @param [in] offset The value to be written into the buffer filled size counter. - /// - virtual void CmdSetBufferFilledSize( - uint32 bufferId, - uint32 offset) = 0; - - /// Binds the specified border color palette for use by samplers. - /// - /// @param [in] pipelineBindPoint Specifies which pipeline type is affected (i.e., graphics or compute). - /// @param [in] pPalette Border color palette object to bind. - virtual void CmdBindBorderColorPalette( - PipelineBindPoint pipelineBindPoint, - const IBorderColorPalette* pPalette) = 0; - - /// Sets predication for this command buffer to use the specified GPU memory location. Any draw, dispatch or copy - /// operation between this command and the corresponding reset/disable call will be skipped if the value in spec- - /// ified location matches the passed-in predicated value - /// - /// This function requires use of the following barrier flags on @ref pGpuMemory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @param [in] pQueryPool pointer to QueryPool obj, not-nullptr means this is a QueryPool based predication - /// - Zpass/Occlusion based predication - /// - or PrimCount/Streamout based predication - /// @param [in] slot Slot to use for setting occlusion predication, valid when pQueryPool is not nullptr - /// @param [in] pGpuMemory GPU memory object for the predication value, only valid when pQueryPool is nullptr - /// @param [in] offset GPU memory offset for the predication value - /// @param [in] predType Predication type. - /// @param [in] predPolarity Controls the polarity of the predication test - /// true = draw_if_visible_or_no_overflow - /// false = draw_if_not_visible_or_overflow - /// @param [in] waitResults Hint only valid for Zpass/Occlusion. - /// false = wait_until_final_zpass_written - /// true = draw_if_not_final_zpass_written - /// @param [in] accumulateData true(1) = allow_accumulation of Zpass and PrimCount across command buffer boundaries. - /// - /// pQueryPool and gpuVirtAddr should be exclusively set, when both are nullptr/0, other params will be ignored - /// and it means to reset/disable predication so that the following commands can perform normally. - virtual void CmdSetPredication( - IQueryPool* pQueryPool, - uint32 slot, - const IGpuMemory* pGpuMemory, - gpusize offset, - PredicateType predType, - bool predPolarity, - bool waitResults, - bool accumulateData) = 0; - - /// Suspend/resume any active predication for this command buffer - /// - /// @param [in] suspend Controls if predication should be paused - /// true = suspend active predication - /// false = resume active predication - /// - /// Any suspended predication must be resumed prior to disabling predication using CmdSetPredication with pQueryPool - /// and gpuVirtAddr with nullptr/0. This is only valid on universal and compute command buffers. - virtual void CmdSuspendPredication( - bool suspend) = 0; - - /// Begins a conditional block in the current command buffer. All commands between this and the corresponding - /// CmdEndIf() (or CmdElse() if it is present) command are executed if the specified condition is true. - /// - /// This function requires use of the following barrier flags on @ref gpuMemory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. - /// @param [in] offset Offset within the memory object where the tested memory location begins. - /// @param [in] data Source data to compare against the value in GPU memory. - /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. - /// @param [in] compareFunc Function controlling how the data operands are compared. - virtual void CmdIf( - const IGpuMemory& gpuMemory, - gpusize offset, - uint64 data, - uint64 mask, - CompareFunc compareFunc) = 0; - - /// Begins a conditional block in the current command buffer. All commands between this and the corresponding - /// CmdEndIf() command are executed if the condition specified in the innermost active conditional block are false. - virtual void CmdElse() = 0; - - /// Ends the innermost active conditional block in the current command buffer. - virtual void CmdEndIf() = 0; - - /// Begins a while loop in the current command buffer. All commands between this and the corresponding CmdEndWhile() - /// command are executed repeatedly as long as the specified condition remains true. - /// - /// This function requires use of the following barrier flags on @ref gpuMemory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. - /// @param [in] offset Offset within the memory object where the tested memory location begins. - /// @param [in] data Source data to compare against the value in GPU memory. - /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. - /// @param [in] compareFunc Function controlling how the data operands are compared. - virtual void CmdWhile( - const IGpuMemory& gpuMemory, - gpusize offset, - uint64 data, - uint64 mask, - CompareFunc compareFunc) = 0; - - /// Ends the innermost active while loop in the current command buffer. - virtual void CmdEndWhile() = 0; - - /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a - /// GPU register. - /// - /// The client (or application) is supposed to do necessary barriers before calling this function, but for now this - /// is only need to wait some display or timer related registers. - /// - /// @param [in] registerOffset The offset in bytes of GPU register to be tested. - /// @param [in] data Source data to compare against the value of GPU register. - /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. - /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never shouldn't - /// be used as the hardware does not support it. - virtual void CmdWaitRegisterValue( - uint32 registerOffset, - uint32 data, - uint32 mask, - CompareFunc compareFunc) = 0; - - /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a - /// GPU memory location. - /// - /// The client (or application) is expected to transiton the memory to proper state before calling this function. - /// The memory location for the condition must be 4-byte aligned. - /// This function requires use of the following barrier flags on @ref gpuVirtAddr: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherCp - /// - /// @param [in] gpuVirtAddr GPU memory address containing the data to be tested. - /// @param [in] data Source data to compare against the value in GPU memory. - /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. - /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not - /// be used as the hardware does not support it. - virtual void CmdWaitMemoryValue( - gpusize gpuVirtAddr, - uint32 data, - uint32 mask, - CompareFunc compareFunc) = 0; - - /// Stalls a command buffer execution until an external device writes to the marker surface in the GPU bus - /// addressable memory location. - /// - /// This function requires use of the following barrier flags on @ref gpuMemory: - /// - PipelineStage: @ref PipelineStagePostPrefetch - /// - CacheCoherency: @ref CoherCp - /// - /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. - /// @param [in] data Source data to compare against the value in GPU memory. - /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. - /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not - /// be used as the hardware does not support it. - virtual void CmdWaitBusAddressableMemoryMarker( - const IGpuMemory& gpuMemory, - uint32 data, - uint32 mask, - CompareFunc compareFunc) = 0; - - /// Begins the specified performance experiment. - /// - /// @param [in] pPerfExperiment Performance experiment to begin. - virtual void CmdBeginPerfExperiment( - IPerfExperiment* pPerfExperiment) = 0; - - /// Updates the sqtt token mask on the specified performance experiment. - /// - /// @param [in] pPerfExperiment Performance experiment to update. - /// @param [in] tokenConfig updated token and reg mask to apply. - /// - /// @note: This function is only valid to call if pPerfExperiment is a thread trace experiment that is currently - // active. - virtual void CmdUpdatePerfExperimentSqttTokenMask( - IPerfExperiment* pPerfExperiment, - const ThreadTraceTokenConfig& tokenConfig) = 0; - - /// Updates the sqtt token mask on all running traces, if any. - /// - /// @note This may overwrite the stall settings (making them more conservative) - /// @param [in] tokenConfig updated token and reg mask to apply. - virtual void CmdUpdateSqttTokenMask( - const ThreadTraceTokenConfig& tokenConfig) = 0; - - /// Ends the specified performance experiment. - /// - /// @param [in] pPerfExperiment Performance experiment to end. - virtual void CmdEndPerfExperiment( - IPerfExperiment* pPerfExperiment) = 0; - - /// Inserts a trace marker into the command buffer. - /// - /// A trace marker can be inserted to mark particular points of interest in a command buffer to be viewed with the - /// trace data collected in a performance experiment. - /// - /// @param [in] markerType Selects one of two generic marker categories ("A" or "B"). - /// @param [in] markerData 32-bit marker value to be inserted. - virtual void CmdInsertTraceMarker( - PerfTraceMarkerType markerType, - uint32 markerData) = 0; - - /// Inserts a set of SQ thread trace markers for consumption by the Radeon GPU Profiler (RGP). - /// - /// Only supported on Universal and Compute engines. - /// - /// @param [in] numDwords Number of dwords in pData to be inserted as SQTT markers. - /// @param [in] pData SQTT marker data. See the RGP SQTT Instrumentation Specification for details on how this - /// data should be formatted. - virtual void CmdInsertRgpTraceMarker( - RgpMarkerSubQueueFlags subQueueFlags, - uint32 numDwords, - const void* pData) = 0; - - /// This function is to be used to copy the DF SPM (MALL SPM) data from the output buffers to an accessible buffer. - /// The buffer that HW outputs to is allocated with a special KMD flag and therefore cannot be the same as the - /// normal IPerfExperiment buffer so we need a special command to get the data. - /// - /// The bulk of the implementation for this is done by the KMD. They are in charge of starting and stopping the - /// trace as well as all of the register programming. When KMD recieves a dfSpmTraceEnd bit from a CmdBufInfo - /// flag, they will wait for the command buffer to be completely idle before stopping the trace. Therefore, a - /// CmdEndPerfExperiment call does not stop this particular sample, the end of a command buffer with a - /// dfSpmTraceEnd does. This means that calling CmdCopyDfSpmTraceData in the same command buffer as - /// dfSpmTraceEnd will give you incorrect data. The sample will still be in progress when the copy happens. - /// You must call CmdCopyDfSpmTraceData in a separate command buffer after one where the dfSpmTraceEnd bit is - /// set. - /// - /// There is also a metadata buffer that does not need a special KMD flag. It is also stored in a separate buffer - /// and is copied along with the output buffer with this command. It contains a uint32 trace size, a uint32 pad, - /// a uint64 start trace GPU timestamp and a uint64 stop trace GPU timestamp and is placed at the beginning of the - /// dstGpuMemory. - /// - /// The minimum size of the dstGpuMemory should be the size of the metadata struct plus the size of the DF SPM - /// ringSize given to the perf experiment. The SPM data may not fill the entire memory, but the client is - /// responsible for parsing the data. - /// - /// This function requires use of the following barrier flags on @ref dstGpuMemory: - /// - PipelineStage: @ref PipelineStageBlt - /// - CacheCoherency: @ref CoherCopyDst - /// - /// @param [in] perfExperiment The perfExperiment that we will be copying the data from - /// @param [in] dstGpuMemory The memory location that the DF SPM trace data will be copied to. - /// @param [in] dstOffset The offset into the destination memory that the data will be copied to. - virtual void CmdCopyDfSpmTraceData( - const IPerfExperiment& perfExperiment, - const IGpuMemory& dstGpuMemory, - gpusize dstOffset) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - /// Loads data from the provided GPU Memory object into Constant Engine RAM. - /// - /// @param [in] srcGpuMemory GPU Memory object containing the source data to be loaded to CE RAM. - /// @param [in] memOffset Offset within the memory object where the source data is located, - /// must be 32-byte aligned. - /// @param [in] ramOffset Byte offset destination in CE RAM where the data should be loaded, - /// must be 32-byte aligned. - /// @param [in] dwordSize Number of DWORDs that should be loaded into CE RAM, must be a multiple of 8. - void CmdLoadCeRam( - const IGpuMemory& srcGpuMemory, - gpusize memOffset, - uint32 ramOffset, - uint32 dwordSize) {} - - /// Dumps data from Constant Engine RAM to the provided GPU Memory address which may be located in a GPU ring buffer - /// managed by the CE. The CE can be used to automatically handle the synchronization between the DE and CE when - /// manipulating a GPU ring buffer. In order for PAL to instruct the CE to handle this, we need to know the current - /// position (entry) within the ring buffer being dumped to, as well as the total size (in entries) of the ring. - /// - /// @param [in] dstGpuMemory GPU Memory object destination where the data should be dumped from CE RAM. - /// @param [in] memOffset Offset within the memory object where data should be dumped, must be 4 byte aligned. - /// @param [in] ramOffset Byte offset source in CE RAM for data that should be dumped, must be 4 byte aligned. - /// @param [in] dwordSize Number of DWORDs that should be dumped from CE RAM into GPU Memory - /// @param [in] currRingPos Current position (ring entry) in the GPU ring buffer being managed by the CE which the - /// dump location corresponds to. - /// @param [in] ringSize Number of entries in the GPU ring buffer being managed by the CE. If the memory being - /// dumped into is not managed in a ring-like fashion, this should be set to zero. - void CmdDumpCeRam( - const IGpuMemory& dstGpuMemory, - gpusize memOffset, - uint32 ramOffset, - uint32 dwordSize, - uint32 currRingPos, - uint32 ringSize) {} - - /// Writes CPU data to Constant Engine RAM - /// - /// @param [in] pSrcData Pointer to the source CPU data to be written to CE RAM. - /// @param [in] ramOffset Byte offset in CE RAM where the data should be written, must be 4 byte aligned. - /// @param [in] dwordSize Number of DWORDs that should be written from pSrcData into CE RAM. - virtual void CmdWriteCeRam( - const void* pSrcData, - uint32 ramOffset, - uint32 dwordSize) {} -#endif - - /// Allocates a chunk of command space that the client can use to embed constant data directly in the command - /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address - /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within - /// this command buffer (e.g., as an SRD table address). - /// - /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the - /// value reported by GetEmbeddedDataLimit(). - /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. - /// @param [out] pGpuAddress The GPU address of the embedded space. - /// - /// @returns The DWORD-aligned CPU address of the embedded space. - virtual uint32* CmdAllocateEmbeddedData( - uint32 sizeInDwords, - uint32 alignmentInDwords, - gpusize* pGpuAddress) = 0; - - /// Allocates a chunk of command space that the client can use to embed constant data directly in the command - /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address - /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within - /// this command buffer (e.g., as an SRD table address). - /// - /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the - /// value reported by GetLargeEmbeddedDataLimit(). - /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. - /// @param [out] pGpuAddress The GPU address of the embedded space. - /// - /// @returns The DWORD-aligned CPU address of the embedded space. - virtual uint32* CmdAllocateLargeEmbeddedData( - uint32 sizeInDwords, - uint32 alignmentInDwords, - gpusize* pGpuAddress) = 0; - - /// Get memory from scratch memory and bind to GPU event. For now only GpuEventPool and CmdBuffer's internal - /// GpuEvent use this path to allocate and bind GPU memory. These usecases assume the bound GPU memory is GPU access - /// only, so client is responsible for resetting the event from GPU, and cannot call Set(), Reset(), GetStatus(). - /// - /// @param [in] pGpuEvent The GPU event that needs to bind a memory. Must not be nullptr. - /// - /// @returns Success if the GPU event successfully binds a GPU memory. Otherwise, one of the following errors may - /// be returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Result AllocateAndBindGpuMemToEvent( - IGpuEvent* pGpuEvent) = 0; - - /// Issues commands to prime GPU caches shortly before accessing the specified GPU address range(s). The benefit of - /// this prefetching is likely to be platform-dependent based on the GPU's cache hierarchy, memory subsystem, - /// available prefetching tools in hardware, etc., so caller beware. - /// - /// This operation may read data from memory into caches and therefore counts as a general BLT SRC operation with - /// regard to barrier execution and memory dependencies. - /// - /// @param [in] rangeCount Number of entries in pRanges. - /// @param [in] pRanges Array of structs defining a memory range and properties controlling prefetching of that - /// range. - virtual void CmdPrimeGpuCaches( - uint32 rangeCount, - const PrimeGpuCacheRange* pRanges) = 0; - - /// Issues commands which execute the specified group of nested command buffers. The observable behavior of this - /// operation should be indiscernible from directly recording the nested command buffers' commands directly into - /// this command buffer. Naturally, the queue type of the nested command buffers must match this command buffer. - /// - /// Conceptually, executing a nested command buffer is similar to calling a subroutine: the root command buffer is - /// like the "caller", while the nested ones are the "callees". - /// - /// If any nested command buffers were allocated from a @ref ICmdAllocator with @ref autoMemoryReuse enabled, - /// resetting or destroying those nested command buffers will render them retroactively uncallable. This effectively - /// makes the caller command buffer invalid and illegal to submit even if it was otherwise valid and executable in - /// the past. If the nested command allocator has autoMemoryReuse disabled, the calls to reset nested command - /// buffers remain valid until the allocator itself is reset. - /// - /// State inheritance/leakage between the caller and callee(s) has the following behavior: - /// + The callee only inherits the state specified in the callee CmdBufferBuildInfo. It is up to the client to - /// bind any default state necessary when they called @ref ICmdBuffer::Begin() to begin building the callee. - /// By default no state is inherited and all state must be specified by the client. - /// + The callee leaks any render and resource-binding state back into the caller after it completes. It is up to - /// the client to rebind the caller's state after this operation completes if they don't want state leakage. - /// + Both of the above points apply in between callees, if more than one command buffer is being executed by this - /// call. - /// - /// @param [in] cmdBufferCount Number of nested command buffers to execute. (i.e., size of the ppCmdBuffers - /// array). This must be at least one, otherwise making this call is pointless. - /// @param [in,out] ppCmdBuffers Array of nested command buffers to execute. It is an error condition if any - /// of the following are true: (Debug assertions are used to check them.) - /// + ppCmdBuffers is null. - /// + Any member of ppCmdBuffers is null. - /// + Any member of ppCmdBuffers is a root command buffer, or has a different - /// queue type than this command buffer. - virtual void CmdExecuteNestedCmdBuffers( - uint32 cmdBufferCount, - ICmdBuffer*const* ppCmdBuffers) = 0; - - /// Saves a copy of some set of the current command buffer state that is used by compute workloads. This feature is - /// intended to give PAL clients a convenient way to issue their own internal compute workloads without modifying - /// the application-facing state. - /// - /// PAL cannot save multiple layers of state, each call to CmdSaveComputeState must be followed by a call to - /// CmdRestoreComputeState before the next call to CmdSaveComputeState. - /// - /// This function can only be called on command buffers that support compute workloads. All query counters will be - /// disabled until CmdRestoreComputeState is called. - /// - /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to save. - virtual void CmdSaveComputeState( - uint32 stateFlags) = 0; - - /// Restores some set of the command buffer state that is used by compute workloads. This feature is intended to - /// give PAL clients a convenient way to issue their own internal compute workloads without modifying the - /// application-facing state. - /// - /// A call to this function must be preceded by a call to CmdSaveComputeState and the save stateFlags must contain - /// all restore stateFlags, otherwise the values of the restored state are undefined. - /// - /// This function can only be called on command buffers that support compute workloads. All previously disabled - /// query counters will be reactivated. - /// - /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to restore. - virtual void CmdRestoreComputeState( - uint32 stateFlags) = 0; - - /// Issues commands which complete two tasks: using the provided @ref IIndirectCmdGenerator object to translate the - /// indirect argument buffer into a format understandable by the GPU; and then executing the generated commands. - /// - /// The virtual address must be 4-byte aligned. - /// - /// The indirect argument data offset in memory must be 4-byte aligned. The expected layout of the argument data - /// is defined by the @ref IIndirectCmdGenerator object. - /// - /// It is unsafe to call this method on a command buffer which was not begun with either the optimizeOneTimeSubmit - /// or optimizeExclusiveSubmit flags. This is because there is a potential race condition if the same command buffer - /// is generating indirect commands on multiple Queues simultaneously. - /// - /// This function requires use of the following barrier flags on the indirect memory: - /// - PipelineStage: @ref PipelineStageFetchIndirectArgs - /// - CacheCoherency: @ref CoherIndirectArgs - /// - /// @param [in] generator Indirect command generator object which can translate the indirect argument buffer - /// into a command buffer format which the GPU can understand. - /// @param [in] gpuVirtAddr Gpu virtual address where the indirect argument data is located. - /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr is nonzero, the - /// value at that memory location is clamped to this maximum. If countGpuAddr is zero, - /// Then the number of draws issued exactly matches this number. - /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. Must be 4-byte aligned. - virtual void CmdExecuteIndirectCmds( - const IIndirectCmdGenerator& generator, - gpusize gpuVirtAddr, - uint32 maximumCount, - gpusize countGpuAddr) = 0; - - /// Updates one or more HiS pretests bound to the given stencil image within a range of mip levels. - /// See @ref HiSPretests for a summary of HiS. - /// - /// @warning Improper use of pretests can cause corruption. Please see @ref HiSPretests for more information. - /// - /// @param [in] image The stencil image that will receive the new pretest(s). - /// @param [in] pretests The new pretest(s). - /// @param [in] firstMip The beginning of the mip range which will receive the new pretest(s). - /// @param [in] numMips The number of mips in the mip range which will receive the new pretest(s). - virtual void CmdUpdateHiSPretests( - const IImage* pImage, - const HiSPretests& pretests, - uint32 firstMip, - uint32 numMips) = 0; - - /// Reserve @ref CommandDataAlloc space for external command packets up to a size of @ref sizeInDwords. - /// This method is only supported on command buffers for the following queue types: - /// - /// @warning @ref CmdCommitSpace must be called once after this function is called. - // Failing to pair up these function calls will result in undefined behavior. - /// - /// @param [in] sizeInDwords Size of the command buffer space to reserve in dwords. - /// If this param is 0, the default command stream reserve limit will be used. - /// @param [in] reserveInNewChunk Selection to reserve space in a new chunk or current chunk. - /// - /// @returns A pointer to the reserved command space. - virtual uint32* CmdReserveSpace( - uint32 sizeInDwords, - bool reserveInNewChunk) = 0; - - /// Ensure data is commited the command buffer and unused space is reclaimed. - /// This method is only supported on command buffers for the following queue types: - /// - /// @param [in] pCmdSpace Pointer to the next unused dword in the command buffer. - virtual void CmdCommitSpace( - uint32* pCmdSpace) = 0; - - /// Executes any internal postprocessing commands to be performed on a frame, such as drawing the dev driver - /// overlay. Calling this prior to presenting (via any path) is a requirement, and must be prior to or - /// concurrent with frameEnd if FSFM is applicable. This must be called using the image that will be the - /// source of the present. - /// - /// @param [in] postProcessInfo Information about the frame to be postprocessed. - /// @param [out] pAddedGpuWork (Optional) Set to true if commands were added as part of this call. - virtual void CmdPostProcessFrame( - const CmdPostProcessFrameInfo& postProcessInfo, - bool* pAddedGpuWork) = 0; - - /// Inserts a string embedded inside a NOP packet with a signature that is recognized by tools and can be printed - /// inside a command buffer disassembly. Note that this is a real NOP that will really be submitted to the GPU - /// and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug dumps. - /// - /// The maximum length of a string that may be embedded in the command buffer is currently 128 characters, - /// including the NUL-terminator. This is defined in the internal command buffer class in MaxCommentStringLength. - /// - /// @param [in] pComment Pointer to NUL-terminated string that will be inserted into the command buffer. - virtual void CmdCommentString( - const char* pComment) = 0; - - /// Inserts the specified payload embedded inside a NOP packet. Note that this is a real NOP that will be submitted - /// to the GPU and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug - /// dumps. - /// - /// @param [in] pPayload Pointer to binary data to embed. - /// @param [in] payloadSize Size of the payload in DWORDs, expected to be under MaxPayloadSize. - virtual void CmdNop( - const void* pPayload, - uint32 payloadSize) = 0; - - /// Marks the begin or end of a user-defined region of GPU work; analyzed post-mortem in crash-dump analysis tools. - /// Each 'Begin' marker must be paired with a corresponding 'End' marker; however, markers may be nested by - /// inserting multiple 'Begin' markers consecutively. - /// - /// @warning This function is a no-op if Crash Analysis mode is not enabled. - /// - /// @param [in] isBegin Whether this is a 'Begin' marker (true) or an 'End' marker (false). - /// @param [in] sourceId The application layer ID at which the marker is being created: - /// 0x0 => Application - /// 0x1 => API (e.g. DX12, Vulkan, etc.) - /// 0x2 => PAL - /// Developers may use IDs within the range of 10 - 15 to define a custom - /// application layer. - /// @param [in] pMarkerName A NULL-terminated string containing a name for this marker, used for annotation - /// purposes in external tools. Only valid for 'Begin' markers, and will be ignored if - /// isBeginMarker is false. - /// @param [in] markerNameSize Size of the marker string, in bytes. - /// - /// @returns Non-zero counter value of the embedded execution marker. - /// If Crash Analysis mode is disabled, this will always return zero. - virtual uint32 CmdInsertExecutionMarker( - bool isBegin, - uint8 sourceId, - const char* pMarkerName, - uint32 markerNameSize) = 0; - - /// Performs the virtual queue handshake. The host queue will do the following: - /// - Wait until the parent kernel is done - /// - Change the parent kernel state - /// - CP waits until the child counter is 0 - /// - CP sends the termination signal to the device queue - /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. - /// - /// @param [in] parentState Address of the parent kernel state - /// @param [in] newStateValue The new state value of the parent kernel - /// @param [in] parentChildCounter Address of the parent child counter - /// @param [in] signal Address of the virtual queue signal - /// @param [in] dedicatedQueue Runtime uses a dedicated queue for the scheduler - /// - /// @note This function is to support OpenCL AQL submissions. - virtual void CmdVirtualQueueHandshake( - gpusize parentState, - uint32 newStateValue, - gpusize parentChildCounter, - gpusize signal, - bool dedicatedQueue) = 0; - - /// Returns GPU address of the loop start with dispatch templates. The pointer will be passed to the - /// scheduler kernel for the update of dispatch templates. - /// Also initializes common registers for each dispatch template. - /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. - /// - /// @note This function is to support OpenCL AQL submissions. - virtual gpusize CmdVirtualQueueDispatcherStart() = 0; - - /// Programs CP iterator with dispatch templates for device enqueue in OpenCL2.0 - /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. - /// - /// @param [in] signal Address for the termination signal - /// @param [in] loopStart GPU address of the loop start for CP - /// @param [in] numTemplates The number of dispatch templates - /// - /// @note This function is to support OpenCL AQL submissions. - virtual void CmdVirtualQueueDispatcherEnd( - gpusize signal, - gpusize loopStart, - uint32 numTemplates) = 0; - - /// Emulates AQL dispatch with PM4 commands. - /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. - /// - /// @param [in] dispatchInfo Pointer to kernel dispatch info - /// @param [out] pFeedback Pointer to the structure where information about the - /// dispatch can be stored if != nullptr. - /// - /// @note This function is to support OpenCL AQL submissions. - void CmdDispatchAql( - const DispatchAqlParams& dispatchInfo, - DispatchAqlFeedback* pFeedback) - { - m_funcTable.pfnCmdDispatchAql(this, dispatchInfo, pFeedback); - } - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 954 - inline void CmdDispatchAql( - const DispatchAqlParams& dispatchInfo) - { - CmdDispatchAql(dispatchInfo, nullptr); - } -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - /// XDMA was retired starting in gfx10 so this function has no use anymore. - inline void CmdXdmaWaitFlipPending() {} -#endif - - /// Starts thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP - /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these - /// functions) - /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. - /// The caller is responsible for setting up valid GPU Profiler panel settings. - virtual void CmdStartGpuProfilerLogging() = 0; - - /// Stops thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP - /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these - /// functions) - /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. - /// The caller is responsible for setting up valid GPU Profiler panel settings. - virtual void CmdStopGpuProfilerLogging() = 0; - - /// Set a mask to control which view instances are enabled for subsequent draws, should only be called on - /// universal command buffers. - /// - /// @param [in] mask The mask to control which view instances are enabled. - virtual void CmdSetViewInstanceMask(uint32 mask) = 0; - - /// Get used size of all chunks in bytes for given CmdAllocType. For CommandDataAlloc with multi-queue scheme, the - /// size reported will be the sum of all command streams associated with the command buffer. It's legal to call - /// this function while in the command building state. - /// - /// @param [in] type Allocation type for ICmdAllocator - /// - /// @returns Used allocation data size in bytes for provided CmdAllocType. - virtual uint32 GetUsedSize( - CmdAllocType type) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - - /// Gets the internal unique Id of the command buffer. - /// This function was originally only for internal debugging, and the Id is not unique across different queue types. - /// However, CmdDisassembly::ICmdBufferReporting requires a way to differentiate cmdLists that matches - /// the meaning of UniqueId(), in particular, not requiring uniqueness across different queue types. - /// - /// @returns Unique Id of the command buffer - virtual uint32 UniqueId() const = 0; - - /// Get the number of arrays of tracking data (for correlation) held by this ICmdBuffer. - /// For TrackCmdLocationBefore and TrackCmdLocationAfter below, only (idx < GetNumTrackingArrays()) - /// can yield a non-nullptr result - /// - /// @detail If GetNumTrackingArrays() returns 0 but GetTrackedCmdLocationArraySizeInBytes() does not, it will - /// be possible to create them on this ICmdBuffer using CreateTrackedCmdLocationArray - /// - /// @returns the number of a TrackedCmdLocationArray's held by the ICmdBuffer. - virtual uint32 GetNumTrackingArrays() const = 0; - - /// Get the number of bytes required by CreateTrackedCmdLocationArray. - /// - /// @detail The value returned here accomdates the full number of TrackedCmdLocationArray's to be - /// created, from a single contiguous allocation. - /// If allocation has not yet occured, (GetNumTrackingArrays() == 0). - /// If (GetTrackedCmdLocationArraySizeInBytes() > 0) && (GetNumTrackingArrays() == 0) - /// this ICmdBuffer supports TrackedCmdLocationArray's, but has not yet allocated them - /// If (GetTrackedCmdLocationArraySizeInBytes() == 0), this ICmdBuffer does not support - /// TrackedCmdLocationArray's - /// - /// @returns 0 if TrackedCmdLocationArray's are not supported - /// The total number of bytes required requied by CreateTrackedCmdLocationArray otherwise. - virtual uint32 GetTrackedCmdLocationArraySizeInBytes() const = 0; - - /// Uses the memory pMemory to initialize GetNumTrackingArrays() TrackedCmdLocationArray's on this - /// ICmdBuffer. - /// - /// @param [in] pMemory Address of memory allocated for the purpose of creating TrackedCmdLocationArray's - /// This memory should be at larger than GetTrackedCmdLocationArraySizeInBytes() - /// bytes. - /// - /// @returns Result::Success: Indicates creation was successful - /// Result::Unsupported: This ICmdBuffer does not support TrackedCmdLocationArray's - /// Confirm (GetTrackedCmdLocationArraySizeInBytes() > 0) before using this function - /// Result::ErrorInvalidPointer: pMemory == nullptr - /// Result::AlreadyExists: TrackedCmdLocationArray's have already been allocated on - /// this ICmdBuffer - /// other: Error values originating from Util::Vector::Reserve() or - /// Util::Vector::PushBack() - /// - virtual Result CreateTrackedCmdLocationArray( - void* pMemory) = 0; - - /// Executes the destructors for all TrackedCmdLocationArray's owned by this ICmdBuffer. This should be - /// called prior to deleting the memory pMemory that was originally provided to CreateTrackedCmdLocationArray - /// - /// @param [out] ppAllocatedMemory - /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() > 0) - /// The original value for pMemory provided in CreateTrackedCmdLocationArray(pMemory) will - /// by returned in *ppAllocatedMemory. ie *ppAllocatedMemory = pMemory - /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() == 0), - /// *ppAllocatedMemory = nullptr; - /// - virtual void DestroyTrackedCmdLocationArray( - void** ppAllocatedMemory) = 0; - - /// Creates a new TrackedCmdLocation in the TrackedCmdLocationArray corresponding to idx - /// of type TrackedCmdLocationMode::Begin. This location will be furnished with a pointer to the address of - /// the next PM4Packet to be created on the CmdBuffer referred to by idx, and m_event == eventId. - /// - /// @detail Note there is two potential valid corner cases. - /// (pBeforeResult->Get()->m_correlateInternal.m_ptr. == 0) - /// Indicates there is a commandstream but it has not begun building PM4Packets - /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins - /// (pBeforeResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) - /// Indicates there is not yet an associated commandstream. - /// - /// @param [in] idx The idx corresponding to GetTrackingArray(idx). - /// Only (idx < GetNumTrackingArrays()) will yield non-trivial results. - /// @param [in] eventId The code for the event being tracked - /// @param [out] pBeforeResult If successful, returns a TrackedCmdLocationRef to a TrackedCmdLocation within - /// the TrackedCmdLocationArray corresponding to idx - /// Otherwise, if (pBeforeResult != nullptr), is initialized to the default for - /// TrackedCmdLocationRef() - /// - /// @returns - /// Pal::Result::Success if successful - /// Pal::Result::ErrorInvalidPointer if (pBeforeResult == nullptr) - /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer - /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) - /// Pal::Result::NotFound if there is no tracking array corresponding to idx - /// This can occur if CreateTrackedCmdLocationArray has not been - /// called - which may mean the feature is disabled - /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable - /// to allocate memory - /// - /// - virtual Pal::Result TrackCmdLocationBefore( - uint32 idx, - uint8 eventId, - CmdDisassembly::TrackedCmdLocationRef* pBeforeResult) = 0; - - /// Similar to TrackCmdLocationBefore, TrackCmdLocationAfter creates a TrackedCmdLocation, of type - /// TrackedCmdLocationMode::End or TrackedCmdLocationMode::Delta. This location will be furnished with a - /// pointer to the address of the next PM4Packet to be created on the CmdBuffer referred to by idx. Type - /// TrackedCmdLocationMode::Delta will only occur if parameter before is the last TrackedCmdLocation - /// for the TrackedCmdLocationArray corresponding to idx, has the same m_event == eventId, and the change in - /// pointer address is small enough to be represented in 6 bits. - /// - /// @detail Note there is two potential valid corner cases. - /// (pAfterResult->Get()->m_correlateInternal.m_ptr. == 0) - /// Indicates there is a commandstream but it has not begun building PM4Packets - /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins - /// (pAfterResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) - /// Indicates there is not yet an associated commandstream. - /// In both of these cases, in parameter "before" had the same value for m_correlateInternal.m_ptr. - /// and "before" referred to the most recent TrackedCmdLocation, this tracked location will be of - /// type TrackedCmdLocationMode::Delta, with (before.Get()->m_correlateInternal.m_deltaInDWords == 0) - /// - /// @param [in] idx The idx corresponding to GetTrackingArray(idx). - /// Only (idx < GetNumTrackingArrays()) can yield non-trivial results. - /// @param [in] eventId The code for the event being tracked - /// @param [in] before The corresponding location generated by TrackCmdLocationBefore - /// This may be CmdDisassembly::TrackedCmdLocationRef() if no location from - /// TrackCmdLocationBefore before exists (such as on Reset) - /// @param [out] pAfterResult Returns a TrackedCmdLocationRef to a TrackedCmdLocation within the TrackedCmdLocationArray - /// corresponding to idx - /// - /// @returns - /// Pal::Result::Success if successful - /// Pal::Result::ErrorInvalidPointer if (pAfterResult == nullptr) - /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer - /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) - /// Pal::Result::ErrorInvalidValue if (eventId != before.m_correlateInternal.m_event) - /// Pal::Result::NotFound if there is no tracking array corresponding to idx - /// This can occur if CreateTrackedCmdLocationArray has not been - /// called - which may mean the feature is disabled - /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable - /// to allocate memory - /// - - virtual Pal::Result TrackCmdLocationAfter( - uint32 idx, - uint8 eventId, - CmdDisassembly::TrackedCmdLocationRef before, - CmdDisassembly::TrackedCmdLocationRef* pAfterResult) = 0; - - /// An accessor function for the TrackedCmdLocationArray corresponding to idx - /// - /// @param idx There is a CmdDisassembly::TrackedCmdLocationArray* corresponding to each - /// sub-cmdBuffer for this cmdBuffer. This idx indexes these in the same fashion. - /// Only idx < GetNumTrackingArrays() can yield non-nullptr results. - /// - /// @returns the TrackedCmdLocationArray corresponding to idx - virtual CmdDisassembly::TrackedCmdLocationArray* GetTrackingArray( - uint32 idx) const = 0; - - /// TrackClientEvent operates similarly to TrackCmdLocationBefore and TrackCmdLocationAfter above but - /// operates on all TrackedCmdLocationArray's on this CmdBuffer. The TrackedCmdLocation generated here - /// corresponds to type TrackedCmdLocationMode::ClientEvent, where clientId is a value the client is using - /// to track this cmdBuffer, and eventId refers to some event the client is tracking outside of driver. - /// - /// @detail The first call to TrackClientEvent will include a TrackedCmdLocation with - /// (m_mode == TrackedCmdLocationMode::ClientId), to define clientId on this cmdBuffer - /// All calls will generate the TrackedCmdLocationMode::ClientEventId with - /// (m_clientEvent.m_clientEventId == clientEventId) - /// and be followed with a TrackedCmdLocationMode::Delta location with (m_eventId == PostClientEvent (0xFF)) - /// - /// @param [in] clientId - /// @param [in] eventId - /// - /// @returns - /// Result::Success if the tracked client event was successfully recorded - /// Result::Unsupported if the implementation of ICmdBuffer does not support tracking - /// Result::ErrorInvalidPointer if there was an error encountered determining the cmdList correlation - /// requested. This is likely to be an out-of-memory situation. - /// Result::AlreadyExists if registering clientId occured multiple times. This should only occur for - /// race conditions, if the code calling TrackClientEvent is not threadsafe - virtual Result TrackClientEvent( - uint64 clientId, - uint64 clientEventId) = 0; - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - ICmdBuffer() : m_pClientData(nullptr) - { - } - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~ICmdBuffer() { } - - /// Structure for function pointers for the ICmdBuffer::Cmd* functions. - struct CmdBufferFnTable - { - /// CmdSetUserData function pointers for each pipeline bind point. - CmdSetUserDataFunc pfnCmdSetUserData[static_cast(PipelineBindPoint::Count)]; - - CmdDrawFunc pfnCmdDraw; ///< CmdDraw function pointer. - CmdDrawOpaqueFunc pfnCmdDrawOpaque; ///< CmdDrawOpaque function pointer. - CmdDrawIndexedFunc pfnCmdDrawIndexed; ///< CmdDrawIndexed function pointer. - CmdDrawIndirectMultiFunc pfnCmdDrawIndirectMulti; ///< CmdDrawIndirectMulti function pointer. - CmdDrawIndexedIndirectMultiFunc pfnCmdDrawIndexedIndirectMulti; ///< CmdDrawIndexedIndirectMulti func pointer. - CmdDispatchFunc pfnCmdDispatch; ///< CmdDispatch function pointer. - CmdDispatchIndirectFunc pfnCmdDispatchIndirect; ///< CmdDispatchIndirect function pointer. - CmdDispatchOffsetFunc pfnCmdDispatchOffset; ///< CmdDispatchOffset function pointer. - CmdDispatchMeshFunc pfnCmdDispatchMesh; ///< CmdDispatchmesh function pointer. - CmdDispatchMeshIndirectMultiFunc pfnCmdDispatchMeshIndirectMulti; ///< CmdDispatchMeshIndirect function pointer. - CmdDispatchAqlFunc pfnCmdDispatchAql; ///< CmdDispatchAql function pointer. - } m_funcTable; ///< Function pointer table for Cmd* functions. - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 - /// @internal Some back-compat glue for some of the HwPipePoint interfaces in this file. - static constexpr uint32 HwPipePointToStage[] = - { - PipelineStageTopOfPipe, // HwPipeTop = 0x0 - PipelineStagePostPrefetch, // HwPipePostPrefetch = 0x1 - PipelineStageVs, // HwPipePreRasterization = 0x2 - PipelineStagePs, // HwPipePostPs = 0x3 - PipelineStageLateDsTarget, // HwPipePreColorTarget = 0x4 - PipelineStageCs, // HwPipePostCs = 0x5 - PipelineStageBlt, // HwPipePostBlt = 0x6 - PipelineStageBottomOfPipe, // HwPipeBottom = 0x7 - }; -#endif -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdBuffer.h + * @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDevice.h" +#include "palGpuMemory.h" +#include "palImage.h" +#include "palMsaaState.h" +#include "palPipeline.h" +#include "palQueryPool.h" +#include "palCmdTracking.h" + +/// HSA kernel dispatch packet typedef +typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; +/// AMD kernel code typedef +typedef struct amd_kernel_code_s amd_kernel_code_t; +/// AMD kernel descriptor +namespace llvm {namespace amdhsa {struct kernel_descriptor_t;} } + +namespace Util +{ +class VirtualLinearAllocator; +class Event; +} + +namespace Pal +{ + +// Forward declarations. +class IBorderColorPalette; +class ICmdAllocator; +class ICmdBuffer; +class IColorBlendState; +class IColorTargetView; +class IDepthStencilState; +class IDepthStencilView; +class IGpuEvent; +class IGpuMemory; +class IIndirectCmdGenerator; +class IMsaaState; +class IPerfExperiment; +class IQueue; +class IQueryPool; +enum class PerfTraceMarkerType : uint32; +enum class PointOrigin : uint32; + +struct VideoCodecInfo; +struct VideoCodecAuxInfo; + +/// Specifies a pipeline bind point (i.e., compute or graphics). +enum class PipelineBindPoint : uint32 +{ + Compute = 0x0, + Graphics = 0x1, + Count +}; + +/// Fully specifies a type of graphics primitive and vertex ordering for geometry. +enum class PrimitiveTopology : uint8 +{ + PointList = 0x0, + LineList = 0x1, + LineStrip = 0x2, + TriangleList = 0x3, + TriangleStrip = 0x4, + RectList = 0x5, ///< Each rect is three 2D axis-aligned rectangle vertices. + QuadList = 0x6, + QuadStrip = 0x7, + LineListAdj = 0x8, + LineStripAdj = 0x9, + TriangleListAdj = 0xA, + TriangleStripAdj = 0xB, + Patch = 0xC, + TriangleFan = 0xD, + LineLoop = 0xE, + Polygon = 0xF, + TwoDRectList = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle. + /// Support is optional, see support2DRectList in DeviceProperties. + Count +}; + +/// Specifies how triangle primitives should be rasterized. +enum class FillMode : uint8 +{ + Points = 0x0, + Wireframe = 0x1, + Solid = 0x2, + Count +}; + +/// Specifies the triangle face direction that should result in culled primitives. +enum class CullMode : uint8 +{ + _None = 0x0, ///< All triangles are rasterized. + Front = 0x1, ///< Front facing triangles are culled. + Back = 0x2, ///< Back facing triangles are culled. + FrontAndBack = 0x3, ///< All triangles are culled. + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< All triangles are rasterized. +#endif +}; + +/// Specifies vertex winding order corresponding to a front facing triangle. @see CullMode. +enum class FaceOrientation : uint8 +{ + Ccw = 0x0, ///< Counter-clockwise vertex winding primitives are front facing. + Cw = 0x1 ///< Clockwise vertex winding primitives are front facing. +}; + +/// Specifies which vertex of a primitive is the _provoking vertex_. This impacts which vertex's "flat" VS outputs +/// are passed to the PS (i.e., flat shading). +enum class ProvokingVertex : uint8 +{ + First = 0x0, + Last = 0x1 +}; + +/// Specifies bit size of each element in an index buffer. +enum class IndexType : uint32 +{ + Idx8 = 0x0, + Idx16 = 0x1, + Idx32 = 0x2, + Count +}; + +/// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic(). +enum class AtomicOp : uint32 +{ + AddInt32 = 0x00, + SubInt32 = 0x01, + MinUint32 = 0x02, + MaxUint32 = 0x03, + MinSint32 = 0x04, + MaxSint32 = 0x05, + AndInt32 = 0x06, + OrInt32 = 0x07, + XorInt32 = 0x08, + IncUint32 = 0x09, + DecUint32 = 0x0A, + AddInt64 = 0x0B, + SubInt64 = 0x0C, + MinUint64 = 0x0D, + MaxUint64 = 0x0E, + MinSint64 = 0x0F, + MaxSint64 = 0x10, + AndInt64 = 0x11, + OrInt64 = 0x12, + XorInt64 = 0x13, + IncUint64 = 0x14, + DecUint64 = 0x15, + Count +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 +/// Specifies the point in the GPU pipeline where an action should take place. +/// +/// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps. +/// +/// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield +/// true if a stall is necessary to resolve a hazard between those two pipe points. This guideline does not +/// hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with +/// graphics shader work. +/// +/// @see ICmdBuffer::CmdSetEvent() +/// @see ICmdBuffer::CmdResetEvent() +/// @see ICmdBuffer::CmdPredicateEvent() +/// @see ICmdBuffer::CmdBarrier() +/// @see ICmdBuffer::CmdWriteTimestamp() +/// @see ICmdBuffer::CmdWriteImmediate() +enum HwPipePoint : uint32 +{ + HwPipeTop = 0x0, ///< Earliest possible point in the GPU pipeline (CP PFP), can be + /// used as wait point for indirect args and index buffer fetch. + HwPipePostPrefetch = 0x1, ///< Indirect arguments have been fetched for all prior + /// draws/dispatches (CP ME). + HwPipePreRasterization = 0x2, ///< All prior generated VS/HS/DS/GS waves have completed, can be + /// used as release point for VB/IB fetch and streamout target. + HwPipePostPs = 0x3, ///< All prior generated PS waves have completed. + /// Only valid as a pipe point to wait on (release point). + HwPipePreColorTarget = 0x4, ///< Represents the same point in pipe to HwPipePostPs, but provides + /// clients with a better option to accurately specify the pipeline + /// sync request. And PAL uses it as entry-point to add partial + /// flushes to prevent write-after-read hazard from corner cases. + /// Only valid as a wait point (acquire point). + HwPipePreIndexBuffer = HwPipeTop, ///< As late as possible before index buffer fetches (CP PFP). + HwPipePostIndexBuffer = HwPipePreRasterization,///< All prior index buffer fetches have completed. + + // The following points apply to compute-specific work: + HwPipePreCs = HwPipePostPrefetch, ///< As late as possible before CS waves are launched (CP ME). + HwPipePostCs = 0x5, ///< All prior generated CS waves have completed. + + // The following points apply to BLT-specific work: + HwPipePreBlt = HwPipePostPrefetch, ///< As late as possible before BLT operations are launched. + HwPipePostBlt = 0x6, ///< All prior requested BLTs have completed. + + HwPipeBottom = 0x7, ///< All prior GPU work (graphics, compute, or BLT) has completed. + HwPipePointCount +}; +#endif + +/// Bitmask values that can be OR'ed together to specify a synchronization scope. See srcStageMask and dstStageMask in +/// @ref AcquireReleaseInfo. +/// +/// When specifying an execution dependency at a synchronization point where previous operations must *happen-before* +/// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior +/// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs. +/// +/// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should +/// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering. +enum PipelineStageFlag : uint32 +{ + PipelineStageTopOfPipe = 0x00000001, + PipelineStageFetchIndirectArgs = 0x00000002, + PipelineStagePostPrefetch = 0x00000004, + PipelineStageFetchIndices = 0x00000008, + PipelineStageStreamOut = 0x00000010, + PipelineStageVs = 0x00000020, + PipelineStageHs = 0x00000040, + PipelineStageDs = 0x00000080, + PipelineStageGs = 0x00000100, + PipelineStagePs = 0x00000200, + PipelineStageSampleRate = 0x00000400, + PipelineStageEarlyDsTarget = 0x00000800, + PipelineStageLateDsTarget = 0x00001000, + PipelineStageColorTarget = 0x00002000, + PipelineStageCs = 0x00004000, + PipelineStageBlt = 0x00008000, + PipelineStageBottomOfPipe = 0x00010000, + PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget, + PipelineStageAllStages = 0x0001FFFF +}; + +/// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time. Such a +/// mask should be specified in the usages field of ImageLayout. These combined usages can be examined by PAL to infer +/// the layout (i.e., compression state) of the image. +/// +/// @note There is no layout corresponding to CmdClear*(). The layout flags passed to those functions will determine +/// the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the +/// layout the same. +enum ImageLayoutUsageFlags : uint32 +{ + LayoutUninitializedTarget = 0x00000001, ///< Initial state of any image that can be used as a color or + /// depth/stencil target. A layout transition out of this state will + /// likely result in a mask RAM initialization BLT. If this bit is + /// set, no other bits may be set. + LayoutColorTarget = 0x00000002, ///< Color target bound via CmdBindTargets(). This bit is exclusive + /// with LayoutDepthStencilTarget. + LayoutDepthStencilTarget = 0x00000004, ///< Depth/stencil target bound via CmdBindTargets(). This bit is + /// exclusive with LayoutColorTarget. + LayoutShaderRead = 0x00000008, ///< Any shader read state including texture, UAV, constant buffer, + /// vertex buffer. + LayoutShaderFmaskBasedRead = 0x00000010, ///< Images in this state support the load_fptr AMD IL instruction, + /// which will read decompressed fmask in order to access compressed + /// MSAA color data from a shader. + LayoutShaderWrite = 0x00000020, ///< Writeable UAV. + LayoutCopySrc = 0x00000040, ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or + /// CmdCopyTiledImageToMemory() source image. + LayoutCopyDst = 0x00000080, ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or + /// CmdCopyMemoryToTiledImage() destination image. + LayoutResolveSrc = 0x00000100, ///< CmdResolveImage() source. + LayoutResolveDst = 0x00000200, ///< CmdResolveImage() destination. + LayoutPresentWindowed = 0x00000400, ///< Windowed-mode IQueue::Present(). + LayoutPresentFullscreen = 0x00000800, ///< Fullscreen (flip) present. Layout must be supported by the + /// display engine. + LayoutUncompressed = 0x00001000, ///< Metadata fully decompressed/expanded layout + LayoutSampleRate = 0x00002000, ///< CmdBindSampleRateImage() source. + LayoutAllUsages = 0x00003FFF +}; + +/// Bitmask values that can be ORed together to specify all potential engines an image might be used on. Such a +/// mask should be specified in the engines field of ImageLayout. +/// +/// If the client API is unable to determine which engines might be used, it should specify all possible engines +/// corresponding to the usage flags. +enum ImageLayoutEngineFlags : uint32 +{ + LayoutUniversalEngine = 0x1, + LayoutComputeEngine = 0x2, + LayoutDmaEngine = 0x4, + LayoutVideoEncodeEngine = 0x8, + LayoutVideoDecodeEngine = 0x10, + LayoutVideoJpegDecodeEngine = 0x20, + LayoutAllEngines = 0x3F +}; + +/// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or +/// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages. +enum CacheCoherencyUsageFlags : uint32 +{ + CoherCpu = 0x00000001, ///< Data read or written by CPU. + CoherShaderRead = 0x00000002, ///< Data read by a GPU shader. + CoherShaderWrite = 0x00000004, ///< Data written by a GPU shader. + CoherCopySrc = 0x00000008, ///< Source of a ICmdBuffer::CmdCopy*() call. + CoherCopyDst = 0x00000010, ///< Destination of a ICmdBuffer::CmdCopy*() call. + CoherColorTarget = 0x00000020, ///< Color target. + CoherDepthStencilTarget = 0x00000040, ///< Depth stencil target. + CoherResolveSrc = 0x00000080, ///< Source of a CmdResolveImage() call. + CoherResolveDst = 0x00000100, ///< Destination of a CmdResolveImage() call. + CoherClear = 0x00000200, ///< Destination of a CmdClear() call. + CoherIndirectArgs = 0x00000400, ///< Source argument data read by CmdDrawIndirect() and similar functions. + CoherIndexData = 0x00000800, ///< Index buffer data. + CoherQueueAtomic = 0x00001000, ///< Destination of a CmdMemoryAtomic() call. + CoherTimestamp = 0x00002000, ///< Destination of a CmdWriteTimestamp() call. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 + CoherStreamOut = 0x00004000, ///< Data written as stream output. + CoherMemory = 0x00008000, ///< Data read or written directly from/to memory + CoherSampleRate = 0x00010000, ///< CmdBindSampleRateImage() source. + CoherPresent = 0x00020000, ///< Source of present. + CoherCp = 0x00080000, ///< HW Command Processor (CP) encompassing the front - end command + CoherAllUsages = 0x000FFFFF, ///< processing of any queue, including SDMA. +#else + CoherCeLoad = 0x00004000, ///< Source of a CmdLoadCeRam() call. + CoherCeDump = 0x00008000, ///< Destination of CmdDumpCeRam() call. + CoherStreamOut = 0x00010000, ///< Data written as stream output. + CoherMemory = 0x00020000, ///< Data read or written directly from/to memory + CoherSampleRate = 0x00040000, ///< CmdBindSampleRateImage() source. + CoherPresent = 0x00080000, ///< Source of present. + CoherCp = 0x00200000, ///< HW Command Processor (CP) encompassing the front - end command + CoherAllUsages = 0x003FFFFF, ///< processing of any queue, including SDMA. +#endif + + CoherShader = CoherShaderRead | CoherShaderWrite, + CoherCopy = CoherCopySrc | CoherCopyDst, + CoherResolve = CoherResolveSrc | CoherResolveDst, +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage(). +enum ClearColorImageFlags : uint32 +{ + ColorClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before + /// and after the clear assuming all subresources to be cleared are currently + /// ready for rendering as a color target (as is required by API convention in + /// DX12). Allows reduced sync costs in some situations since PAL knows + /// the details of how the clear will be performed. + ColorClearForceSlow = 0x00000002, ///< Force these to use slow clears. + ColorClearSkipIfSlow = 0x00000004, ///< Only issue the clear if it is a fast clear. + ColorClearAllFlags = 0x00000007 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil(). +enum ClearDepthStencilFlags : uint32 +{ + DsClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before + /// and after the clear assuming all subresources to be cleared are currently + /// ready for rendering as a depth/stencil target (as is required by API convention + /// in DX12). Allows reduced sync costs in some situations since PAL knows the + /// details of how the clear will be performed. + DsClearAllFlags = 0x00000001 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage(). +enum ResolveImageFlags : uint32 +{ + ImageResolveInvertY = 0x00000001, ///< PAL will invert the y-axis (flip upside down) of the resolved region to + /// the destination image. + ImageResolveDstAsSrgb = 0x00000002, ///< If set, a non-srgb destination image will be treated as srgb format. + /// The flag cannot be set when @ref ImageResolveDstAsNorm is set. + ImageResolveDstAsNorm = 0x00000004, ///< If set, a srgb destination image will be treated as non-srgb format. + /// The flag cannot be set when @ref ImageResolveDstAsSrgb is set. + ImageResolveSrcAsNorm = 0x00000008, ///< If set, a srgb source image will be treated as non-srgb format. + ImageResolveAllFlags = 0x0000000F ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies properties for creation of an ICmdBuffer object. Input structure to IDevice::CreateCmdBuffer(). +struct CmdBufferCreateInfo +{ + ICmdAllocator* pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory + /// If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset + /// with a non-null pCmdAllocator before calling ICmdBuffer::Begin. + QueueType queueType; ///< Type of queue commands in this command buffer will target. + /// This defines the set of allowed actions in the command buffer. + QueuePriority queuePriority; ///< Priority level of the queue this command buffer will target. + EngineType engineType; ///< Type of engine the queue commands will run on. + + union + { + struct + { + /// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root" + /// command buffer. Nested command buffers differ from root command buffers in how they are sent to the + /// GPU for execution: root command buffers must be submitted to the hardware by calling + /// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root + /// command buffer. + /// + /// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is + /// meaningless and unsupported. It is an error to attempt to create a nested DMA command buffer. + /// + /// @see ICmdBuffer::CmdExecuteNestedCmdBuffers. + uint32 nested : 1; + + /// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming. + uint32 realtimeComputeUnits : 1; + + /// Target queue uses dispatch tunneling. + uint32 dispatchTunneling : 1; + + /// Indicates that each subsequent Dispatch command is desired to be executed in alternating + /// order of forward and reverse workgroup walk order. This can improve cache locality when + /// subsequent Dispatches consume data from the previous Dispatch and the overall footprint + /// does not fit in cache. + /// This is a best effort as not all implementations or Queues may support this. + uint32 dispatchPingPongWalk : 1; + + /// Reserved for future use. + uint32 reserved : 28; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; + + } flags; ///< Command buffer creation flags. +}; + +/// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling +/// root-level command buffer. +union InheritedStateFlags +{ + struct + { + /// Color and depth target views are inherited from the root-level command buffer. The nested command buffer + /// should not modify this state. + uint32 targetViewState : 1; + + /// Occlusion query is inherited from the root-level command buffer. The nested command buffer + /// should not modify this state. + uint32 occlusionQuery : 1; + + /// Predication is inherited from the root-level command buffer. The nested command buffer should not modify + /// this state. + uint32 predication : 1; + + /// Reserved for future usage. + uint32 reserved : 29; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies parameters inherited from primary command buffer into nested command buffer. +struct InheritedStateParams +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891 + uint32 colorTargetCount; ///< Number of color targets bound in the + /// root-level command buffer. + SwizzledFormat colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color + /// target. + uint32 sampleCount[MaxColorTargets]; ///< Sample count for each color target. +#endif + InheritedStateFlags stateFlags; ///< States that are inherited from the + /// calling root-level command buffer. +}; + +/// Specifies optional hints to control command buffer building optimizations. +union CmdBufferBuildFlags +{ + struct + { + /// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end + /// limited. These optimizations include removing redundant PM4 commands and reducing the VGT prim group size. + /// This flag might increase the CPU overhead of building command buffers. + uint32 optimizeGpuSmallBatch : 1; + + /// Optimize command buffer building for exclusive command buffer submission. Command buffers built with this + /// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that + /// they are no longer in use. This flag allows PAL to modify the contents of command buffers during + /// submission. + uint32 optimizeExclusiveSubmit : 1; + + /// Optimize command buffer building for single command buffer submission. Command buffers built with this flag + /// cannot be submitted more than once. This flag allows PAL to modify the contents of command buffers during + /// submission. This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set + /// optimizeExclusiveSubmit if this flag is set. + uint32 optimizeOneTimeSubmit : 1; + + /// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients + /// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided. + uint32 optimizeTessDistributionFactors : 1; + + /// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound + /// pipeline object. This optimization might increase the CPU overhead of building command buffers and/or + /// introduce additional front-end GPU bottlenecks. + uint32 prefetchShaders : 1; + + /// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end. + /// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes. + uint32 prefetchCommands : 1; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(), + /// or CmdWriteCeRam() + uint32 usesCeRamCmds : 1; +#else + uint32 placeholder914 : 1; +#endif + + /// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet. + /// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if + /// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers. + uint32 disallowNestedLaunchViaIb2 : 1; + + /// placeholder + uint32 placeholder1 : 2; + + /// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write + /// non-TMZ memory, the results are undefined. Only valid for graphics and compute. + uint32 enableTmz : 1; + + uint32 placeholder3 : 1; + + /// If set, internal operations such as blits, copies, etc. will not affect active Query results. + /// Otherwise they may affect the results. + uint32 disableQueryInternalOps : 1; + + uint32 optimizeContextStatesPerBin : 1; + uint32 optimizePersistentStatesPerBin : 1; + + /// Reserved for future use. + uint32 reserved : 16; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies tessellation accum factors. +union TessDistributionFactors +{ + struct + { + /// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for + /// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a + /// different VGT. + uint32 isoDistributionFactor : 8; + uint32 triDistributionFactor : 8; ///< Recommended to be higher than quad factor. + uint32 quadDistributionFactor : 8; + /// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is + /// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still + /// increments the accumulator for the Patch distribution factor. + uint32 donutDistributionFactor : 5; + /// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch + /// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to + /// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken + /// into trapezoids. + uint32 trapDistributionFactor : 3; + }; + + /// Values packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies options that direct command buffer building. +struct CmdBufferBuildInfo +{ + /// Command buffer build flags, specifies optional hints to control command buffer build optimizations. + CmdBufferBuildFlags flags; + + /// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested + /// command buffer should not modify the software states. Any software params that may be needed within nested + /// command buffer needs to be provided here. + const InheritedStateParams* pInheritedState; + + /// If non-null, the command buffer will begin with all states set as they are in this previously built command + /// buffer. Any state specified in pInheritedState is excluded if it is also provided. + const ICmdBuffer* pStateInheritCmdBuffer; + + /// Optional allocator for PAL to use when allocating temporary memory during command buffer building. PAL will + /// stop using this allocator once command building ends. If no allocator is provided PAL will use an internally + /// managed allocator instead which may be less efficient. PAL will use this allocator in two ways: + /// + Temporary storage within a single command building call. PAL will rewind the allocator before returning to + /// free all memory allocated within the call. + /// + Temporary storage for the entire command building period. When Begin() is called, PAL will save the current + /// position of the allocator and rewind the allocator to that point when End() is called. If the client also + /// wishes to allocate temporary storage that lasts between command building function calls they must allocate it + /// before calling Begin() or PAL will accidentally free it. + Util::VirtualLinearAllocator* pMemAllocator; + + /// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the + /// optimizeTessDistributionFactors flag for these custom factors to take effect. + /// Nested command buffers inherit this value from the primary. + TessDistributionFactors clientTessDistributionFactors; + + /// Number of context states per PBB bin. + /// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect. + uint8 contextStatesPerBin; + + /// Number of persistent states per PBB bin. + /// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect. + uint8 persistentStatesPerBin; + + /// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer. + uint64 execMarkerClientHandle; +}; + +/// Specifies info on how a compute shader should use resources. +struct DynamicComputeShaderInfo +{ + float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively + /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a + /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] + /// and specify the maximum number of waves per compute unit. If the hardware has one wave + /// limit control for multiple shader stages PAL will select the most strict limit. + /// This option is converted internally to set set HW WavesPerSh setting and the non-integer + /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for + /// example specify less number of waves than number of CUs per shader array. + + uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on, + /// throttling it, to enable more graphics work to complete. 0 disables the limit. + + uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before + /// moving to the next compute unit. 0 selects optimal default. + + uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes. + /// Zero indicates that the LDS size determined at pipeline-compilation time will be used. +}; + +/// Specifies info on how a graphics shader should use resources. +struct DynamicGraphicsShaderInfo +{ + float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively + /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a + /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] + /// and specify the maximum number of waves per compute unit. If the hardware has one wave + /// limit control for multiple shader stages PAL will select the most strict limit. + /// This option is converted internally to set HW WavesPerSh setting and the non-integer + /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for + /// example specify less number of waves than number of CUs per shader array. +}; + +/// Specifies dynamic states of a graphics pipeline +struct DynamicGraphicsState +{ + uint32 colorWriteMask; ///< Color target write mask. 4b / RT (8 count) + struct + { + uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation. + uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate. + uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate. + uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage. + uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to + /// axis-aligned line end caps during line rasterization. + uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels. + uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend + uint32 vertexBufferCount : 6; ///< Number vertex buffer slots accessed by this pipeline + LogicOp logicOp : 4; ///< Logic operation to perform. + DepthRange depthRange : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport + /// transform: 0 to 1 or -1 to 1). + DepthClampMode depthClampMode : 2; ///< Depth clamping behavior. + uint32 reserved1 : 7; ///< Reserved + uint32 reserved : 5; ///< Reserved for future use. + }; + + union + { + struct + { + uint32 depthClampMode : 1; ///< Whether to enable dynamic state depthClampMode. + uint32 depthRange : 1; ///< Whether to enable dynamic state depthRange. + uint32 logicOp : 1; ///< Whether to enable dynamic state logicOp. + uint32 colorWriteMask : 1; ///< Whether to enable dynamic state colorWriteMask. + uint32 switchWinding : 1; ///< Whether to enable dynamic state switchWinding. + uint32 depthClipMode : 1; ///< Whether to enable dynamic state depthClipNear/FarEnable. + uint32 alphaToCoverageEnable : 1; ///< Whether to enable dynamic state alphaToCoverageEnable. + uint32 perpLineEndCapsEnable : 1; ///< Whether to enable dynamic state perpLineEndCapsEnable. + uint32 rasterizerDiscardEnable : 1; ///< Whether to enable dynamic state rasterizerDiscardEnable. + uint32 dualSourceBlendEnable : 1; ///< Whether to enable dynamic state dualSourceBlendEnable + uint32 vertexBufferCount : 1; ///< Whether to enable dynamic state vertexBufferCount. + uint32 reserved1 : 1; ///< Reserved. + uint32 reserved : 20; ///< Reserved for future use. + }; + uint32 u32All; + } enable; +}; + +/// Specifies info on how graphics shaders should use resources. +struct DynamicGraphicsShaderInfos +{ + union + { + // VS/HS/DS/GS or TS/MS are active + struct + { + DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information. + DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information. + DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information. + DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information. + }; + struct + { + DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information. + DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information. + }; + }; + + DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information. + + union + { + struct + { + uint8 vs : 1; // If set, there is dynamic VS shader info. + uint8 hs : 1; // If set, there is dynamic HS shader info. + uint8 ds : 1; // If set, there is dynamic DS shader info. + uint8 gs : 1; // If set, there is dynamic GS shader info. + uint8 ps : 1; // If set, there is dynamic PS shader info. + uint8 ts : 1; // If set, there is dynamic TS shader info. + uint8 ms : 1; // If set, there is dynamic MS shader info. + uint8 reserved : 1; // Reserved. + }; + uint8 u8All; + } enable; +}; + +/// Specifies parameters for binding a pipeline. +/// @see ICmdBuffer::CmdBindPipeline +struct PipelineBindParams +{ + PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics). + const IPipeline* pPipeline; ///< New pipeline to be bound. Can be null in order to unbind a previously + /// bound pipeline without binding a new one. + uint64 apiPsoHash; ///< 64-bit identifier provided by client driver based on the Pipeline State + /// Object. There exists a many-to-one correlation for ApiPsoHash to + /// internalPipelineHash to map the two. + union + { + DynamicComputeShaderInfo cs; ///< Dynamic Compute shader information. + + struct + { + DynamicGraphicsShaderInfos gfxShaderInfo; + DynamicGraphicsState gfxDynState; + }; + }; +}; + +/// Specifies per-MRT color target view and current image state. Used as input to ICmdBuffer::CmdBindTargets(). +struct ColorTargetBindInfo +{ + const IColorTargetView* pColorTargetView; ///< Color target view to bind. + ImageLayout imageLayout; ///< Specifies the current image layout based on bitmasks of currently + /// allowed operations and engines that may perform those operations. + /// At minimum, the LayoutColorTarget usage flag and + /// LayoutUniversalEngine engine flag must be set. +}; + +/// Specifies depth/stencil view and current image state of the depth and stencil planes. Used as input to +/// ICmdBuffer::CmdBindTargets(). +struct DepthStencilBindInfo +{ + const IDepthStencilView* pDepthStencilView; ///< Depth/stencil target view to bind. + ImageLayout depthLayout; ///< Specifies the current image layout of the depth plane based on + /// bitmasks of currently allowed operations and engines that may + /// perform those operations. At minimum, the + /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine + /// engine flag must be set. Ignored if the specified view does not + /// have a depth plane. + ImageLayout stencilLayout; ///< Specifies the current image layout of the stencil plane based on + /// bitmasks of currently allowed operations and engines that may + /// perform those operations. At minimum, the + /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine + /// engine flag must be set. Ignored if the specified view does not + /// have a stencil plane. +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 +/// Represents a GPU memory or image transition as part of a barrier. +/// +/// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the +/// specified set of destination caches. The source and destination designation is relative to the barrier itself +/// and does not indicate whether a particular cache is a read or write cache. +/// +/// Typically a transition flushes written data from the source caches into the destination caches and thus the source +/// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior +/// read-only caches accesses as PAL may be able to optimize its cache operations. +/// +/// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL +/// may still issue cache operations for internal reasons. +/// +/// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt. +/// +/// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches. +struct BarrierTransition +{ + + uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose + /// results need to be visible for subsequent operations. Flags for prior read operations + /// may be included as well and may be used for internal optimizations. + uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read + /// and/or write data flushed from the caches indicated by the srcCacheMask. + + struct + { + const IImage* pImage; ///< If non-null, indicates this transition only applies to the specified image. + /// The remaining members of this structure are ignored if this member is null. + SubresRange subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref + /// LayoutUninitializedTarget this range must cover all subresources of pImage + /// unless the perSubresInit image create flag was specified. + ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and + /// engines up to this point. These masks imply the previous compression state. No + /// usage flags should ever be set in oldLayout.usages that correspond to usages + /// that are not supported by the engine that is performing the transition. The + /// queue type performing the transition must be set in oldLayout.engines. + ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and + /// engines after this point. These masks imply the upcoming compression state. + /// point. This usage mask implies the upcoming compressions state. A difference + /// between oldLayoutUsageMask and newLayoutUsageMask may result in a + /// decompression. + + /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a + /// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum + /// valid position (not quite to the bottom/right border of the pixel). + /// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when + /// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call. + const MsaaQuadSamplePattern* pQuadSamplePattern; + + } imageInfo; ///< Image-specific transition information. +}; + +/// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier(). +/// +/// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate +/// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how +/// the GPU will use the image. +/// +/// This structure directly specifies how #1 is performed. #2 and #3 are managed by the list of @ref BarrierTransition +/// structures passed in pTransitions. +struct BarrierInfo +{ + /// Determine at what point the GPU should stall until all specified waits and transitions have completed. If the + /// specified wait point is unavailable, PAL will wait at the closest available earlier point. + HwPipePoint waitPoint; + + uint32 pipePointWaitCount; ///< Number of entries in pPipePoints. + const HwPipePoint* pPipePoints; ///< The barrier will stall until the hardware pipeline has cleared + /// up to each point specified in this array. One entry in this + /// array is typically enough, but CS and GFX operate in parallel + /// at certain stages. + + uint32 gpuEventWaitCount; ///< Number of entries in ppGpuEvents. + const IGpuEvent** ppGpuEvents; ///< The barrier will stall until each GPU event in this array is + /// in the set state. + + uint32 rangeCheckedTargetWaitCount; ///< Number of entries in ppTargets. + const IImage** ppTargets; ///< The barrier will stall until all previous rendering with any + /// color or depth/stencil image in this list bound as a target + /// has completed. If one of the targets is a nullptr it will + /// perform a full range sync. + + uint32 transitionCount; ///< Number of entries in pTransitions. + const BarrierTransition* pTransitions; ///< List of image/memory transitions to process. See + /// @ref BarrierTransition. The same subresource should never + /// be specified more than once in the list of transitions. + /// PAL assumes that all specified subresources are unique. + + uint32 globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined + /// (bitwise logical union) with the @ref srcCacheMask field belonging to every + /// element in @ref pTransitions. If this is zero or if there are no transitions, + /// then no global cache flags are applied during every transition. + + uint32 globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined + /// (bitwise logical union) with the @ref dstCacheMask field belonging to every + /// element in @ref pTransitions. If this is zero or if there are no transitions, + /// then no global cache flags are applied during every transition. + + uint32 reason; ///< The reason that the barrier was invoked. +}; +#endif + +/// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory +/// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain +/// cache operations. +/// +/// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of +/// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier. +/// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution +/// dependencies. +/// +/// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values. +/// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made +/// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or +/// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate +/// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed. +/// +/// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions +/// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero. +/// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations. +/// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations. +/// +/// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations +/// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional +/// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations. +/// If they are provided PAL may detect cases where future read operations use the same caches as the prior read +/// operations and thus can skip the usual visibility operations. +/// +/// Note that, +/// 1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags +/// were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise. +/// 2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal +/// optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one +/// @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource +/// transitions' stage or access mask into the global transition mask. +/// +/// This struct is used by @ref AcquireReleaseInfo. +struct MemBarrier +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + union + { + struct + { + uint32 globallyAvailable : 1; ///< Normally, data made available is in the GPU LLC. When this bit is + /// set, available means in memory, available to all clients in the + /// system. This is useful for rare cases like mid command buffer + /// synchronization with the CPU or another external device. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit uint. + } flags; ///< Flags controlling the memory barrier. +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 + GpuMemSubAllocInfo memory; ///< Specifies a portion of an IGpuMemory object this memory barrier affects. + /// Zero values of memory structure indicate full range barrier operations. +#endif + + uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope that must be confirmed complete as part of a release. Must be + /// 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope of operations to be performed after the acquire. Must be + /// 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// availability operation, as defined in the struct comment header. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// visibility operation, as defined in the struct comment header. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. +}; + +/// Specifies required layout transition, execution dependencies, *availability*, and/or *visibility* operations on a +/// subresource of an IImage object. +/// +/// See the header comment on @ref MemBarrier for a full description of the execution dependencies, availability and +/// visibility operations, including what rules the clients must follow when filling out srcAccessMask and +/// dstAccessMask. +/// +/// This struct is used by @ref AcquireReleaseInfo. +struct ImgBarrier +{ + const IImage* pImage; ///< Relevant image resource for this barrier. + SubresRange subresRange; ///< Selects a range of planes/slices/mips the barrier affects. If newLayout + /// includes @ref LayoutUninitializedTarget this range must cover all subresources of + /// pImage unless the perSubresInit image create flag was specified. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 + Box box; ///< Restricts the barrier to a sub-section of each subresource. The Z offset/extent + /// must be 0 for 1D/2D images, and the Y offset/extent must be 0 for 1D images. A + /// box with zero extents will be ignored, and the barrier will affect the entire + /// subresource range. This box may be used to restrict ranges of cache flushes or + /// invalidations, or may restrict what data is decompressed. However, the + /// implementation may not be able to optimize particular cases and may expand the + /// barrier to cover the entire subresource range. Specifying a subregion with a box + /// when newLayout includes @ref LayoutUninitializedTarget is not supported. +#endif + + uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope that must be confirmed complete as part of a release. Must be + /// 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization + /// scope of operations to be performed after the acquire. Must be + /// 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// availability operation, as defined in the struct comment header. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the + /// visibility operation, as defined in the struct comment header. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. + + ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and + /// engines up to this point. These masks imply the previous compression state. No + /// usage flags should ever be set in oldLayout.usages that correspond to usages + /// that are not supported by the engine that is performing the transition. The + /// engine type performing the transition must be set in oldLayout.engines. Can set + /// both oldLayout and newLayout to zero value for no layout transition case. + ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and + /// engines after this point. These masks imply the upcoming compression state. + /// point. A difference between oldLayoutUsageMask and newLayoutUsageMask may result + /// in a decompression. PAL's implementation will ensure the results of any layout + /// operations are consistent with the requested availability and visibility + /// operations. Can set both oldLayout and newLayout to zero value for no layout + /// transition case. + + /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a grid + /// where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum valid + /// position (not quite to the bottom/right border of the pixel). Specifies a custom sample pattern over a 2x2 + /// pixel quad. Can be left null for non-MSAA images or when a valid IMsaaState is bound prior to the barrier + /// call. + const MsaaQuadSamplePattern* pQuadSamplePattern; +}; + +/// Input structure to CmdRelease(), CmdReleaseEvent(), CmdAcquire(), CmdAcquireEvent(), and CmdReleastThenAcquire(). +/// It describes the execution dependencies, memory dependencies, and image layout transitions that must be resolved. +/// +/// Global transition doesn't have buffer or image info so it will assume the worst case and the barrier operations may +/// not be optimal (e.g. metadata may be misaligned and need issue LLC flush/invalidation). It's suggested that if +/// clients know the buffer or image info, try setting up the barrier call with the full buffer or image transition +/// info (including stageMask and accessMask) instead of global transition for optimal performance. +/// +/// Clients may OR multiple MemBarrier into a single MemBarrier on full range barrier cases for simple and saving CPU +/// overhead. To allow more optimization chances (e.g. skip unnecessary stalls for read only transitions) in PAL, +/// it's suggested to split the single grouped MemBarrier into two separate grouped MemBarriers: one is read only +/// MemBarrier and the other is writeable MemBarrier; both are then passed together to the barrier call. +struct AcquireReleaseInfo +{ + uint32 srcGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global + /// synchronization scope that must be confirmed complete as part of a + /// release. Must be 0 when passed in to CmdAcquire or CmdAcquireEvent. + uint32 dstGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global + /// synchronization scope of operations to be performed after the + /// acquire. Must be 0 when passed in to CmdRelease or CmdReleaseEvent. + + uint32 srcGlobalAccessMask; ///< *Access scope* for the global availability operation. Serves the + /// same purpose as srcAccessMask in @ref MemoryBarrier, but will cause + /// all relevant caches to be flushed without range checking. + /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. + uint32 dstGlobalAccessMask; ///< *Access scope* for the global visibility operation. Serves the + /// same purpose as dstAccessMask in @ref MemoryBarrier, but will cause + /// all relevant caches to be invalidated without range checking. + /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. + + uint32 memoryBarrierCount; ///< Number of entries in pMemoryBarriers. + const MemBarrier* pMemoryBarriers; ///< Describes memory dependencies specific to a range of a particular + /// IGpuMemory object. + + uint32 imageBarrierCount; ///< Number of entries in pImageBarriers. + const ImgBarrier* pImageBarriers; /// Describes memory dependencies and image layout transitions required + /// for a subresource range of a particular IImage object. + uint32 reason; ///< The reason that the barrier was invoked. + /// See @ref Developer::BarrierReason for internal reason codes, though + /// clients may define their own as well +}; + +/// Specifies barrier type, global (potentially mixed cases of buffer and image), buffer or image. +enum class BarrierType : uint32 +{ + Global, + Buffer, + Image +}; + +/// Number of all HW opaque release token types. +constexpr uint32 NumReleaseTokenTypes = 4; + +/// Synchronization token structure for CmdRelease() and CmdAcquire(). +/// +/// Clients should pass the ReleaseToken returned by CmdRelease() to CmdAcquire() directly without changing the value. +/// If a resource with given subresource range has multiple ReleaseToken, all related ReleaseToken should be passed to +/// CmdAcquire(). +/// +/// Passing ReleaseToken { .fenceValue = N; .type = T } into CmdAcquire() will wait for all prior releases with +/// .fenceValue <= N for .type == T. Resource with a large number of subresources may introduce lots of ReleaseToken +/// potentially (e.g. released per subresource). No need to track all ReleaseToken for each resource since clients +/// can optimize this based on the fact that release type and fenceValue are exposed for each ReleaseToken: define a +/// ReleaseToken array with size @ref NumReleaseTokenTypes, only track ReleaseToken with the largest fenceValue per +/// each release type; and then passing the tracked array ReleaseToken values to CmdAcquire() is enough. +union ReleaseToken +{ + struct + { + uint32 fenceValue : 24; ///< Release fence value per token type. + uint32 type : 8; ///< Release token type (HW opaque). Note that please increase the number of bits if + /// it can't hold all types, see @ref NumReleaseTokenTypes for details. + }; + + uint32 u32All; +}; + +/// Specifies parameters for a copy from one range of a source GPU memory allocation to a range of the same size in a +/// destination GPU memory allocation. Used as an input to ICmdBuffer::CmdCopyMemory(). +struct MemoryCopyRegion +{ + gpusize srcOffset; ///< Offset in bytes into the source GPU memory allocation to copy data from. + gpusize dstOffset; ///< Offset in bytes into the destination GPU memory allocation to copy data to. + gpusize copySize; ///< Amount of data to copy in bytes. +}; + +/// Specifies parameters for an image copy from one region in a source image subresource to a region of the same size in +/// a destination image subresource. Used as input to ICmdBuffer::CmdCopyImage(). +/// If the region describes a copy between a 2D and a 3D image, extent.depth and numSlices must be equal and may be +/// larger than 1. +struct ImageCopyRegion +{ + SubresId srcSubres; ///< Selects the source subresource. + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + SubresId dstSubres; ///< Selects the destination subresource. + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination + /// subresource. + Extent3d extent; ///< Size of the copy region in pixels. + uint32 numSlices; ///< Number of slices the copy will span. +}; + +/// Specifies parameters for a copy between an image and a GPU memory allocation. The same structure is used regardless +/// of direction, an input for both ICmdBuffer::CmdCopyImageToMemory() and ICmdBuffer::CmdCopyMemoryToImage(). +struct MemoryImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in pixels. + uint32 numSlices; ///< Number of slices the copy will span. + gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + SwizzledFormat swizzledFormat;///< If not Undefined, reinterpret both subresources using this format and swizzle. +}; + +/// Specifies parameters for a copy between a PRT and a GPU memory allocation. The same structure is used regardless +/// of direction, an input for both ICmdBuffer::CmdCopyTiledImageToMemory() and ICmdBuffer::CmdCopyMemoryToTiledImage(). +struct MemoryTiledImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource; must not be a part of the packed mip tail. + Offset3d imageOffset; ///< Tile offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in tiles. + uint32 numSlices; ///< Number of slices the copy will span. + gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// Used by copy operations to temporarily interpret a range of GPU memory as a "typed buffer". A typed buffer is +/// essentially a linear image with a caller-defined row pitch and depth pitch. Typed buffer copies do not require +/// the GPU memory objects to be created with the "typedBuffer" flag. +struct TypedBufferInfo +{ + SwizzledFormat swizzledFormat; ///< The pixels in this buffer have this format. + gpusize offset; ///< Offset in bytes to the start of the copy region in the buffer's GPU memory + /// allocation. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// Specifies parameters for a copy from one region of a typed buffer to a region of the same size in a destination +/// typed buffer. Used as an input to ICmdBuffer::CmdCopyTypedBuffer(). +struct TypedBufferCopyRegion +{ + TypedBufferInfo srcBuffer; ///< How to interpret the source GPU memory allocation as a typed buffer. + TypedBufferInfo dstBuffer; ///< How to interpret the destination GPU memory allocation as a typed buffer. + Extent3d extent; ///< Size of the copy region in pixels. +}; + +/// Specifies parameters for a scaled copy between an image and a typed buffer. The same structure is used regardless +/// of direction, an input for ICmdBuffer::CmdScaledCopyTypedBufferToImage(). +struct TypedBufferImageScaledCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset2d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent2d imageExtent; ///< Size of the image region in pixels. + TypedBufferInfo bufferInfo; ///< How to interpret the GPU memory allocation as a typed buffer. + Extent2d bufferExtent; ///< Size of the typed buffer region in pixels. + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. +}; + +/// Specifies parameters for a scaled image copy from one region in a source image subresource to a region in the +/// destination image subresource. Used as an input to ICmdBuffer::CmdScaledCopyImage. +struct ImageScaledCopyRegion +{ + SubresId srcSubres; ///< Selects the source subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 + uint32 srcSlices; ///< Number of source image slices to read across. +#endif + union + { + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + Offset3dFloat srcOffsetFloat; ///< Alternative representation in floating point. + }; + union + { + SignedExtent3d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates + /// a copy in the reverse direction. + Extent3dFloat srcExtentFloat; ///< Alternative representation in floating point. + }; + + SubresId dstSubres; ///< Selects the destination subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 + uint32 dstSlices; ///< Number of destination image slices to write. +#endif + union + { + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. + Offset3dFloat dstOffsetFloat; ///< Alternative representation in floating point. + }; + union + { + SignedExtent3d dstExtent; ///< Signed size of the destination region in pixels. A negative size + /// indicates a copy in the reverse direction. + Extent3dFloat dstExtentFloat; ///< Alternative representation in floating point. + }; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 887 + uint32 numSlices; ///< Number of slices the copy will span. +#endif + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. + /// The specified format needs to have been included in the "pViewFormats" list + /// specified at image-creation time, otherwise the result might be incorrect. +}; + +/// Specifies parameters for a color-space-conversion copy from one region in a source image subresource to a region in +/// a destination image subresource. Used as an input to ICmdBuffer::CmdColorSpaceConversionCopy. +struct ColorSpaceConversionRegion +{ + Offset2d srcOffset; ///< Offset to the start of the chosen region in the source subresource(s). + SignedExtent2d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates a copy + /// in the reverse direction. + Offset2d dstOffset; ///< Offset to the start of the chosen region in the destination subresource(s). + SignedExtent2d dstExtent; ///< Signed size of the destination region in pixels. A negative size indicates a + /// copy in the reverse direction. + SubresId rgbSubres; ///< Selects the first subresource of the RGB image where the copy will begin. This + /// can either be the source or destination of the copy, depending on whether the + /// copy is performing an RGB->YUV or YUV->RGB conversion. + uint32 yuvStartSlice; ///< Array slice of the YUV image where the copy will begin. All planes of planar + /// YUV images will be implicitly involved in the copy. This can either be the + /// source or destination of the copy, depending on whether the copy is performing + /// an RGB->YUV or YUV->RGB conversion. + uint32 sliceCount; ///< Number of slices the copy will span. +}; + +/// Specifies the color-space-conversion table used when converting between YUV and RGB Image formats. Used as an input +/// to ICmdBuffer:CmdColorSpaceConversionCopy. +struct ColorSpaceConversionTable +{ + float table[3][4]; ///< Values forming the conversion table matrix, which has three rows and four columns. For RGB + /// to YUV conversions, the conversion shader uses the following expressions to evaluate the + /// YUV color: + /// Y = dot( [R G B 1], [row #0] ) + /// U = dot( [R G B 1], [row #1] ) + /// V = dot( [R G B 1], [row #2] ) + /// For YUV to RGB conversions, the conversion shader uses the following expressions to + /// evaluate the RGB color: + /// R = dot( [Y U V 1], [row #0] ) + /// G = dot( [Y U V 1], [row #1] ) + /// B = dot( [Y U V 1], [row #2] ) + /// A fourth row is not needed because alpha is copied directly between the RGB and YUV colors. +}; + +/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy +/// to perform a YUV to RGB color space conversion. Represents the BT.601 standard (standard-definition TV). +extern const ColorSpaceConversionTable DefaultCscTableYuvToRgb; + +/// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy +/// to perform a RGB to YUV color space conversion. Represents the BT.601 standard (standard-definition TV). +extern const ColorSpaceConversionTable DefaultCscTableRgbToYuv; + +/// Specifies flags controlling GPU copy behavior. Format related flags are ignored by DMA queues. +enum CopyControlFlags : uint32 +{ + CopyFormatConversion = 0x1, ///< Requests that the copy convert between two compatible formats. This is ignored + /// unless both formats support @ref FormatFeatureFormatConversion. + CopyRawSwizzle = 0x2, ///< If possible, raw copies will swizzle from the source channel format into the + /// destination channel format (e.g., RGBA to BGRA). + CopyEnableScissorTest = 0x4, ///< If set, do scissor test using the specified scissor rectangle. + CopyControlAllFlags = 0x7 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single +/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). +struct ImageResolveRegion +{ + uint32 srcPlane; ///< The source color, depth, or stencil plane. + uint32 srcSlice; ///< Selects the source starting slice + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. + uint32 dstPlane; ///< The destination color, depth, or stencil plane. + uint32 dstMipLevel; ///< Selects destination mip level. + uint32 dstSlice; ///< Selects the destination starting slice + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. + /// The format must match both subresource's native formats. + + const MsaaQuadSamplePattern* pQuadSamplePattern; ///< Specifies sample pattern for MSAA depth image. It must be a + /// valid pointer if image was created with sampleLocsAlwaysKnown + /// flag set. +}; + +/// A list of the types of PRT+ resolves that can be performed. +enum class PrtPlusResolveType : uint32 +{ + Decode = 0x0, ///< Translate from AMD HW format to format of destination image. + Encode = 0x1, ///< Translate from source image to AMD HW format + Count = 0x2, +}; + +/// Input structure to the CmdResolvePrtPlusImage function +struct PrtPlusImageResolveRegion +{ + Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 + SubresId srcSubresId; ///< Selects the source subresource +#else + uint32 srcMipLevel; ///< Selects source mip level + uint32 srcSlice; ///< Selects the source starting slice +#endif + + Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 + SubresId dstSubresId; ///< Selects the destination subresource +#else + uint32 dstMipLevel; ///< Selects destination mip level + uint32 dstSlice; ///< Selects the destination starting slice +#endif + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Input structure to ICmdBuffer::CmdResolvePrtPlusImageToBuffer() +struct PrtPlusImageToBufferResolveRegion +{ + SubresId srcSubresId; ///< Selects the source subresource + + gpusize dstOffset; ///< Offset into destination subresource + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Input structure to ICmdBuffer::CmdResolvePrtPlusBufferToImage() +struct PrtPlusBufferToImageResolveRegion +{ + gpusize srcOffset; ///< Offset into source subresource + + SubresId dstSubresId; ///< Selects the destination subresource + + Extent3d extent; ///< Size of the resolve region in pixels. + uint32 numSlices; ///< Number of slices to be resolved +}; + +/// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single +/// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). +enum class ResolveMode : uint32 +{ + Average = 0x0, ///< Resolve result is an average of all the individual samples + Minimum = 0x1, ///< Resolve result is the minimum value of all individual samples + Maximum = 0x2, ///< Resolve result is the maximum value of all individual samples + Count = 0x4, +}; + +/// Specifies width of immediate data to be written out. +enum class ImmediateDataWidth : uint32 +{ + ImmediateData32Bit = 0x0, + ImmediateData64Bit = 0x1, + + Count = 0x2, +}; + +/// Specifies flags controlling GPU query behavior. +union QueryControlFlags +{ + struct + { + /// Controls accuracy of query data collection. Available only for occlusion queries. If set, occlusion query + /// is guaranteed to return imprecise non-zero value if any samples pass the depth and stencil test. Using + /// imprecise occlusion query results could improve rendering performance while an occlusion query is active. + uint32 impreciseData : 1; + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies layout of GPU memory used as an input to CmdDrawIndirectMulti. +struct DrawIndirectArgs +{ + uint32 vertexCount; ///< Number of vertices to draw. + uint32 instanceCount; ///< Number of instances to draw. + uint32 firstVertex; ///< Starting index value for the draw. Indices passed to the vertex shader will range from + /// firstVertex to firstVertex + vertexCount - 1. + uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from + /// firstInstance to firstInstance + instanceCount - 1. +}; + +/// Specifies layout of GPU memory used as an input to CmdDrawIndexedIndirectMulti. +/// +/// Indices passed to the vertex shader will be: +/// +/// + IndexBuffer[firstIndex] + vertexOffset +/// + IndexBuffer[firstIndex + 1] + vertexOffset, +/// + ... +/// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset +struct DrawIndexedIndirectArgs +{ + uint32 indexCount; ///< Number of vertices to draw. + uint32 instanceCount; ///< Number of instances to draw. + uint32 firstIndex; ///< Starting index buffer slot for the draw. + int32 vertexOffset; ///< Offset added to the index fetched from the index buffer before it is passed to the + /// vertex shader. + uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from + /// firstInstance to firstInstance + instanceCount - 1. +}; + +/// Specifies layout of GPU memory used as an input to CmdDispatchIndirect. +struct DispatchIndirectArgs +{ + uint32 x; ///< Threadgroups to dispatch in the X dimension. + uint32 y; ///< Threadgroups to dispatch in the Y dimension. + uint32 z; ///< Threadgroups to dispatch in the Z dimension. +}; + +/// Specifies layout of GPU memory used as an input to CmdDispatchMeshIndirect. +using DispatchMeshIndirectArgs = DispatchIndirectArgs; + +/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. +struct CpuVirtAddrAndStride +{ + const void* pCpuVirtAddr; ///< CPU virtual address of the 0th array element. + struct + { + uint64 stride : 32; ///< Distance between array elements in bytes. + uint64 _pad : 32; ///< Padding for structure alignment. + }; +}; + +/// Specifies the GPU virtual address of an array and the stride in bytes between array elements. +struct GpuVirtAddrAndStride +{ + gpusize gpuVirtAddr; ///< GPU virtual address of the 0th array element. + struct + { + uint64 stride : 32; ///< Distance between array elements in bytes. + uint64 _pad : 32; ///< Padding for structure alignment. + }; +}; + +/// Flags to describe a dispatch +union DispatchInfoFlags +{ + struct + { + uint32 devDriverOverlay : 1; ///< Flag indicates this dispatch draws the DevDriver overlay + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies the different stages at which a combiner can choose between different shading rates. +enum class VrsCombinerStage : uint32 +{ + ProvokingVertex, ///< Chooses between the shading rate specified by the VrsRateParams struct and the shader + /// rate provided by the provoking vertex. + Primitive, ///< Chooses between previous combiner stage and the shader rate associated with the primitive + Image, ///< Chooses between previous combiner stage and the shader rate associated with an image + PsIterSamples, ///< Chooses between previous combiner stage and the PS_ITER_SAMPLES rate. + Max +}; + +/// Specifies the different possible shading rates. Not all are supported on all HW; see the supportedVrsRates +/// entry in the gfxipProperties structure. +enum class VrsShadingRate : uint32 +{ + _16xSsaa = 0x0, + _8xSsaa = 0x1, + _4xSsaa = 0x2, + _2xSsaa = 0x3, + _1x1 = 0x4, + _1x2 = 0x5, + _2x1 = 0x6, + _2x2 = 0x7, + Count +}; + +/// Indices into the centerOffset array member of the VrsCenterState structure. +enum class VrsCenterRates : uint32 +{ + _1x1 = 0x0, + _1x2 = 0x1, + _2x1 = 0x2, + _2x2 = 0x3, + Max = 0x4, +}; + +/// Specifies the different ways in which a combiner can choose between two different shading rate inputs. +enum class VrsCombiner : uint32 +{ + Passthrough = 0, ///< Keep previous shading rate. + Override = 1, ///< C.xy = B.xy + Min = 2, ///< min(A.xy, B.xy) + Max = 3, ///< max(A.xy, B.xy) + Sum = 4, ///< min(maxRate, A.xy + B.xy) + Count +}; + +/// Structure for defining paramters to the CmdSetPerDrawVrsRate function. +struct VrsRateParams +{ + /// The shading rate to be bound to the render state. + VrsShadingRate shadingRate; + + /// The state of all the combiners. + VrsCombiner combinerState[static_cast(VrsCombinerStage::Max)]; + + union + { + struct + { + uint32 exposeVrsPixelsMask : 1; ///< Controls how the shader input mask of a coarse pixel is generated. + /// 0 : Bitwise OR of all fine pixel`s mask + /// 1 : Pack fine pixels` coverage mask into iMask. Layout based + /// on VRS rate + uint32 reserved : 31; + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags controlling VRS rate parameters +}; + +/// Structure for defininig paramters to the CmdSetVrsCenterState function. +struct VrsCenterState +{ + /// The offset is scaled by the coarse pixel size and then added to the center location + /// Center offsets are specified as two 4 bits signed integer value representing a location on a 16x16 grid gd. + /// The offset is scaled by the coarse pixel size and then added to the center location + /// 1x1, 1x2, 2x1 and 2x2 shading rates can all have their own unique offsets + Offset2d centerOffset[static_cast(VrsCenterRates::Max)]; + + union + { + struct + { + uint32 overrideCenterSsaa : 1; ///< Override center interpolants to be evaluated at the sample + /// position. + uint32 overrideCentroidSsaa : 1; ///< Override centroid interpolants to be evaluated at the centroid + /// of each sample group being iterated (simply the sample position + /// in the typical case of 1-sample groups). + uint32 alwaysComputeCentroid : 1; ///< Don't assume the centroid of a fully covered shading region is + /// the center. It is possible all samples could be lit but the + /// center is not lit for certain combinations of centerOffset[] + /// values and programmable sample positions + uint32 reserved : 29; ///< Reserved for future HW + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags controlling VRS center state +}; + +/// @internal +/// Function pointer type definition for setting pipeline-accessible user data entries to the specified values. Each +/// command buffer object has one such callback per pipeline bind point, so the bind point is implicit. +/// +/// @see ICmdBuffer::CmdSetUserData(). +typedef void (PAL_STDCALL *CmdSetUserDataFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstEntry, + uint32 entryCount, + const uint32* pEntryValues); + +/// @internal Function pointer type definition for issuing non-indexed draws. +/// +/// @see ICmdBuffer::CmdDraw(). +typedef void (PAL_STDCALL *CmdDrawFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstVertex, + uint32 vertexCount, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId); + +/// @internal Function pointer type definition for issuing draws auto. +/// +/// @see ICmdBuffer::CmdDrawOpaque(). +typedef void (PAL_STDCALL *CmdDrawOpaqueFunc)( + ICmdBuffer* pCmdBuffer, + gpusize streamOutFilledSizeVa, + uint32 streamOutOffset, + uint32 stride, + uint32 firstInstance, + uint32 instanceCount); + +/// @internal Function pointer type definition for issuing indexed draws. +/// +/// @see ICmdBuffer::CmdDrawIndexed(). +typedef void (PAL_STDCALL *CmdDrawIndexedFunc)( + ICmdBuffer* pCmdBuffer, + uint32 firstIndex, + uint32 indexCount, + int32 vertexOffset, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId); + +/// @internal Function pointer type definition for issuing indirect draws. +/// +/// @see ICmdBuffer::CmdDrawIndirectMulti(). +typedef void (PAL_STDCALL *CmdDrawIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// @internal Function pointer type definition for issuing indexed, indirect draws. +/// +/// @see ICmdBuffer::CmdDrawIndexedIndirectMulti(). +typedef void (PAL_STDCALL *CmdDrawIndexedIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// @internal Function pointer type definition for issuing direct dispatches. +/// +/// @see ICmdBuffer::CmdDispatch(). +typedef void (PAL_STDCALL *CmdDispatchFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims size, + DispatchInfoFlags infoFlags); + +/// @internal Function pointer type definition for issuing indirect dispatches. +/// +/// @see ICmdBuffer::CmdDispatchIndirect(). +typedef void (PAL_STDCALL *CmdDispatchIndirectFunc)( + ICmdBuffer* pCmdBuffer, + gpusize gpuVirtAddr); +/// @internal Function pointer type definition for issuing direct dispatches with threadgroup offsets. +/// +/// @see ICmdBuffer::CmdDispatchOffset(). +typedef void (PAL_STDCALL *CmdDispatchOffsetFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims offset, + DispatchDims launchSize, + DispatchDims logicalSize); + +/// @internal Function pointer type definition for issuing direct mesh dispatches. +/// +/// @see ICmdBuffer::CmdDispatchMesh(). +typedef void (PAL_STDCALL *CmdDispatchMeshFunc)( + ICmdBuffer* pCmdBuffer, + DispatchDims size); + +/// @internal Function pointer type definition for issuing indirect mesh dispatches. +/// +/// @see ICmdBuffer::CmdDispatchMeshIndirectMulti(). +typedef void (PAL_STDCALL *CmdDispatchMeshIndirectMultiFunc)( + ICmdBuffer* pCmdBuffer, + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr); + +/// This struct provides the parameters of all the supported features for kernel dispatch +struct DispatchAqlParams +{ + const hsa_kernel_dispatch_packet_t* pAqlPacket; ///< Pointer to AQL packet contains the essential + /// information (size of workgroup, grid, data + /// segments, handle of kernel code object, kernel + /// arguments) of the kernel to be dispatched. + gpusize scratchAddr; ///< GPU VM scratch buffer address + uint32 scratchSize; ///< Scratch buffer size + uint32 scratchOffset; ///< Scratch buffer offset from the base for generic + /// address space +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 920 + const llvm::amdhsa::kernel_descriptor_t* pCpuAqlCode; ///< AMD kernel descriptor on CPU for PM4 emulation +#else + const amd_kernel_code_t* pCpuAqlCode; ///< AMD kernel code object on CPU for PM4 emulation +#endif + gpusize hsaQueueVa; ///< GPU VM address where amd_queue_t is allocated + uint32 wavesPerSh; ///< Waves Per Shade Array + bool useAtc; ///< Indicates whether ATC bit in registers should be set + /// ATC bit is used for indicating if the address + /// is GPUVM(=0) or SVM(=1). Valid only in gfx6 and older + uint64 kernargSegmentSize; ///< The size of kerarg segment that holds the + /// values of the arguments to the kernels + uint32 workitemPrivateSegmentSize; ///< The amount of fixed private address + /// space memory required for a workitem. + uint32 aqlPacketIndex; ///< AQL ID in QueueCreateInfo.aqlPacketList for debugger. + /// Note: debugger support enabled for Navi3x+ + +}; + +/// @internal Function pointer type definition for issuing AQL dispatches. +/// +/// @see ICmdBuffer::CmdDispatchAql(). +typedef void (PAL_STDCALL *CmdDispatchAqlFunc)( + ICmdBuffer* pCmdBuffer, + const DispatchAqlParams& dispatchInfo); + +/// Specifies input assembler state for draws. +/// @see ICmdBuffer::CmdSetInputAssemblyState +struct InputAssemblyStateParams +{ + PrimitiveTopology topology; ///< Defines how vertices should be interpretted and rendered by + /// the graphics pipeline. + uint8 patchControlPoints; ///< # of control points per patch. [0-32] valid. Should be set to + /// 0 by clients if topology is not PrimitiveTopology::Patch. + bool primitiveRestartEnable; ///< Enables the index specified by primitiveRestartIndex to _cut_ + /// a primitive (i.e., triangle strip) and begin a new primitive + /// with the next index. + bool primitiveRestartMatchAllBits; ///< Specifies which bits from primitiveRestartIndex to use. + /// false - only check relevant bits based on index type + /// true - check all 32 bits irrespective of index type + uint32 primitiveRestartIndex; ///< When primitiveRestartEnable is true, this is the index value + /// that will restart a primitive. When using a 16-bit index + /// buffer, the upper 16 bits of this value will be ignored. +}; + +/// Specifies parameters for controlling triangle rasterization. +/// @see ICmdBuffer::CmdSetTriangleRasterState +struct TriangleRasterStateParams +{ + struct + { + FillMode frontFillMode : 2; ///< Whether front-facing triangles should be rendered solid or wireframe. + FillMode backFillMode : 2; ///< Whether back-facing triangles should be rendered solid or wireframe. + CullMode cullMode : 2; ///< Specifies which, if any, triangles should be culled based on whether + /// they are front or back facing. + FaceOrientation frontFace : 1; ///< Specifies the vertex winding that results in a front-facing triangle. + ProvokingVertex provokingVertex : 1; ///< Specifies whether the first or last vertex of a primitive is the + }; /// provoking vertex as it affects flat shading. + union + { + struct + { + uint8 frontDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for front-facing + /// triangle-based primitives + uint8 backDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for back-facing + /// triangle-based primitives + uint8 reserved : 6; ///< Reserved for future use. + }; + uint8 u8All; ///< Flags packed as 8-bit uint. + } flags; ///< Triangle raster state flags. +}; + +/// Specifies parameters for controlling point and line rasterization. +/// @see ICmdBuffer::CmdSetPointLineRasterState +struct PointLineRasterStateParams +{ + float pointSize; ///< Width of a point primitive in pixels. + float lineWidth; ///< Width of a line primitive in pixels. + float pointSizeMin; ///< Minimum width of a point primitive in pixels. + float pointSizeMax; ///< Maximum width of a point primitive in pixels. +}; + +/// Specifies parameters for controlling line stippling. +/// @see ICmdBuffer::CmdSetLineStippleState +struct LineStippleStateParams +{ + uint16 lineStippleValue; ///< Line stipple bit pattern. + uint32 lineStippleScale; ///< Line stipple repeat factor. +}; + +/// Specifies paramters for setting up depth bias. Depth Bias is used to ensure a primitive can properly be displayed +/// (without Z fighting) in front (or behind) of the previously rendered co-planar primitive. This is useful for decal +/// or shadow rendering. +/// @see ICmdBuffer::CmdSetDepthBiasState +struct DepthBiasParams +{ + float depthBias; ///< Base depth bias to be added to each fragment's Z value. In units of the + /// minimum delta representable in the bound depth buffer. + float depthBiasClamp; ///< Maximum allowed depth bias result. Prevents polygons viewed at a sharp value + /// from generating very large biases. + float slopeScaledDepthBias; ///< Factor multiplied by the depth slope (change in Z coord per x/y pixel) to + /// create more bias for "steep" polygons. This result is applied to the final + /// Z value in addition to the base depthBias parameter. +}; + +/// Specifies parameters for setting the value range to be used for depth bounds testing. +/// @see ICmdBuffer::CmdSetDepthBounds +struct DepthBoundsParams +{ + float min; ///< Minimum depth value in passing range (closest). + float max; ///< Maximum depth value in passing range (farthest). +}; + +/// Specifies parameters for setting bit-masks applied to stencil buffer reads and writes. +/// @see ICmdBuffer::CmdSetStencilRefMasks +struct StencilRefMaskParams +{ + + uint8 frontRef; ///< Stencil reference value for front-facing polygons. + uint8 frontReadMask; ///< Bitmask to restrict stencil buffer reads for front-facing polygons. + uint8 frontWriteMask; ///< Bitmask to restrict stencil buffer writes for front-facing polygons. + uint8 frontOpValue; ///< Stencil operation value for front-facing polygons. + /// This is the value used as a parameter for a given stencil operation. + /// For example: StencilOp::IncWrap will use this value when incrementing the current + /// stencil contents. Typically, this would be set to one, but on AMD hardware, + /// this register is 8 bits so there is a greater flexibility. + + uint8 backRef; ///< Stencil reference value for back-facing polygons. + uint8 backReadMask; ///< Bitmask to restrict stencil buffer reads for back-facing polygons. + uint8 backWriteMask; ///< Bitmask to restrict stencil buffer writes for back-facing polygons. + uint8 backOpValue; ///< Stencil operation value for back-facing polygons - See description of frontOpValue + /// for further details. + union + { + uint8 u8All; ///< Flags packed as a 8-bit uint. + struct + { + uint8 updateFrontRef : 1; ///< Updating reference value for front-facing polygons. + uint8 updateFrontReadMask : 1; ///< Updating read mask value for front-facing polygons. + uint8 updateFrontWriteMask : 1; ///< Updating write mask value for front-facing polygons. + uint8 updateFrontOpValue : 1; ///< Updating stencil op value for front-facing polygons. + uint8 updateBackRef : 1; ///< Updating reference value for back-facing polygons. + uint8 updateBackReadMask : 1; ///< Updating read mask value for back-facing polygons. + uint8 updateBackWriteMask : 1; ///< Updating write mask value for back-facing polygons. + uint8 updateBackOpValue : 1; ///< Updating stencil op value for back-facing polygons. + }; + } flags; ///< Flags to indicate which of the stencil state values are being updated. +}; + +/// HiS always exposes two pretests. +constexpr uint32 NumHiSPretests = 2; + +/// Hierarchical stencil (HiS) allows work to be discarded by the stencil test at tile rate in certain cases. +/// In order to use HiS, the client will define a set of pretests that will be performed whenever a particular stencil +/// buffer is written. The stencil image will track the results of the pretest for each 8x8 tile, keeping a record of +/// whether any pixel in the tile "may-pass" or "may-fail" the specified pretest. When stencil testing is enabled, +/// the hardware may be able to discard whole tiles early based on what it can glean from the HiS pretest states. +/// +/// Each stencil image has two pretest slots per mip level. Pretest slots are reset when an initialization barrier +/// targets their mip level on the stencil plane. The client can then pass this struct to @ref CmdUpdateHiSPretests +/// to bind one or more valid pretests. It is legal to bind a pretest over a reset slot at any point. +/// +/// @warning Except in special cases, it is illegal to bind a pretest on top of an existing pretest. +/// +/// It is only legal to bind a new pretest on top of an existing pretest if: +/// 1. All array slices within the given mip have been reset using an initialization barrier. +/// 2. The client guarantees that they will rewrite all stencil values in all array slices within the given mip +/// before the next draw with stencil testing enabled by doing either: +/// a. One or more calls to @ref CmdClearDepthStencil. +/// b. One or more draws with the stencil test disabled and stencil writes enabled. +/// +/// Once pretests are selected via @ref CmdUpdateHiSPretests the client should keep track of which tests were enabled +/// on each stencil image and provide them to every call to @ref CmdClearDepthStencil. This is optional but PAL will +/// not be able to generate HiS optimized clears unless it is given the current pretests. +/// +/// @warning The pretests provided to @ref CmdUpdateHiSPretests are applied to all mips of all subresource ranges. +/// If the client varies pretests between mips they must guarantee that the given pretests were bound to all +/// mips in the given subresource ranges. +/// +/// This feature works best if the future stencil test behavior is known, either directly told via an API extension +/// or via an app profile in the client layer. For example, if the application 1) clears stencil, 2) does a pass to +/// write stencil, 3) then does a final pass that masks rendering based on the stencil value being > 0, ideally we +/// would choose a pretest of func=Greater, mask=0xFF, and value=0 so that #2 would update the stencil image with +/// per-tile data that lets #3 be accelerated at maximum effeciency. +/// +/// In absence of app-specific knowledge, the following algorithm may be a good generic approach: +/// 1. When the stencil image is cleared, set pretest #0 to func=Equal, mask=0xFF, and value set to the clear value. +/// 2. On the first draw with stencil writes enabled, set pretest #1 with the mask set to the app's current stencil +/// mask, and +/// a. If the stencil op is INC or DEC, set func=GreaterEqual and value the same as in #1. +/// b. If the stencil op is REPLACE, set func=Equal and set value to the app's current stencil ref value. +/// +/// Note that HiS can only be beneficial for GPU performance so clients that do not want to implement app profiles or +/// generic heuristics should at least hard-code both tests to something simple. +struct HiSPretests +{ + struct + { + CompareFunc func; ///< This function is used to compare the pretest value with the image's stencil value. + /// The expression is evaluated with the pretest value as the left-hand operand and the + /// image's stencil value as the right-hand operand. + uint8 mask; ///< This value is ANDed with both stencil values before evaluating the comparison. + uint8 value; ///< The pretest value, used as the left-hand operand in the comparison. + bool isValid; ///< True if this pretest contains valid information. Set to false to skip this test. + } test[NumHiSPretests]; ///< The set of pretest slots. +}; + +/// Specifies coordinates for setting up single user clip plane. +/// @see ICmdBuffer::CmdSetUserClipPlanes +struct UserClipPlane +{ + float x; ///< Plane coordinate x + float y; ///< Plane coordinate y + float z; ///< Plane coordinate z + float w; ///< Plane coordinate w +}; + +/// Specifies parameters for setting the constant factor to be used by the blend hardware when programmed with the +/// Blend::ConstantColor, Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend +/// coefficients. +/// @see ICmdBuffer::CmdSetBlendConst +struct BlendConstParams +{ + float blendConst[4]; ///< 4-component RGBA float specifying the new blend constant. +}; + +/// Specifies the parameters for a single viewport +struct Viewport +{ + float originX; ///< X coordinate for the viewport's origin. + float originY; ///< Y coordinate for the viewport's origin. + float width; ///< Width of the viewport. + float height; ///< Height of the viewport. + float minDepth; ///< Minimum depth value of the viewport. Must be in the [0..1] range. + float maxDepth; ///< Maximum depth value of the viewport. Must be in the [0..1] range. + PointOrigin origin; ///< Origin of the viewport relative to NDC. UpperLeft or LowerLeft. +}; + +/// Specifies the viewport transform parameters for setting a single viewport. +/// @see ICmdBuffer::CmdSetViewport +struct ViewportParams +{ + uint32 count; ///< Number of viewports. + float horzDiscardRatio; ///< The ratio between guardband discard rect width and viewport width. + /// For all guard band ratio settings, values less than 1.0f are illegal. + /// Value FLT_MAX opens the guardband as wide as the HW supports. + /// Value 1.0f disables the guardband. + float vertDiscardRatio; ///< The ratio between guardband discard rect height and viewport height. + float horzClipRatio; ///< The ratio between guardband clip rect width and viewport width. + float vertClipRatio; ///< The ratio between guardband clip rect height and viewport height. + DepthRange depthRange; ///< Specifies the target range of Z values + // Define viewports array at the end of the structure as it is common to only access the first N from the CPU. + Viewport viewports[MaxViewports]; ///< Array of desciptors for each viewport. +}; + +/// Specifies the parameters for specifing the scissor rectangle. +struct ScissorRectParams +{ + uint32 count; ///< Number of scissor rectangles. + Rect scissors[MaxViewports]; ///< Array of scissor regions corresponding to each viewport. +}; + +/// Specifies parameters for setting the global scissor rectangle. +/// @see ICmdBuffer::CmdSetGlobalScissor +struct GlobalScissorParams +{ + Rect scissorRegion; ///< Rectangle of the global scissor window. +}; + +/// Specifies parameters for binding the color targets and depth target. +/// @see ICmdBuffer::CmdBindTargets +struct BindTargetParams +{ + uint32 colorTargetCount; ///< Number of color targets to bind. + ColorTargetBindInfo colorTargets[MaxColorTargets]; ///< Array of color target descriptors. + DepthStencilBindInfo depthTarget; ///< Describes the depth target bind info. +}; + +/// Specifies parameters for binding the stream-output targets. +/// @see ICmdBuffer::CmdBindStreamOutTargets +struct BindStreamOutTargetParams +{ + struct + { + gpusize gpuVirtAddr; ///< GPU virtual address of this stream-output target. Must be DWORD-aligned. If + /// this is zero, 'size' is ignored and the target is considered un-bound. + gpusize size; ///< Size of this stream-output target, in bytes. Must be DWORD-aligned. + } target[MaxStreamOutTargets]; ///< Describes the stream-output target for each buffer slot. +}; + +/// Specifies the different types of predication ops available. +enum class PredicateType : uint32 +{ + Zpass = 1, ///< Enable occlusion predicate + PrimCount = 2, ///< Enable streamout predicate + Boolean64 = 3, ///< CP PFP treats memory as a 64bit integer which is either false (0) or true, DX12 style. + Boolean32 = 4, ///< CP PFP treats memory as a 32bit integer which is either false (0) or true, Vulkan style. + Count +}; + +/// Bitfield structure used to specify masks for functions that operate on depth and/or stencil planes of an image. +union DepthStencilSelectFlags +{ + struct + { + /// Select Depth. + uint32 depth : 1; + + /// Select Stencil. + uint32 stencil : 1; + + /// Reserved for future usage. + uint32 reserved : 30; + }; + + /// Flags packed as 32-bit uint. + uint32 u32All; +}; + +/// Specifies information related to clearing a bound color target. Input structure to CmdClearBoundColorTargets(). +struct BoundColorTarget +{ + uint32 targetIndex; ///< Render target index where the target image is currently bound. + SwizzledFormat swizzledFormat; ///< Format and swizzle of the target image. + uint32 samples; ///< Sample count for the target. + uint32 fragments; ///< Fragment count for the target. + ClearColor clearValue; ///< clear color value. +}; + +/// Specifies clear region to clear a bound target. Input structure to CmdClearBoundColorTargets() and +/// CmdClearBoundDepthStencilTargets() +struct ClearBoundTargetRegion +{ + Rect rect; ///< The 2D region to clear. + uint32 startSlice; ///< The starting slice to clear. + uint32 numSlices; ///< The number of slices to clear. +}; + +/// Specifies flags controlling CmdSaveComputeState and CmdRestoreComputeState. PAL clients must be aware that saving +/// and restoring specific state in a nested command buffer may not be supported. The rule is simple: if the client +/// requires that the caller leak the given state to the callee, PAL will not support saving and restoring that state. +enum ComputeStateFlags : uint32 +{ + ComputeStatePipelineAndUserData = 0x1, ///< Selects the bound compute pipeline, all non-indirect user data, and all + /// kernel arguments (if applicable). Note that the current user data will + /// be invalidated on CmdSaveComputeState. + ComputeStateBorderColorPalette = 0x2, ///< Selects the bound border color pallete that affects compute pipelines. + ComputeStateAll = 0x3, ///< Selects all state +}; + +/// Provides dynamic command buffer flags during submission +/// The following flags are used for Frame Pacing when delay time is configured to be caculated by KMD. +/// (Currently DX clients require this). +/// For clients that do not need Frame Pacing with KMD caculated delay time, they can ignore these flags: +/// +/// - frameBegin and frameEnd : Client's presenting queue should track its present state, +/// and set frameBegin flag on the first command buffer after present, +/// set frameEnd flag on the the last command buffer before present. (Could be the Present command buffer itself.) +/// We don't need to set them on queues other than the presenting queue. +/// - P2PCmd : Mark a P2P copy command. KMD could use this flag for adjustments for its frame time calculation. +/// For the current frame time algorithm, clients should only set this flag on SW compositing copy command. +/// But KMD may adjust their algorithm, and clients should update the flag depending on KMD needs. +/// +/// The following flags are used for Direct Capture. +/// +/// - captureBegin and captureEnd : Direct capture info should be filled if any of these is set. And captureEnd flag +/// also notifies KMD that the on-screen primary is safe to release. +struct CmdBufInfo +{ + union + { + struct + { + uint32 isValid : 1; ///< Indicate if this CmdBufInfo is valid and should be submitted + uint32 frameBegin : 1; ///< First command buffer after Queue creation or Present. + uint32 dfSpmTraceBegin : 1; ///< This command buffer begins a DF SPM trace. + uint32 dfSpmTraceEnd : 1; ///< This command buffer ends a DF SPM trace. + uint32 frameEnd : 1; ///< Last command buffer before Present. + uint32 p2pCmd : 1; ///< Is P2P copy command. See CmdBufInfo comments for details. + uint32 captureBegin : 1; ///< This command buffer begins a Direct Capture frame capture. + uint32 captureEnd : 1; ///< This command buffer ends a Direct Capture frame capture. + uint32 rayTracingExecuted : 1; ///< This command buffer contains ray tracing work. + uint32 preflip : 1; ///< This command buffer has pre-flip access to DirectCapture resource + uint32 postflip : 1; ///< This command buffer has post-flip access to DirectCapture resource + uint32 privateFlip : 1; ///< Need to flip to a private primary surface for DirectCapture feature + uint32 vpBltExecuted : 1; ///< This command buffer comtains VP Blt work. + uint32 disableDccRejected : 1; ///< Reject KMD's DisableDcc request to avoid writing to front buffer. + uint32 noFlip : 1; ///< No flip when DirectCapture access submission completes + uint32 frameGenIndex : 4; ///< Index of the DirectCapture feature generated frames + uint32 noRenderPresent : 1; ///< Last command buffer before present which is no render present or not + uint32 motionVectorPropChanged : 1; ///< Indicates whether motion vector properties changed + uint32 depthPropChanged : 1; ///< Indicates whether depth properties changed + uint32 cameraPropChanged : 1; ///< Indicates whether camera matrix properties changed + uint32 capturePrimary : 1; ///< Has Direct Capture primary surface capture + uint32 captureMotionVector : 1; ///< Has Direct Capture motion vector capture + uint32 captureDepth : 1; ///< Has Direct Capture depth capture + uint32 captureCamera : 1; ///< Has Direct Capture camera matrix capture + uint32 hudLessImagePropChanged : 1; ///< Indicates whether HUD less image properties changed + uint32 captureHudLessImage : 1; ///< Has Direct Capture HUD less image capture + uint32 reserved : 3; ///< Reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. + }; + + const IGpuMemory* pPrimaryMemory; ///< The primary's gpu memory object used for passing its allocation handle + /// to KMD for pre-flip primary access (PFPA). If frame metadata flags + /// specifies that primaryHandle should be sent, clients should set this to + /// current frame pending primary's IGpuMemory object on the creating GPU + /// for the frameEnd command. Otherwise set this to nullptr. + const IGpuMemory* pDirectCapMemory; ///< The Direct Capture gpu memory object. It should be set if flag + /// captureBegin or captureEnd is set. Otherwise set this to nullptr. + const IGpuMemory* pPrivFlipMemory; ///< The gpu memory object of the private flip primary surface for the + /// DirectCapture feature. + const Util::Event* pEarlyPresentEvent; ///< The 'early present' event object. This variable can be nullptr. + uint64 frameIndex; ///< The frame index of this command buffer. It is only required for the + /// DirectCapture feature + uint32 vidPnSourceId; ///< The display source id for the DirectCapture feature. Clients must set + /// a valid vidPnSourceId when privateFlip flag is set and pDirectCapMemory + /// is nullptr. + uint64 frameId; ///< Present frame index, incremented at each present + const IGpuMemory* pMotionVectorMemory; ///< The motion vector gpu memory object for the DirectCapture feature. + const IGpuMemory* pDepthMemory; ///< The depth gpu memory object for the DirectCapture feature. + const IGpuMemory* pCameraMemory; ///< The camera gpu memory object for the DirectCapture feature. + const IGpuMemory* pHudLessImageMemory; ///< The HUD less image gpu memory object for DirectCapture. +}; + +/// Specifies rotation angle between two images. Used as input to ICmdBuffer::CmdScaledCopyImage. +enum class ImageRotation : uint32 +{ + Ccw0 = 0x0, ///< Counter clockwise degree 0 + Ccw90 = 0x1, ///< Counter clockwise degree 90 + Ccw180 = 0x2, ///< Counter clockwise degree 180 + Ccw270 = 0x3, ///< Counter clockwise degree 270 + Count +}; + +/// Describes a color-key value which can control a pixel get copied or ignored during a CmdScaledCopyImage operation. +struct ColorKey +{ + uint32 u32Color[4]; ///< The color value for each channel +}; + +/// Uniquely identifies the target of the a Present operation (swap chain / destination window / etc.) so that PAL's debug +/// layers can track frames-per-second or other statistics correctly when applications render to multiple displays or +/// windows. Client drivers which don't care about this can always specify a key value of 0. +using UniquePresentKey = uint64; + +/// Convert an OS window handle to a unique present key. +inline UniquePresentKey PresentKeyFromOsWindowHandle(OsWindowHandle handle) +#if defined(_WIN32) + { return reinterpret_cast(handle); } +#else + { return handle.win; } +#endif +/// Convert any pointer to a unique present key. +template +constexpr inline UniquePresentKey PresentKeyFromPointer(T* ptr) { return reinterpret_cast(ptr); } + +/// Specifies the input parameters for debug overlay's visual confirm. This struct is not functional. +/// The client is expected to default initialize this struct and then fill out any state that makes +/// sense under its presentation model. PAL will process any valid input and ignore fields that are +/// default initialized. +struct CmdPostProcessDebugOverlayInfo +{ + PresentMode presentMode; ///< The Presentation Mode of the application. + WsiPlatform wsiPlatform; ///< The WsiPlatform that Swap Chain works upon + UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. +}; + +/// Specifies the input parameters for ICmdBuffer::CmdPostProcessFrame. +struct CmdPostProcessFrameInfo +{ + union + { + struct + { + uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. + uint32 reserved : 31; ///< Reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. + } flags; + + union + { + const IImage* pSrcImage; ///< The image to postprocess (prior to presenting). + const IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to postprocess. + /// Must have been created as a typed buffer. + }; + + CmdPostProcessDebugOverlayInfo debugOverlay; + FullScreenFrameMetadataControlFlags fullScreenFrameMetadataControlFlags; + + Pal::ImageLayout srcImageLayout; +}; + +/// External flags for ScaledCopyImage. +union ScaledCopyFlags +{ + struct + { + uint32 srcColorKey : 1; ///< If set, enables source color-keying by using the value in the ColorKey member. + /// That is, any pixel in the source image that matches the color key should not be + /// copied to the destination image, and all of the source pixels that do not match + /// the color key should be copied. Mutually exclusive with dstColorKey. + uint32 dstColorKey : 1; ///< If set, enables destination color-keying by using the value in the ColorKey + /// member. That is, any pixel in the destination image that matches the color key + /// should be replaced with the corresponding pixel from the source image, and all of + /// the destination pixels that do not match the color key should not be replaced. + /// Mutually exclusive with srcColorKey. + uint32 srcAlpha : 1; ///< If set, use alpha channel in source surface as blend factor. + /// color = src alpha * src color + (1.0 - src alpha) * dst color. + uint32 dstAsSrgb : 1; ///< If set, a non-srgb destination image will be treated as srgb format. + /// Cannot be set if @ref dstAsNorm is set. + uint32 dstAsNorm : 1; ///< If set, a srgb destination image will be treated as non-srgb format. + /// Cannot be set if @ref dstAsSrgb is set. + uint32 scissorTest : 1; ///< If set, do scissor test using the specified scissor rectangle. + uint32 coordsInFloat : 1; ///< If set, copy regions are represented in floating point type. + uint32 srcAsNorm : 1; ///< If set, an srgb source image will be treated as non-srgb format. + /// Cannot be set if @ref srcAsSrgb is set. + uint32 srcAsSrgb : 1; ///< If set, a non-srgb source image will be treated as srgb format. + /// Cannot be set if @ref srcAsNorm is set. + uint32 reserved : 23; ///< reserved for future usage. + }; + uint32 u32All; ///< Flags packed as uint32. +}; + +/// Input structure to @ref ICmdBuffer::CmdScaledCopyImage. Specifies parameters needed to execute CmdScaledCopyImage. +struct ScaledCopyInfo +{ + const IImage* pSrcImage; ///< The source image to blt from. + ImageLayout srcImageLayout; ///< The source image layout. + const IImage* pDstImage; ///< The dest image to blt to. + ImageLayout dstImageLayout; ///< The dest image layout. + uint32 regionCount; ///< Copy region array size. + const ImageScaledCopyRegion* pRegions; ///< Region array to copy. + TexFilter filter; ///< Controlling how a given texture is sampled. + ImageRotation rotation; ///< Rotation option between two images. + const ColorKey* pColorKey; ///< Color key value. + const Rect* pScissorRect; ///< Scissor test rectangle. + ScaledCopyFlags flags; ///< Copy flags, identifies the type of blt to peform. +}; + +/// Input structure to @ref ICmdBuffer::CmdGenerateMipmaps. Specifies parameters needed to execute CmdGenerateMipmaps. +struct GenMipmapsInfo +{ + const IImage* pImage; ///< Populate mips in this image by reading from existing higher-level mips. + ImageLayout baseMipLayout; ///< The layout of all slices in the read-only base mip; must include LayoutCopySrc. + ImageLayout genMipLayout; ///< The layout of all slices and mips that will be generated; must include + /// LayoutCopySrc and LayoutCopyDst. + SubresRange range; ///< Which subresources should be generated from earlier mips. The starting mipLevel + /// must never be zero because there would be no larger mip to read. + TexFilter filter; ///< Controls texture sampling during mip generation. Linear texture filtering is + /// only supported for images with non-integer formats. + SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret all subresources using this format and swizzle. + /// The specified format needs to have been included in the "pViewFormats" list + /// specified at image-creation time, otherwise the result might be incorrect. +}; + +/// Defines a single memory range to prefetch using CmdPrimeGpuCaches. +struct PrimeGpuCacheRange +{ + gpusize gpuVirtAddr; ///< Base GPU virtual address to be prefetched. + gpusize size; ///< Number of bytes to prefetch. Clients should keep range sizes small relative + /// to the GPU caches (e.g., tccSizeInBytes); the PAL implementation may clamp + /// prefetched ranges if they are too large for the cache being prefetched. + uint32 usageMask; ///< Bitmask of CacheCoherencyUsageFlags defining the usage to prefetch for. + /// E.g., if the mask includes CoherShader, then PAL will attempt to prefetch + /// into caches that are on the shader core's data path. This mask must be a + /// subset of the dstCacheMask specified in the last barrier operation executed + /// on this memory range. Performing the cache prefetch is considered a read + /// operation of the specified usage, and so must be properly accounted for + /// in future barrier memory dependencies for this range. + bool addrTranslationOnly; ///< If set, only the address translation caches (i.e., TLB) will be primed; + /// no data caches will be affected. If this is set, the prefetch operation + /// has no bearing on barrier execution or memory dependencies. +}; + +/// Magic number tag for payloads in command buffer dumps +constexpr uint32 CmdBufferPayloadSignature = 0x1337F77D; + +/// Maximum size, in DWORDs, of payload data in command buffer dumps. +constexpr uint32 MaxPayloadSize = 254; + +/// Payload types used in special embedded NOP packets. +enum class CmdBufferPayloadType : uint32 +{ + Integer = 0, ///< Payload consists of a single 32-bit signed integer. + UnsignedInteger = 1, ///< Payload consists of a single 32-bit unsigned integer. + Integer64 = 2, ///< Payload consists of a single 64-bit signed integer. + UnsignedInteger64 = 3, ///< Payload consists of a single 64-bit unsigned integer. + Float = 4, ///< Payload consists of a single 32-bit floating point number. + Double = 5, ///< Payload consists of a single 64-bit double precision floating point number. + Pointer = 6, ///< Payload consists of a single 64-bit pointer address. + String = 7, ///< Payload consists of a variable length string. Must contain null-terminator. + Binary = 8, ///< Payload consists of DWORD-aligned binary data. +}; + +/// Structure layout for embedded CmdBuffer payloads. This can be embedded into the command stream with the +/// @ref ICmdBuffer::CmdNop() function. +struct CmdBufferPayload +{ + uint32 signature; ///< Magic number tag indicating the structure to follow. + uint32 payloadSize; ///< Size of the NOP packet (one DWORD) plus the sizeof this structure and the + /// payload data to follow. + /// This value is in DWORDs. Payload size is expected to be under + /// MaxPayloadSize. + CmdBufferPayloadType type; ///< The type of payload. + uint32 payload[1]; ///< Initial DWORD of payload data with the other data to follow. +}; + +/// Flags controlling which sub-queue(s) of a command buffer should insert an RGP trace marker. Zeroing out this +/// union is invalid, because RGP markers must be sent to at least one sub-queue. +union RgpMarkerSubQueueFlags +{ + struct + { + uint32 includeMainSubQueue : 1; ///< If set, includes the main sub-queue in the RGP marker. + uint32 includeGangedSubQueues : 1; ///< If set, includes any ganged sub-queues in the RGP marker. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed into a uint32 +}; + +/** + *********************************************************************************************************************** + * @interface ICmdBuffer + * @brief Contains GPU rendering and other commands recorded by PAL on the client's behalf. + * + * A command buffer can be executed by the GPU multiple times and recycled, provided the command buffer is not pending + * execution on the GPU when it is recycled. + * + * Command buffers are fully independent and there is no persistence of GPU state between submitted command buffers. + * When a new command buffer is recorded, the state is undefined. All relevant state must be explicitly set by the + * client before state-dependent operations such as draws and dispatches. + * + * @see IDevice::CreateCmdBuffer() + *********************************************************************************************************************** + */ +class ICmdBuffer : public IDestroyable +{ +public: + /// Resets the command buffer's previous contents and state, then puts it in the _building_ _state_, allowing new + /// commands to be recorded. + /// + /// If this is a root command buffer, the state will be reset to a "clean slate" with nothing bound. If this is a + /// nested command buffer, the state is set to an "undefined" state so that all render state can be inherited from + /// any root command buffer which executes this one. + /// + /// @param [in] info Controls how PAL will generate commands for this command buffer. E.g., specifies whether the + /// command buffer may be submitted more than once, and controls options for optimizing PM4, etc. + /// + /// @returns Success if the command buffer was successfully reset and put into the _building_ _state_. Otherwise, + /// one of the following error codes may be returned: + /// + ErrorInvalidFlags if invalid flags are set in the flags parameter. + /// + ErrorIncompleteCommandBuffer if the command buffer is already in the _building_ _state_. + virtual Result Begin( + const CmdBufferBuildInfo& info) = 0; + + /// Completes recording of a command buffer in the _building_ _state_, making it _executable_. + /// + /// @returns Success if the command buffer was successfully made _executable_. Otherwise, one of the following + /// errors may be returned: + /// + ErrorIncompleteCommandBuffer if the command buffer is not in the _building_ _state_. + /// + ErrorBuildingCommandBuffer if some error occurred while building the command buffer, and it could not + /// be made _executable_. If this error is returned, the command buffer can not be submitted. + virtual Result End() = 0; + + /// Explicitly resets a command buffer, releasing any internal resources associated with it. + /// + /// This call must be used to reset command buffers that have previously reported a ErrorIncompleteCommandBuffer + /// error. + /// + /// @note @ref Begin will implicitly cause a command buffer to be reset in addition to putting it in the + /// _building_ _state_. This method just gives a way to release resources between when the client knows + /// it is done with the command buffer and when it is ready to reuse this command buffer object for + /// recording new commands. + /// + /// @param [in] pCmdAllocator If non-null, all future GPU memory allocations will be done using this allocator. + /// Otherwise the command buffer will continue to use its current command allocator. + /// + /// @param [in] returnGpuMemory If true then all GPU memory associated with this command buffer will be returned + /// to the allocator upon reset. If false data chunks will be retained and reused. + /// Note: This flag must be true if changing command allocators. + /// + /// @warning If returnGpuMemory is false, the client must guarantee that this command buffer is not queued for + /// execution, is not currently being executed, and that all other command buffers that have referenced + /// this command buffer in a @ref CmdExecuteNestedCmdBuffers call have also been reset. + /// + /// @returns Success if the command buffer was successfully reset. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result Reset(ICmdAllocator* pCmdAllocator, bool returnGpuMemory) = 0; + + /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to CmdAllocateEmbeddedData. + /// + /// This a property of the command buffer and its associated command allocator; it may change if the caller + /// specifies a different command allocator on Reset(). + /// + /// @returns How many DWORDs of embedded data the command buffer can allocate at once. + virtual uint32 GetEmbeddedDataLimit() const = 0; + + /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to + /// CmdAllocateLargeEmbeddedData. + /// + /// @returns Number of DWORDs that can be allocated in one call to CmdAllocateLargeEmbeddedData + virtual uint32 GetLargeEmbeddedDataLimit() const = 0; + + /// Binds a graphics or compute pipeline to the current command buffer state. + /// + /// Graphics pipelines must be compiled for the PAL ABI. Compute pipelines must either be compiled for the PAL ABI + /// or the HSA ABI, if it's supported. HSA ABI support is indicated by supportHsaAbi in @ref DeviceProperties. + /// + /// PAL ABI pipelines and HSA ABI pipelines use different mechanisms to bind inputs and outputs. PAL ABI pipelines + /// use user data entries set by @ref CmdSetUserData. HSA ABI pipelines use kernel arguments set by @ref + /// CmdSetKernelArguments. Binding or unbinding a compute pipeline can implicitly modify the user data and kernel + /// argument state, please read the @ref CmdSetUserData and @ref CmdSetKernelArguments documentation for details. + /// + /// @param [in] params Parameters necessary to manage dynamic pipeline shader information. + virtual void CmdBindPipeline( + const PipelineBindParams& params) = 0; + + /// Binds the specified MSAA state object to the current command buffer state. + /// + /// @param [in] pMsaaState New MSAA state to be bound. Can be null in order to unbind a previously bound MSAA state + /// object without binding a new one. + virtual void CmdBindMsaaState( + const IMsaaState* pMsaaState) = 0; + + /// Saves a copy of all of the current command buffer state that is used by graphics workloads. This feature is + /// intended to give PAL clients a convenient way to issue their own internal graphics workloads without modifying + /// the application-facing state. + /// + /// PAL cannot save multiple layers of state, each call to CmdSaveGraphicsState must be followed by a call to + /// CmdRestoreGraphicsState before the next call to CmdSaveGraphicsState. Any barriers, resolves, blits, etc are not + /// allowed while the state is pushed. + /// + /// This function can only be called on command buffers that support graphics workloads. All query counters will be + /// disabled until CmdRestoreGraphicsState is called. + virtual void CmdSaveGraphicsState() = 0; + + /// Restores all of the command buffer state that is used by graphics workloads. This feature is intended to + /// give PAL clients a convenient way to issue their own internal graphics workloads without modifying the + /// application-facing state. + /// + /// A call to this function must be preceded by a call to CmdSaveGraphicsState + /// + /// This function can only be called on command buffers that support graphics workloads. All previously disabled + /// query counters will be reactivated. + virtual void CmdRestoreGraphicsState() = 0; + + /// Sets the shading rate in the command buffer along with the state of the various combiners. + /// + /// @param [in] rateParams Nwe VRS shading rate parameters to be bound. + virtual void CmdSetPerDrawVrsRate( + const VrsRateParams& rateParams) = 0; + + /// Setup parameters regarding how pixel center will be evaluated with VRS. + /// + /// @param [in] centerState Nwe VRS parameters to be bound that control how pixel center is defined. + virtual void CmdSetVrsCenterState( + const VrsCenterState& centerState) = 0; + + /// Binds the shading rate data in the specified image into the pipeline for use with VRS. Only relevant if the + /// combiner stage for VrsCombinerStage is set to something other than Passthrough. + /// + /// This binding point requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageSampleRate + /// - CacheCoherency: @ref CoherSampleRate + /// - ImageLayout: @ref LayoutSampleRate + /// + /// @param [in] pImage Image that contains sample rate data. Pointer can be NULL to force 1x1 shading rate. + virtual void CmdBindSampleRateImage( + const IImage* pImage) = 0; + + /// Binds the specified color/blend state object to the current command buffer state. + /// + /// @param [in] pColorBlendState New color/blend state to be bound. Can be null in order to unbind a previously + /// bound color/blend state object without binding a new one. + virtual void CmdBindColorBlendState( + const IColorBlendState* pColorBlendState) = 0; + + /// Binds the specified depth/stencil state object to the current command buffer state. + /// + /// @param [in] pDepthStencilState New depth/stencil state to be bound. Can be null in order to unbind a previously + /// bound depth/stencil state object without binding a new one. + virtual void CmdBindDepthStencilState( + const IDepthStencilState* pDepthStencilState) = 0; + + /// Sets the value range to be used for depth bounds testing. + /// + /// The depth bounds test is enabled in the graphics pipeline. When enabled, an additional check will be done that + /// will reject a pixel if the pre-existing depth value stored at its destination location is outside of the + /// specified bounds. Applications would typically use this feature to optimize shadow volume rendering. + /// + /// @param [in] params Parameters necessary to set the depth bounds (such as min/max depth). + virtual void CmdSetDepthBounds( + const DepthBoundsParams& params) = 0; + + /// Sets pipeline-accessible user data to the specified values. + /// + /// The values set in user data entries will be interpreted based on the resource mapping specified for each shader + /// in the currently bound pipeline. For example, the client can write virtual addresses of tables containing + /// SRDs, immediate SRDs that can be loaded without an indirection, or even a small number of immediate ALU + /// constants. + /// + /// The user data values are only used by PAL ABI pipelines. Almost all pipelines used by PAL clients are compiled + /// for the PAL ABI, but PAL also supports HSA ABI compute pipelines which use @ref CmdSetKernelArguments instead. + /// When an HSA ABI pipeline is bound the current compute user data entries are saved and will be restored if the + /// client later binds a PAL ABI compute pipeline. + /// + /// @warning It's illegal to set compute user data if an HSA ABI pipeline is currently bound. + /// + /// If no compute pipeline is currently bound PAL assumes the client will bind a PAL ABI pipeline and thus accepts + /// user data bindings. Graphics user data are unaffected by all of this because graphics pipelines can only use + /// the PAL ABI. + /// + /// @see PipelineShaderInfo + /// @see ResourceMappingNode + /// @ingroup ResourceBinding + /// + /// @param [in] bindPoint Specifies which type of user-date is to be set (i.e., compute or graphics). + /// @param [in] firstEntry First user data entry to be updated. + /// @param [in] entryCount Number of user data entries to update; size of the pEntryValues array. Must be greater + /// than zero, and (firstEntry + entryCount) must not extend beyond MaxUserDataEntries. + /// @param [in] pEntryValues Array of 32-bit values to be copied into user data. + void CmdSetUserData( + PipelineBindPoint bindPoint, + uint32 firstEntry, + uint32 entryCount, + const uint32* pEntryValues) + { (m_funcTable.pfnCmdSetUserData[static_cast(bindPoint)])(this, firstEntry, entryCount, pEntryValues); } + + /// Copies all pipeline-accessible user-data from one bind point to another. It is invalid if the source and + /// dest parameters refer to the same bind point. + /// + /// @see CmdSetUserData for how the user-date entries will be interpreted by the pipeline. + /// + /// @param [in] source Specifies which bind point to copy from. + /// @param [in] dest Specifies which bind point to copy into. + virtual void CmdDuplicateUserData( + PipelineBindPoint source, + PipelineBindPoint dest) = 0; + + /// Sets one or more HSA code object kernel argument values. + /// + /// If the currently bound compute pipeline was compiled using the HSA compute ABI this function must be used to + /// bind that pipeline's arguments. The argument position and value types are static properties of the pipeline + /// and must be known by the client. + /// + /// @note Calling @ref CmdBindPipeline invalidates all prior kernel argument bindings, even if the new pipeline + /// also uses the HSA ABI. Any kernel arguments that the client intends to share between pipelines must + /// be manually rebound. + /// + /// @warning It's illegal to call this function if no compute pipeline is bound or if the bound compute pipeline + /// uses a different ABI (e.g., the PAL compute ABI). + /// + /// @ingroup ResourceBinding + /// + /// @param [in] firstArg The zero-based position of the first kernel argument to bind. + /// @param [in] argCount Number of kernel arguments this call binds. + /// @param [in] ppValues Array of pointers to kernel argument values. + virtual void CmdSetKernelArguments( + uint32 firstArg, + uint32 argCount, + const void*const* ppValues) = 0; + + /// Changes one or more of the command buffer's active vertex buffers. + /// + /// @note If bufferViews.offsetMode is false, PAL will construct SRDs for each bound vertex buffer which are + /// equivalent to the client calling @ref IDevice::CreateUntypedBufferViewSrd on each element of the + /// pBuffers parameter. + /// + /// Note that vertex buffers require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageVs + /// - CacheCoherency: @ref CoherShaderRead + /// + /// @param [in] bufferViews Vertex buffer view descriptors.This parameter defines which vertex mode is used through + /// @ref VertexBufferViews::offsetMode. VertexBufferViews::pVertexBufferViews or + /// VertexBufferViews::pBufferViewInfos must not be nullptr. + virtual void CmdSetVertexBuffers( + const VertexBufferViews& bufferViews) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 877 + void CmdSetVertexBuffers( + uint32 firstBuffer, + uint32 bufferCount, + const BufferViewInfo* pBuffers) + { + const VertexBufferViews bufferViews = + { + .firstBuffer = firstBuffer, + .bufferCount = bufferCount, + .offsetMode = false, + .pBufferViewInfos = pBuffers + }; + CmdSetVertexBuffers(bufferViews); + } +#endif + + /// Binds a range of memory for use as index data (i.e., binds an index buffer). + /// + /// The GPU virtual address must be index element aligned: 2-byte aligned for 16-bit indices or 4-byte aligned for + /// 32-bit indices. + /// + /// The index buffer binding point requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageFetchIndices + /// - CacheCoherency: @ref CoherIndexData + /// + /// @param [in] gpuAddr GPU virtual address of the index data. Can be zero to unbind the previously bound data. + /// @param [in] indexCount Maximum number of indices in the index data; the GPU may read less indices. + /// @param [in] indexType Specifies whether to use 8-bit, 16-bit or 32-bit index data. + virtual void CmdBindIndexData( + gpusize gpuAddr, + uint32 indexCount, + IndexType indexType) = 0; + + /// Binds color and depth/stencil targets to the current command buffer state. + /// + /// The current layout of each target must also be specified. + /// + /// The color target binding points require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// + /// The depth and stencil target binding points require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// + /// @param [in] params Parameters representing the color and depth/stencil targets to bind to the command buffer. + virtual void CmdBindTargets( + const BindTargetParams& params) = 0; + + /// Binds stream-output target buffers to the current command buffer state. + /// + /// At draw-time, the stream-output targets must be consistent with the soState parameters specified by the + /// currently bound graphics pipeline. + /// + /// The stream-output target buffers require use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageStreamOut + /// - CacheCoherency: @ref CoherStreamOut + /// + /// @param [in] params Parameters representing the stream-output target buffers to bind to the command buffer. + virtual void CmdBindStreamOutTargets( + const BindStreamOutTargetParams& params) = 0; + + /// Sets the constant factor to be used by the blend hardware when programmed with the Blend::ConstantColor, + /// Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend coefficients. + /// + /// @param [in] params Parameters representing the blend constant factor. + virtual void CmdSetBlendConst( + const BlendConstParams& params) = 0; + + /// Sets input assembly state for upcoming draws in this command buffer. + /// + /// At draw-time, the topology specified with this method must be consistent with the _topologyInfo_ parameters + /// specified by the currently bound graphics pipeline. + /// + /// @param [in] params Parameters representing the input assembly state for upcoming draws. + virtual void CmdSetInputAssemblyState( + const InputAssemblyStateParams& params) = 0; + + /// Sets parameters controlling triangle rasterization. + /// + /// @param [in] params Parameters to set the triangle raster state (such as fill/cull mode). + virtual void CmdSetTriangleRasterState( + const TriangleRasterStateParams& params) = 0; + + /// Sets parameters controlling point and line rasterization. + /// + /// @param [in] params Parameters to set the point and line rasterization state (such as pointSize and lineWidth). + virtual void CmdSetPointLineRasterState( + const PointLineRasterStateParams& params) = 0; + + /// Sets parameters controlling line stippling. + /// + /// @param [in] params Parameters to set the line stipple state. + virtual void CmdSetLineStippleState( + const LineStippleStateParams& params) = 0; + + /// Sets depth bias parameters. + /// + /// Depth bias is used to ensure a primitive can properly be displayed (without Z fighting) in front (or behind) + /// of the previously rendered co-planar primitive. This is useful for decal or shadow rendering. + /// + /// @param [in] params Parameters for setting the depth bias (such as depth bias, depth bias clamp, and slope + /// scaled depth bias). + virtual void CmdSetDepthBiasState( + const DepthBiasParams& params) = 0; + + /// Sets stencil reference values and mask buffer reads and writes in upcoming draws. Separate reference values + /// can be specified for front-facing and back-facing polygons. Update flags should be set for state which needs to + /// be updated. All other state will be preserved. + /// Setting all the values (reference, read/write masks and stencil op) in the StencilRefMaskParams together + /// takes the faster path. + /// Setting either the ref value, read/write masks or the stencil op value individually takes the slower + /// read-modify-write path. + /// + /// @param [in] params Parameters for setting the stencil read and write masks. + virtual void CmdSetStencilRefMasks( + const StencilRefMaskParams& params) = 0; + + /// Sets user defined clip planes, should only be called on universal command buffers. + /// + /// @param [in] firstPlane The index of first plane in user define clip plane array. + /// @param [in] planeCount The count of planes in plane array. + /// @param [in] pPlanes Pointer to plane array. + virtual void CmdSetUserClipPlanes( + uint32 firstPlane, + uint32 planeCount, + const UserClipPlane* pPlanes) = 0; + + /// Sets clip rects, should only be called on universal command buffers. + /// + /// @param [in] clipRule 16-bit clip rule bits are used to determine if pixel shall be discarded or retained. + /// For each pixel, a 4-bit index is computed based on which clip rects the pixel is + /// inside (bitN represents rectN). Then uses this index to check the corresponding bit + /// in clip rule for this pixel - 0 for discarded, 1 for retained. + /// @param [in] rectCount The count of rectangles in rect list. This must be less than or equal to + /// MaxClipRects (4). + /// @param [in] pRectList Pointer to the rect list. + virtual void CmdSetClipRects( + uint16 clipRule, + uint32 rectCount, + const Rect* pRectList) = 0; + + /// Sets user defined MSAA quad-pixel sample pattern, should only be called on universal command buffers + /// This should be called before clearing, rendering, barriering and resolving of MSAA DepthStencil image. + /// + /// @param [in] numSamplesPerPixel Number of samples per pixel + /// @param [in] quadSamplePattern The input msaa sample pattern + virtual void CmdSetMsaaQuadSamplePattern( + uint32 numSamplesPerPixel, + const MsaaQuadSamplePattern& quadSamplePattern) = 0; + + /// Sets the specified viewports to the current command buffer state. + /// + /// @param [in] params Parameters for setting the specified number of viewports. + virtual void CmdSetViewports( + const ViewportParams& params) = 0; + + /// Sets the scissor regions corresponding to each viewport to the current command buffer state. + /// + /// @param [in] params Parameters for setting the specified number of scissor regions. + virtual void CmdSetScissorRects( + const ScissorRectParams& params) = 0; + + /// Sets the global scissor rectangle. + /// + /// @param [in] params Parameters for setting the global scissor rectangle from the top left to bottom right + /// coordinate. + virtual void CmdSetGlobalScissor( + const GlobalScissorParams& params) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 + /// Inserts a barrier in the current command stream that can stall GPU execution, flush/invalidate caches, or + /// decompress images before further, dependent work can continue in this command buffer. + /// + /// This operation does not honor the command buffer's predication state, if active. + /// + /// @param [in] barrierInfo See @ref BarrierInfo for detailed information. + virtual void CmdBarrier( + const BarrierInfo& barrierInfo) = 0; +#endif + /// Perform source pipeline stage and cache access optimization based on the acquire/release interface. + /// + /// @param [in] barrierType Barrier transition type @ref BarrierType. + /// @param [in] pImage Image pointer for image transition, required when @ref BarrierType is + /// BarrierType::Image. + /// @param [in/out] pSrcStageMask A source mask of ORed @ref PipelineStageFlag to optimize, can't be null. + /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null. + /// @param [in/out] pDstStageMask A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null. + /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize. + /// + /// @returns If need flush and invalidate GL2 cache. + /// + /// @note PipelineStageBlt will be converted to more accurate stage(s) based on the underlying implementation of + /// outstanding BLTs, but will be left as PipelineStageBlt if the internal outstanding BLTs can't be expressed + /// as a client-facing PipelineStage (e.g., if there are CP DMA BLTs in flight). + virtual bool OptimizeAcqRelReleaseInfo( + BarrierType barrierType, + const IImage* pImage, + uint32* pSrcStageMask, + uint32* pSrcAccessMask, + uint32* pDstStageMask, + uint32* pDstAccessMask) const = 0; + + /// Performs the release portion of an acquire/release-based barrier. This releases a set of resources from their + /// current usage, while CmdAcquire() is expected to be called to acquire access to the resources for future, + /// different usage. + /// + /// Conceptually, this method will: + /// - Ensure the specified source synchronization scope has completed. + /// - Ensure all specified resources are available in memory. The availability operation will flush all + /// write-back caches to the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// Once all of these operations are complete, the release issues a timestamp event that signals the operation + /// completion. The event type and timestamp value is returned to caller in a packed uint32 token. A corresponding + /// CmdAcquire() call is expected to wait on one or a list of such synchronization tokens and perform any necessary + /// visibility operations and/or layout transitions that could not be predicted at release-time. + /// + /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout + /// transitions. + /// @returns Synchronization token for the release operation. Pass this token to CmdAcquire to confirm completion. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 + virtual uint32 CmdRelease( +#else + virtual ReleaseToken CmdRelease( +#endif + const AcquireReleaseInfo& releaseInfo) = 0; + + /// Performs the acquire portion of an acquire/release-based barrier. This acquire a set of resources for a new + /// set of usages, assuming CmdRelease() was called to release access for the resource's past usage. + /// + /// Conceptually, this method will: + /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all + /// relevant caches above the last-level-cache. + /// - Perform any requested layout transitions. + /// - Ensure the release(s) have completed by waiting on the synchronization token of the release operation. + /// + /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout + /// layout transitions. + /// @param [in] syncTokenCount Number of entries in pSyncTokens, can be zero if no valid release token. + /// @param [in] pSyncTokens Array of synchronization tokens, as returned from CmdRelease, to confirm completion. + /// The token value(s) must have been returned by a CmdRelease call in the same command + /// buffer. pSyncTokens can be null if syncTokenCount is 0. + virtual void CmdAcquire( + const AcquireReleaseInfo& acquireInfo, + uint32 syncTokenCount, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 + const uint32* pSyncTokens) = 0; +#else + const ReleaseToken* pSyncTokens) = 0; +#endif + + /// Performs the release portion of an acquire/release event-based barrier. This releases a set of resources from + /// their current usage, while CmdAcquireEvent() is expected to be called to acquire access to the resources for + /// future, different usage. + /// + /// Conceptually, this method will: + /// - Ensure the specified source synchronization scope has completed. + /// - Ensure all specified resources are available in memory. The availability operation will flush all + /// write-back caches to the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// Once all of these operations are complete, the specified IGpuEvent object will be signaled. A corresponding + /// CmdAcquireEvent() call is expected to wait on this event and perform any necessary visibility operations and/or + /// layout transitions that could not be predicted at release-time. + /// + /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout + /// transitions. + /// @param [in] pGpuEvent Event to be signaled once the release has completed. Must be a valid (non-null) GPU + /// event pointer. Call CmdRelease()/CmdAcquire() pair instead if want to release/acquire + /// something but no GPU event is available. + virtual void CmdReleaseEvent( + const AcquireReleaseInfo& releaseInfo, + const IGpuEvent* pGpuEvent) = 0; + + /// Performs the acquire portion of an acquire/release event-based barrier. This acquire a set of resources for a + /// new set of usages, assuming CmdReleaseEvent() was called to release access for the resource's past usage. + /// + /// Conceptually, this method will: + /// - Ensure the release(s) have completed by waiting for the specified IGpuEvent early enough in the pipeline to + /// support the specified destination synchronization scope. + /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all + /// relevant caches above the last-level-cache. + /// - Perform any requested layout transitions. + /// + /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout + /// layout transitions. + /// @param [in] gpuEventCount Number of entries in pGpuEvents. + /// @param [in] ppGpuEvents Array of one or more events to wait on. Typically these will be set via + /// CmdReleaseEvent(), but it's valid to wait on an event set through a different means + /// like CmdSetEvent() from CPU side. Must be a valid (non-null) pointer to an array of + /// gpuEventCount valid GPU event pointers. Call CmdReleaseThenAcquire() instead if wait + /// to acquire something but no GPU event is available. + virtual void CmdAcquireEvent( + const AcquireReleaseInfo& acquireInfo, + uint32 gpuEventCount, + const IGpuEvent* const* ppGpuEvents) = 0; + + /// Conceptually equivalent to calling CmdRelease() followed immediately by CmdAcquire(), but it potentially has + /// better performance than calling CmdRelease()/CmdAcquire() directly. Can be called in cases where the client/ + /// application cannot detect separate release and acquire points for a transition. + /// + /// @param [in] barrierInfo Describes the synchronization scopes, availability/visibility operations, and the + /// required layout transitions. + virtual void CmdReleaseThenAcquire( + const AcquireReleaseInfo& barrierInfo) = 0; + + /// Issues an instanced, non-indexed draw call using the command buffer's currently bound graphics state. Results + /// in instanceCount * vertexCount vertices being processed. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// @param [in] firstVertex Starting index value for the draw. Indices passed to the vertex shader will range + /// from firstVertex to firstVertex + vertexCount - 1. + /// @param [in] vertexCount Number of vertices to draw. If zero, the draw will be discarded. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range + /// from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + /// @param [in] drawId Draw index for the draw. + void CmdDraw( + uint32 firstVertex, + uint32 vertexCount, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId) + { + m_funcTable.pfnCmdDraw(this, firstVertex, vertexCount, firstInstance, instanceCount, drawId); + } + + /// Issues draw opaque call using the command buffer's currently bound graphics state. + /// Uses the stream-out target of a previous draw as the input vertex data. + /// the number of vertices = (streamOutFilledSize (value of streamOutFilledSizeVa) - streamOutOffset) / stride + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// @param [in] streamOutFilledSizeVa gpuAddress of streamOut filled size for streamOut buffer. + /// @param [in] streamOutOffset the offset of begin of streamOut as vertex. + /// @param [in] stride stride for stream data as vertex. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader + /// will range from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + void CmdDrawOpaque( + gpusize streamOutFilledSizeVa, + uint32 streamOutOffset, + uint32 stride, + uint32 firstInstance, + uint32 instanceCount) + { + m_funcTable.pfnCmdDrawOpaque(this, + streamOutFilledSizeVa, + streamOutOffset, + stride, + firstInstance, + instanceCount); + } + + /// Issues an instanced, indexed draw call using the command buffer's currently bound graphics state. Results in + /// instanceCount * indexCount vertices being processed. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// Indices passed to the vertex shader will be: + /// + /// + IndexBuffer[firstIndex] + vertexOffset + /// + IndexBuffer[firstIndex + 1] + vertexOffset, + /// + ... + /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset + /// + /// @param [in] firstIndex Starting index buffer slot for the draw. + /// @param [in] indexCount Number of vertices to draw. If zero, the draw will be discarded. + /// @param [in] vertexOffset Offset added to the index fetched from the index buffer before it is passed to the + /// vertex shader. + /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range + /// from firstInstance to firstInstance + instanceCount - 1. + /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. + /// @param [in] drawId Draw index for the draw. + void CmdDrawIndexed( + uint32 firstIndex, + uint32 indexCount, + int32 vertexOffset, + uint32 firstInstance, + uint32 instanceCount, + uint32 drawId) + { + m_funcTable.pfnCmdDrawIndexed(this, firstIndex, indexCount, vertexOffset, firstInstance, instanceCount, drawId); + } + + /// Issues instanced, non-indexed draw calls using the command buffer's currently bound graphics state. The draw + /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find + /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its vertexCount or + /// instanceCount is zero. + /// + /// The layout of the argument data is defined in the @ref DrawIndirectArgs structure. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDraw + /// @see DrawIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDrawIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDrawIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + + /// Issues instanced, indexed draw calls using the command buffer's currently bound graphics state. The draw + /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find + /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its indexCount or + /// instanceCount is zero. + /// + /// The layout of the argument data is defined in the @ref DrawIndexedIndirectArgs structure. + /// + /// It is an error if the currently bound pipeline contains a mesh and/or task shader. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDrawIndexed + /// @see DrawIndexedIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDrawIndexedIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDrawIndexedIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 909 + /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + inline void CmdDispatch( + DispatchDims size) + { + m_funcTable.pfnCmdDispatch(this, size, {}); + } +#endif + + /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + /// @param [in] infoFlags Additional information about the dispatch. + void CmdDispatch( + DispatchDims size, + DispatchInfoFlags infoFlags) + { + m_funcTable.pfnCmdDispatch(this, size, infoFlags); + } + + /// Dispatches a compute workload using the command buffer's currently bound compute state. The dimensions of the + /// workload come from GPU memory. The dispatch will be discarded if any of its dimensions are zero. + /// + /// The layout of the argument data is defined in the @ref DispatchIndirectArgs structure. + /// + /// @warning Does not support HSA ABI pipelines. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDispatch + /// @see DispatchIndirectArgs + /// + /// @param [in] gpuVirtAddr GPU virtual memory address where the indirect argument data is located. + /// The virtual address must be 4-byte aligned. + void CmdDispatchIndirect( + gpusize gpuVirtAddr) + { + m_funcTable.pfnCmdDispatchIndirect(this, gpuVirtAddr); + } + + /// Dispatches a compute workload of the given dimensions and offsets using the command buffer's currently bound + /// compute state. This command allows targeting regions of thread groups without adding the offset computations in + /// the shader. + /// + /// The caller may also provide a logical thread group count which is larger than the number of groups actually + /// launched. If the shader reads the dispatch's thread group count from PAL metadata it will see the logical size, + /// not the launch size. + /// + /// The combination of an offset, launch size, and logical size give the caller enough flexibility to take an + /// incoming dispatch, split it up into sub-dispatches, and execute those sub-dispatches using multiple + /// CmdDispatchOffset calls in whatever execution pattern they would like. Note that such an optimization + /// would not work if the shader has global logic that does make assumptions about thread group launch order. + /// + /// The thread group size is defined in the compute shader. + /// + /// Supports PAL ABI and HSA ABI pipelines. + /// + /// @param [in] offset The thread groups offsets. Set them to zero if you don't want an offset. + /// @param [in] launchSize Thread groups to dispatch. If any components are zero the dispatch will be discarded. + /// @param [in] logicalSize The thread group dimensions reported to the shader via metadata. + void CmdDispatchOffset( + DispatchDims offset, + DispatchDims launchSize, + DispatchDims logicalSize) + { + m_funcTable.pfnCmdDispatchOffset(this, offset, launchSize, logicalSize); + } + + /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if + /// the currently bound graphics pipeline does not contain a mesh and/or task shader. + /// + /// The thread group size is defined in the mesh shader or task shader. + /// + /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. + void CmdDispatchMesh( + DispatchDims size) + { + m_funcTable.pfnCmdDispatchMesh(this, size); + } + + /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if + /// the currently bound graphics pipeline does not contain a mesh shader. The dimensions of the workload come from + /// GPU memory. The dispatch will be discarded if any of its dimensions are zero. + /// + /// The layout of the argument data is defined in the @ref DispatchMeshIndirectArgs structure. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @see CmdDispatchMesh + /// @see DispatchMeshIndirectArgs + /// + /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in + /// memory from one structure to another. + /// The virtual address must be 4 byte aligned. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr + /// is nonzero, the value at that memory location is clamped to + /// this maximum. If countGpuAddr is zero, then the number of draws + /// issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. + /// Must be 4-byte aligned. + void CmdDispatchMeshIndirectMulti( + GpuVirtAddrAndStride gpuVirtAddrAndStride, + uint32 maximumCount, + gpusize countGpuAddr) + { + m_funcTable.pfnCmdDispatchMeshIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); + } + + /// Copies multiple regions from one GPU memory allocation to another. + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping + /// will cause undefined results. + /// + /// This call should be used for buffer memory copy only; don't use it for image memory. + /// + /// For best performance, offsets and copy sizes should be 4-byte aligned. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuMemory GPU memory allocation where the source regions are located. + /// @param [in] dstGpuMemory GPU memory allocation where the destination regions are located. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, destination offset, and + /// copy size. + virtual void CmdCopyMemory( + const IGpuMemory& srcGpuMemory, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryCopyRegion* pRegions) = 0; + + /// Copies multiple regions from one GPU memory virtual address to another. + /// + /// @note The CmdCopyMemory() path should be preferred because it contains more optimizations due to more + /// knowledge about the memory itself that is lost when only virtual addresses are passed in. + /// + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory virtual address are the same. Any illegal + /// overlapping will cause undefined results. + /// + /// For best performance, addresses, offsets, and copy sizes should be 4-byte aligned. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuVirtAddr GPU memory vitrual address where the source regions are located. + /// @param [in] dstGpuVirtAddr GPU memory virtual address where the destination regions are located. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifynig a source offset, destination offset, + /// and copy size. + virtual void CmdCopyMemoryByGpuVa( + gpusize srcGpuVirtAddr, + gpusize dstGpuVirtAddr, + uint32 regionCount, + const MemoryCopyRegion* pRegions) = 0; + + /// Copies multiple regions from one image to another. + /// + /// The source and destination subresource of a particular region are not allowed to be the same, and will produce + /// undefined results. Additionally, destination subresources cannot be present more than once per CmdCopyImage() + /// call. + /// + /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image + /// extents are specified in compression blocks. + /// + /// The source and destination images must to be of the same type (1D, 2D or 3D), or optionally 2D and 3D with the + /// number of slices matching the depth. MSAA source and destination images must have the same number of samples. + /// + /// Each region must satisfy these restrictions. + /// - srcOffset >= 0 and dstOffset >= 0 + /// - srcOffset + extent <= srcSubres's extent + /// - dstOffset + extent <= dstSubres's extent + /// + /// Images copied via this function must have x/y/z offsets and width/height/depth extents aligned to the minimum + /// tiled copy alignment specified in @ref DeviceProperties for the engine this function is executed on. Note that + /// the DMA engine supports tiled copies regardless of the alignment; the reported minimum tiled copy alignments + /// are an indication of the minimum alignments for which the copy will be performant. + /// + /// When the per-engine capability flag supportsMismatchedTileTokenCopy (@see DeviceProperties) is false, + /// CmdCopyImage is only valid between two subresources that share the same tileToken (@see SubresLayout). + /// + /// Note that the copy can go through clone copy automatically if, + /// - Both source and destination images are created with @ref ImageCreateInfo::flags::cloneable = 1 + /// - Both source and destination images have same @ref ImageCreateInfo + /// - Source image's layout is compatible with destination images' layout + /// - This is a full image copy + /// - Copy flags @ref CopyControlFlags required to be 0. + /// + /// Basically clone copy clones all subresources' data of one image object in another while preserving the image + /// layout. It does raw copy on image data and metadata; and tries to keep the metadata (like DCC/HiZ/HiS) + /// unchanged but may be not true due to different HW design. + /// e.g. Client compression (fragment and ZPlane compression) will be missed during the compute based raw copy. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] srcImage Image where source regions reside. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Image where destination regions reside. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source subresource, destination + /// subresource, source x/y/z offset, destination x/y/z offset, and copy size in the + /// x/y/z dimensions. + /// @param [in] pScissorRect Rectangle for scissor test. + /// @param [in] flags A mask of ORed @ref CopyControlFlags that can be used to control copy behavior. + virtual void CmdCopyImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const ImageCopyRegion* pRegions, + const Rect* pScissorRect, + uint32 flags) = 0; + + /// Copies data directly (without format conversion) from a GPU memory object to an image. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The size of the data copied from memory is implicitly derived from the image extents. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdCopyMemoryToImage() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination + /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyMemoryToImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const MemoryImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from an image to a GPU memory object. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The size of the data copied to memory is implicitly derived from the image extents. + /// + /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A + /// destination region cannot be present more than once per CmdCopyImageToMemory() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc + /// + /// @param [in] srcImage Image where source data will be read from. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source + /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyImageToMemory( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a GPU memory object to a PRT. + /// + /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. + /// This function always copies entire tiles, even if parts of the tile are internal padding. + /// + /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that + /// operate in texels like the generic CmdCopyMemoryToImage() should be used instead. + /// + /// The size of the data copied from memory is implicitly derived from the image extents. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdCopyMemoryToTiledImage() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. Must have the "prt" flag set. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination + /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyMemoryToTiledImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const MemoryTiledImageCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a PRT to a GPU memory object. + /// + /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. + /// This function always copies entire tiles, even if parts of the tile are internal padding. + /// + /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that + /// operate in texels like the generic CmdCopyImageToMemory() should be used instead. + /// + /// The size of the data copied to memory is implicitly derived from the image extents. + /// + /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A + /// destination region cannot be present more than once per CmdCopyTiledImageToMemory() call. + /// + /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc + /// + /// @param [in] srcImage Image where source data will be read from. + /// @param [in] srcImageLayout Current allowed usages and queues for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source + /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. + virtual void CmdCopyTiledImageToMemory( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const MemoryTiledImageCopyRegion* pRegions) = 0; + + /// Copies multiple regions directly (without format conversion) from one typed buffer to another. + /// + /// For compressed formats, the extents are specified in compression blocks. + /// + /// The buffer memory offsets have to be aligned to the smaller of their texel sizes or 4 bytes. + /// + /// None of the destination regions are allowed to overlap each other, nor are destination and source regions + /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping + /// will cause undefined results. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source offset, + /// and copy size in the x/y/z dimensions. + virtual void CmdCopyTypedBuffer( + const IGpuMemory& srcGpuMemory, + const IGpuMemory& dstGpuMemory, + uint32 regionCount, + const TypedBufferCopyRegion* pRegions) = 0; + + /// Copies data directly (without format conversion) from a 2D typed buffer to a 2D image. + /// + /// For compressed images, the extents are specified in compression blocks. + /// + /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination + /// subresource cannot be present more than once per CmdScaledCopyTypedBufferToImage() call. + /// + /// MSAA resource is unsupported. The client must resolve both resources before calling this function. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopyDst + /// + /// @param [in] srcGpuMemory GPU memory where the source data is located. + /// @param [in] dstImage Image where destination data will be written. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, copy size of source + /// region, a destination offset, destination subresource, and copy size of destination + /// region. + virtual void CmdScaledCopyTypedBufferToImage( + const IGpuMemory& srcGpuMemory, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const TypedBufferImageScaledCopyRegion* pRegions) = 0; + + /// Copies a GPU register content to a GPU memory location. + /// + /// The destination memory offset has to be aligned to 4 bytes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherMemory + /// + /// @param [in] srcRegisterOffset Source register offset in bytes + /// @param [in] dstGpuMemory GPU memory where the destination data will be written. + /// @param [in] dstOffset Destination memory offset in bytes. + virtual void CmdCopyRegisterToMemory( + uint32 srcRegisterOffset, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + + /// Copies multiple scaled regions from one image to another. + /// + /// The source and destination subresource of a particular region are not allowed to be the same, and will produce + /// undefined results. Additionally, destination subresources cannot be present more than once per + /// CmdScaledCopyImage() call. + /// + /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image + /// extents are specified in compression blocks. + /// + /// The source and destination images must to be of the same type (1D, 2D or 3D). Both single sampled images and + /// MSAA images are supported. + /// + /// Linear texture filtering is only supported for images with non-integer formats. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] copyInfo Specifies parameters needed to execute CmdScaledCopyImage. See + /// @ref ScaledCopyInfo for more information. + virtual void CmdScaledCopyImage( + const ScaledCopyInfo& copyInfo) = 0; + + /// Automatically generates texture data for a range of subresources such that they may be used as intermediate + /// images in a mipmap chain. The existing values in mip N are used to generate mip N+1. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the base mip, @ref CoherCopySrc and @ref CoherCopyDst for the others. + /// - ImageLayout: @ref LayoutCopySrc for the base mip, @ref LayoutCopySrc and @ref LayoutCopyDst for the others. + /// + /// @param [in] genInfo The parameters for CmdGenerateMipmaps. See @ref GenMipmapsInfo for more information. + virtual void CmdGenerateMipmaps( + const GenMipmapsInfo& genInfo) = 0; + + /// Copies multiple scaled regions from one image to another, converting between RGB and YUV color spaces during + /// the copy. The exact conversion between YUV and RGB is controlled by a caller-specified color-space-conversion + /// table. + /// + /// The source and destination images must both be of the 2D type. Only single-sampled images are supported. + /// One of the two images involved must have an RGB color format, and the other must have a YUV color format. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. + /// + /// @param [in] srcImage Images where source region reside. If this is a YUV image, the destination must be + /// RGB, and this copy will convert YUV to RGB. Otherwise, the destination must be YUV, + /// and the copy will convert RGB to YUV. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Image where destination regions reside. If this is a YUV image, the source must be + /// RGB, and this copy will convert RGB to YUV. Otherwise, the source must be YUV and + /// the copy will convert YUV to RGB. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] regionCount Number of regions to copy; size of the pRegions array. + /// @param [in] pRegions Array of conversion-copy regions, each entry specifying a source x/y/z offset, source + /// x/y/z extent, destination x/y/z offset, destination x/y/z extent, RGB subresource and + /// YUV subresource(s). + /// @param [in] filter Texture filtering for shader sample instruction. + /// @param [in] cscTable Color-space-conversion table which controls how YUV data is converted to a specific + /// RGB representation and vice-versa. + virtual void CmdColorSpaceConversionCopy( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const ColorSpaceConversionRegion* pRegions, + TexFilter filter, + const ColorSpaceConversionTable& cscTable) = 0; + + /// Clones data of one image object in another while preserving the image layout. + /// + /// The source and destination images must be created with identical creation parameters and must specify the + /// cloneable flag. The clone operation clones all subresources. + /// + /// Both resources can be in any layout before the clone operation. After the clone, the source image state is left + /// intact and the destination image layout becomes the same as the source. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. + /// + /// @param [in] srcImage Source image. + /// @param [in] dstImage Destination image. + virtual void CmdCloneImageData( + const IImage& srcImage, + const IImage& dstImage) = 0; + + /// Directly updates a range of GPU memory with a small amount of host data. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// The client is responsible for choosing the proper method for optimal performance. If updating data size is less + /// equal than 8 bytes, CmdWriteImmediate() is preferred. + /// + /// @param [in] dstGpuMemory GPU memory object to be updated. + /// @param [in] dstOffset Byte offset into the GPU memory object to be udpated. Must be a multiple of 4. + /// @param [in] dataSize Amount of data to write, in bytes. Must be a multiple of 4. + /// @param [in] pData Pointer to host data to be copied into the GPU memory. + virtual void CmdUpdateMemory( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize dataSize, + const uint32* pData) = 0; + + /// Updates marker surface with a DWORD value to indicate an event completion. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] dstGpuMemory GPU memory object to be updated. + /// @param [in] offset Byte offset into marker address + /// @param [in] value Marker DWORD value to be copied to the bus addressable or external physical memory. + virtual void CmdUpdateBusAddressableMemoryMarker( + const IGpuMemory& dstGpuMemory, + gpusize offset, + uint32 value) = 0; + + /// Fills a range of GPU memory with the provided 32-bit data. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// @param [in] dstGpuMemory GPU memory object to be filled. + /// @param [in] dstOffset Byte offset into the GPU memory object to be filled. Must be a multiple of 4. + /// @param [in] fillSize Size to fill, in bytes. Must be a multiple of 4. + /// @param [in] data 32-bit value to be repeated in the filled range. + virtual void CmdFillMemory( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize fillSize, + uint32 data) = 0; + + /// Interprets a range of GPU memory as a color buffer and clears it to the specified clear color. + /// + /// The maximum clear range is determined by the buffer offset and buffer extent; if any Ranges are specified they + /// must be specified in texels with respect to the beginning of the buffer and must not exceed its extent. + /// With 96-bit formats, bufferOffset must be specified in bytes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs is expected but the more general @ref PipelineStageBlt is also OK. + /// - CacheCoherency: @ref CoherShader is expected but the more general @ref CoherClear is also OK. + /// + /// @param [in] gpuMemory GPU memory to be cleared. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] bufferFormat The format of the color data in the buffer. + /// @param [in] bufferOffset The offset to the beginning of the buffer, in units of texels + /// (or bytes for 96-bit texels). + /// @param [in] bufferExtent The extent of the buffer, in units of texels. + /// @param [in] rangeCount Number of ranges within the buffer to clear; size of the pRanges array. + /// If zero, the entire view will be cleared and pRanges will be ignored. + /// @param [in] pRanges Array of ranges within the GPU memory to clear. + virtual void CmdClearColorBuffer( + const IGpuMemory& gpuMemory, + const ClearColor& color, + SwizzledFormat bufferFormat, + uint32 bufferOffset, + uint32 bufferExtent, + uint32 rangeCount = 0, + const Range* pRanges = nullptr) = 0; + + /// Clears the currently bound color targets to the specified clear color. + /// + /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. + /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in + /// Vulkan where the color attachments are inherited. + /// + /// This requires regionCount being specified since resource size is for sure to be known. The bound color targets + /// shouldn't have UndefinedSwizzledFormat as their swizzle format. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// + /// @param [in] colorTargetCount Number of bound color target that needs to be cleared. + /// @param [in] pBoundColorTargets Color target information for the bound color targets. + /// @param [in] regionCount Number of volumes within the image to clear; size of the pClearRegions array. + /// This need to be non-zero. + /// @param [in] pClearRegions Array of volumes within the subresources to clear. + virtual void CmdClearBoundColorTargets( + uint32 colorTargetCount, + const BoundColorTarget* pBoundColorTargets, + uint32 regionCount, + const ClearBoundTargetRegion* pClearRegions) = 0; + + /// Clears a color image to the specified clear color. + /// + /// If any Boxes have been specified, all subresource ranges must contain a single, identical mip level. + /// + /// The imageLayout can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick + /// to layouts that are likely to support compression like @ref LayoutColorTarget. + /// + /// This function requires use of the following barrier flags if @ref flags includes @ref ColorClearAutoSync: + /// - PipelineStage: @ref PipelineStageColorTarget + /// - CacheCoherency: @ref CoherColorTarget + /// - ImageLayout: @ref LayoutColorTarget + /// Otherwise the following barrier flags must be used: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherClear + /// + /// @param [in] image Image to be cleared. + /// @param [in] imageLayout Current allowed usages and engines for the target image. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] clearFormat If clearFormat.format is Undefined (e.g. if UndefinedSwizzledFormat is provided), do not + /// reinterpret the subresources' formats. Otherwise, the subresources' formats will be + /// reinterpreted according to this parameter. The specified format needs to have been + /// included in the "pViewFormats" list specified at image-creation time, otherwise + /// corruption may occur. + /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. + /// @param [in] pRanges Array of subresource ranges to clear. + /// @param [in] boxCount Number of volumes within the image to clear; size of the pBoxes array. + /// If zero, entire subresources will be cleared and pBoxes will be ignored. + /// @param [in] pBoxes Array of volumes within the subresources to clear. + /// @param [in] flags Mask of ClearColorImageFlags values controlling behavior of the clear. + virtual void CmdClearColorImage( + const IImage& image, + ImageLayout imageLayout, + const ClearColor& color, + const SwizzledFormat& clearFormat, + uint32 rangeCount, + const SubresRange* pRanges, + uint32 boxCount, + const Box* pBoxes, + uint32 flags) = 0; + + /// Clears the currently bound depth/stencil targets to the specified clear values. + /// + /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. + /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in + /// Vulkan where the color attachments are inherited. + /// + /// This requires regionCount being specified since resource size is for sure to be known. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// + /// @param [in] depth Depth clear value. + /// @param [in] stencil Stencil clear value. + /// @param [in] stencilWriteMask Stencil write mask to clear specific stencil planes. + /// @param [in] samples Sample count. + /// @param [in] fragments Fragment count. + /// @param [in] flag Select to depth, stencil or depth and stencil. + /// @param [in] regionCount Number of volumes within the bound depth/stencil target to clear. + /// @param [in] pClearRegions Array of volumes within the subresources to clear. + virtual void CmdClearBoundDepthStencilTargets( + float depth, + uint8 stencil, + uint8 stencilWriteMask, + uint32 samples, + uint32 fragments, + DepthStencilSelectFlags flag, + uint32 regionCount, + const ClearBoundTargetRegion* pClearRegions) = 0; + + /// Clears a depth/stencil image to the specified clear values. + /// + /// If any Rects have been specified, all subresource ranges must contain a single, identical mip level. + /// + /// The layouts can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick to + /// layouts that are likely to support compression like @ref LayoutDepthStencilTarget. + /// + /// This function requires use of the following barrier flags if @ref flags includes @ref DsClearAutoSync: + /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget + /// - CacheCoherency: @ref CoherDepthStencilTarget + /// - ImageLayout: @ref LayoutDepthStencilTarget + /// Otherwise the following barrier flags must be used: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherClear + /// + /// @param [in] image Image to be cleared. + /// @param [in] depth Depth clear value. + /// @param [in] depthLayout Current allowed usages and engines for the depth plane. + /// @param [in] stencil Stencil clear value. + /// @param [in] stencilWriteMask Write-mask to apply to the stencil subresource ranges during the clear. + /// @param [in] stencilLayout Current allowed usages and engines for the stencil plane. + /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. + /// @param [in] pRanges Array of subresource ranges to clear. + /// @param [in] rectCount Number of areas within the image to clear; size of the pRects array. If zero, + /// the entire subresources will be cleared and pRects will be ignored. + /// @param [in] pRects Array of areas within the subresources to clear. + /// @param [in] flags Mask of ClearDepthStencilFlags values controlling behavior of the clear. + virtual void CmdClearDepthStencil( + const IImage& image, + ImageLayout depthLayout, + ImageLayout stencilLayout, + float depth, + uint8 stencil, + uint8 stencilWriteMask, + uint32 rangeCount, + const SubresRange* pRanges, + uint32 rectCount, + const Rect* pRects, + uint32 flags) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 910 + /// Clears a range of GPU memory to the specified clear color using the specified buffer view SRD. + /// + /// The maximum clear range is determined by the view; if any Ranges are specified they must fit within the view's + /// range. The view must support shader writes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs + /// - CacheCoherency: @ref CoherShader + /// + /// @note You may use the more general @ref PipelineStageBlt and @ref CoherClear if you wish but they may result in + /// higher barrier overhead. + /// + /// @param [in] gpuMemory GPU memory to be cleared. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] pBufferViewSrd The image view SRD that will be used to interpret the image. + /// @param [in] rangeCount Number of ranges within the GPU memory to clear; size of the pRanges array. + /// If zero, the entire view will be cleared and pRanges will be ignored. + /// @param [in] pRanges Array of ranges within the GPU memory to clear. + virtual void CmdClearBufferView( + const IGpuMemory& gpuMemory, + const ClearColor& color, + const void* pBufferViewSrd, + uint32 rangeCount = 0, + const Range* pRanges = nullptr) = 0; + + /// Clears an image to the specified clear color using the specified image view SRD. + /// + /// The clear subresouce range is determined by the view; if any Rects have been specified, the image view must + /// contain a single mip level. The view must support shader writes. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageCs but the more general @ref PipelineStageBlt is also OK. + /// - CacheCoherency: @ref CoherShader but the more general @ref CoherClear is also OK. + /// - ImageLayout: @ref LayoutShaderWrite + /// + /// @param [in] image Image to be cleared. + /// @param [in] imageLayout Current allowed usages and engines for the image, must include LayoutShaderWrite. + /// @param [in] color Specifies the clear color data and how to interpret it. + /// @param [in] pImageViewSrd The image view SRD that will be used to interpret the image. + /// @param [in] rectCount Number of volumes within the image to clear; size of the pRects array. + /// If zero, entire subresources will be cleared and pRects will be ignored. + /// @param [in] pRects Array of volumes within the subresources to clear. The begin and end slices to be + /// cleard are from SubresRange in pImageViewSrd. + virtual void CmdClearImageView( + const IImage& image, + ImageLayout imageLayout, + const ClearColor& color, + const void* pImageViewSrd, + uint32 rectCount = 0, + const Rect* pRects = nullptr) = 0; +#endif + + /// Resolves multiple regions of a multisampled image to a single-sampled image. + /// + /// The source image must be a 2D multisampled image and the destination must be a single-sampled image. + /// The formats of the source and destination images must match unless all regions specify a valid format. + /// + /// For color images, if the source image has an integer numeric format, a single sample is copied (sample 0). + /// + /// For depth/stencil images, the resolve is performed by simply copying sample 0 from every source pixel to the + /// destination pixel. + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherResolveSrc for the source and @ref CoherResolveDst for the destination. + /// - ImageLayout: @ref LayoutResolveSrc for the source and @ref LayoutResolveDst for the destination. + /// + /// @param [in] srcImage MSAA source image. + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include + /// LayoutResolveSrc and the ImageLayoutEngineFlags corresponding to the engine this + /// function is being called on. + /// @param [in] dstImage Single-sample destination image. + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutResolveDst and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] resolveMode Resolve mode + /// @param [in] pRegions Specifies src/dst subresources and rectangles. + /// @param [in] flags Mask of ResolveImageFlags values controlling behavior of the resolve. + virtual void CmdResolveImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + ResolveMode resolveMode, + uint32 regionCount, + const ImageResolveRegion* pRegions, + uint32 flags) = 0; + + /// Resolves multiple regions of a Sampler Feedback map to another image. + /// + /// The source image must be: + /// Decode: A 2D Feedback map, in which case the destination image will hold the decoded sampled data + /// Encode: A texture, in which case the destination will hold the encoded sampler map + /// + /// The formats of the source and destination images must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderRead for the source and @ref LayoutShaderWrite for the destination. + /// + /// @param [in] srcImage Source image, depends on resolve type + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must + /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] dstImage Destination image, depends on resolve type + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] resolveType Resolve type + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] pRegions Specifies src/dst subresources and rectangles + virtual void CmdResolvePrtPlusImage( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IImage& dstImage, + ImageLayout dstImageLayout, + PrtPlusResolveType resolveType, + uint32 regionCount, + const PrtPlusImageResolveRegion* pRegions) = 0; + + /// Encodes a buffer into a Sampler Feedback image. + /// + /// This interface only supports encoding raw data from a buffer to an encoded Sampler Feedback map + /// + /// The format of the data in the source buffer and destination image must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderWrite for the destination. + /// + /// @param [in] srcBuffer Source buffer, depends on the resolve type + /// @param [in] dstImage Destination image, depends on resolve type + /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must + /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] regionCount Number of regions to resolve, size of pRegions array + /// @param [in] pRegions Specifies src/dst subresources and rectangles + virtual void CmdResolvePrtPlusBufferToImage( + const IGpuMemory& srcBuffer, + const IImage& dstImage, + ImageLayout dstImageLayout, + uint32 regionCount, + const PrtPlusBufferToImageResolveRegion* pRegions) = 0; + + /// Decodes a Sampler Feedback map to a buffer + /// + /// This interface only supports decoding a Sampler Feedback map to buffer + /// + /// The format of the data in the source image and destination buffer must be 8bpp + /// + /// The same subresource may not appear more than once in the specified array of regions. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. + /// - ImageLayout: @ref LayoutShaderRead for the source + /// + /// @param [in] srcImage Source image, depends on the resolve type + /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must + /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine + /// this function is being called on. + /// @param [in] dstBuffer Destination buffer, depends on resolve type + /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. + /// @param [in] pRegions Specifies src/dst subresources and rectangles. + virtual void CmdResolvePrtPlusImageToBuffer( + const IImage& srcImage, + ImageLayout srcImageLayout, + const IGpuMemory& dstBuffer, + uint32 regionCount, + const PrtPlusImageToBufferResolveRegion* pRegions) = 0; + + /// Puts the specified event into the _set_ state when all prior GPU work has progressed past the given stages. + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] gpuEvent GPU event to be set. + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the event is set. The set + /// will be performed at the earliest possible stage after the prior stages. + virtual void CmdSetEvent( + const IGpuEvent& gpuEvent, + uint32 stageMask) = 0; + + /// Puts the specified event into the _reset_ state when all prior GPU work has progressed past the given stages. + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] gpuEvent GPU event to be reset. + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the event is reset. The + /// reset will be performed at the earliest possible stage after the prior stages. + virtual void CmdResetEvent( + const IGpuEvent& gpuEvent, + uint32 stageMask) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// Puts the specified GPU event into the _set_ state when all previous GPU work reaches the specified point in the + /// pipeline. + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] gpuEvent GPU event to be set. + /// @param [in] setPoint Point in the graphics pipeline where the GPU event will be _set_, indicating all prior + /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't + /// support this operation at the exact specified point, the set will be performed at the + /// earliest possible point _after_ the specified point. + inline void CmdSetEvent( + const IGpuEvent& gpuEvent, + HwPipePoint setPoint) + { CmdSetEvent(gpuEvent, HwPipePointToStage[setPoint]); } + + /// Puts the specified GPU event into the _reset_ state when all previous GPU work reaches the specified point in + /// the pipeline. + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] gpuEvent GPU event to be reset. + /// @param [in] resetPoint Point in the graphics pipeline where the GPU event will be _reset_, indicating all prior + /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't + /// support this operation at the exact specified point, the reset will be performed at the + /// earliest possible point _after_ the specified point. + inline void CmdResetEvent( + const IGpuEvent& gpuEvent, + HwPipePoint resetPoint) + { CmdResetEvent(gpuEvent, HwPipePointToStage[resetPoint]); } +#endif + + /// Predicate the subsequent jobs in the command buffer if the event is set. + /// + /// @param [in] gpuEvent GPU event to be checked. + virtual void CmdPredicateEvent( + const IGpuEvent& gpuEvent) = 0; + + /// Performs the specified 32- or 64-bit memory operation. These operations are atomic with respect to shader + /// atomic operations. + /// + /// The data size (32-bit or 64-bit) is determined by the operation type. For 32-bit atomics, only the lower + /// 32-bits of srcData is used. + /// + /// The destination GPU memory offset must be 4-byte aligned for 32-bit atomics and 8-byte aligned for 64-bit + /// atomics. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherQueueAtomic + /// + /// @param [in] dstGpuMemory Destination GPU memory object. + /// @param [in] dstOffset Offset into the memory object where the atomic will be performed. + /// @param [in] srcData Source data for the atomic operation. Use depends on the atomicOp. + /// @param [in] atomicOp Specifies which atomic operation to perform. @see AtomicOp. + virtual void CmdMemoryAtomic( + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + uint64 srcData, + AtomicOp atomicOp) = 0; + + /// Starts a query operation for the given slot of a query pool. + /// + /// The query slot must have been previously cleared with CmdResetQueryPool() before starting a query. + /// + /// @note Queries may not span multiple command buffers. + /// + /// @param [in] queryPool Query pool for this query. + /// @param [in] queryType The type of query this operation will produce. + /// @param [in] slot Slot in pQueryPool where the results of this query should be accumulated. + /// @param [in] flags Flags controlling query behavior. @see QueryControlFlags. + virtual void CmdBeginQuery( + const IQueryPool& queryPool, + QueryType queryType, + uint32 slot, + QueryControlFlags flags) = 0; + + /// Stops a query operation for the given slot of a query pool. + /// + /// The query slot must have an open query on it when this is called. + /// + /// @param [in] queryPool Query pool for this query. + /// @param [in] queryType The type of query this operation will produce. + /// @param [in] slot Slot in pQueryPool where the query is running. + virtual void CmdEndQuery( + const IQueryPool& queryPool, + QueryType queryType, + uint32 slot) = 0; + + /// Resolves the results of a range of queries to the specified query type into the specified GPU memory location. + /// + /// This function requires use of the following barrier flags on @ref dstGpuMemory: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// This operation does not honor the command buffer's predication state, if active. + /// + /// @param [in] queryPool Query pool holding the source queries. + /// @param [in] flags Flags that control the result data layout and how the results are retrieved. + /// @param [in] queryType The type of queries this resolve will produce. + /// @param [in] startQuery First slot in pQueryPool to resolve. + /// @param [in] queryCount Number of query pool slots to resolve. + /// @param [in] dstGpuMemory Destination GPU memory object. + /// @param [in] dstOffset 4-byte aligned offset into pDstGpuMemory where the results should be written. + /// @param [in] dstStride 4-byte aligned stride between where results are written into pDstGpuMemory. + virtual void CmdResolveQuery( + const IQueryPool& queryPool, + QueryResultFlags flags, + QueryType queryType, + uint32 startQuery, + uint32 queryCount, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset, + gpusize dstStride) = 0; + + /// Rests a range of slots in a query pool. A query slot must be reset each time before a query can be started + /// using that slot. + /// + /// @param [in] queryPool Query pool to be reset. + /// @param [in] startQuery First slot in pQueryPool to be reset. + /// @param [in] queryCount Number of slots to reset. + virtual void CmdResetQueryPool( + const IQueryPool& queryPool, + uint32 startQuery, + uint32 queryCount) = 0; + + /// Writes a GPU performance timestamp to memory when all prior GPU work has progressed past the given stages. + /// + /// The timestamp data is a 64-bit value that increments once per clock. @ref timestampFrequency in DeviceProperties + /// reports the frequency the timestamps are clocked at. Timestamps are only supported by engines that report + /// @ref supportsTimestamps in DeviceProperties. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag(s) specified in @ref stageMask. + /// - CacheCoherency: @ref CoherTimestamp + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that + /// restricts which stages of prior GPU work must happen before the timestamp is written. + /// The timestamp will be performed at the earliest possible stage after the prior stages. + /// Note that the SDMA engine only supports bottom-of-pipe timestamps. + /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. + /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to + /// minTimestampAlignment in DeviceProperties. + virtual void CmdWriteTimestamp( + uint32 stageMask, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + + /// Writes an immediate value to memory when all prior GPU work has progressed past the given stages. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag(s) specified in @ref stageMask. + /// - CacheCoherency: @ref CoherCp + /// + /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. + /// + /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope + /// that restricts which stages of prior GPU work must happen before the immediate + /// value is written. The write will be occur at the earliest possible stage after + /// the prior stages. Note that the SDMA engine only supports bottom-of-pipe writes. + /// @param [in] data Value to be written to gpu address. + /// @param [in] ImmediateDataWidth Size of the data to be written out. + /// @param [in] address GPU address where immediate value should be written. + virtual void CmdWriteImmediate( + uint32 stageMask, + uint64 data, + ImmediateDataWidth dataSize, + gpusize address) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// Writes a HwPipePostPrefetch or HwPipeBottom timestamp to the specified memory location. + /// + /// The timestamp data is a 64-bit value that increments once per clock. timestampFrequency in DeviceProperties + /// reports the frequency the timestamps are clocked at. + /// + /// Timestamps are only supported by engines that report supportsTimestamps in DeviceProperties. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag specified in @ref pipePoint. + /// - CacheCoherency: @ref CoherTimestamp + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. The only + /// valid choices are HwPipePostPrefetch and HwPipeBottom. HwPipePostPrefetch timestamps + /// are not supported on the SDMA engine, so all timestamps will be executed as + /// bottom-of-pipe. + /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. + /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to + /// minTimestampAlignment in DeviceProperties. + inline void CmdWriteTimestamp( + HwPipePoint pipePoint, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) + { CmdWriteTimestamp(HwPipePointToStage[pipePoint], dstGpuMemory, dstOffset); } + + /// Writes a top-of-pipe or bottom-of-pipe immediate value to the specified memory location. + /// + /// This function requires use of the following barrier flags: + /// - PipelineStage: the same flag specified in @ref pipePoint. + /// - CacheCoherency: @ref CoherCp + /// + /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. + /// + /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. + /// The only valid choices are HwPipeTop, HwPipePostPrefetch and HwPipeBottom. + /// Top-of-pipe timestamps are not supported on the SDMA engine, so all timestamps + /// will be executed as bottom-of-pipe. + /// @param [in] data Value to be written to gpu address. + /// @param [in] ImmediateDataWidth Size of the data to be written out. + /// @param [in] address GPU address where immediate value should be written. + inline void CmdWriteImmediate( + HwPipePoint pipePoint, + uint64 data, + ImmediateDataWidth dataSize, + gpusize address) + { CmdWriteImmediate(HwPipePointToStage[pipePoint], data, dataSize, address); } +#endif + + /// Loads the current stream-out buffer-filled-sizes stored on the GPU from memory, typically from a target of a + /// prior CmdSaveBufferFilledSizes() call. + /// + /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use + /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr Array of GPU virtual addresses to load each counter from. If any of these are zero, + /// the corresponding filled-size counter is not loaded. + virtual void CmdLoadBufferFilledSizes( + const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; + + /// Saves the current stream-out buffer-filled-sizes into GPU memory. + /// + /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use + /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr Array of GPU virtual addresses to save each counter into. If any of these are zero, + /// the corresponding filled-size counter is not saved. + virtual void CmdSaveBufferFilledSizes( + const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; + + /// Set the offset to buffer-filled-size for a stream-out target. + /// + /// @param [in] bufferId Stream-out buffer ID, it could be in the range [0, MaxStreamOutTargets). + /// @param [in] offset The value to be written into the buffer filled size counter. + /// + virtual void CmdSetBufferFilledSize( + uint32 bufferId, + uint32 offset) = 0; + + /// Binds the specified border color palette for use by samplers. + /// + /// @param [in] pipelineBindPoint Specifies which pipeline type is affected (i.e., graphics or compute). + /// @param [in] pPalette Border color palette object to bind. + virtual void CmdBindBorderColorPalette( + PipelineBindPoint pipelineBindPoint, + const IBorderColorPalette* pPalette) = 0; + + /// Sets predication for this command buffer to use the specified GPU memory location. Any draw, dispatch or copy + /// operation between this command and the corresponding reset/disable call will be skipped if the value in spec- + /// ified location matches the passed-in predicated value + /// + /// This function requires use of the following barrier flags on @ref pGpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] pQueryPool pointer to QueryPool obj, not-nullptr means this is a QueryPool based predication + /// - Zpass/Occlusion based predication + /// - or PrimCount/Streamout based predication + /// @param [in] slot Slot to use for setting occlusion predication, valid when pQueryPool is not nullptr + /// @param [in] pGpuMemory GPU memory object for the predication value, only valid when pQueryPool is nullptr + /// @param [in] offset GPU memory offset for the predication value + /// @param [in] predType Predication type. + /// @param [in] predPolarity Controls the polarity of the predication test + /// true = draw_if_visible_or_no_overflow + /// false = draw_if_not_visible_or_overflow + /// @param [in] waitResults Hint only valid for Zpass/Occlusion. + /// false = wait_until_final_zpass_written + /// true = draw_if_not_final_zpass_written + /// @param [in] accumulateData true(1) = allow_accumulation of Zpass and PrimCount across command buffer boundaries. + /// + /// pQueryPool and gpuVirtAddr should be exclusively set, when both are nullptr/0, other params will be ignored + /// and it means to reset/disable predication so that the following commands can perform normally. + virtual void CmdSetPredication( + IQueryPool* pQueryPool, + uint32 slot, + const IGpuMemory* pGpuMemory, + gpusize offset, + PredicateType predType, + bool predPolarity, + bool waitResults, + bool accumulateData) = 0; + + /// Suspend/resume any active predication for this command buffer + /// + /// @param [in] suspend Controls if predication should be paused + /// true = suspend active predication + /// false = resume active predication + /// + /// Any suspended predication must be resumed prior to disabling predication using CmdSetPredication with pQueryPool + /// and gpuVirtAddr with nullptr/0. This is only valid on universal and compute command buffers. + virtual void CmdSuspendPredication( + bool suspend) = 0; + + /// Begins a conditional block in the current command buffer. All commands between this and the corresponding + /// CmdEndIf() (or CmdElse() if it is present) command are executed if the specified condition is true. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] offset Offset within the memory object where the tested memory location begins. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. + virtual void CmdIf( + const IGpuMemory& gpuMemory, + gpusize offset, + uint64 data, + uint64 mask, + CompareFunc compareFunc) = 0; + + /// Begins a conditional block in the current command buffer. All commands between this and the corresponding + /// CmdEndIf() command are executed if the condition specified in the innermost active conditional block are false. + virtual void CmdElse() = 0; + + /// Ends the innermost active conditional block in the current command buffer. + virtual void CmdEndIf() = 0; + + /// Begins a while loop in the current command buffer. All commands between this and the corresponding CmdEndWhile() + /// command are executed repeatedly as long as the specified condition remains true. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] offset Offset within the memory object where the tested memory location begins. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. + virtual void CmdWhile( + const IGpuMemory& gpuMemory, + gpusize offset, + uint64 data, + uint64 mask, + CompareFunc compareFunc) = 0; + + /// Ends the innermost active while loop in the current command buffer. + virtual void CmdEndWhile() = 0; + + /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a + /// GPU register. + /// + /// The client (or application) is supposed to do necessary barriers before calling this function, but for now this + /// is only need to wait some display or timer related registers. + /// + /// @param [in] registerOffset The offset in bytes of GPU register to be tested. + /// @param [in] data Source data to compare against the value of GPU register. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never shouldn't + /// be used as the hardware does not support it. + virtual void CmdWaitRegisterValue( + uint32 registerOffset, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a + /// GPU memory location. + /// + /// The client (or application) is expected to transiton the memory to proper state before calling this function. + /// The memory location for the condition must be 4-byte aligned. + /// This function requires use of the following barrier flags on @ref gpuVirtAddr: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuVirtAddr GPU memory address containing the data to be tested. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not + /// be used as the hardware does not support it. + virtual void CmdWaitMemoryValue( + gpusize gpuVirtAddr, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Stalls a command buffer execution until an external device writes to the marker surface in the GPU bus + /// addressable memory location. + /// + /// This function requires use of the following barrier flags on @ref gpuMemory: + /// - PipelineStage: @ref PipelineStagePostPrefetch + /// - CacheCoherency: @ref CoherCp + /// + /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. + /// @param [in] data Source data to compare against the value in GPU memory. + /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. + /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not + /// be used as the hardware does not support it. + virtual void CmdWaitBusAddressableMemoryMarker( + const IGpuMemory& gpuMemory, + uint32 data, + uint32 mask, + CompareFunc compareFunc) = 0; + + /// Begins the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to begin. + virtual void CmdBeginPerfExperiment( + IPerfExperiment* pPerfExperiment) = 0; + + /// Updates the sqtt token mask on the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to update. + /// @param [in] tokenConfig updated token and reg mask to apply. + /// + /// @note: This function is only valid to call if pPerfExperiment is a thread trace experiment that is currently + // active. + virtual void CmdUpdatePerfExperimentSqttTokenMask( + IPerfExperiment* pPerfExperiment, + const ThreadTraceTokenConfig& tokenConfig) = 0; + + /// Updates the sqtt token mask on all running traces, if any. + /// + /// @note This may overwrite the stall settings (making them more conservative) + /// @param [in] tokenConfig updated token and reg mask to apply. + virtual void CmdUpdateSqttTokenMask( + const ThreadTraceTokenConfig& tokenConfig) = 0; + + /// Ends the specified performance experiment. + /// + /// @param [in] pPerfExperiment Performance experiment to end. + virtual void CmdEndPerfExperiment( + IPerfExperiment* pPerfExperiment) = 0; + + /// Inserts a trace marker into the command buffer. + /// + /// A trace marker can be inserted to mark particular points of interest in a command buffer to be viewed with the + /// trace data collected in a performance experiment. + /// + /// @param [in] markerType Selects one of two generic marker categories ("A" or "B"). + /// @param [in] markerData 32-bit marker value to be inserted. + virtual void CmdInsertTraceMarker( + PerfTraceMarkerType markerType, + uint32 markerData) = 0; + + /// Inserts a set of SQ thread trace markers for consumption by the Radeon GPU Profiler (RGP). + /// + /// Only supported on Universal and Compute engines. + /// + /// @param [in] numDwords Number of dwords in pData to be inserted as SQTT markers. + /// @param [in] pData SQTT marker data. See the RGP SQTT Instrumentation Specification for details on how this + /// data should be formatted. + virtual void CmdInsertRgpTraceMarker( + RgpMarkerSubQueueFlags subQueueFlags, + uint32 numDwords, + const void* pData) = 0; + + /// This function is to be used to copy the DF SPM (MALL SPM) data from the output buffers to an accessible buffer. + /// The buffer that HW outputs to is allocated with a special KMD flag and therefore cannot be the same as the + /// normal IPerfExperiment buffer so we need a special command to get the data. + /// + /// The bulk of the implementation for this is done by the KMD. They are in charge of starting and stopping the + /// trace as well as all of the register programming. When KMD recieves a dfSpmTraceEnd bit from a CmdBufInfo + /// flag, they will wait for the command buffer to be completely idle before stopping the trace. Therefore, a + /// CmdEndPerfExperiment call does not stop this particular sample, the end of a command buffer with a + /// dfSpmTraceEnd does. This means that calling CmdCopyDfSpmTraceData in the same command buffer as + /// dfSpmTraceEnd will give you incorrect data. The sample will still be in progress when the copy happens. + /// You must call CmdCopyDfSpmTraceData in a separate command buffer after one where the dfSpmTraceEnd bit is + /// set. + /// + /// There is also a metadata buffer that does not need a special KMD flag. It is also stored in a separate buffer + /// and is copied along with the output buffer with this command. It contains a uint32 trace size, a uint32 pad, + /// a uint64 start trace GPU timestamp and a uint64 stop trace GPU timestamp and is placed at the beginning of the + /// dstGpuMemory. + /// + /// The minimum size of the dstGpuMemory should be the size of the metadata struct plus the size of the DF SPM + /// ringSize given to the perf experiment. The SPM data may not fill the entire memory, but the client is + /// responsible for parsing the data. + /// + /// This function requires use of the following barrier flags on @ref dstGpuMemory: + /// - PipelineStage: @ref PipelineStageBlt + /// - CacheCoherency: @ref CoherCopyDst + /// + /// @param [in] perfExperiment The perfExperiment that we will be copying the data from + /// @param [in] dstGpuMemory The memory location that the DF SPM trace data will be copied to. + /// @param [in] dstOffset The offset into the destination memory that the data will be copied to. + virtual void CmdCopyDfSpmTraceData( + const IPerfExperiment& perfExperiment, + const IGpuMemory& dstGpuMemory, + gpusize dstOffset) = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Loads data from the provided GPU Memory object into Constant Engine RAM. + /// + /// @param [in] srcGpuMemory GPU Memory object containing the source data to be loaded to CE RAM. + /// @param [in] memOffset Offset within the memory object where the source data is located, + /// must be 32-byte aligned. + /// @param [in] ramOffset Byte offset destination in CE RAM where the data should be loaded, + /// must be 32-byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be loaded into CE RAM, must be a multiple of 8. + void CmdLoadCeRam( + const IGpuMemory& srcGpuMemory, + gpusize memOffset, + uint32 ramOffset, + uint32 dwordSize) {} + + /// Dumps data from Constant Engine RAM to the provided GPU Memory address which may be located in a GPU ring buffer + /// managed by the CE. The CE can be used to automatically handle the synchronization between the DE and CE when + /// manipulating a GPU ring buffer. In order for PAL to instruct the CE to handle this, we need to know the current + /// position (entry) within the ring buffer being dumped to, as well as the total size (in entries) of the ring. + /// + /// @param [in] dstGpuMemory GPU Memory object destination where the data should be dumped from CE RAM. + /// @param [in] memOffset Offset within the memory object where data should be dumped, must be 4 byte aligned. + /// @param [in] ramOffset Byte offset source in CE RAM for data that should be dumped, must be 4 byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be dumped from CE RAM into GPU Memory + /// @param [in] currRingPos Current position (ring entry) in the GPU ring buffer being managed by the CE which the + /// dump location corresponds to. + /// @param [in] ringSize Number of entries in the GPU ring buffer being managed by the CE. If the memory being + /// dumped into is not managed in a ring-like fashion, this should be set to zero. + void CmdDumpCeRam( + const IGpuMemory& dstGpuMemory, + gpusize memOffset, + uint32 ramOffset, + uint32 dwordSize, + uint32 currRingPos, + uint32 ringSize) {} + + /// Writes CPU data to Constant Engine RAM + /// + /// @param [in] pSrcData Pointer to the source CPU data to be written to CE RAM. + /// @param [in] ramOffset Byte offset in CE RAM where the data should be written, must be 4 byte aligned. + /// @param [in] dwordSize Number of DWORDs that should be written from pSrcData into CE RAM. + virtual void CmdWriteCeRam( + const void* pSrcData, + uint32 ramOffset, + uint32 dwordSize) {} +#endif + + /// Allocates a chunk of command space that the client can use to embed constant data directly in the command + /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address + /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within + /// this command buffer (e.g., as an SRD table address). + /// + /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the + /// value reported by GetEmbeddedDataLimit(). + /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. + /// @param [out] pGpuAddress The GPU address of the embedded space. + /// + /// @returns The DWORD-aligned CPU address of the embedded space. + virtual uint32* CmdAllocateEmbeddedData( + uint32 sizeInDwords, + uint32 alignmentInDwords, + gpusize* pGpuAddress) = 0; + + /// Allocates a chunk of command space that the client can use to embed constant data directly in the command + /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address + /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within + /// this command buffer (e.g., as an SRD table address). + /// + /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the + /// value reported by GetLargeEmbeddedDataLimit(). + /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. + /// @param [out] pGpuAddress The GPU address of the embedded space. + /// + /// @returns The DWORD-aligned CPU address of the embedded space. + virtual uint32* CmdAllocateLargeEmbeddedData( + uint32 sizeInDwords, + uint32 alignmentInDwords, + gpusize* pGpuAddress) = 0; + + /// Get memory from scratch memory and bind to GPU event. For now only GpuEventPool and CmdBuffer's internal + /// GpuEvent use this path to allocate and bind GPU memory. These usecases assume the bound GPU memory is GPU access + /// only, so client is responsible for resetting the event from GPU, and cannot call Set(), Reset(), GetStatus(). + /// + /// @param [in] pGpuEvent The GPU event that needs to bind a memory. Must not be nullptr. + /// + /// @returns Success if the GPU event successfully binds a GPU memory. Otherwise, one of the following errors may + /// be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Result AllocateAndBindGpuMemToEvent( + IGpuEvent* pGpuEvent) = 0; + + /// Issues commands to prime GPU caches shortly before accessing the specified GPU address range(s). The benefit of + /// this prefetching is likely to be platform-dependent based on the GPU's cache hierarchy, memory subsystem, + /// available prefetching tools in hardware, etc., so caller beware. + /// + /// This operation may read data from memory into caches and therefore counts as a general BLT SRC operation with + /// regard to barrier execution and memory dependencies. + /// + /// @param [in] rangeCount Number of entries in pRanges. + /// @param [in] pRanges Array of structs defining a memory range and properties controlling prefetching of that + /// range. + virtual void CmdPrimeGpuCaches( + uint32 rangeCount, + const PrimeGpuCacheRange* pRanges) = 0; + + /// Issues commands which execute the specified group of nested command buffers. The observable behavior of this + /// operation should be indiscernible from directly recording the nested command buffers' commands directly into + /// this command buffer. Naturally, the queue type of the nested command buffers must match this command buffer. + /// + /// Conceptually, executing a nested command buffer is similar to calling a subroutine: the root command buffer is + /// like the "caller", while the nested ones are the "callees". + /// + /// If any nested command buffers were allocated from a @ref ICmdAllocator with @ref autoMemoryReuse enabled, + /// resetting or destroying those nested command buffers will render them retroactively uncallable. This effectively + /// makes the caller command buffer invalid and illegal to submit even if it was otherwise valid and executable in + /// the past. If the nested command allocator has autoMemoryReuse disabled, the calls to reset nested command + /// buffers remain valid until the allocator itself is reset. + /// + /// State inheritance/leakage between the caller and callee(s) has the following behavior: + /// + The callee only inherits the state specified in the callee CmdBufferBuildInfo. It is up to the client to + /// bind any default state necessary when they called @ref ICmdBuffer::Begin() to begin building the callee. + /// By default no state is inherited and all state must be specified by the client. + /// + The callee leaks any render and resource-binding state back into the caller after it completes. It is up to + /// the client to rebind the caller's state after this operation completes if they don't want state leakage. + /// + Both of the above points apply in between callees, if more than one command buffer is being executed by this + /// call. + /// + /// @param [in] cmdBufferCount Number of nested command buffers to execute. (i.e., size of the ppCmdBuffers + /// array). This must be at least one, otherwise making this call is pointless. + /// @param [in,out] ppCmdBuffers Array of nested command buffers to execute. It is an error condition if any + /// of the following are true: (Debug assertions are used to check them.) + /// + ppCmdBuffers is null. + /// + Any member of ppCmdBuffers is null. + /// + Any member of ppCmdBuffers is a root command buffer, or has a different + /// queue type than this command buffer. + virtual void CmdExecuteNestedCmdBuffers( + uint32 cmdBufferCount, + ICmdBuffer*const* ppCmdBuffers) = 0; + + /// Saves a copy of some set of the current command buffer state that is used by compute workloads. This feature is + /// intended to give PAL clients a convenient way to issue their own internal compute workloads without modifying + /// the application-facing state. + /// + /// PAL cannot save multiple layers of state, each call to CmdSaveComputeState must be followed by a call to + /// CmdRestoreComputeState before the next call to CmdSaveComputeState. + /// + /// This function can only be called on command buffers that support compute workloads. All query counters will be + /// disabled until CmdRestoreComputeState is called. + /// + /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to save. + virtual void CmdSaveComputeState( + uint32 stateFlags) = 0; + + /// Restores some set of the command buffer state that is used by compute workloads. This feature is intended to + /// give PAL clients a convenient way to issue their own internal compute workloads without modifying the + /// application-facing state. + /// + /// A call to this function must be preceded by a call to CmdSaveComputeState and the save stateFlags must contain + /// all restore stateFlags, otherwise the values of the restored state are undefined. + /// + /// This function can only be called on command buffers that support compute workloads. All previously disabled + /// query counters will be reactivated. + /// + /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to restore. + virtual void CmdRestoreComputeState( + uint32 stateFlags) = 0; + + /// Issues commands which complete two tasks: using the provided @ref IIndirectCmdGenerator object to translate the + /// indirect argument buffer into a format understandable by the GPU; and then executing the generated commands. + /// + /// The virtual address must be 4-byte aligned. + /// + /// The indirect argument data offset in memory must be 4-byte aligned. The expected layout of the argument data + /// is defined by the @ref IIndirectCmdGenerator object. + /// + /// It is unsafe to call this method on a command buffer which was not begun with either the optimizeOneTimeSubmit + /// or optimizeExclusiveSubmit flags. This is because there is a potential race condition if the same command buffer + /// is generating indirect commands on multiple Queues simultaneously. + /// + /// This function requires use of the following barrier flags on the indirect memory: + /// - PipelineStage: @ref PipelineStageFetchIndirectArgs + /// - CacheCoherency: @ref CoherIndirectArgs + /// + /// @param [in] generator Indirect command generator object which can translate the indirect argument buffer + /// into a command buffer format which the GPU can understand. + /// @param [in] gpuVirtAddr Gpu virtual address where the indirect argument data is located. + /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr is nonzero, the + /// value at that memory location is clamped to this maximum. If countGpuAddr is zero, + /// Then the number of draws issued exactly matches this number. + /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. Must be 4-byte aligned. + virtual void CmdExecuteIndirectCmds( + const IIndirectCmdGenerator& generator, + gpusize gpuVirtAddr, + uint32 maximumCount, + gpusize countGpuAddr) = 0; + + /// Updates one or more HiS pretests bound to the given stencil image within a range of mip levels. + /// See @ref HiSPretests for a summary of HiS. + /// + /// @warning Improper use of pretests can cause corruption. Please see @ref HiSPretests for more information. + /// + /// @param [in] image The stencil image that will receive the new pretest(s). + /// @param [in] pretests The new pretest(s). + /// @param [in] firstMip The beginning of the mip range which will receive the new pretest(s). + /// @param [in] numMips The number of mips in the mip range which will receive the new pretest(s). + virtual void CmdUpdateHiSPretests( + const IImage* pImage, + const HiSPretests& pretests, + uint32 firstMip, + uint32 numMips) = 0; + + /// Reserve @ref CommandDataAlloc space for external command packets up to a size of @ref sizeInDwords. + /// This method is only supported on command buffers for the following queue types: + /// + /// @warning @ref CmdCommitSpace must be called once after this function is called. + // Failing to pair up these function calls will result in undefined behavior. + /// + /// @param [in] sizeInDwords Size of the command buffer space to reserve in dwords. + /// If this param is 0, the default command stream reserve limit will be used. + /// @param [in] reserveInNewChunk Selection to reserve space in a new chunk or current chunk. + /// + /// @returns A pointer to the reserved command space. + virtual uint32* CmdReserveSpace( + uint32 sizeInDwords, + bool reserveInNewChunk) = 0; + + /// Ensure data is commited the command buffer and unused space is reclaimed. + /// This method is only supported on command buffers for the following queue types: + /// + /// @param [in] pCmdSpace Pointer to the next unused dword in the command buffer. + virtual void CmdCommitSpace( + uint32* pCmdSpace) = 0; + + /// Executes any internal postprocessing commands to be performed on a frame, such as drawing the dev driver + /// overlay. Calling this prior to presenting (via any path) is a requirement, and must be prior to or + /// concurrent with frameEnd if FSFM is applicable. This must be called using the image that will be the + /// source of the present. + /// + /// @param [in] postProcessInfo Information about the frame to be postprocessed. + /// @param [out] pAddedGpuWork (Optional) Set to true if commands were added as part of this call. + virtual void CmdPostProcessFrame( + const CmdPostProcessFrameInfo& postProcessInfo, + bool* pAddedGpuWork) = 0; + + /// Inserts a string embedded inside a NOP packet with a signature that is recognized by tools and can be printed + /// inside a command buffer disassembly. Note that this is a real NOP that will really be submitted to the GPU + /// and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug dumps. + /// + /// The maximum length of a string that may be embedded in the command buffer is currently 128 characters, + /// including the NUL-terminator. This is defined in the internal command buffer class in MaxCommentStringLength. + /// + /// @param [in] pComment Pointer to NUL-terminated string that will be inserted into the command buffer. + virtual void CmdCommentString( + const char* pComment) = 0; + + /// Inserts the specified payload embedded inside a NOP packet. Note that this is a real NOP that will be submitted + /// to the GPU and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug + /// dumps. + /// + /// @param [in] pPayload Pointer to binary data to embed. + /// @param [in] payloadSize Size of the payload in DWORDs, expected to be under MaxPayloadSize. + virtual void CmdNop( + const void* pPayload, + uint32 payloadSize) = 0; + + /// Marks the begin or end of a user-defined region of GPU work; analyzed post-mortem in crash-dump analysis tools. + /// Each 'Begin' marker must be paired with a corresponding 'End' marker; however, markers may be nested by + /// inserting multiple 'Begin' markers consecutively. + /// + /// @warning This function is a no-op if Crash Analysis mode is not enabled. + /// + /// @param [in] isBegin Whether this is a 'Begin' marker (true) or an 'End' marker (false). + /// @param [in] sourceId The application layer ID at which the marker is being created: + /// 0x0 => Application + /// 0x1 => API (e.g. DX12, Vulkan, etc.) + /// 0x2 => PAL + /// Developers may use IDs within the range of 10 - 15 to define a custom + /// application layer. + /// @param [in] pMarkerName A NULL-terminated string containing a name for this marker, used for annotation + /// purposes in external tools. Only valid for 'Begin' markers, and will be ignored if + /// isBeginMarker is false. + /// @param [in] markerNameSize Size of the marker string, in bytes. + /// + /// @returns Non-zero counter value of the embedded execution marker. + /// If Crash Analysis mode is disabled, this will always return zero. + virtual uint32 CmdInsertExecutionMarker( + bool isBegin, + uint8 sourceId, + const char* pMarkerName, + uint32 markerNameSize) = 0; + + /// Performs the virtual queue handshake. The host queue will do the following: + /// - Wait until the parent kernel is done + /// - Change the parent kernel state + /// - CP waits until the child counter is 0 + /// - CP sends the termination signal to the device queue + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] parentState Address of the parent kernel state + /// @param [in] newStateValue The new state value of the parent kernel + /// @param [in] parentChildCounter Address of the parent child counter + /// @param [in] signal Address of the virtual queue signal + /// @param [in] dedicatedQueue Runtime uses a dedicated queue for the scheduler + /// + /// @note This function is to support OpenCL AQL submissions. + virtual void CmdVirtualQueueHandshake( + gpusize parentState, + uint32 newStateValue, + gpusize parentChildCounter, + gpusize signal, + bool dedicatedQueue) = 0; + + /// Returns GPU address of the loop start with dispatch templates. The pointer will be passed to the + /// scheduler kernel for the update of dispatch templates. + /// Also initializes common registers for each dispatch template. + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @note This function is to support OpenCL AQL submissions. + virtual gpusize CmdVirtualQueueDispatcherStart() = 0; + + /// Programs CP iterator with dispatch templates for device enqueue in OpenCL2.0 + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] signal Address for the termination signal + /// @param [in] loopStart GPU address of the loop start for CP + /// @param [in] numTemplates The number of dispatch templates + /// + /// @note This function is to support OpenCL AQL submissions. + virtual void CmdVirtualQueueDispatcherEnd( + gpusize signal, + gpusize loopStart, + uint32 numTemplates) = 0; + + /// Emulates AQL dispatch with PM4 commands. + /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. + /// + /// @param [in] dispatchInfo Pointer to kernel dispatch info + /// + /// @note This function is to support OpenCL AQL submissions. + void CmdDispatchAql( + const DispatchAqlParams& dispatchInfo) + { + m_funcTable.pfnCmdDispatchAql(this, dispatchInfo); + } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + /// XDMA was retired starting in gfx10 so this function has no use anymore. + inline void CmdXdmaWaitFlipPending() {} +#endif + + /// Starts thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP + /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these + /// functions) + /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. + /// The caller is responsible for setting up valid GPU Profiler panel settings. + virtual void CmdStartGpuProfilerLogging() = 0; + + /// Stops thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP + /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these + /// functions) + /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. + /// The caller is responsible for setting up valid GPU Profiler panel settings. + virtual void CmdStopGpuProfilerLogging() = 0; + + /// Set a mask to control which view instances are enabled for subsequent draws, should only be called on + /// universal command buffers. + /// + /// @param [in] mask The mask to control which view instances are enabled. + virtual void CmdSetViewInstanceMask(uint32 mask) = 0; + + /// Get used size of all chunks in bytes for given CmdAllocType. For CommandDataAlloc with multi-queue scheme, the + /// size reported will be the sum of all command streams associated with the command buffer. It's legal to call + /// this function while in the command building state. + /// + /// @param [in] type Allocation type for ICmdAllocator + /// + /// @returns Used allocation data size in bytes for provided CmdAllocType. + virtual uint32 GetUsedSize( + CmdAllocType type) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Gets the internal unique Id of the command buffer. + /// This function was originally only for internal debugging, and the Id is not unique across different queue types. + /// However, CmdDisassembly::ICmdBufferReporting requires a way to differentiate cmdLists that matches + /// the meaning of UniqueId(), in particular, not requiring uniqueness across different queue types. + /// + /// @returns Unique Id of the command buffer + virtual uint32 UniqueId() const = 0; + + /// Get the number of arrays of tracking data (for correlation) held by this ICmdBuffer. + /// For TrackCmdLocationBefore and TrackCmdLocationAfter below, only (idx < GetNumTrackingArrays()) + /// can yield a non-nullptr result + /// + /// @detail If GetNumTrackingArrays() returns 0 but GetTrackedCmdLocationArraySizeInBytes() does not, it will + /// be possible to create them on this ICmdBuffer using CreateTrackedCmdLocationArray + /// + /// @returns the number of a TrackedCmdLocationArray's held by the ICmdBuffer. + virtual uint32 GetNumTrackingArrays() const = 0; + + /// Get the number of bytes required by CreateTrackedCmdLocationArray. + /// + /// @detail The value returned here accomdates the full number of TrackedCmdLocationArray's to be + /// created, from a single contiguous allocation. + /// If allocation has not yet occured, (GetNumTrackingArrays() == 0). + /// If (GetTrackedCmdLocationArraySizeInBytes() > 0) && (GetNumTrackingArrays() == 0) + /// this ICmdBuffer supports TrackedCmdLocationArray's, but has not yet allocated them + /// If (GetTrackedCmdLocationArraySizeInBytes() == 0), this ICmdBuffer does not support + /// TrackedCmdLocationArray's + /// + /// @returns 0 if TrackedCmdLocationArray's are not supported + /// The total number of bytes required requied by CreateTrackedCmdLocationArray otherwise. + virtual uint32 GetTrackedCmdLocationArraySizeInBytes() const = 0; + + /// Uses the memory pMemory to initialize GetNumTrackingArrays() TrackedCmdLocationArray's on this + /// ICmdBuffer. + /// + /// @param [in] pMemory Address of memory allocated for the purpose of creating TrackedCmdLocationArray's + /// This memory should be at larger than GetTrackedCmdLocationArraySizeInBytes() + /// bytes. + /// + /// @returns Result::Success: Indicates creation was successful + /// Result::Unsupported: This ICmdBuffer does not support TrackedCmdLocationArray's + /// Confirm (GetTrackedCmdLocationArraySizeInBytes() > 0) before using this function + /// Result::ErrorInvalidPointer: pMemory == nullptr + /// Result::AlreadyExists: TrackedCmdLocationArray's have already been allocated on + /// this ICmdBuffer + /// other: Error values originating from Util::Vector::Reserve() or + /// Util::Vector::PushBack() + /// + virtual Result CreateTrackedCmdLocationArray( + void* pMemory) = 0; + + /// Executes the destructors for all TrackedCmdLocationArray's owned by this ICmdBuffer. This should be + /// called prior to deleting the memory pMemory that was originally provided to CreateTrackedCmdLocationArray + /// + /// @param [out] ppAllocatedMemory + /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() > 0) + /// The original value for pMemory provided in CreateTrackedCmdLocationArray(pMemory) will + /// by returned in *ppAllocatedMemory. ie *ppAllocatedMemory = pMemory + /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() == 0), + /// *ppAllocatedMemory = nullptr; + /// + virtual void DestroyTrackedCmdLocationArray( + void** ppAllocatedMemory) = 0; + + /// Creates a new TrackedCmdLocation in the TrackedCmdLocationArray corresponding to idx + /// of type TrackedCmdLocationMode::Begin. This location will be furnished with a pointer to the address of + /// the next PM4Packet to be created on the CmdBuffer referred to by idx, and m_event == eventId. + /// + /// @detail Note there is two potential valid corner cases. + /// (pBeforeResult->Get()->m_correlateInternal.m_ptr. == 0) + /// Indicates there is a commandstream but it has not begun building PM4Packets + /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins + /// (pBeforeResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) + /// Indicates there is not yet an associated commandstream. + /// + /// @param [in] idx The idx corresponding to GetTrackingArray(idx). + /// Only (idx < GetNumTrackingArrays()) will yield non-trivial results. + /// @param [in] eventId The code for the event being tracked + /// @param [out] pBeforeResult If successful, returns a TrackedCmdLocationRef to a TrackedCmdLocation within + /// the TrackedCmdLocationArray corresponding to idx + /// Otherwise, if (pBeforeResult != nullptr), is initialized to the default for + /// TrackedCmdLocationRef() + /// + /// @returns + /// Pal::Result::Success if successful + /// Pal::Result::ErrorInvalidPointer if (pBeforeResult == nullptr) + /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer + /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) + /// Pal::Result::NotFound if there is no tracking array corresponding to idx + /// This can occur if CreateTrackedCmdLocationArray has not been + /// called - which may mean the feature is disabled + /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable + /// to allocate memory + /// + /// + virtual Pal::Result TrackCmdLocationBefore( + uint32 idx, + uint8 eventId, + CmdDisassembly::TrackedCmdLocationRef* pBeforeResult) = 0; + + /// Similar to TrackCmdLocationBefore, TrackCmdLocationAfter creates a TrackedCmdLocation, of type + /// TrackedCmdLocationMode::End or TrackedCmdLocationMode::Delta. This location will be furnished with a + /// pointer to the address of the next PM4Packet to be created on the CmdBuffer referred to by idx. Type + /// TrackedCmdLocationMode::Delta will only occur if parameter before is the last TrackedCmdLocation + /// for the TrackedCmdLocationArray corresponding to idx, has the same m_event == eventId, and the change in + /// pointer address is small enough to be represented in 6 bits. + /// + /// @detail Note there is two potential valid corner cases. + /// (pAfterResult->Get()->m_correlateInternal.m_ptr. == 0) + /// Indicates there is a commandstream but it has not begun building PM4Packets + /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins + /// (pAfterResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) + /// Indicates there is not yet an associated commandstream. + /// In both of these cases, in parameter "before" had the same value for m_correlateInternal.m_ptr. + /// and "before" referred to the most recent TrackedCmdLocation, this tracked location will be of + /// type TrackedCmdLocationMode::Delta, with (before.Get()->m_correlateInternal.m_deltaInDWords == 0) + /// + /// @param [in] idx The idx corresponding to GetTrackingArray(idx). + /// Only (idx < GetNumTrackingArrays()) can yield non-trivial results. + /// @param [in] eventId The code for the event being tracked + /// @param [in] before The corresponding location generated by TrackCmdLocationBefore + /// This may be CmdDisassembly::TrackedCmdLocationRef() if no location from + /// TrackCmdLocationBefore before exists (such as on Reset) + /// @param [out] pAfterResult Returns a TrackedCmdLocationRef to a TrackedCmdLocation within the TrackedCmdLocationArray + /// corresponding to idx + /// + /// @returns + /// Pal::Result::Success if successful + /// Pal::Result::ErrorInvalidPointer if (pAfterResult == nullptr) + /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer + /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) + /// Pal::Result::ErrorInvalidValue if (eventId != before.m_correlateInternal.m_event) + /// Pal::Result::NotFound if there is no tracking array corresponding to idx + /// This can occur if CreateTrackedCmdLocationArray has not been + /// called - which may mean the feature is disabled + /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable + /// to allocate memory + /// + + virtual Pal::Result TrackCmdLocationAfter( + uint32 idx, + uint8 eventId, + CmdDisassembly::TrackedCmdLocationRef before, + CmdDisassembly::TrackedCmdLocationRef* pAfterResult) = 0; + + /// An accessor function for the TrackedCmdLocationArray corresponding to idx + /// + /// @param idx There is a CmdDisassembly::TrackedCmdLocationArray* corresponding to each + /// sub-cmdBuffer for this cmdBuffer. This idx indexes these in the same fashion. + /// Only idx < GetNumTrackingArrays() can yield non-nullptr results. + /// + /// @returns the TrackedCmdLocationArray corresponding to idx + virtual CmdDisassembly::TrackedCmdLocationArray* GetTrackingArray( + uint32 idx) const = 0; + + /// TrackClientEvent operates similarly to TrackCmdLocationBefore and TrackCmdLocationAfter above but + /// operates on all TrackedCmdLocationArray's on this CmdBuffer. The TrackedCmdLocation generated here + /// corresponds to type TrackedCmdLocationMode::ClientEvent, where clientId is a value the client is using + /// to track this cmdBuffer, and eventId refers to some event the client is tracking outside of driver. + /// + /// @detail The first call to TrackClientEvent will include a TrackedCmdLocation with + /// (m_mode == TrackedCmdLocationMode::ClientId), to define clientId on this cmdBuffer + /// All calls will generate the TrackedCmdLocationMode::ClientEventId with + /// (m_clientEvent.m_clientEventId == clientEventId) + /// and be followed with a TrackedCmdLocationMode::Delta location with (m_eventId == PostClientEvent (0xFF)) + /// + /// @param [in] clientId + /// @param [in] eventId + /// + /// @returns + /// Result::Success if the tracked client event was successfully recorded + /// Result::Unsupported if the implementation of ICmdBuffer does not support tracking + /// Result::ErrorInvalidPointer if there was an error encountered determining the cmdList correlation + /// requested. This is likely to be an out-of-memory situation. + /// Result::AlreadyExists if registering clientId occured multiple times. This should only occur for + /// race conditions, if the code calling TrackClientEvent is not threadsafe + virtual Result TrackClientEvent( + uint64 clientId, + uint64 clientEventId) = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + ICmdBuffer() : m_pClientData(nullptr) + { + } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~ICmdBuffer() { } + + /// Structure for function pointers for the ICmdBuffer::Cmd* functions. + struct CmdBufferFnTable + { + /// CmdSetUserData function pointers for each pipeline bind point. + CmdSetUserDataFunc pfnCmdSetUserData[static_cast(PipelineBindPoint::Count)]; + + CmdDrawFunc pfnCmdDraw; ///< CmdDraw function pointer. + CmdDrawOpaqueFunc pfnCmdDrawOpaque; ///< CmdDrawOpaque function pointer. + CmdDrawIndexedFunc pfnCmdDrawIndexed; ///< CmdDrawIndexed function pointer. + CmdDrawIndirectMultiFunc pfnCmdDrawIndirectMulti; ///< CmdDrawIndirectMulti function pointer. + CmdDrawIndexedIndirectMultiFunc pfnCmdDrawIndexedIndirectMulti; ///< CmdDrawIndexedIndirectMulti func pointer. + CmdDispatchFunc pfnCmdDispatch; ///< CmdDispatch function pointer. + CmdDispatchIndirectFunc pfnCmdDispatchIndirect; ///< CmdDispatchIndirect function pointer. + CmdDispatchOffsetFunc pfnCmdDispatchOffset; ///< CmdDispatchOffset function pointer. + CmdDispatchMeshFunc pfnCmdDispatchMesh; ///< CmdDispatchmesh function pointer. + CmdDispatchMeshIndirectMultiFunc pfnCmdDispatchMeshIndirectMulti; ///< CmdDispatchMeshIndirect function pointer. + CmdDispatchAqlFunc pfnCmdDispatchAql; ///< CmdDispatchAql function pointer. + } m_funcTable; ///< Function pointer table for Cmd* functions. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 + /// @internal Some back-compat glue for some of the HwPipePoint interfaces in this file. + static constexpr uint32 HwPipePointToStage[] = + { + PipelineStageTopOfPipe, // HwPipeTop = 0x0 + PipelineStagePostPrefetch, // HwPipePostPrefetch = 0x1 + PipelineStageVs, // HwPipePreRasterization = 0x2 + PipelineStagePs, // HwPipePostPs = 0x3 + PipelineStageLateDsTarget, // HwPipePreColorTarget = 0x4 + PipelineStageCs, // HwPipePostCs = 0x5 + PipelineStageBlt, // HwPipePostBlt = 0x6 + PipelineStageBottomOfPipe, // HwPipeBottom = 0x7 + }; +#endif +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h b/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h index 028e10978a..656b412a85 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palCmdTracking.h @@ -1,370 +1,370 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palCmdTracking.h - * @brief Defines a number of support classes used for construction and storage of struct TrackedCmdLocation - * defined in trackedCmdLocation.h - * - * - struct TrackingEventInfo: A single from uint8 to name, used for logging - * - class TrackedCmdSupportBase A set of TrackingEventInfo, maintained outside of Pal - * - class TrackedCmdLocationArray The arrays for TrackedCmdLocation's used for reporting - * correlation data through ICmdBufferReporting::CorrelationReportOnSubmit - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palVector.h" - -#include "trackedCmdLocation.h" - -namespace Pal -{ - -// forward decl -class Platform; - -namespace CmdDisassembly -{ - -// forward definition -class TrackedCmdLocationArray; - -/** -************************************************************************************************************************ -* @brief class TrackedCmdLocationRef -* A copyable reference to a member in a TrackedCmdLocationArray, invariant to that array be -* re-allocated. -* -* @detail Is simply a pointer to a TrackedCmdLocationArray, and an index in to that array -* -************************************************************************************************************************ -*/ -class TrackedCmdLocationRef -{ -public: - TrackedCmdLocationRef() - : m_pSourceArray(nullptr), - m_index(0) - { - } - - TrackedCmdLocationRef( - TrackedCmdLocationArray* pSourceArray, - Util::uint32 index) - : m_pSourceArray(pSourceArray), - m_index(index) - { - } - - TrackedCmdLocationRef( - TrackedCmdLocationRef&& other) = default; - TrackedCmdLocationRef( - TrackedCmdLocationRef const& other) = default; - TrackedCmdLocationRef& operator=( - TrackedCmdLocationRef&& other) = default; - TrackedCmdLocationRef& operator=( - TrackedCmdLocationRef const& other) = default; - - bool operator==( - TrackedCmdLocationRef const& other) const - { return (this->m_pSourceArray == other.m_pSourceArray) && (this->m_index == other.m_index); } - bool operator!=( - TrackedCmdLocationRef const& other) const - { return (this->m_pSourceArray != other.m_pSourceArray) || (this->m_index != other.m_index); } - - TrackedCmdLocation* Use(); - const TrackedCmdLocation* Get() const; - - Util::uint32 GetIndex() const - { - return m_index; - } - - /// Helper functions - /// - - /// Clears the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result Clear(); - - /// @returns - /// TrackedCmdLocationMode::Invalid if (IsValid() == false) - /// Get()->m_mode otherwise - TrackedCmdLocationMode GetMode() const; - - /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// to mode TrackedCmdLocationMode::Before - /// - /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it - /// Most likely, a value registered to a TrackedCmdSupportBase - /// @param [in] beforePtr The end pointer for the cmdList being tracked before the event referred to by eventId - /// Only 48-bits of beforePtr are used - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result SetAsBefore( - uint8 eventId, - uint64 beforePtr); - - /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// to mode TrackedCmdLocationMode::After - /// - /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it - /// Most likely, a value registered to a TrackedCmdSupportBase - /// @param [in] afterPtr The end pointer for the cmdList being tracked after the event referred to by eventId - /// Only 48-bits of afterPtr are used - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result SetAsAfter( - uint8 eventId, - uint64 afterPtr); - - /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// to mode TrackedCmdLocationMode::Delta, with no begin or end (ie, no data can be written to - /// the cmdList being tracked "during" the event referred to be eventId - /// - /// @param [in] eventId Refers to an uint8 event that does not have a begin and/or an end associated with it - /// Such as Pal::CmdDisassembly::TrackedCmdLocation::PostClientEvent - /// @param [in] ptr The end pointer for the cmdList being tracked after the event referred to by eventId - /// Only 48-bits of ptr are used - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result SetAsEmptyDelta( - uint8 eventId, - uint64 ptr); - - /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// to mode TrackedCmdLocationMode::ClientId - /// - /// @param [in] clientId A 61-bit bit value used by the client application to identify which cmdList is being - /// tracked - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result SetAsClientId( - uint64 clientId); - - /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef - /// to mode TrackedCmdLocationMode::ClientEventId - /// - /// @param [in] clientEventId A 61-bit bit value used by the client application to identify - /// a client event relative to the current end position of the cmdList being tracked - /// - /// @returns - /// Result::ErrorInvalidPointer if (IsValid() == false) - /// Result::Success if successful - Result SetAsClientEvent( - uint64 clientEventId); - - /// @brief bool TrackedCmdLocation::TrySetAsDelta(uint64 afterPtr) - /// Will attempt to set this TrackedCmdLocation to type TrackedCmdLocationMode::Delta - /// - /// @detail If GetMode() == TrackedCmdLocationMode::Before and afterPtr - m_correlateInternal.m_ptr is small - /// enough to be encoded in m_correlateInternal.m_deltaInDWords, the mode will be altered to - /// TrackedCmdLocationMode::Delta, with afterPtr - m_correlateInternal.m_ptr encoded in - /// m_correlateInternal.m_deltaInDWords. - /// If this attempt fails, the calling function should instead create a TrackedCmdLocationMode::After - /// TrackedCmdLocation - /// - /// @param [in] afterPtr, the value a TrackedCmdLocationMode::After would have for m_correlateInternal.m_ptr - /// @return Result::Success if it was possible to set this TrackedCmdLocation to type - /// TrackedCmdLocationMode::Delta - /// Result::Unsupported if the conditions described above are not met. - Result TrySetAsDelta( - uint64 afterPtr); - -private: - TrackedCmdLocationArray* m_pSourceArray; - Util::uint32 m_index; - - Result SetMode( - TrackedCmdLocationMode mode); -}; - -/// @brief struct TrackingEventInfo -/// Essentially just a name, plus a boolean to indicate whether the name is valid / has been set -struct TrackingEventInfo -{ - Util::StringView name; - bool isValid; - - TrackingEventInfo() - : isValid(false) - {} -}; - -/** -************************************************************************************************************************ -* @brief class TrackedCmdSupportBase translates eventId's to strings for internal correlation events -* -* @detail For use in Pal::Queue when dumping to text files. Corresponds to -* TrackedCmdLocation::m_correlateInternal.m_event for the cases where TrackedCmdLocation::m_mode -* is not TrackedCmdLocationMode::ClientEvent -* -* The implementation for this is in whatever client of Pal that is creating the internal correlation events, -* -************************************************************************************************************************ -*/ -class TrackedCmdSupportBase -{ -public: - virtual ~TrackedCmdSupportBase() = default; - - void SetEventIdName( - uint8 eventId, - const char* name) - { - PAL_ASSERT(static_cast(eventId) < NumUInt8Values); - m_allEventsMap[eventId].name = name; - m_allEventsMap[eventId].isValid = true; - } - - TrackingEventInfo const& GetEventInfo( - uint8 eventId) const - { - PAL_ASSERT(static_cast(eventId) < NumUInt8Values); - return m_allEventsMap[eventId]; - } - -protected: - static constexpr uint32 NumUInt8Values = UINT8_MAX + 1; - - TrackingEventInfo m_allEventsMap[NumUInt8Values]; - - TrackedCmdSupportBase() = default; -}; - -/** -************************************************************************************************************************ -* @brief class TrackedCmdLocationArray is simple a TrackedCmdLocationVec together with a clientId -* and some helpers. TrackedCmdLocationArray live on Pal::GfxCmdBuffer -* -* @detail Each Pal::GfxCmdBuffer has at most CmdDisassembly::MaxNumSubCmdBuffers TrackedCmdLocationArray's -* corresponding to Pal::GfxCmdBuffer::NumCmdStreams(); -* -* The clientId used for TrackedCmdLocationArray::m_clientId, corresponds to the client Id used in -* TrackedCmdLocation::m_clientId.m_clientId -* -* For the moment, the underlying implementation used is -* Util::Vector, but could be changed to use a Chunk -* scheme, especially as sizes of cmdLists can become very large. -* The only requirement to a change, is for TrackedCmdLocationRef continues to function as an accessor -* -* Note that the functions in TrackedCmdLocationArray are not designed for thread-safety, as they are -* issued from command-list-building functions that are, in their turn, not thread safe. Adding mutex -* behavior here would potentially hide issues relating to thread-safety. -* -************************************************************************************************************************ -*/ -class TrackedCmdLocationArray -{ -public: - static constexpr uint32 DefaultCapacity = 1024; - static constexpr uint32 BadIndex = UINT32_MAX; - static constexpr uint64 InvalidClientId = UINT64_MAX; - - typedef Util::Vector TrackedCmdLocationVec; - - static uint32 GetTrackedCmdLocationArraySizeInBytes() - { - return sizeof(TrackedCmdLocationArray); - } - - static TrackedCmdLocationArray* CreateTrackedCmdLocationArray( - void* pMemory, - Pal::Platform* pPlatform); - - void Reset() - { - m_lastLocation = TrackedCmdLocationRef(this, BadIndex); - m_clientId = InvalidClientId; - m_locations.Clear(); - } - - void Destroy(); - - uint64 GetClientId() const - { - return m_clientId; - } - - Result SetClientId( - uint64 clientId); - - Util::uint32 GetTotalSize() const - { - return m_locations.size(); - } - - const TrackedCmdLocationVec& GetLocationsVec() const - { - return m_locations; - } - - TrackedCmdLocationVec& UseLocationsVec() - { - return m_locations; - } - - Pal::Result MakeNext( - TrackedCmdLocationRef* pResult); - - const TrackedCmdLocationRef GetLast() const - { - return m_lastLocation; - } - - bool IsLast( - TrackedCmdLocationRef const& location) const - { - return location == m_lastLocation; - } - -private: - TrackedCmdLocationVec m_locations; - Pal::Platform* m_pPlatform; - uint64 m_clientId; - TrackedCmdLocationRef m_lastLocation; - - TrackedCmdLocationArray( - Pal::Platform* pPlatform); - - ~TrackedCmdLocationArray() = default; -}; - -} // namespace CmdDisassembly -} // namespace Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palCmdTracking.h + * @brief Defines a number of support classes used for construction and storage of struct TrackedCmdLocation + * defined in trackedCmdLocation.h + * + * - struct TrackingEventInfo: A single from uint8 to name, used for logging + * - class TrackedCmdSupportBase A set of TrackingEventInfo, maintained outside of Pal + * - class TrackedCmdLocationArray The arrays for TrackedCmdLocation's used for reporting + * correlation data through ICmdBufferReporting::CorrelationReportOnSubmit + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palVector.h" + +#include "trackedCmdLocation.h" + +namespace Pal +{ + +// forward decl +class Platform; + +namespace CmdDisassembly +{ + +// forward definition +class TrackedCmdLocationArray; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdLocationRef +* A copyable reference to a member in a TrackedCmdLocationArray, invariant to that array be +* re-allocated. +* +* @detail Is simply a pointer to a TrackedCmdLocationArray, and an index in to that array +* +************************************************************************************************************************ +*/ +class TrackedCmdLocationRef +{ +public: + TrackedCmdLocationRef() + : m_pSourceArray(nullptr), + m_index(0) + { + } + + TrackedCmdLocationRef( + TrackedCmdLocationArray* pSourceArray, + Util::uint32 index) + : m_pSourceArray(pSourceArray), + m_index(index) + { + } + + TrackedCmdLocationRef( + TrackedCmdLocationRef&& other) = default; + TrackedCmdLocationRef( + TrackedCmdLocationRef const& other) = default; + TrackedCmdLocationRef& operator=( + TrackedCmdLocationRef&& other) = default; + TrackedCmdLocationRef& operator=( + TrackedCmdLocationRef const& other) = default; + + bool operator==( + TrackedCmdLocationRef const& other) const + { return (this->m_pSourceArray == other.m_pSourceArray) && (this->m_index == other.m_index); } + bool operator!=( + TrackedCmdLocationRef const& other) const + { return (this->m_pSourceArray != other.m_pSourceArray) || (this->m_index != other.m_index); } + + TrackedCmdLocation* Use(); + const TrackedCmdLocation* Get() const; + + Util::uint32 GetIndex() const + { + return m_index; + } + + /// Helper functions + /// + + /// Clears the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result Clear(); + + /// @returns + /// TrackedCmdLocationMode::Invalid if (IsValid() == false) + /// Get()->m_mode otherwise + TrackedCmdLocationMode GetMode() const; + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::Before + /// + /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it + /// Most likely, a value registered to a TrackedCmdSupportBase + /// @param [in] beforePtr The end pointer for the cmdList being tracked before the event referred to by eventId + /// Only 48-bits of beforePtr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsBefore( + uint8 eventId, + uint64 beforePtr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::After + /// + /// @param [in] eventId Refers to an uint8 event that has a begin and/or an end associated with it + /// Most likely, a value registered to a TrackedCmdSupportBase + /// @param [in] afterPtr The end pointer for the cmdList being tracked after the event referred to by eventId + /// Only 48-bits of afterPtr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsAfter( + uint8 eventId, + uint64 afterPtr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::Delta, with no begin or end (ie, no data can be written to + /// the cmdList being tracked "during" the event referred to be eventId + /// + /// @param [in] eventId Refers to an uint8 event that does not have a begin and/or an end associated with it + /// Such as Pal::CmdDisassembly::TrackedCmdLocation::PostClientEvent + /// @param [in] ptr The end pointer for the cmdList being tracked after the event referred to by eventId + /// Only 48-bits of ptr are used + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsEmptyDelta( + uint8 eventId, + uint64 ptr); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::ClientId + /// + /// @param [in] clientId A 61-bit bit value used by the client application to identify which cmdList is being + /// tracked + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsClientId( + uint64 clientId); + + /// Sets the TrackedCmdLocation referred to by this TrackedCmdLocationRef + /// to mode TrackedCmdLocationMode::ClientEventId + /// + /// @param [in] clientEventId A 61-bit bit value used by the client application to identify + /// a client event relative to the current end position of the cmdList being tracked + /// + /// @returns + /// Result::ErrorInvalidPointer if (IsValid() == false) + /// Result::Success if successful + Result SetAsClientEvent( + uint64 clientEventId); + + /// @brief bool TrackedCmdLocation::TrySetAsDelta(uint64 afterPtr) + /// Will attempt to set this TrackedCmdLocation to type TrackedCmdLocationMode::Delta + /// + /// @detail If GetMode() == TrackedCmdLocationMode::Before and afterPtr - m_correlateInternal.m_ptr is small + /// enough to be encoded in m_correlateInternal.m_deltaInDWords, the mode will be altered to + /// TrackedCmdLocationMode::Delta, with afterPtr - m_correlateInternal.m_ptr encoded in + /// m_correlateInternal.m_deltaInDWords. + /// If this attempt fails, the calling function should instead create a TrackedCmdLocationMode::After + /// TrackedCmdLocation + /// + /// @param [in] afterPtr, the value a TrackedCmdLocationMode::After would have for m_correlateInternal.m_ptr + /// @return Result::Success if it was possible to set this TrackedCmdLocation to type + /// TrackedCmdLocationMode::Delta + /// Result::Unsupported if the conditions described above are not met. + Result TrySetAsDelta( + uint64 afterPtr); + +private: + TrackedCmdLocationArray* m_pSourceArray; + Util::uint32 m_index; + + Result SetMode( + TrackedCmdLocationMode mode); +}; + +/// @brief struct TrackingEventInfo +/// Essentially just a name, plus a boolean to indicate whether the name is valid / has been set +struct TrackingEventInfo +{ + Util::StringView name; + bool isValid; + + TrackingEventInfo() + : isValid(false) + {} +}; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdSupportBase translates eventId's to strings for internal correlation events +* +* @detail For use in Pal::Queue when dumping to text files. Corresponds to +* TrackedCmdLocation::m_correlateInternal.m_event for the cases where TrackedCmdLocation::m_mode +* is not TrackedCmdLocationMode::ClientEvent +* +* The implementation for this is in whatever client of Pal that is creating the internal correlation events, +* +************************************************************************************************************************ +*/ +class TrackedCmdSupportBase +{ +public: + virtual ~TrackedCmdSupportBase() = default; + + void SetEventIdName( + uint8 eventId, + const char* name) + { + PAL_ASSERT(static_cast(eventId) < NumUInt8Values); + m_allEventsMap[eventId].name = name; + m_allEventsMap[eventId].isValid = true; + } + + TrackingEventInfo const& GetEventInfo( + uint8 eventId) const + { + PAL_ASSERT(static_cast(eventId) < NumUInt8Values); + return m_allEventsMap[eventId]; + } + +protected: + static constexpr uint32 NumUInt8Values = UINT8_MAX + 1; + + TrackingEventInfo m_allEventsMap[NumUInt8Values]; + + TrackedCmdSupportBase() = default; +}; + +/** +************************************************************************************************************************ +* @brief class TrackedCmdLocationArray is simple a TrackedCmdLocationVec together with a clientId +* and some helpers. TrackedCmdLocationArray live on Pal::GfxCmdBuffer +* +* @detail Each Pal::GfxCmdBuffer has at most CmdDisassembly::MaxNumSubCmdBuffers TrackedCmdLocationArray's +* corresponding to Pal::GfxCmdBuffer::NumCmdStreams(); +* +* The clientId used for TrackedCmdLocationArray::m_clientId, corresponds to the client Id used in +* TrackedCmdLocation::m_clientId.m_clientId +* +* For the moment, the underlying implementation used is +* Util::Vector, but could be changed to use a Chunk +* scheme, especially as sizes of cmdLists can become very large. +* The only requirement to a change, is for TrackedCmdLocationRef continues to function as an accessor +* +* Note that the functions in TrackedCmdLocationArray are not designed for thread-safety, as they are +* issued from command-list-building functions that are, in their turn, not thread safe. Adding mutex +* behavior here would potentially hide issues relating to thread-safety. +* +************************************************************************************************************************ +*/ +class TrackedCmdLocationArray +{ +public: + static constexpr uint32 DefaultCapacity = 1024; + static constexpr uint32 BadIndex = UINT32_MAX; + static constexpr uint64 InvalidClientId = UINT64_MAX; + + typedef Util::Vector TrackedCmdLocationVec; + + static uint32 GetTrackedCmdLocationArraySizeInBytes() + { + return sizeof(TrackedCmdLocationArray); + } + + static TrackedCmdLocationArray* CreateTrackedCmdLocationArray( + void* pMemory, + Pal::Platform* pPlatform); + + void Reset() + { + m_lastLocation = TrackedCmdLocationRef(this, BadIndex); + m_clientId = InvalidClientId; + m_locations.Clear(); + } + + void Destroy(); + + uint64 GetClientId() const + { + return m_clientId; + } + + Result SetClientId( + uint64 clientId); + + Util::uint32 GetTotalSize() const + { + return m_locations.size(); + } + + const TrackedCmdLocationVec& GetLocationsVec() const + { + return m_locations; + } + + TrackedCmdLocationVec& UseLocationsVec() + { + return m_locations; + } + + Pal::Result MakeNext( + TrackedCmdLocationRef* pResult); + + const TrackedCmdLocationRef GetLast() const + { + return m_lastLocation; + } + + bool IsLast( + TrackedCmdLocationRef const& location) const + { + return location == m_lastLocation; + } + +private: + TrackedCmdLocationVec m_locations; + Pal::Platform* m_pPlatform; + uint64 m_clientId; + TrackedCmdLocationRef m_lastLocation; + + TrackedCmdLocationArray( + Pal::Platform* pPlatform); + + ~TrackedCmdLocationArray() = default; +}; + +} // namespace CmdDisassembly +} // namespace Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h b/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h index 3e4465960b..efb4b60668 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDestroyable.h @@ -1,70 +1,70 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palDestroyable.h - * @brief Defines the Platform Abstraction Library (PAL) IDestroyable interface. - *********************************************************************************************************************** - */ - -#pragma once - -namespace Pal -{ - -/** - *********************************************************************************************************************** - * @interface IDestroyable - * @brief Interface inherited by objects that must be explicitly destroyed by the client. - * - * This includes all objects except: - * - * + @ref IColorTargetView, @ref IDepthStencilView - These classes are treated as SRDs by the DX12 runtime. Therefore, - * PAL guarantees that no action needs to be taken at Destroy() - the client should just free the memory backing these - * classes. - * + @ref IDevice - These objects are created during IPlatform::EnumerateDevices() and are automatically destroyed - * along with the Platform object. - * + @ref IPrivateScreen - These objects are created as during IPlatform::EnumerateDevices() based on - * which screens are attached to each device. They are automatically destroyed along with the Platform object. - *********************************************************************************************************************** - */ -class IDestroyable -{ -public: - /// Frees all resources associated with this object. - /// - /// It is the client's responsibility to only call this method once there are no more existing references to this - /// object. This method does not free the system memory associated with the object (as specified in pPlacementAddr - /// during creation); the client is responsible for freeing that memory since they allocated it. - virtual void Destroy() = 0; - -protected: - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IDestroyable() { } -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDestroyable.h + * @brief Defines the Platform Abstraction Library (PAL) IDestroyable interface. + *********************************************************************************************************************** + */ + +#pragma once + +namespace Pal +{ + +/** + *********************************************************************************************************************** + * @interface IDestroyable + * @brief Interface inherited by objects that must be explicitly destroyed by the client. + * + * This includes all objects except: + * + * + @ref IColorTargetView, @ref IDepthStencilView - These classes are treated as SRDs by the DX12 runtime. Therefore, + * PAL guarantees that no action needs to be taken at Destroy() - the client should just free the memory backing these + * classes. + * + @ref IDevice - These objects are created during IPlatform::EnumerateDevices() and are automatically destroyed + * along with the Platform object. + * + @ref IPrivateScreen - These objects are created as during IPlatform::EnumerateDevices() based on + * which screens are attached to each device. They are automatically destroyed along with the Platform object. + *********************************************************************************************************************** + */ +class IDestroyable +{ +public: + /// Frees all resources associated with this object. + /// + /// It is the client's responsibility to only call this method once there are no more existing references to this + /// object. This method does not free the system memory associated with the object (as specified in pPlacementAddr + /// during creation); the client is responsible for freeing that memory since they allocated it. + virtual void Destroy() = 0; + +protected: + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IDestroyable() { } +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h b/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h index cdcca271cc..f68d1d730f 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDeveloperHooks.h @@ -1,627 +1,626 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palDeveloperHooks.h - * @brief Common include for PAL developer callbacks. Defines common enums, typedefs, structures, etc. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palCmdBuffer.h" - -namespace Pal -{ - -// Forward declarations. -class ICmdBuffer; -class IImage; -class IPipeline; - -namespace Developer -{ - -/// The type of the developer callback so the callback can properly perform whatever actions it needs. -/// -/// @see Callback -enum class CallbackType : uint32 -{ - AllocGpuMemory = 0, ///< This callback is to inform that GPU memory has been allocated. - FreeGpuMemory, ///< This callback is to inform that GPU memory has been freed. - PresentConcluded, ///< This callback is to inform that a present has concluded. - ImageBarrier, ///< This callback is to inform that a barrier is being executed. - CreateImage, ///< This callback is to inform that an image has been created. - BarrierBegin, ///< This callback is to inform that a barrier is about to be executed. - BarrierEnd, ///< This callback is to inform that a barrier is done being executed. - DrawDispatch, ///< This callback is to inform that a draw or dispatch command is being recorded. - BindPipeline, ///< This callback is to inform that a pipeline (client or internal) has been bound. - SurfRegData, ///< This callback is to inform tools of the register state of a surface. -#if PAL_DEVELOPER_BUILD - DrawDispatchValidation, ///< This callback is to describe the state validation needed by a draw or dispatch. - BindPipelineValidation, ///< This callback is to describe the state validation needed by a pipeline bind. - OptimizedRegisters, ///< This callback is to describe the PM4 optimizer's removal of redundant register - /// sets. -#endif - BindGpuMemory, ///< This callback is to inform of a new binding to GPU memory. - SubAllocGpuMemory, ///< This callback is to inform of suballocation from base GPU memory allocation. - SubFreeGpuMemory, ///< This callback is to inform that GPU memory suballocation has been freed. -#if PAL_DEVELOPER_BUILD - RpmBlt, ///< This callback is to describe the internal RPM blt calls. -#endif - Count, ///< The number of info types. -}; - -constexpr uint32 AllCallbackTypesMask = Util::BitfieldGenMask(static_cast(CallbackType::Count)); - -constexpr uint32 DefaultDisabledCallbackTypes = (1 << static_cast(CallbackType::BindGpuMemory)) | - (1 << static_cast(CallbackType::SubAllocGpuMemory)) | - (1 << static_cast(CallbackType::SubFreeGpuMemory)); - -constexpr uint32 DefaultEnabledCallbackTypes = AllCallbackTypesMask & ~DefaultDisabledCallbackTypes; - -/// Definition for developer callback. -/// -/// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. -/// @param [in] deviceIndex Unique index for the device so that the installer can properly dispatch the event. -/// @param [in] infoType Information about the callback so the installer can make informed decisions about -/// what actions to perform. -/// @param [in] pInfoData Additional data related to the particular callback type. -typedef void (PAL_STDCALL *Callback)( - void* pPrivateData, - const uint32 deviceIndex, - CallbackType type, - void* pCbData); - -/// Enumeration describing the different ways GPU memory is allocated. -enum class GpuMemoryAllocationMethod : uint32 -{ - Unassigned = 0, ///< Unassigned allocation method. - Normal, ///< Virtual memory allocation (not pinned/peer). - Pinned, ///< Pinned memory allocation. - Peer, ///< Peer memory allocation. - MultiDevice, ///< MultiDevice memory allocation. - Opened, ///< Shared memory allocation. - Svm, ///< Shared virtual memory allocation. -}; - -/// Enumeration describing the different Presentation modes an application can take. -enum class PresentModeType : uint32 -{ - Unknown = 0, ///< When the present mode is not known. - Flip, ///< when the presentation surface is used directly as the front buffer. - Composite, ///< When the flipped image is drawn by a window compositor instead - /// of the application. - Blit, ///< when the presentation surface is copied to the front buffer. -}; - -/// Information about the presentation mode an application is in. -struct PresentationModeData -{ - PresentModeType presentationMode; ///< Information about present mode from above enumeration. - UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. -}; - -/// Information for allocation/deallocation of GPU memory. -struct GpuMemoryData -{ - gpusize size; ///< Size, in bytes, of the allocation. - GpuHeap heap; ///< The first requested heap of the allocation. - - /// Allocation description flags - struct Flags - { - uint32 isClient : 1; ///< This allocation is requested by the client. - uint32 isFlippable : 1; ///< This allocation is marked as flippable. - uint32 isUdmaBuffer : 1; ///< This allocation is for a UDMA buffer. - uint32 isVirtual : 1; ///< This allocation is for virtual memory. - uint32 isCmdAllocator : 1; ///< This allocation is for a CmdAllocator. - uint32 isExternal : 1; ///< This allocation is marked as external. - uint32 buddyAllocated : 1; ///< This allocation is buddy allocated. - uint32 appRequested : 1; ///< This allocation is Pal internal, but application requested - uint32 reserved : 24; ///< Reserved for future use. - } flags; ///< Flags describing the allocation. - - GpuMemoryAllocationMethod allocMethod; ///< Allocation method - const IGpuMemory* pGpuMemory; ///< Handle to the Pal::IGpuMemory object of this GPU memory allocation - gpusize offset; ///< Offset, in bytes, of a suballocation within a base allocation. For - /// base allocations, offset is always zero. -}; - -#if PAL_DEVELOPER_BUILD -/// PWS acquire point for barrier logger -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 901 -enum AcquirePoint : uint8 -{ - AcquirePointPfp, - AcquirePointMe, - AcquirePointPreShader, - AcquirePointPreDepth, - AcquirePointPrePs, - AcquirePointPreColor, - AcquirePointEop, - - AcquirePointCount -}; -#else -enum class AcquirePoint : uint8 -{ - Pfp = 0, - Me, - PreShader, - PreDepth, - PrePs, - PreColor, - Eop, // Invalid, for internal optimization purpose. - - Count -}; -#endif -#endif - -/// Information pertaining to the cache flush/invalidations and stalls performed during barrier execution. -struct BarrierOperations -{ - union - { - struct - { - uint16 eopTsBottomOfPipe : 1; ///< Issue an end-of-pipe event that can be waited on. - /// When combined with waitOnTs, makes a full pipeline stall. - uint16 vsPartialFlush : 1; ///< Stall at ME, waiting for all prior VS waves to complete. - uint16 psPartialFlush : 1; ///< Stall at ME, waiting for all prior PS waves to complete. - uint16 csPartialFlush : 1; ///< Stall at ME, waiting for all prior CS waves to complete. - uint16 pfpSyncMe : 1; ///< Stall PFP until ME is at same point in command stream. - /// flushed/invalidated are specified in the caches bitfield. - uint16 syncCpDma : 1; ///< Issue dummy cpDma command to confirm all prior cpDmas have - /// completed. - uint16 eosTsPsDone : 1; ///< Issue an end-of-pixel-shader event that can be waited on. - uint16 eosTsCsDone : 1; ///< Issue an end-of-compute-shader event that can be waited on - uint16 waitOnTs : 1; ///< Wait on an timestamp event (EOP or EOS) at the ME. - /// Which event is not necesarily specified here, though any - /// that are specified here would be waited on. - uint16 reserved : 7; ///< Reserved for future use. - }; - - uint16 u16All; ///< Unsigned integer containing all the values. - - } pipelineStalls; ///< Information about pipeline stalls performed. - - union - { - struct - { - uint16 depthStencilExpand : 1; ///< Decompression of depth/stencil image. - uint16 htileHiZRangeExpand : 1; ///< Expansion of HTile's HiZ range. - uint16 depthStencilResummarize : 1; ///< Resummarization of depth stencil. - uint16 dccDecompress : 1; ///< DCC decompress BLT for color images. - uint16 fmaskDecompress : 1; ///< Fmask decompression for shader readability. - uint16 fastClearEliminate : 1; ///< Expand latest specified clear color into pixel data for the fast - /// cleared color/depth resource. - uint16 fmaskColorExpand : 1; ///< Completely decompresses the specified color resource. - uint16 initMaskRam : 1; ///< Memsets uninitialized memory to prepare it for use as - /// CMask/FMask/DCC/HTile. - uint16 updateDccStateMetadata : 1; ///< DCC state metadata was updated. - uint16 retileGfxDccToDisplayDcc : 1; ///< Gfx dcc is retiled to display dcc. - uint16 reserved : 6; ///< Reserved for future use. - }; - - uint16 u16All; ///< Unsigned integer containing all the values. - - } layoutTransitions; ///< Information about layout translation performed. - - union - { - struct - { - uint16 invalTcp : 1; ///< Invalidate vector caches. - uint16 invalSqI$ : 1; ///< Invalidate the SQ instruction caches. - uint16 invalSqK$ : 1; ///< Invalidate the SQ constant caches (scalar caches). - uint16 flushTcc : 1; ///< Flush L2 cache. - uint16 invalTcc : 1; ///< Invalidate L2 cache. - uint16 flushCb : 1; ///< Flush CB caches. - uint16 invalCb : 1; ///< Invalidate CB caches. - uint16 flushDb : 1; ///< Flush DB caches. - uint16 invalDb : 1; ///< Invalidate DB caches. - uint16 invalCbMetadata : 1; ///< Invalidate CB meta-data cache. - uint16 flushCbMetadata : 1; ///< Flush CB meta-data cache. - uint16 invalDbMetadata : 1; ///< Invalidate DB meta-data cache. - uint16 flushDbMetadata : 1; ///< Flush DB meta-data cache. - uint16 invalTccMetadata : 1; ///< Invalidate L2 meta-data cache (also called the GLM). - uint16 invalGl1 : 1; ///< Invalidate the global L1 cache - uint16 placeholder : 1; ///< Reserved for future use. - }; - - uint16 u16All; ///< Unsigned integer containing all the values. - - } caches; ///< Information about cache operations performed for the barrier. - -#if PAL_DEVELOPER_BUILD - AcquirePoint acquirePoint; -#endif -}; - -/// Enumeration for PAL barrier reasons -enum BarrierReason : uint32 -{ - BarrierReasonInvalid = 0, ///< Invalid barrier reason - - BarrierReasonFirst = 0x80000000, ///< The first valid barrier reason value - /// The only value that can smaller than this is the - /// invalid value. - BarrierReasonLast = 0xbfffffff, ///< The last valid barrier reason value - /// The only value that can larger than this is the - /// unknown value. - - BarrierReasonPreComputeColorClear = BarrierReasonFirst, ///< Barrier issued before a color clear - BarrierReasonPostComputeColorClear, ///< Barrier issued after a color clear - BarrierReasonPreComputeDepthStencilClear, ///< Barrier issued before a depth/stencil clear - BarrierReasonPostComputeDepthStencilClear, ///< Barrier issued after a depth/stencil clear - BarrierReasonMlaaResolveEdgeSync, ///< Barrier issued to sync mlaa edge calculations - BarrierReasonAqlWaitForParentKernel, ///< Barrier issued to wait for the parent kernel to - /// complete in an AQL submission - BarrierReasonAqlWaitForChildrenKernels, ///< Barrier issued to wait for the children kernels to - /// complete in an AQL submission - BarrierReasonP2PBlitSync, ///< Barrier issued to synchronize peer-to-peer blits - BarrierReasonTimeGraphGrid, ///< Barrier issued to wait for the time graph grid - BarrierReasonTimeGraphGpuLine, ///< Barrier issued to wait for the time graph gpu line - BarrierReasonDebugOverlayText, ///< Barrier issued to wait for the debug overlay text - BarrierReasonDebugOverlayGraph, ///< Barrier issued to wait for the debug overlay graph - BarrierReasonDevDriverOverlay, ///< Barrier issued to wait for developer driver overlay - BarrierReasonDmaImgScanlineCopySync, ///< Barrier issued to synchronize between image scanline - /// copies on the dma hardware - BarrierReasonPostSqttTrace, ///< Barrier issued to wait for work from an sqtt trace - BarrierReasonPrePerfDataCopy, ///< Barrier issued to wait for perf data to become - /// available for copy - BarrierReasonFlushL2CachedData, ///< Barrier issued to flush L2 cached data to main memory - BarrierReasonResolveImage, ///< Barrier issued before and after resolve image shader - BarrierReasonPerPixelCopy, ///< Barrier issued between CS copy and per-pixel copy steps - BarrierReasonGenerateMipmaps, ///< Barrier issued between generating mip levels - - /// Newly defined barrier reasons should be before this one. - BarrierReasonInternalLastDefined, ///< Only used for asserts. - BarrierReasonUnknown = 0xFFFFFFFF, ///< Unknown barrier reason - - /// Backwards compatibility reasons - BarrierReasonPreSyncClear = BarrierReasonPreComputeColorClear, - BarrierReasonPostSyncClear = BarrierReasonPostComputeColorClear -}; - -/// Style of barrier -enum class BarrierType : uint32 -{ - Full = 0, ///< A traditional blocking barrier. - Release, ///< A pipelined barrier that flushes caches and starts transitions. - Acquire, ///< A barrier that waits on previous 'Release' barriers. - - Count -}; - -/// Information for barrier executions. -struct BarrierData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the barrier. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 902 - ImgBarrier transition; ///< The particular image barrier with layout transition blt that is currently - /// executing, only used during a CallbackType::ImageBarrier. -#else - BarrierTransition transition; ///< The particular transition with layout transition blt that is currently - /// executing, only used during a CallbackType::ImageBarrier. -#endif - bool hasTransition; ///< Whether or not the transition structure is populated. - BarrierOperations operations; ///< Detailed cache and pipeline operations performed during this barrier execution - uint32 reason; ///< Reason that the barrier was invoked. Only filled at BarrierBegin. - BarrierType type; ///< What style of barrier this is. Only filled at BarrierBegin. -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 -/// Enumeration describing the different types of tile mode dimensions -enum class Gfx6ImageTileModeDimension : uint32 -{ - Linear = 0, ///< Linear tile mode. - Dim1d, ///< 1D tile mode. - Dim2d, ///< 2D tile mode. - Dim3d, ///< 3D tile mode. -}; - -/// Tile mode information -struct Gfx6ImageTileMode -{ - Gfx6ImageTileModeDimension dimension; ///< Dimensionality of tile mode. - - union - { - struct - { - uint32 prt : 1; ///< Image is a PRT. - uint32 thin : 1; ///< Thin tiled. - uint32 thick : 1; ///< Thick tiled. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } properties; ///< Bitfield of properties -}; - -/// Enumeration describing the different tile types -enum class Gfx6ImageTileType : uint32 -{ - Displayable = 0, ///< Displayable tiling. - NonDisplayable, ///< Non-displayable tiling. - DepthSampleOrder, ///< Same as non-displayable plus depth-sample-order. - Rotated, ///< Rotated displayable tiling. - Thick, ///< Thick micro-tiling. -}; -#endif - -/// Meta-data-related properties -struct ImageMetaDataInfo -{ - union - { - struct - { - uint32 color : 1; ///< Flag indicates this is a color buffer. - uint32 depth : 1; ///< Flag indicates this is a depth/stencil buffer. - uint32 stencil : 1; ///< Flag indicates this is a stencil buffer. - uint32 texture : 1; ///< Flag indicates this is a texture. - uint32 cube : 1; ///< Flag indicates this is a cubemap. - uint32 volume : 1; ///< Flag indicates this is a volume texture. - uint32 fmask : 1; ///< Flag indicates this is an fmask. - uint32 compressZ : 1; ///< Flag indicates z buffer is compressed. - uint32 overlay : 1; ///< Flag indicates this is an overlay surface. - uint32 noStencil : 1; ///< Flag indicates this depth has no separate stencil. - uint32 display : 1; ///< Flag indicates this should match display controller req. - uint32 opt4Space : 1; ///< Flag indicates this surface should be optimized for space - /// i.e. save some memory but may lose performance. - uint32 prt : 1; ///< Flag for partially resident texture. - uint32 tcCompatible : 1; ///< Image's metadata is TC-compatible. This reduces the maximum - /// compression levels, but allows the shader to read the data without - /// an expensive decompress operation. - uint32 dccCompatible : 1; ///< GFX 8: whether to make MSAA surface support dcc fast clear. - uint32 dccPipeWorkaround : 1; ///< GFX 8: whether to workaround the HW limit that - /// dcc can't be enabled if pipe config of tile mode - /// is different from that of ASIC. - uint32 disableLinearOpt : 1; ///< Disable tile mode optimization to linear. - uint32 reserved : 15; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } properties; ///< Bitfield of properties -}; - -/// Information for allocation of a PAL Image - AddrLib surface info. -struct ImageDataAddrMgrSurfInfo -{ -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - union - { - struct - { - Gfx6ImageTileMode mode; ///< Tile mode. - Gfx6ImageTileType type; ///< Micro tiling type. - } gfx6; - struct - { - uint32 swizzle; ///< Swizzle mode. - } gfx9; - } tiling; -#endif - - ImageMetaDataInfo flags; ///< Metadata info. - uint32 swizzle; ///< HW-specific swizzle mode. - uint64 size; ///< Surface size, in bytes. - uint32 bpp; ///< Bits per pixel. - uint32 width; ///< Width. - uint32 height; ///< Height. - uint32 depth; ///< Depth. -}; - -/// Type of surface for which the register data is being provided -enum class SurfRegDataType : uint32 -{ - RenderTargetView, ///< Render Target View information. -}; - -/// Information for surface addresses for a SurfRegData callback -struct SurfRegDataInfo -{ - SurfRegDataType type; ///< Type of surface to which the register data corresponds. - uint32 regData; ///< Hardware-specific register data for the specific surface type. -}; - -/// Type of draw or dispatch operation for a DrawDispatch callback -enum class DrawDispatchType : uint32 -{ - CmdDraw = 0, ///< Auto-indexed draw. - CmdDrawOpaque, ///< Auto draw. - CmdDrawIndexed, ///< Indexed draw. - CmdDrawIndirectMulti, ///< (Multi) indirect draw. - CmdDrawIndexedIndirectMulti, ///< (Multi) indirect indexed draw. - CmdDispatchMesh, ///< Task/Mesh shader dispatch. - CmdDispatchMeshIndirectMulti, ///< Indirect Task/Mesh shader dispatch. - CmdGenExecuteIndirectDraw, ///< ExecuteIndirect draw. - CmdGenExecuteIndirectDrawIndexed, ///< ExecuteIndirect indexed draw. - CmdGenExecuteIndirectDispatchMesh, ///< ExecuteIndirect Task/Mesh shader dispatch. - CmdDispatch, ///< Direct compute dispatch. - CmdDispatchAce, ///< Direct Compute dispatch through implicit ganged-submit ACE stream. - CmdDispatchIndirect, ///< Indirect compute dispatch. - CmdDispatchOffset, ///< Direct compute dispatch (offsetted start). - CmdGenExecuteIndirectDispatch, ///< ExecuteIndirect dispatch. - CmdDispatchAql, ///< AQL compute dispatch - - Count, - FirstDispatch = CmdDispatch ///< All callbacks with an enum value greater or equal than this are dispatches -}; - -/// Draw-specific information for DrawDispatch callbacks -struct DrawDispatchDrawArgs -{ - /// Contains information about user data register indices for certain draw parameter state. - /// Some of these values may not be available for all draws on all clients, and in such - /// cases the value will be UINT_MAX. - struct - { - uint32 firstVertex; ///< Vertex offset (first vertex) user data register index - uint32 instanceOffset; ///< Instance offset (start instance) user data register index - uint32 drawIndex; ///< Draw ID SPI user data register index - } userDataRegs; -}; - -/// Dispatch-specific information for DrawDispatch callbacks -struct DrawDispatchDispatchArgs -{ - DispatchDims groupStart; ///< Thread/workgroup start offsets in X/Y/Z dimensions. Only valid for CmdDispatchOffset. - DispatchDims groupDims; ///< Thread/workgroup counts in X/Y/Z dimensions. Only valid for CmdDispatch[Offset]. - DispatchDims logicalSize; ///< Thread/workgroup counts as seen by the shader. Only valid for CmdDispatchOffset. - /// Optional flags to help the client driver understand the dispatch. - /// For example, if the dispatch originated in PAL rather than the client driver. - DispatchInfoFlags infoFlags; -}; - -/// Information for DrawDispatch callbacks -struct DrawDispatchData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command - DrawDispatchType cmdType; ///< Draw/dispatch command type. This influences which sub-structure below is valid. - - union - { - /// Draw-specific parameters. Valid when cmdType is CmdDraw*. - DrawDispatchDrawArgs draw; - - /// Dispatch-specific parameters. Valid when cmdType is CmdDispatch* - DrawDispatchDispatchArgs dispatch; - }; - - /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), - /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert - /// the marker. - RgpMarkerSubQueueFlags subQueueFlags; -}; - -/// Information for BindPipeline callbacks -struct BindPipelineData -{ - const IPipeline* pPipeline; ///< The currently-bound pipeline - const PipelineInfo* pPipelineInfo; ///< General information about the bound pipeline - ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command - uint64 apiPsoHash; ///< The hash to correlate APIs and corresponding PSOs. - PipelineBindPoint bindPoint; ///< The bind point of the pipeline within a queue. - - /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), - /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert - /// the marker. - RgpMarkerSubQueueFlags subQueueFlags; -}; - -#if PAL_DEVELOPER_BUILD -/// Information for DrawDispatchValidation callbacks -struct DrawDispatchValidationData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. - uint32 userDataCmdSize; ///< Size of PM4 commands used to validate the current user-data entries (bytes). - uint32 miscCmdSize; ///< Size of PM4 commands for all other draw- or dispatch-time validation (bytes). -}; - -// Information for BindPipelineValidation callbacks -struct BindPipelineValidationData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. - uint32 pipelineCmdSize; ///< Size of PM4 commands used to validate the current pipeline state (bytes). -}; - -/// Information for OptimizedRegisters callbacks -struct OptimizedRegistersData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering PM4 stream. - /// Array containing the number of times the PM4 optimizer saw a SET packet which modified each register - const uint32* pShRegSeenSets; - ///< Array containing the number of times the PM4 optimizer kept a SET packet which modified each register - const uint32* pShRegKeptSets; - uint32 shRegCount; ///< Number of SH registers - uint16 shRegBase; ///< Base address of SH registers - /// Array containing the number of times the PM4 optimizer saw a SET or RMW packet which modified each register - const uint32* pCtxRegSeenSets; - ///< Array containing the number of times the PM4 optimizer kept a SET or RMW packet which modified each register - const uint32* pCtxRegKeptSets; - uint32 ctxRegCount; ///< Number of context registers - uint16 ctxRegBase; ///< Base address of context registers -}; - -/// Internal RPM blt type -enum class RpmBltType : uint32 -{ - CpDmaCopy = 0, - CpDmaUpdate, - Draw, - Dispatch, - - Count -}; - -/// Describes the RPM blt call -struct RpmBltData -{ - ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the blt. - RpmBltType bltType; ///< Type of RPM blt, @ref RpmBltType. -}; -#endif - -/// Describes the binding of a GPU Memory object to a resource -struct BindGpuMemoryData -{ - const void* pObj; ///< Opaque pointer to the resource having memory bound to it. - gpusize requiredGpuMemSize; ///< GPU memory size required by pObj. - const IGpuMemory* pGpuMemory; ///< IGpuMemory object being bound to the resource. - gpusize offset; ///< Offset within pGpuMemory where the resource is being bound. - bool isSystemMemory; ///< If true then system memory is being bound to the object. In this case, - /// pGpuMemory and offset should be set to zero. -}; - -/// Describes an user marker operation -enum class UserMarkerOpType : uint8 -{ - Invalid = 0, ///< Invalid user marker operation - Push, ///< Push user marker operation - Pop, ///< Pop user marker operation - Set ///< Set user marker operation -}; - -/// Describes an user marker operation, used in UserMarkerHistoryTraceSource -struct UserMarkerOpInfo -{ - union - { - struct - { - uint32 opType : 2; ///< UserMarkerOpType - uint32 strIndex : 30; ///< Index of the user marker in the in corresponding string table - }; - uint32 u32All; - }; -}; - -} // Developer -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDeveloperHooks.h + * @brief Common include for PAL developer callbacks. Defines common enums, typedefs, structures, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palCmdBuffer.h" + +namespace Pal +{ + +// Forward declarations. +class ICmdBuffer; +class IImage; +class IPipeline; + +namespace Developer +{ + +/// The type of the developer callback so the callback can properly perform whatever actions it needs. +/// +/// @see Callback +enum class CallbackType : uint32 +{ + AllocGpuMemory = 0, ///< This callback is to inform that GPU memory has been allocated. + FreeGpuMemory, ///< This callback is to inform that GPU memory has been freed. + PresentConcluded, ///< This callback is to inform that a present has concluded. + ImageBarrier, ///< This callback is to inform that a barrier is being executed. + CreateImage, ///< This callback is to inform that an image has been created. + BarrierBegin, ///< This callback is to inform that a barrier is about to be executed. + BarrierEnd, ///< This callback is to inform that a barrier is done being executed. + DrawDispatch, ///< This callback is to inform that a draw or dispatch command is being recorded. + BindPipeline, ///< This callback is to inform that a pipeline (client or internal) has been bound. + SurfRegData, ///< This callback is to inform tools of the register state of a surface. +#if PAL_DEVELOPER_BUILD + DrawDispatchValidation, ///< This callback is to describe the state validation needed by a draw or dispatch. + BindPipelineValidation, ///< This callback is to describe the state validation needed by a pipeline bind. + OptimizedRegisters, ///< This callback is to describe the PM4 optimizer's removal of redundant register + /// sets. +#endif + BindGpuMemory, ///< This callback is to inform of a new binding to GPU memory. + SubAllocGpuMemory, ///< This callback is to inform of suballocation from base GPU memory allocation. + SubFreeGpuMemory, ///< This callback is to inform that GPU memory suballocation has been freed. +#if PAL_DEVELOPER_BUILD + RpmBlt, ///< This callback is to describe the internal RPM blt calls. +#endif + Count, ///< The number of info types. +}; + +constexpr uint32 AllCallbackTypesMask = Util::BitfieldGenMask(static_cast(CallbackType::Count)); + +constexpr uint32 DefaultDisabledCallbackTypes = (1 << static_cast(CallbackType::BindGpuMemory)) | + (1 << static_cast(CallbackType::SubAllocGpuMemory)) | + (1 << static_cast(CallbackType::SubFreeGpuMemory)); + +constexpr uint32 DefaultEnabledCallbackTypes = AllCallbackTypesMask & ~DefaultDisabledCallbackTypes; + +/// Definition for developer callback. +/// +/// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. +/// @param [in] deviceIndex Unique index for the device so that the installer can properly dispatch the event. +/// @param [in] infoType Information about the callback so the installer can make informed decisions about +/// what actions to perform. +/// @param [in] pInfoData Additional data related to the particular callback type. +typedef void (PAL_STDCALL *Callback)( + void* pPrivateData, + const uint32 deviceIndex, + CallbackType type, + void* pCbData); + +/// Enumeration describing the different ways GPU memory is allocated. +enum class GpuMemoryAllocationMethod : uint32 +{ + Unassigned = 0, ///< Unassigned allocation method. + Normal, ///< Virtual memory allocation (not pinned/peer). + Pinned, ///< Pinned memory allocation. + Peer, ///< Peer memory allocation. + MultiDevice, ///< MultiDevice memory allocation. + Opened, ///< Shared memory allocation. + Svm, ///< Shared virtual memory allocation. +}; + +/// Enumeration describing the different Presentation modes an application can take. +enum class PresentModeType : uint32 +{ + Unknown = 0, ///< When the present mode is not known. + Flip, ///< when the presentation surface is used directly as the front buffer. + Composite, ///< When the flipped image is drawn by a window compositor instead + /// of the application. + Blit, ///< when the presentation surface is copied to the front buffer. +}; + +/// Information about the presentation mode an application is in. +struct PresentationModeData +{ + PresentModeType presentationMode; ///< Information about present mode from above enumeration. + UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. +}; + +/// Information for allocation/deallocation of GPU memory. +struct GpuMemoryData +{ + gpusize size; ///< Size, in bytes, of the allocation. + GpuHeap heap; ///< The first requested heap of the allocation. + + /// Allocation description flags + struct Flags + { + uint32 isClient : 1; ///< This allocation is requested by the client. + uint32 isFlippable : 1; ///< This allocation is marked as flippable. + uint32 isUdmaBuffer : 1; ///< This allocation is for a UDMA buffer. + uint32 isVirtual : 1; ///< This allocation is for virtual memory. + uint32 isCmdAllocator : 1; ///< This allocation is for a CmdAllocator. + uint32 isExternal : 1; ///< This allocation is marked as external. + uint32 buddyAllocated : 1; ///< This allocation is buddy allocated. + uint32 appRequested : 1; ///< This allocation is Pal internal, but application requested + uint32 reserved : 24; ///< Reserved for future use. + } flags; ///< Flags describing the allocation. + + GpuMemoryAllocationMethod allocMethod; ///< Allocation method + const IGpuMemory* pGpuMemory; ///< Handle to the Pal::IGpuMemory object of this GPU memory allocation + gpusize offset; ///< Offset, in bytes, of a suballocation within a base allocation. For + /// base allocations, offset is always zero. +}; + +#if PAL_DEVELOPER_BUILD +/// PWS acquire point for barrier logger +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 901 +enum AcquirePoint : uint8 +{ + AcquirePointPfp, + AcquirePointMe, + AcquirePointPreShader, + AcquirePointPreDepth, + AcquirePointPrePs, + AcquirePointPreColor, + AcquirePointEop, + + AcquirePointCount +}; +#else +enum class AcquirePoint : uint8 +{ + Pfp = 0, + Me, + PreShader, + PreDepth, + PrePs, + PreColor, + Eop, // Invalid, for internal optimization purpose. + + Count +}; +#endif +#endif + +/// Information pertaining to the cache flush/invalidations and stalls performed during barrier execution. +struct BarrierOperations +{ + union + { + struct + { + uint16 eopTsBottomOfPipe : 1; ///< Issue an end-of-pipe event that can be waited on. + /// When combined with waitOnTs, makes a full pipeline stall. + uint16 vsPartialFlush : 1; ///< Stall at ME, waiting for all prior VS waves to complete. + uint16 psPartialFlush : 1; ///< Stall at ME, waiting for all prior PS waves to complete. + uint16 csPartialFlush : 1; ///< Stall at ME, waiting for all prior CS waves to complete. + uint16 pfpSyncMe : 1; ///< Stall PFP until ME is at same point in command stream. + /// flushed/invalidated are specified in the caches bitfield. + uint16 syncCpDma : 1; ///< Issue dummy cpDma command to confirm all prior cpDmas have + /// completed. + uint16 eosTsPsDone : 1; ///< Issue an end-of-pixel-shader event that can be waited on. + uint16 eosTsCsDone : 1; ///< Issue an end-of-compute-shader event that can be waited on + uint16 waitOnTs : 1; ///< Wait on an timestamp event (EOP or EOS) at the ME. + /// Which event is not necesarily specified here, though any + /// that are specified here would be waited on. + uint16 reserved : 7; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } pipelineStalls; ///< Information about pipeline stalls performed. + + union + { + struct + { + uint16 depthStencilExpand : 1; ///< Decompression of depth/stencil image. + uint16 htileHiZRangeExpand : 1; ///< Expansion of HTile's HiZ range. + uint16 depthStencilResummarize : 1; ///< Resummarization of depth stencil. + uint16 dccDecompress : 1; ///< DCC decompress BLT for color images. + uint16 fmaskDecompress : 1; ///< Fmask decompression for shader readability. + uint16 fastClearEliminate : 1; ///< Expand latest specified clear color into pixel data for the fast + /// cleared color/depth resource. + uint16 fmaskColorExpand : 1; ///< Completely decompresses the specified color resource. + uint16 initMaskRam : 1; ///< Memsets uninitialized memory to prepare it for use as + /// CMask/FMask/DCC/HTile. + uint16 updateDccStateMetadata : 1; ///< DCC state metadata was updated. + uint16 reserved : 7; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } layoutTransitions; ///< Information about layout translation performed. + + union + { + struct + { + uint16 invalTcp : 1; ///< Invalidate vector caches. + uint16 invalSqI$ : 1; ///< Invalidate the SQ instruction caches. + uint16 invalSqK$ : 1; ///< Invalidate the SQ constant caches (scalar caches). + uint16 flushTcc : 1; ///< Flush L2 cache. + uint16 invalTcc : 1; ///< Invalidate L2 cache. + uint16 flushCb : 1; ///< Flush CB caches. + uint16 invalCb : 1; ///< Invalidate CB caches. + uint16 flushDb : 1; ///< Flush DB caches. + uint16 invalDb : 1; ///< Invalidate DB caches. + uint16 invalCbMetadata : 1; ///< Invalidate CB meta-data cache. + uint16 flushCbMetadata : 1; ///< Flush CB meta-data cache. + uint16 invalDbMetadata : 1; ///< Invalidate DB meta-data cache. + uint16 flushDbMetadata : 1; ///< Flush DB meta-data cache. + uint16 invalTccMetadata : 1; ///< Invalidate L2 meta-data cache (also called the GLM). + uint16 invalGl1 : 1; ///< Invalidate the global L1 cache + uint16 placeholder : 1; ///< Reserved for future use. + }; + + uint16 u16All; ///< Unsigned integer containing all the values. + + } caches; ///< Information about cache operations performed for the barrier. + +#if PAL_DEVELOPER_BUILD + AcquirePoint acquirePoint; +#endif +}; + +/// Enumeration for PAL barrier reasons +enum BarrierReason : uint32 +{ + BarrierReasonInvalid = 0, ///< Invalid barrier reason + + BarrierReasonFirst = 0x80000000, ///< The first valid barrier reason value + /// The only value that can smaller than this is the + /// invalid value. + BarrierReasonLast = 0xbfffffff, ///< The last valid barrier reason value + /// The only value that can larger than this is the + /// unknown value. + + BarrierReasonPreComputeColorClear = BarrierReasonFirst, ///< Barrier issued before a color clear + BarrierReasonPostComputeColorClear, ///< Barrier issued after a color clear + BarrierReasonPreComputeDepthStencilClear, ///< Barrier issued before a depth/stencil clear + BarrierReasonPostComputeDepthStencilClear, ///< Barrier issued after a depth/stencil clear + BarrierReasonMlaaResolveEdgeSync, ///< Barrier issued to sync mlaa edge calculations + BarrierReasonAqlWaitForParentKernel, ///< Barrier issued to wait for the parent kernel to + /// complete in an AQL submission + BarrierReasonAqlWaitForChildrenKernels, ///< Barrier issued to wait for the children kernels to + /// complete in an AQL submission + BarrierReasonP2PBlitSync, ///< Barrier issued to synchronize peer-to-peer blits + BarrierReasonTimeGraphGrid, ///< Barrier issued to wait for the time graph grid + BarrierReasonTimeGraphGpuLine, ///< Barrier issued to wait for the time graph gpu line + BarrierReasonDebugOverlayText, ///< Barrier issued to wait for the debug overlay text + BarrierReasonDebugOverlayGraph, ///< Barrier issued to wait for the debug overlay graph + BarrierReasonDevDriverOverlay, ///< Barrier issued to wait for developer driver overlay + BarrierReasonDmaImgScanlineCopySync, ///< Barrier issued to synchronize between image scanline + /// copies on the dma hardware + BarrierReasonPostSqttTrace, ///< Barrier issued to wait for work from an sqtt trace + BarrierReasonPrePerfDataCopy, ///< Barrier issued to wait for perf data to become + /// available for copy + BarrierReasonFlushL2CachedData, ///< Barrier issued to flush L2 cached data to main memory + BarrierReasonResolveImage, ///< Barrier issued before and after resolve image shader + BarrierReasonPerPixelCopy, ///< Barrier issued between CS copy and per-pixel copy steps + BarrierReasonGenerateMipmaps, ///< Barrier issued between generating mip levels + + /// Newly defined barrier reasons should be before this one. + BarrierReasonInternalLastDefined, ///< Only used for asserts. + BarrierReasonUnknown = 0xFFFFFFFF, ///< Unknown barrier reason + + /// Backwards compatibility reasons + BarrierReasonPreSyncClear = BarrierReasonPreComputeColorClear, + BarrierReasonPostSyncClear = BarrierReasonPostComputeColorClear +}; + +/// Style of barrier +enum class BarrierType : uint32 +{ + Full = 0, ///< A traditional blocking barrier. + Release, ///< A pipelined barrier that flushes caches and starts transitions. + Acquire, ///< A barrier that waits on previous 'Release' barriers. + + Count +}; + +/// Information for barrier executions. +struct BarrierData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the barrier. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 902 + ImgBarrier transition; ///< The particular image barrier with layout transition blt that is currently + /// executing, only used during a CallbackType::ImageBarrier. +#else + BarrierTransition transition; ///< The particular transition with layout transition blt that is currently + /// executing, only used during a CallbackType::ImageBarrier. +#endif + bool hasTransition; ///< Whether or not the transition structure is populated. + BarrierOperations operations; ///< Detailed cache and pipeline operations performed during this barrier execution + uint32 reason; ///< Reason that the barrier was invoked. Only filled at BarrierBegin. + BarrierType type; ///< What style of barrier this is. Only filled at BarrierBegin. +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Enumeration describing the different types of tile mode dimensions +enum class Gfx6ImageTileModeDimension : uint32 +{ + Linear = 0, ///< Linear tile mode. + Dim1d, ///< 1D tile mode. + Dim2d, ///< 2D tile mode. + Dim3d, ///< 3D tile mode. +}; + +/// Tile mode information +struct Gfx6ImageTileMode +{ + Gfx6ImageTileModeDimension dimension; ///< Dimensionality of tile mode. + + union + { + struct + { + uint32 prt : 1; ///< Image is a PRT. + uint32 thin : 1; ///< Thin tiled. + uint32 thick : 1; ///< Thick tiled. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } properties; ///< Bitfield of properties +}; + +/// Enumeration describing the different tile types +enum class Gfx6ImageTileType : uint32 +{ + Displayable = 0, ///< Displayable tiling. + NonDisplayable, ///< Non-displayable tiling. + DepthSampleOrder, ///< Same as non-displayable plus depth-sample-order. + Rotated, ///< Rotated displayable tiling. + Thick, ///< Thick micro-tiling. +}; +#endif + +/// Meta-data-related properties +struct ImageMetaDataInfo +{ + union + { + struct + { + uint32 color : 1; ///< Flag indicates this is a color buffer. + uint32 depth : 1; ///< Flag indicates this is a depth/stencil buffer. + uint32 stencil : 1; ///< Flag indicates this is a stencil buffer. + uint32 texture : 1; ///< Flag indicates this is a texture. + uint32 cube : 1; ///< Flag indicates this is a cubemap. + uint32 volume : 1; ///< Flag indicates this is a volume texture. + uint32 fmask : 1; ///< Flag indicates this is an fmask. + uint32 compressZ : 1; ///< Flag indicates z buffer is compressed. + uint32 overlay : 1; ///< Flag indicates this is an overlay surface. + uint32 noStencil : 1; ///< Flag indicates this depth has no separate stencil. + uint32 display : 1; ///< Flag indicates this should match display controller req. + uint32 opt4Space : 1; ///< Flag indicates this surface should be optimized for space + /// i.e. save some memory but may lose performance. + uint32 prt : 1; ///< Flag for partially resident texture. + uint32 tcCompatible : 1; ///< Image's metadata is TC-compatible. This reduces the maximum + /// compression levels, but allows the shader to read the data without + /// an expensive decompress operation. + uint32 dccCompatible : 1; ///< GFX 8: whether to make MSAA surface support dcc fast clear. + uint32 dccPipeWorkaround : 1; ///< GFX 8: whether to workaround the HW limit that + /// dcc can't be enabled if pipe config of tile mode + /// is different from that of ASIC. + uint32 disableLinearOpt : 1; ///< Disable tile mode optimization to linear. + uint32 reserved : 15; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } properties; ///< Bitfield of properties +}; + +/// Information for allocation of a PAL Image - AddrLib surface info. +struct ImageDataAddrMgrSurfInfo +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + union + { + struct + { + Gfx6ImageTileMode mode; ///< Tile mode. + Gfx6ImageTileType type; ///< Micro tiling type. + } gfx6; + struct + { + uint32 swizzle; ///< Swizzle mode. + } gfx9; + } tiling; +#endif + + ImageMetaDataInfo flags; ///< Metadata info. + uint32 swizzle; ///< HW-specific swizzle mode. + uint64 size; ///< Surface size, in bytes. + uint32 bpp; ///< Bits per pixel. + uint32 width; ///< Width. + uint32 height; ///< Height. + uint32 depth; ///< Depth. +}; + +/// Type of surface for which the register data is being provided +enum class SurfRegDataType : uint32 +{ + RenderTargetView, ///< Render Target View information. +}; + +/// Information for surface addresses for a SurfRegData callback +struct SurfRegDataInfo +{ + SurfRegDataType type; ///< Type of surface to which the register data corresponds. + uint32 regData; ///< Hardware-specific register data for the specific surface type. +}; + +/// Type of draw or dispatch operation for a DrawDispatch callback +enum class DrawDispatchType : uint32 +{ + CmdDraw = 0, ///< Auto-indexed draw. + CmdDrawOpaque, ///< Auto draw. + CmdDrawIndexed, ///< Indexed draw. + CmdDrawIndirectMulti, ///< (Multi) indirect draw. + CmdDrawIndexedIndirectMulti, ///< (Multi) indirect indexed draw. + CmdDispatchMesh, ///< Task/Mesh shader dispatch. + CmdDispatchMeshIndirectMulti, ///< Indirect Task/Mesh shader dispatch. + CmdGenExecuteIndirectDraw, ///< ExecuteIndirect draw. + CmdGenExecuteIndirectDrawIndexed, ///< ExecuteIndirect indexed draw. + CmdGenExecuteIndirectDispatchMesh, ///< ExecuteIndirect Task/Mesh shader dispatch. + CmdDispatch, ///< Direct compute dispatch. + CmdDispatchAce, ///< Direct Compute dispatch through implicit ganged-submit ACE stream. + CmdDispatchIndirect, ///< Indirect compute dispatch. + CmdDispatchOffset, ///< Direct compute dispatch (offsetted start). + CmdGenExecuteIndirectDispatch, ///< ExecuteIndirect dispatch. + CmdDispatchAql, ///< AQL compute dispatch + + Count, + FirstDispatch = CmdDispatch ///< All callbacks with an enum value greater or equal than this are dispatches +}; + +/// Draw-specific information for DrawDispatch callbacks +struct DrawDispatchDrawArgs +{ + /// Contains information about user data register indices for certain draw parameter state. + /// Some of these values may not be available for all draws on all clients, and in such + /// cases the value will be UINT_MAX. + struct + { + uint32 firstVertex; ///< Vertex offset (first vertex) user data register index + uint32 instanceOffset; ///< Instance offset (start instance) user data register index + uint32 drawIndex; ///< Draw ID SPI user data register index + } userDataRegs; +}; + +/// Dispatch-specific information for DrawDispatch callbacks +struct DrawDispatchDispatchArgs +{ + DispatchDims groupStart; ///< Thread/workgroup start offsets in X/Y/Z dimensions. Only valid for CmdDispatchOffset. + DispatchDims groupDims; ///< Thread/workgroup counts in X/Y/Z dimensions. Only valid for CmdDispatch[Offset]. + DispatchDims logicalSize; ///< Thread/workgroup counts as seen by the shader. Only valid for CmdDispatchOffset. + /// Optional flags to help the client driver understand the dispatch. + /// For example, if the dispatch originated in PAL rather than the client driver. + DispatchInfoFlags infoFlags; +}; + +/// Information for DrawDispatch callbacks +struct DrawDispatchData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command + DrawDispatchType cmdType; ///< Draw/dispatch command type. This influences which sub-structure below is valid. + + union + { + /// Draw-specific parameters. Valid when cmdType is CmdDraw*. + DrawDispatchDrawArgs draw; + + /// Dispatch-specific parameters. Valid when cmdType is CmdDispatch* + DrawDispatchDispatchArgs dispatch; + }; + + /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), + /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert + /// the marker. + RgpMarkerSubQueueFlags subQueueFlags; +}; + +/// Information for BindPipeline callbacks +struct BindPipelineData +{ + const IPipeline* pPipeline; ///< The currently-bound pipeline + const PipelineInfo* pPipelineInfo; ///< General information about the bound pipeline + ICmdBuffer* pCmdBuffer; ///< The command buffer that is recording this command + uint64 apiPsoHash; ///< The hash to correlate APIs and corresponding PSOs. + PipelineBindPoint bindPoint; ///< The bind point of the pipeline within a queue. + + /// If the handler of this callback inserts an RGP trace marker using ICmdBuffer::CmdInsertRgpTraceMarker(), + /// these flags should be passed to that call to control which sub-queue(s) in the command buffer should insert + /// the marker. + RgpMarkerSubQueueFlags subQueueFlags; +}; + +#if PAL_DEVELOPER_BUILD +/// Information for DrawDispatchValidation callbacks +struct DrawDispatchValidationData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. + uint32 userDataCmdSize; ///< Size of PM4 commands used to validate the current user-data entries (bytes). + uint32 miscCmdSize; ///< Size of PM4 commands for all other draw- or dispatch-time validation (bytes). +}; + +// Information for BindPipelineValidation callbacks +struct BindPipelineValidationData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering draw or dispatch. + uint32 pipelineCmdSize; ///< Size of PM4 commands used to validate the current pipeline state (bytes). +}; + +/// Information for OptimizedRegisters callbacks +struct OptimizedRegistersData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer which is recording the triggering PM4 stream. + /// Array containing the number of times the PM4 optimizer saw a SET packet which modified each register + const uint32* pShRegSeenSets; + ///< Array containing the number of times the PM4 optimizer kept a SET packet which modified each register + const uint32* pShRegKeptSets; + uint32 shRegCount; ///< Number of SH registers + uint16 shRegBase; ///< Base address of SH registers + /// Array containing the number of times the PM4 optimizer saw a SET or RMW packet which modified each register + const uint32* pCtxRegSeenSets; + ///< Array containing the number of times the PM4 optimizer kept a SET or RMW packet which modified each register + const uint32* pCtxRegKeptSets; + uint32 ctxRegCount; ///< Number of context registers + uint16 ctxRegBase; ///< Base address of context registers +}; + +/// Internal RPM blt type +enum class RpmBltType : uint32 +{ + CpDmaCopy = 0, + CpDmaUpdate, + Draw, + Dispatch, + + Count +}; + +/// Describes the RPM blt call +struct RpmBltData +{ + ICmdBuffer* pCmdBuffer; ///< The command buffer that is executing the blt. + RpmBltType bltType; ///< Type of RPM blt, @ref RpmBltType. +}; +#endif + +/// Describes the binding of a GPU Memory object to a resource +struct BindGpuMemoryData +{ + const void* pObj; ///< Opaque pointer to the resource having memory bound to it. + gpusize requiredGpuMemSize; ///< GPU memory size required by pObj. + const IGpuMemory* pGpuMemory; ///< IGpuMemory object being bound to the resource. + gpusize offset; ///< Offset within pGpuMemory where the resource is being bound. + bool isSystemMemory; ///< If true then system memory is being bound to the object. In this case, + /// pGpuMemory and offset should be set to zero. +}; + +/// Describes an user marker operation +enum class UserMarkerOpType : uint8 +{ + Invalid = 0, ///< Invalid user marker operation + Push, ///< Push user marker operation + Pop, ///< Pop user marker operation + Set ///< Set user marker operation +}; + +/// Describes an user marker operation, used in UserMarkerHistoryTraceSource +struct UserMarkerOpInfo +{ + union + { + struct + { + uint32 opType : 2; ///< UserMarkerOpType + uint32 strIndex : 30; ///< Index of the user marker in the in corresponding string table + }; + uint32 u32All; + }; +}; + +} // Developer +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h b/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h index 65648148e7..9d22946b6d 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palDevice.h @@ -1,5930 +1,5896 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palDevice.h - * @brief Defines the Platform Abstraction Library (PAL) IDevice interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palCmdAllocator.h" -#include "palDestroyable.h" -#include "palFence.h" -#include "palFile.h" -#include "palGpuMemory.h" -#include "palImage.h" -#include "palInlineFuncs.h" -#include "palLib.h" -#include "palPerfExperiment.h" -#include "palPipeline.h" -#include "palQueue.h" -#include - -#if PAL_KMT_BUILD -struct _SECURITY_ATTRIBUTES; -#endif - -namespace Util -{ - class Event; -} - -namespace Pal -{ - -// Forward declarations. -class IBorderColorPalette; -class ICmdAllocator; -class ICmdBuffer; -class IColorBlendState; -class IColorTargetView; -class IDepthStencilState; -class IDepthStencilView; -class IDevice; -class IFence; -class IGpuEvent; -class IGpuMemory; -class IImage; -class IIndirectCmdGenerator; -class IMsaaState; -class IPerfExperiment; -class IPipeline; -class IPrivateScreen; -class IQueryPool; -class IQueue; -class IQueueSemaphore; -class IShaderLibrary; -class ISwapChain; -struct BorderColorPaletteCreateInfo; -struct CmdAllocatorCreateInfo; -struct CmdBufferCreateInfo; -struct ColorBlendStateCreateInfo; -struct ColorTargetViewCreateInfo; -struct ComputePipelineCreateInfo; -struct DepthStencilStateCreateInfo; -struct DepthStencilViewCreateInfo; -struct ExternalImageOpenInfo; -struct ExternalGpuMemoryOpenInfo; -struct ExternalQueueSemaphoreOpenInfo; -struct ExternalResourceOpenInfo; -struct GpuEventCreateInfo; -struct GpuMemoryCreateInfo; -struct GpuMemoryOpenInfo; -struct GpuMemoryRef; -struct GraphicsPipelineCreateInfo; -struct ImageCreateInfo; -struct IndirectCmdGeneratorCreateInfo; -struct MsaaStateCreateInfo; -struct MsaaQuadSamplePattern; -struct PeerGpuMemoryOpenInfo; -struct PeerImageOpenInfo; -struct PerfExperimentCreateInfo; -struct PinnedGpuMemoryCreateInfo; -struct PresentableImageCreateInfo; -struct PrivateScreenCreateInfo; -struct PrivateScreenNotifyInfo; -struct QueryPoolCreateInfo; -struct QueueCreateInfo; -struct QueueSemaphoreCreateInfo; -struct QueueSemaphoreOpenInfo; -struct ShaderLibraryCreateInfo; -struct SwapChainCreateInfo; -struct SwapChainProperties; -struct SvmGpuMemoryCreateInfo; -struct GraphicPipelineViewInstancingInfo; -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 881 -enum class WsiPlatform : uint32; -#else -enum WsiPlatform : uint32; -#endif -enum class PipelineBindPoint : uint32; -enum class VaRange : uint32; -enum class SwapChainMode : uint32; -namespace CmdDisassembly -{ - class ICmdBufferReporting; -} - -/// Maximum string length for GPU names. @see DeviceProperties. -constexpr uint32 MaxDeviceName = 256; - -/// Maximum number of indirect user-data tables managed by PAL's command buffer objects. @see DeviceFinalizeInfo. -constexpr uint32 MaxIndirectUserDataTables = 1; - -/// Maximum number of supported entries in the MSAA sample pattern palette. See IDevice::SetSamplePatternPalette(). -constexpr uint32 MaxSamplePatternPaletteEntries = 16; - -/// Maximum number of supported units in the gpu. These can be much larger than the actual values, but useful for arrays. -constexpr uint32 MaxShaderEngines = 32; -/// Maximum number of supported subunits each Shader Engine splits into (SH or SA, depending on generation) -constexpr uint32 MaxShaderArraysPerSe = 2; - -/// Size of the Active Pixel Packer Mask in DWORDs -constexpr uint32 ActivePixelPackerMaskDwords = 4; - -/// Maximum number of pixel packers per SE expected by PAL -constexpr uint32 MaxPixelPackerPerSe = 4; - -/// Defines host flags for Semaphore/Fence Array wait -enum HostWaitFlags : uint32 -{ - HostWaitAny = 0x1, ///< if set this bit, return after any signle semaphore/fence in the array has - /// completed. if not set, wait for completion of all semaphores/fences in the - /// array before returning. -}; - -/// Specifies what type of GPU a particular IDevice is (i.e., discrete vs. integrated). -enum class GpuType : uint32 -{ - Unknown = 0x0, ///< The GPU type can't be determined and is unknown. - Integrated = 0x1, ///< Integrated GPU (i.e., APU). - Discrete = 0x2, ///< Discrete GPU. - Virtual = 0x3, ///< Virtualized GPU. - Count -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 -/// Specifies which operating-system-support IP level (OSSIP) this device has. -enum class OssIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an OSSIP block, or its level cannot be determined - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< The device does not have an OSSIP block, or its level cannot be determined -#endif - - OssIp2_4 = 0x3, - OssIp4 = 0x4, -}; - -/// Specifies which VCE IP level this device has. -enum class VceIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an VCEIP block, or its level cannot be determined - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< The device does not have an VCEIP block, or its level cannot be determined -#endif - - VceIp1 = 0x1, - VceIp2 = 0x2, - VceIp3 = 0x3, - VceIp3_1 = 0x4, - VceIp3_4 = 0x5, - VceIp4 = 0x6, -}; - -/// Specifies which UVD IP level this device has. -enum class UvdIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an UVDIP block, or its level cannot be determined - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< The device does not have an UVDIP block, or its level cannot be determined -#endif - - UvdIp3_2 = 0x1, - UvdIp4 = 0x2, - UvdIp4_2 = 0x2, - UvdIp5 = 0x3, - UvdIp6 = 0x4, - UvdIp6_2 = 0x5, - UvdIp6_3 = 0x6, - UvdIp7 = 0x7, - UvdIp7_2 = 0x8, -}; -#endif - -/// Specifies which VCN IP level this device has. -enum class VcnIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an VCNIP block, or its level cannot be determined - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< The device does not have an VCNIP block, or its level cannot be determined -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 -#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - VcnIp1 = 0x1, -#endif -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 -/// Specifies which SPU IP level this device has. -enum class SpuIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an SPUIP block, or its level cannot be determined -#ifndef None - None = _None, ///< The device does not have an SPUIP block, or its level cannot be determined -#endif - SpuIp = 0x1, -}; -#endif - -/// Specifies which PSP IP level this device has. -enum class PspIpLevel : uint32 -{ - _None = 0x0, ///< @internal The device does not have an PSPIP block, or its level cannot be determined -#ifndef None - None = _None, ///< The device does not have an PSPIP block, or its level cannot be determined -#endif - PspIp10 = 0x1, -}; - -/// Specified video decode type -enum class VideoDecodeType : uint32 -{ - H264 = 0x0, ///< H264 VLD - Vc1 = 0x1, ///< VC1 VLD - Mpeg2Idct = 0x2, ///< Partial MPEG2 decode (IT+MP) - Mpeg2Vld = 0x3, ///< Full MPEG2 decode (RE+IT+MP+DB) - Mpeg4 = 0x4, ///< MPEG4 - Wmv9 = 0x5, ///< WMV9 IDCT - Mjpeg = 0x6, ///< Motion JPEG - Hevc = 0x7, ///< HEVC - Vp9 = 0x8, ///< VP9 - Hevc10Bit = 0x9, ///< HEVC 10bit - Vp910Bit = 0xa, ///< VP9 10bit - Av1 = 0xb, ///< AV1 8/10bit - Av112Bit = 0xc, ///< AV1 12bit - Count, -}; - -/// Video CODEC to use for encoding -enum class VideoEncodeCodec : uint32 -{ - H264 = 0x0, ///< H.264 - H265 = 0x1, ///< H.265 - Av1 = 0x2, ///< AV1 - Count -}; - -/// Specifies a virtual address range memory should be allocated in. -enum class VaRange : uint32 -{ - Default, ///< Default VA range. Choose this for most allocations. - DescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for descriptor tables. Knowing - /// an allocation is allocated in this range, only one user data entry is required to - /// specify a descriptor table. @see ResourceMappingNodeType. - ShadowDescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for "shadow" descriptor tables. - /// A shadow descriptor table is an additional table with the same layout as its parent - /// descriptor table that can hold infrequently needed data like fmask SRDs or UAV counter - /// data. This scheme allows the client and SC to work out a known location for - /// infrequently needed data without wasting a user data entry or wasting half of every - /// descriptor cache line. - /// Only supported if DeviceProperties::gpuMemoryProperties::flags::shadowDescVaSupport is - /// set. - Svm, ///< Place the allocation in a VA range reserved by PAL for shared virtual memory(SVM). - /// This is a GPU VA range that is reserved also on the CPU-side. - /// The size of reserved VA is set by PAL client by calling CreatePlatform. - CaptureReplay, ///< Place the allocation in a VA range reserved for capture and playback. - Count, -}; - -/// Enumerates tmz(trusted memory zone) support level. -enum class TmzSupportLevel : uint32 -{ - None = 0, ///< TMZ not supported. - PerQueue = 1, ///< Enable TMZ mode per queue. - PerSubmission = 2, ///< Enable TMZ mode per submission. - PerCommandOp = 3 ///< Enable TMZ mode per command operation. -}; - -/// How to interpret a single bit in a swizzle equation. -union SwizzleEquationBit -{ - struct - { - uint8 valid : 1; ///< Indicates whether this channel setting is valid. - uint8 channel : 2; ///< 0 for x channel, 1 for y channel, 2 for z channel. - uint8 index : 5; ///< The channel index. - }; - uint8 u8All; ///< The above values packed in an 8-bit uint. -}; - -constexpr uint32 SwizzleEquationMaxBits = 20; ///< Swizzle equations will consider no more than this many bits. -constexpr uint8 InvalidSwizzleEqIndex = 0xFF; ///< Indicates an invalid swizzle equation index in the equation table. -constexpr uint8 LinearSwizzleEqIndex = 0xFE; ///< An invalid eq. index indicating a row-major, linear memory layout. - -/// Texture fetch meta-data capabilities bitfield definition, used with tcCompatibleMetaData setting -enum TexFetchMetaDataCaps : uint32 -{ - TexFetchMetaDataCapsNoAaColor = 0x00000001, - TexFetchMetaDataCapsMsaaColor = 0x00000002, - TexFetchMetaDataCapsFmask = 0x00000004, - TexFetchMetaDataCapsNoAaDepth = 0x00000008, - TexFetchMetaDataCapsMsaaDepth = 0x00000010, - TexFetchMetaDataCapsAllowStencil = 0x00000020, - TexFetchMetaDataCapsAllowZ16 = 0x00000040, -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 -/// Catalyst AI setting enums -enum CatalystAiSettings : uint32 -{ - CatalystAiDisable = 0, - CatalystAiEnable = 1, - CatalystAiMaximum = 2, -}; -#endif - -/// Texture Filter optimization enum values -enum TextureFilterOptimizationSettings : uint32 -{ - TextureFilterOptimizationsDisabled = 0, - TextureFilterOptimizationsEnabled = 1, - TextureFilterOptimizationsAggressive = 2, -}; - -/// Distribution Tess Mode enum values -enum DistributionTessMode : uint32 -{ - DistributionTessOff = 0, - DistributionTessDefault = 1, - DistributionTessPatch = 2, - DistributionTessDonut = 3, - DistributionTessTrapezoid = 4, - DistributionTessTrapezoidOnly = 5, -}; - -/// Defines the context roll optimization flags -enum ContextRollOptimizationFlags : uint32 -{ - OptFlagNone = 0x00000000, - PadParamCacheSpace = 0x00000001, -}; - -/// Defines the initial value to use for DCC metadata -enum class DccInitialClearKind { - Uncompressed = 0x0, - OpaqueBlack = 0x1, - OpaqueWhite = 0x2, - ForceBit = 0x10, - ForceOpaqueBlack = (ForceBit | OpaqueBlack), - ForceOpaqueWhite = (ForceBit | OpaqueWhite), -}; - -/// Enum defining the different scopes (i.e. registry locations) where settings values are stored -enum InternalSettingScope : uint32 -{ - PrivateDriverKey = 0x0, - PublicPalKey = 0x1, - PrivatePalKey = 0x2, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - PublicCatalystKey = 0x3, - PrivatePalGfx9Key = 0x4, - PrivatePalGfx12Key = 0x5, -#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - PrivatePalGfx6Key = 0x3, - PrivatePalGfx9Key = 0x4, - PublicCatalystKey = 0x5, - PrivatePalGfx12Key = 0x7, -#endif - PublicPalFile = 0x9, - RootDriverKey = 0xa, -}; - -/// Enum defining override states for feature settings. -enum class FeatureOverride : uint32 -{ - Default = 0, ///< Default setting state. - Enabled = 1, ///< (Force) enabled state. Default may change itself to this state. - Disabled = 2 ///< (Force) disabled state. Default may change itself to this state. -}; - -/// Enum bitmask defining externally-controlled (e.g. by Radeon Settings/KMD) driver feature settings. -enum RsFeatureType : uint32 -{ - RsFeatureTypeTurboSync = (1u << 0), - RsFeatureTypeChill = (1u << 1), - RsFeatureTypeDelag = (1u << 2), - RsFeatureTypeBoost = (1u << 4), - RsFeatureTypeProVsr = (1u << 5), -}; - -/// Output structure containing information about the requested RsFeatureType (singular). -union RsFeatureInfo -{ - /// Global TurboSync settings. - struct - { - bool enabled; ///< Specifies whether TurboSync is enabled globally. - } turboSync; - - /// Global Chill settings. - struct - { - bool enabled; ///< Specifies whether Chill is enabled globally. - uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Chill. - uint32 minFps; ///< Specifies the global Chill minimum FPS limit. - uint32 maxFps; ///< Specifies the global Chill maximum FPS limit. - } chill; - - /// Global Delag settings. - struct - { - bool enabled; ///< Specifies whether Delag is enabled globally. - uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Delag. - uint32 hotkeyInd;///< If nonzero, specifies the virtual key code assigned to Delag's indicator. - uint32 limitFps; ///< Specifies the global Delag FPS limit. - uint32 level; ///< Specifies the global Delag level. - } delag; - - /// Global Boost settings. - struct - { - bool enabled; ///< Specifies whether Boost is enabled globally. - uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Boost. - uint32 hotkeyInd; ///< If nonzero, specifies the virtual key code assigned to Boost's indicator. - uint32 minRes; ///< Specifies the global Boost minimum resolution. - bool adaptiveVrsEnabled; ///< Specifies whether BoostAdaptiveVrs is enabled globally. - } boost; - - /// Global ProVsr settings. - struct - { - bool enabled; ///< Specifies whether ProVsr is enabled globally. - uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to ProVsr. - } proVsr; - -}; - -/// High-dynamic range (HDR) surface display modes. Used to indicate the HDR display standard for a particular swap -/// chain texture format and screen colorspace/transfer function combination. -enum class HdrDisplayMode : uint32 -{ - Sdr = 0, ///< Standard dynamic range; non-HDR compatible (default). - Hdr10 = 1, ///< HDR10 PQ. Requires 10:10:10:2 swap chain. - ScRgb = 2, ///< scRGB HDR (Microsoft and FreeSync2 linear mode). 1.0 = 80 nits, 125.0 = 10000 nits. - /// Requires FP16 swapchain. - FreeSync2 = 3, ///< FreeSync2 HDR10 Gamma 2.2. Requires 10:10:10:2 swap chain. -}; - -static constexpr uint32 MaxMiscStrLen = 61; - -/// Whether to use graphics or compute for performing fast clears on depth stencil views. -enum class FastDepthStencilClearMode : uint8 -{ - Default, ///< Compute or graphics will be chosen at the driver's discretion - Graphics, ///< Graphics will always be used - Compute ///< Compute will always be used -}; - -enum DeferredBatchBinMode : uint32 -{ -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 875 - DeferredBatchBinCustom = 0, - DeferredBatchBinAccurate = 1 -#else - DeferredBatchBinDisabled = 0, - DeferredBatchBinCustom = 1, - DeferredBatchBinAccurate = 2 -#endif -}; - -/// PWS enable mode: e.g. disabled, fully enabled or partially enabled. -enum class PwsMode : uint32 -{ - Disabled = 0, ///< PWS feature is disabled - Enabled = 1, ///< PWS feature is fully enabled if HW supports. - NoLateAcquirePoint = 2 ///< PWS feature is enabled with PWS counter only if HW supports, no late acquire points. -}; - -#if defined(__unix__) -/// Defines the supported VM always valid modes. -enum VmAlwaysValidEnable : uint32 -{ - VmAlwaysValidDefaultEnable = 0, ///< Only enable the optimization when kernel driver can support - VmAlwaysValidForceDisable = 1, ///< Force the optimization always disabled - VmAlwaysValidForceEnable = 2 ///< Force the optimization always enabled -}; -#endif - -/// Alignment mode for accessing graphics/compute buffers. -/// This doesn't change the alignment of the whole buffer, just the alignment at which accesses must be made. -enum class BufferAlignmentMode : uint8 -{ - Default, ///< KMD (and therefore PAL) picks alignment requirement. Client should be prepared for anything. - Dword, ///< Hardware will automatically align requests to the smaller of: element-size or DWORD. - Unaligned ///< Any request alignment is allowed. -}; - -enum TemporalHintsMrtBehavior : uint8 -{ - TemporalHintsDynamicRt = 0x0, ///< Enable Dynamic RT Temporal hints. PAL chooses NT vs RT based on heuristics. - TemporalHintsStaticRt = 0x1, ///< Regular temporal for both near and far read/write caches. - TemporalHintsStaticNt = 0x2, ///< Non-temporal (re-use not expected) for both near and far read/write caches. -}; - -/// Client-controllable behavior for Gfx12-specific software workaround to HiSZ hardware bug. -/// Allows the client to override PAL defaults for performance or profiled reasons. -enum class HiSZWorkaroundBehavior : uint8 -{ - Default = 0x0, ///< Let PAL decide what the default is. - ForceDisableAllWar, ///< Force disable all workarounds. - /// Note: This should rarely be chosen and carries an extremely high risk of issue. - /// Should only be used when application has been profiled to guarantee no - /// risk of issue. - ForceHiSZDisableBasedWar, ///< Force the disable HiZ/S based workaround behavior. - ForceHiSZEventBasedWar, ///< Force the event-after-draw workaround behavior. - /// Note: This carries a risk that the hang may still be seen. - ForceHiSZDisableBaseWarWithReZ ///< Force the disable HiZ/S based workaround behavior, but with an added - /// optimization to force ZOrder mode to EarlyZThenReZ to reclaim some performance. - /// This will apply to all graphics pipelines that trigger the workaround - /// condition, except those pipelines that have set - /// GraphicsPipelineCreateInfo::noForceReZ. This ZOrder change will occur only when - /// HiZ/S is forcibly disabled by the workaround; otherwise the ZOrder remains what - /// was chosen by the compiler. -}; - -/// Pal settings that are client visible and editable. -struct PalPublicSettings -{ - /// Maximum border color palette size supported by any queue. - uint32 borderColorPaletteSizeLimit; - - /// Whether to use graphics or compute for performing fast clears on depth stencil views. - FastDepthStencilClearMode fastDepthStencilClearMode; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 - /// Forces all serialized loads (LoadPipeline or LoadCompoundState) to fail. - bool forceLoadObjectFailure; -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 956 - /// Controls the distribution mode for tessellation, which affects how patches are processed by different VGT - /// units. 0: None - No distribution across VGTs (legacy mode). 1: Default - Optimal settings are chosen depending - /// on the gfxip. 2: Patch - Individual patches are distributed to different VGTs. 3: Donut - Patches are split - /// into donuts and distributed to different VGTs. 4: Trapezoid - Patches from donuts are split into trapezoids and - /// distributed to different VGTs. Falls back to donut mode if HW does not support this mode. 5: Trapezoid only - - /// Distribution turned off if HW does not support this mode. - uint32 distributionTessMode; -#endif - - /// Flags that control PAL optimizations to reduce context rolls. 0: Optimization disabled. 1: Pad parameter cache - /// space. Sets VS export count and PS interpolant number to per-command buffer maximum value. Reduces context rolls - /// at the expense of parameter cache space. - uint32 contextRollOptimizationFlags; - - /// The number of unbound descriptor debug srds to allocate. To detect reads of unbound descriptor within arrays, - /// multiple debug srds can be allocated. - uint32 unboundDescriptorDebugSrdCount; - - /// Disables compilation of internal PAL shaders. It can be enabled only if a PAL client won't use any of PAL blit - /// functionalities on gfx/compute engines. - bool disableResourceProcessingManager; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 - /// Controls app detect and image quality altering optimizations exposed by CCC. - uint32 catalystAI; -#endif - - /// Controls texture filtering optimizations exposed by CCC. - uint32 textureOptLevel; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 - /// Disables SC initialization. It can be enabled only if a PAL client won't use SC for shader compilation and - /// provide direct ISA binaries(usually AQL path). - bool disableScManager; -#endif - - /// Information about the client performing the rendering. For example: Rendered By PAL (0.0.1) - char renderedByString[MaxMiscStrLen]; - - /// Debug information that the client or tester might want reported. - char miscellaneousDebugString[MaxMiscStrLen]; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 - /// Allows SC to make optimizations at the expense of IEEE compliance. - bool allowNonIeeeOperations; - - /// Controls whether shaders should execute one atomic instruction per wave for UAV append/consume operations. - /// If false, one atomic will be executed per thread. - bool appendBufPerWaveAtomic; -#endif - - /// Bitmask of cases where texture compatible meta data will be used Single-sample color surface: 0x00000001 MSAA - /// color surface: 0x00000002 FMask data: 0x00000004 Single-sample depth surface: 0x00000008 MSAA depth surface: - /// 0x00000010 Allow stencil: 0x00000020 Allow Z-16 surfs 0x00000040 - uint32 tcCompatibleMetaData; - - /// Specifies the threshold below which CmdCopyMemory() is executed via a CpDma BLT, in bytes. CPDMA copies have - /// lower overhead than CS/Gfx copies, but less throughput for large copies. - uint32 cpDmaCmdCopyMemoryMaxBytes; - - /// Forces high performance state for allocated queues. Note: currently supported in Windows only. - bool forceHighClocks; - - /// When submitting multiple command buffers in a single grQueueSubmit call, the ICD will patch the command streams - /// so that the command buffers are chained together instead of submitting through KMD multiple times. This setting - /// limits the number of command buffers that will be chained together; reduce to prevent problems due to long - /// running submits. - uint32 cmdBufBatchedSubmitChainLimit; - - /// Flags that control PAL's command allocator residency optimizations. If a command allocation isn't optimized PAL - /// will wait for it to become resident at creation. 0x1 - Wait for command data to become resident at Submit-time. - /// 0x2 - Wait for embedded data to become resident at Submit-time. 0x4 - Wait for marker data to become resident at - /// Submit-time. - uint32 cmdAllocResidency; - - /// Overrides max queued frames allowed - uint32 maxQueuedFrames; - - /// Maximum number of presentable images per adapter(including LDA chain) which is recommended. If app exceeds the - /// presentable image number threshold, awarning may be reported. - uint32 presentableImageNumberThreshold; - - /// Provides a hint to PAL that client knows that every individual depth stencil surfaces are always cleared with - /// same values.If TRUE, per-tile tracking of exp/clear will be enabled (requires HTile). - bool hintInvariantDepthStencilClearValues; - - /// Provides a hint to PAL that PAL should disable color compression on surfaces that are smaller than or equal to - /// this setting (setting * setting) in size. - uint32 hintDisableSmallSurfColorCompressionSize; - - /// Disables Escape call to KMD. This is a temporary setting for experimentation that is expected to break features - /// that currently needs Escape call. - bool disableEscapeCall; - - /// A hint to the Windows OS that this application has submissions that are expected to run for a long time. This - /// tells the OS that checking elapsed execution time is not a good way to judge if the GPU is hung. - /// - /// If this flag is set, Windows shouldn't TDR long submissions on uncontested hardware queues. Submissions on - /// contested queues must preempt within the time limit to avoid a TDR. Setting this may also extend the TDR timer - /// on compute queues. - bool longRunningSubmissions; - - /// Disables MCBP on demand. This is a temporary setting until ATOMIC_MEM packet issue with MCBP is resolved. - bool disableCommandBufferPreemption; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 956 - /// Disable the fast clear eliminate skipping optimization. This optimization will conservatively track the usage - /// of clear values to allow the vast majority of images that never clear to a value that isn't TC-compatible to - /// skip the CPU and front-end GPU overhead of issuing a predicated fast clear eliminate BLT. - bool disableSkipFceOptimization; -#endif - - /// Sets the minimum BPP of surfaces which will have DCC enabled - uint32 dccBitsPerPixelThreshold; - - /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have - /// PAL automatically pad allocation starting virtual address alignments to enable this optimization. By - /// default, PAL will use the KMD-reported limit. - gpusize largePageMinSizeForVaAlignmentInBytes; - - /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have - /// PAL automatically pad allocation sizes to fill an integral number of large pages. By default, PAL will - /// use the KMD-reported limit. - gpusize largePageMinSizeForSizeAlignmentInBytes; - - /// Makes the unbound descriptor debug srd 0 so the hardware drops the load and ignores it instead of pagefaulting. - /// Used to workaround incorrect app behavior. - bool zeroUnboundDescDebugSrd; - - /// Preferred heap for uploading client pipelines. Default is set to @ref GpuHeap::GpuHeapInvisible. Setting is - /// ignored for internal pipelines and are uploaded to @ref GpuHeap::GpuHeapLocal. - GpuHeap pipelinePreferredHeap; - - bool depthClampBasedOnZExport; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 - /// Force the PreColorTarget to an earlier PreRasterization point if used as a wait point. This is to prevent a - /// write-after-read hazard for a corner case: shader exports from distinct packers are not ordered. Advancing - /// wait point from PreColorTarget to PostPrefetch could cause over-sync due to extra VS/PS_PARTIAL_FLUSH - /// inserted. It is default to false, but client drivers may choose to app-detect to enable if see corruption. - bool forceWaitPointPreColorToPostPrefetch; -#endif - - /// Allows the client to disable debug overlay visual confirm after DebugOverlay::Platform is created when the - /// panel setting DebugOverlayEnabled is globally set but a certain application might need to turn off visual - /// confirm to make the screen not too noisy. - bool disableDebugOverlayVisualConfirm; - - bool enableExecuteIndirectPacket; - - /// Offers flexibility to the client to choose Graphics vs Compute engine for Indirect Command Generation - /// (Shader path) based on performance and other factors. The default is false since we have seen perf gains using - /// the ACE. - bool disableExecuteIndirectAceOffload; - - /// Value to initialize metadata for DCC surfaces to, if they are compressable. This has no effect on non-DCC - /// images. Images whose initial layout is not compressable are only affected if this is "forced". - /// 0x00 - Uncompressed (default) - /// 0x01 - Opaque Black - /// 0x02 - Opaque White - /// 0x11 - Forced Opaque Black - /// 0x12 - Forced Opaque White - uint32 dccInitialClearKind; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 956 - /// Allows the client to not create internal VrsImage. Pal internal will create a 16M image as vrsImageSize. - bool disableInternalVrsImage; -#endif - - /// Allows the client to control binning persistent and context states per bin. - /// A value of 0 tells PAL to pick the number of states per bin. - uint32 binningPersistentStatesPerBin; - uint32 binningContextStatesPerBin; - - /// This key controls if binning will be disabled when the PS may kill pixels. - OverrideMode disableBinningPsKill; - - /// Controls GS LateAlloc val (for pos/prim allocations NOT param cache) on NGG pipelines. Can be no more than 127. - uint32 nggLateAllocGs; - - /// Bitmask of cases where RPM view memory accesses will bypass the MALL - /// RpmViewsBypassMallOff (0x0): Disable MALL bypass - /// RpmViewsBypassMallOnRead (0x1): Skip MALL for read access of views created in RPM - /// RpmViewsBypassMallOnWrite (0x2): Skip MALL for write access of views created in RPM - /// RpmViewsBypassMallOnCbDbWrite (0x4): Control the RPM CB/DB behavior - RpmViewsBypassMall rpmViewsBypassMall; - - /// Optimize color export format for depth only rendering. Only applicable for RB+ parts - bool optDepthOnlyExportRate; - - /// Controls whether or not we should expand Hi-Z to full range rather than doing fine-grain resummarize - /// operations. Expanding Hi-Z leaves the Hi-Z data in a less optimal state but is a much faster operation - /// than the fine-grain resummarize. - bool expandHiZRangeForResummarize; - - /// Control whether to have command buffer emit SQTT marker events. Useful for client driver to perform SQTT - /// dump without the involvement of dev driver. - bool enableSqttMarkerEvent; - - /// Controls the value of CB_COLOR0_ATTRIB.LIMIT_COLOR_FETCH_TO_256B_MAX. This bit limits CB fetch to 256B on cache - /// miss, regardless of sector size. - bool limitCbFetch256B; - - /// Controls whether or not deferred batch binning is enabled 0 : Batch binning always disabled 1 : Use custom bin - /// sizes 2 : Optimal. - DeferredBatchBinMode binningMode; - - /// Controls the custom batch bin size.Only used when deferredBatchBinMode == 1 High word is for x, low word is for - /// y. Default is 128x128. Values must be power of two between 16 and 512. - uint32 customBatchBinSize; - - /// Maximum number of primitives per batch. The maximum value is 1024. - uint32 binningMaxPrimPerBatch; - - /// Controls PWS enable mode: disabled, fully enabled or partially enabled. Only take effect if HW supports PWS. - PwsMode pwsMode; - - /// Controls the MaxScratchRingSizeBaseline, which is really just the maximum size of the scratch ring - gpusize maxScratchRingSizeBaseline; - - /// Controls the maximum size of the scratch ring allocation - uint32 maxScratchRingSizeScalePct; - -#if defined(__unix__) - /// Whether enable vm-always-valid feature on Linux while allocating Bo - VmAlwaysValidEnable enableVmAlwaysValid; -#endif - - /// Alignment mode for graphics and compute buffers. - /// Clients must check Pal::DeviceProperties::osProperties::flags::forceAlignmentSupported - /// to see if anything other than default will work. - BufferAlignmentMode hardwareBufferAlignmentMode; - - // Disallows putting the shader ring in system memory for performance purposes - // This is done by un-listing GpuHeapGartUswc as a possible heap for the shader rings. - // Instead only allowing GpuHeapInvisible and GpuHeapLocal - bool forceShaderRingToVMem; - - /// If the client sets this to true they promise they've done exhaustive testing on every ASIC to prove that this - /// application can use AC01 fast clears safely. This should never be forced to true unconditionally. - bool ac01WaNotNeeded; - - /// Toggles whether or not image copies will prefer using the graphics pipeline. This setting does not force all - /// copies to use graphics or compute, it changes what method will be selected in cases where either could be used. - bool preferGraphicsImageCopy; - - /// Bitmask to control adding Waits around Flush events - /// This is public setting to allow AppDetect to override of 'WaitOnFlush' setting. - /// Setting 'waitOnFlush' for actual workarounds is strongly discouraged because: - /// 1) It has a negative performance impact. - /// 2) Waits effect the timing and pipeline execution which can hide underlying hw/fw/sw bugs. - /// Issues resolved by added waits should be root caused. - uint32 waitOnFlush; - - /// Provides the ability for mall to be alloc-ed/noalloc-ed using the dynamic or static behavior. - TemporalHintsMrtBehavior temporalHintsMrtBehavior; - - /// Allows the client to change the behavior of the Gfx12 HW Bug that impacts HiS and HiZ. - /// This should never be forced away from default unconditionally. - HiSZWorkaroundBehavior hiSZWorkaroundBehavior; - - /// For event-based HiSZ workarounds (Gfx12), specifies the timeouts supported by the Scan Converter and Depth Block - /// for their tile summarizer controller. - /// Ignored unless @ref hiSZWorkaroundBehavior is set to ForceHiSZEventBasedWar. - /// If set to 0, PAL picks a default value. - /// Note: Choosing any value other than 0 carries with it **significant risk** of hangs, as the timeout value - /// determined by PAL is the most optimal to avoid the most hangs. Any deviation from the default must be - /// thoroughly tested and is not guaranteed to be safe! - uint32 tileSummarizerTimeout; - -#if PAL_KMT_BUILD - /// If client sets it to false, native fence will be disabled in OS wide no matter OS or hardware supports - /// native fence or not. - bool enableNativeFence; -#endif - - /// Whether HiZ/HiS is valid after depth/stencil image is copied as destination. - /// - If false, this is not guaranteed, PAL will issue necessary HiZ/HiS resummarization at later client barrier - /// call time when transition to another HiZ/HiS valid layout like DepthStencilTarget. - /// - If true, PAL does necessary HiZ/HiS fixup after copy if needed and HiZ/HiS will be always valid after copy. - /// At later client barrier call time, no HiZ/HiS resummarization will happen. - bool makeLayoutCopyDstHiSZCompatible; -}; - -/// Defines the modes that the GPU Profiling layer can use when its buffer fills. -enum GpuProfilerStallMode : uint32 -{ - GpuProfilerStallAlways = 0, ///< Always stall to get accurate trace data - GpuProfilerStallLoseDetail = 1, ///< Lose register-level detail if under pressure to avoid stalls - GpuProfilerStallNever = 2, ///< Never stall, miss trace packets -}; - -/// Describes the equations needed to interpret the raw memory of a tiled texture. -struct SwizzleEquation -{ - SwizzleEquationBit addr[SwizzleEquationMaxBits]; ///< Address setting: each bit is the result of addr ^ xor ^ xor2. - SwizzleEquationBit xor1[SwizzleEquationMaxBits]; ///< xor setting. - SwizzleEquationBit xor2[SwizzleEquationMaxBits]; ///< xor2 setting. - uint32 numBits; ///< The number of bits in the equation. - bool stackedDepthSlices; ///< True if depth slices are treated as being stacked vertically - /// prior to swizzling. -}; - -/// Specifies the hardware features supported for PRT (sparse images). -enum PrtFeatureFlags : uint32 -{ - PrtFeatureBuffer = 0x00000001, ///< Indicates support for sparse buffers - PrtFeatureImage2D = 0x00000002, ///< Indicates support for sparse 2D images - PrtFeatureImage3D = 0x00000004, ///< Indicates support for sparse 3D images - PrtFeatureImageMultisampled = 0x00000008, ///< Indicates support for sparse multisampled images - PrtFeatureImageDepthStencil = 0x00000010, ///< Indicates support for sparse depth/stencil images - PrtFeatureShaderStatus = 0x00000020, ///< Indicates support for residency status in shader instructions - PrtFeatureShaderLodClamp = 0x00000040, ///< Indicates support for LOD clamping in shader instructions - PrtFeatureUnalignedMipSize = 0x00000080, ///< Indicates support for non-miptail levels with dimensions that - /// aren't integer multiples of the tile size as long as they are - /// at least as large as a single tile - PrtFeaturePerSliceMipTail = 0x00000100, ///< Indicates support for per-slice miptail (slice-major order) - - PrtFeatureTileAliasing = 0x00000200, ///< Indicates support for aliasing tiles (without metadata) - PrtFeatureStrictNull = 0x00000400, ///< Indicates whether reads of unmapped tiles always return zero -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 - PrtFeatureNonStandardImage3D = 0x00000800, ///< Indicates support for sparse 3D images restricted to - /// non-standard tile shapes that match the tile mode block depth - PrtFeaturePrtPlus = 0x00001000, ///< Indicates that this image supports use of residency maps. -#else - PrtFeaturePrtPlus = 0x00000800, ///< Indicates that this image supports use of residency maps. -#endif -}; - -/// Describe the settings' scope accessible by clients. -enum class SettingScope -{ - Driver, ///< For settings specific to a UMD - Global, ///< For global settings controlled by CCC - File, ///< For settings that are only read from a file - Root, ///< For settings under HKR -}; - -/// Big Software (BigSW) Release information structure -/// Software release management uses this version # to control a rollout of big SW features together. -struct BigSoftwareReleaseInfo -{ - uint32 majorVersion; ///< BigSW Release Major version - uint32 minorVersion; ///< BigSW Release Minor version. - uint32 miscControl; ///< BigSW Release miscellaneous control. -}; - -/// Virtual display capabilities as determined by the OS. The reported values bound the valid ranges of values supported -/// by the @ref VirtualDisplayInfo structure passed in to @ref IDevice::CreateVirtualDisplay. -struct VirtualDisplayCapabilities -{ - uint32 maxVirtualDisplays; ///< The maximum number of virtual display supported - Rational minRefreshRate; ///< The minimum refresh rate - Rational maxRefreshRate; ///< The maximum refresh rate -}; - -/// The properties of a specific virtual display -struct VirtualDisplayProperties -{ - bool isVirtualDisplay; ///< True, if it's a virtual display -}; - -/// Enumerates all of the types of local video memory which could be associated with a GPU. -enum class LocalMemoryType : uint32 -{ - Unknown = 0, - Ddr2, - Ddr3, - Ddr4, - Gddr5, - Gddr6, - Hbm, - Hbm2, - Hbm3, - Lpddr4, - Lpddr5, - Ddr5, - Count -}; - -/// Bitmask of all MSAA/EQAA types supported, in terms of samples (S) and shaded fragments (F) -enum MsaaFlags : uint16 -{ - MsaaS1F1 = 0x0001, - MsaaS2F1 = 0x0002, - MsaaS4F1 = 0x0004, - MsaaS8F1 = 0x0008, - MsaaS16F1 = 0x0010, - MsaaAllF1 = 0x001F, - - MsaaS2F2 = 0x0020, - MsaaS4F2 = 0x0040, - MsaaS8F2 = 0x0080, - MsaaS16F2 = 0x0100, - MsaaAllF2 = 0x01E0, - - MsaaS4F4 = 0x0200, - MsaaS8F4 = 0x0400, - MsaaS16F4 = 0x0800, - MsaaAllF4 = 0x0E00, - - MsaaS8F8 = 0x1000, - MsaaS16F8 = 0x2000, - MsaaAllF8 = 0x3000, - - MsaaAll = 0x3FFF, -}; - -/// Supported RTIP version enumeration -enum class RayTracingIpLevel : uint32 -{ - _None = 0x0, ///< The device does not have an RayTracing Ip Level -#ifndef None - None = _None, ///< The device does not have an RayTracing Ip Level -#endif - RtIp1_0 = 0x1, ///< First Implementation of HW RT - RtIp1_1 = 0x2, ///< Added computation of triangle barycentrics into HW - RtIp2_0 = 0x3, ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, - /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 - RtIp3_1 = 0x5, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, - /// 128 Byte Quantized box node, obb support, wide sort) -#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, - /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 - RtIp3_1 = 0x6, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, - /// 128 Byte Quantized box node, obb support, wide sort) -#endif -}; - -/// Which ip version a component has reported -struct IpTriple -{ - uint32 major : 8; /// Major revision value - uint32 minor : 8; /// Minor revision value - uint32 stepping : 16; /// Stepping value - - constexpr operator uint32() const - { - return ((stepping ) | - (minor << 16) | - (major << 24)); - } -}; - -/// Reports various properties of a particular IDevice to the client. @see IDevice::GetProperties. -struct DeviceProperties -{ - uint32 vendorId; ///< Vendor ID (should always be 0x1002 for AMD). - uint32 deviceId; ///< GPU device ID (e.g., Hawaii XT = 0x67B0). - uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different - /// SKUs or revisions. Corresponds to one of the PRID_* revision IDs. - uint32 eRevId; ///< GPU emulation/internal revision ID. - AsicRevision revision; ///< ASIC revision. - GpuType gpuType; ///< Type of GPU (discrete vs. integrated) - uint16 gpuPerformanceCapacity; ///< Portion of GPU assigned in virtualized system (SRIOV) - ///< 0-65535, 0 invalid (not virtualized), 1 min, 65535 max - GfxIpLevel gfxLevel; ///< IP level of this GPU's GFX block -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - OssIpLevel ossLevel; ///< IP level of this GPU's OSS block - VceIpLevel vceLevel; ///< IP level of this GPU's VCE block - UvdIpLevel uvdLevel; ///< IP level of this GPU's UVD block -#endif - VcnIpLevel vcnLevel; ///< IP level of this GPU's VCN block -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - SpuIpLevel spuLevel; ///< IP level of this GPU's SPU block -#endif - PspIpLevel pspLevel; ///< IP level of this GPU's PSP block - uint32 gfxStepping; ///< Stepping level of this GPU's GFX block - IpTriple gfxTriple; ///< Full GFX IP level (major.minor.step) of this GPU - char gpuName[MaxDeviceName]; ///< Null terminated string identifying the GPU. - uint32 gpuIndex; ///< Device's index in a linked adapter chain. - uint32 maxGpuMemoryRefsResident; ///< Maximum number of GPU memory references that can be resident - /// at any time. Memory references set both via IQueue and IDevice - /// (via AddGpuMemoryReferences() or Submit()) count against this limit. - uint64 timestampFrequency; ///< Frequency of the device's timestamp counter in Hz. - /// @see ICmdBuffer::CmdWriteTimestamp. - uint32 attachedScreenCount; ///< Number of screen attached to the device. - uint32 maxSemaphoreCount; ///< Queue semaphores cannot have a signal count higher than this value. - /// For example, one indicates that queue semaphores are binary. - PalPublicSettings settings; ///< Public settings that the client has the option of overriding -#if PAL_CLIENT_EXAMPLE - AddrHandle hAddrlib; ///< Handle to addrlib for directed image tests. -#endif - - struct - { - union - { - struct - { - /// This engine supports timestamps (ICmdBuffer::CmdWriteTimestamp()). - uint32 supportsTimestamps : 1; - - /// This engine supports ICmdBuffer::CmdSetPredication() based on Streamout/Occlusion query - uint32 supportsQueryPredication : 1; - - /// This engine supports ICmdBuffer::CmdSetPredication() based on a 32-bit GPU memory allocation - uint32 supports32bitMemoryPredication : 1; - - /// This engine supports ICmdBuffer::CmdSetPredication() based on a 64-bit GPU memory allocation - uint32 supports64bitMemoryPredication : 1; - - /// This engine supports ICmdBuffer::If(), Else() and EndIf() calls. - uint32 supportsConditionalExecution : 1; - - /// This engine supports ICmdBuffer::While() and EndWhile() calls. - uint32 supportsLoopExecution : 1; - - /// This engine supports ICmdBuffer::CmdWaitRegisterValue(), WaitMemoryValue() and - /// CopyRegisterToMemory() calls. - uint32 supportsRegMemAccess : 1; - - /// This engine supports ICmdBuffer::CmdCopyImage() between optimally tiled images with - /// mismatched tiling tokens. - uint32 supportsMismatchedTileTokenCopy : 1; - - /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref - /// LayoutUninitializedTarget layout. - uint32 supportsImageInitBarrier : 1; - - /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref - /// LayoutUninitializedTarget layout for individual subresources. If this is not set and - /// supportsImageInitBarrier is set, the subresource range must span the entire image. - uint32 supportsImageInitPerSubresource : 1; - - /// This engine does not support any virtual memory features. IQueue::RemapVirtualMemoryPages and - /// IQueue::CopyVirtualPageMappings are not supported on Queues using this engine. - uint32 runsInPhysicalMode : 1; - - /// Indicates whether this engine can do virtual memory remap or not. - uint32 supportVirtualMemoryRemap : 1; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - /// Indicates whether this Queues using this engine can maintain the contents of CE RAM across - /// consecutive submissions. If this is not set, the client must not specify a nonzero value for - /// either @ref QueueCreateInfo::persistentCeRamSize or @ref QueueCreateInfo::persistentCeRamOffset. - uint32 supportPersistentCeRam : 1; -#else - uint32 reserved914 : 1; -#endif - - /// Indicates whether the engine supports the command allocator tracks which chunk is idle. - uint32 supportsTrackBusyChunks : 1; - - /// Indicates whether the engine can safely access non-resident ranges of resources. - uint32 supportsUnmappedPrtPageAccess : 1; - - /// This engine supports clear or copy with MSAA depth-stencil destination - uint32 supportsClearCopyMsaaDsDst : 1; - - /// Reserved for future use. - uint32 reserved : 16; - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Engines property flags. - - struct - { - union - { - struct - { - uint32 exclusive : 1; ///< Engine is exclusively owned by one client at a time. - uint32 mustUseDispatchTunneling : 1; ///< Queues created on this engine must use dispatch - /// tunneling. - /// Indicates whether this engine instance can be used for gang submission workloads via - /// a multi-queue. - /// @see IDevice::CreateMultiQueue. - uint32 supportsMultiQueue : 1; - uint32 hwsEnabled : 1; - uint32 isHighPriority : 1; - uint32 reserved : 27; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Capabilities property flags. - - uint32 queuePrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue - /// priority levels are supported by this engine. - uint32 dispatchTunnelingPrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue - /// priority levels support dispatch tunneling on this engine. - uint32 maxFrontEndPipes; ///< Up to this number of IQueue objects can be consumed in - /// parallel by the front-end of this engine instance. It will - /// only be greater than 1 on hardware scheduled engine backed - /// by multiple hardware pipes/threads. - } capabilities[MaxAvailableEngines]; ///< Lists each engine of this type (up to engineCount) and their - /// properties. - - uint32 engineCount; ///< Number available engines of this type. - uint32 queueSupport; ///< Mask of QueueTypeSupport flags indicating which queues are - /// supported by this engine. - uint32 maxBorderColorPaletteSize; ///< Maximum size of a border color palette on this engine. - uint32 controlFlowNestingLimit; ///< Maximum depth of command-buffer control flow nesting on this - /// engine. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - uint32 ceRamSizeAvailable; ///< Size, in bytes, of constant engine RAM available on this engine. -#endif - Extent3d minTiledImageCopyAlignment; ///< Minimum alignments (pixels) for X/Y/Z/Width/Height/Depth for - /// ICmdBuffer::CmdCopyImage() between optimally tiled images. - Extent3d minTiledImageMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for - /// ICmdBuffer::CmdCopyImage() with an optimally tiled image and a - /// linearly tiled image. Also applies to - /// ICmdBuffer::CmdCopyImageToMemory() or - /// ICmdBuffer::CmdCopyMemoryToImage() with an optimally tiled image. - Extent3d minLinearMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for - /// ICmdBuffer::CmdCopyTypedBuffer(). - uint32 minTimestampAlignment; ///< If supportsTimestamps is set, this is the minimum address alignment - /// in bytes of the dstOffset in ICmdBuffer::CmdWriteTimestamp(). - uint32 maxNumDedicatedCu; ///< The maximum number of dedicated CUs for the real time audio queue - uint32 maxNumDedicatedCuPerQueue; ///< The maximum number of dedicated CUs per queue - uint32 dedicatedCuGranularity; ///< The granularity at which compute units can be dedicated to a queue - /// Specifies the suggested heap preference clients should use when creating an @ref ICmdAllocator that will - /// allocate command space for this engine type. These heap preferences should be specified in the allocHeap - /// parameter of @ref CmdAllocatorCreateInfo. Clients are free to ignore these defaults and use their own - /// heap preferences, but may suffer a performance penalty. - GpuHeap preferredCmdAllocHeaps[CmdAllocatorTypeCount]; - - /// Indicate which queue supports per-command, per-submit, or per-queue TMZ based on the queue type. - TmzSupportLevel tmzSupportLevel; - } engineProperties[EngineTypeCount]; ///< Lists available engines on this device and their properties. - - struct - { - union - { - struct - { - /// This queue supports IQueue::PresentSwapChain() calls. Note that a queue may support swap chain - /// presents even if the supportedDirectPresentModes flags below indicate no support for direct - /// presents; instead swap chain PresentMode support is queried via GetSwapChainInfo. - uint32 supportsSwapChainPresents : 1; - uint32 reserved744 : 1; -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 927 - /// Set if the queue supports additional split barrier feature on top of basic acquire/release - /// interface support. This provides CmdAcquire() and CmdRelease() to implement split barriers. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 - /// Note: supportReleaseAcquireInterface is a prerequisite to supportSplitReleaseAcquire. -#endif - uint32 supportSplitReleaseAcquire : 1; -#else - uint32 reserved927 : 1; -#endif - - /// Reserved for future use. - uint32 reserved : 29; - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Queue property flags. - - uint32 supportedDirectPresentModes; ///< A mask of PresentModeSupport flags indicating support for various - /// PresentModes when calling IQueue::PresentDirect(). - } queueProperties[QueueTypeCount]; ///< Lists the properties of all queues supported by PAL. - - struct - { - union - { - struct - { - /// Indicates support for virtual GPU memory allocations. @see IQueue::RemapVirtualMemoryPages. - uint32 virtualRemappingSupport : 1; - - /// Indicates support for pinning system memory for access as GPU memory. - /// @see IDevice::PinSystemMemory. - uint32 pinningSupport : 1; - - /// Indicates support pinned memory which is host-mapped from foreign device. - uint32 supportHostMappedForeignMemory : 1; - - /// Indicates whether specifying memory references at Submit time is supported. If not supported - /// all memory references must be manged via IDevice or IQueue AddGpuMemoryReferences() - uint32 supportPerSubmitMemRefs : 1; - - /// Indicates support for GPU virtual addresses that are visible to all devices. - uint32 globalGpuVaSupport : 1; - - /// Indicates support for Shared Virtual Memory VA range. - uint32 svmSupport : 1; - - /// Indicates support for shadow desc VA range. - uint32 shadowDescVaSupport : 1; - - /// Indicates support for IOMMUv2. Fine grain SVM is not supported without IOMMU. - /// PAL client needs to check this flag before using fine grain SVM. - /// IOMMU is a memory management unit (MMU) that connects a direct-memory-access-capable - /// (DMA-capable) I/O bus to the main memory. - uint32 iommuv2Support : 1; - - /// Indiciates that the platform supports automatic GPU memory priority management. - uint32 autoPrioritySupport : 1; - - /// Indicates KMD has enabled HBCC(High Bandwidth Cache Controller) page migration support. This means - /// shaders must be compiled such that all memory clauses can be replayed in response to an XNACK. - uint32 pageMigrationEnabled : 1; - /// Indicates TMZ (or HSFB) protected memory allocations are supported. - uint32 supportsTmz : 1; - - /// Memory allocations on this device support MALL (memory access last level); essentially - /// the lowest level cache possible. - uint32 supportsMall : 1; - - /// Support for querying page fault information - uint32 supportPageFaultInfo : 1; - - /// Indicates if this device supports GFX12-style distributed compression. Client can control - /// whether distributed compression is enabled or not per IGpuMemory object using the - /// distributedCompression field in @ref GpuMemoryCreateInfo. - uint32 supportDistributedCompression : 1; - - /// All GPU memory allocations in all heaps are always initialized to zero on creation. - /// The @ref initializeToZero flag is not needed when this is true. - uint32 alwaysInitializedToZero : 1; - - /// Reserved for future use. - uint32 reserved : 17; - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory property flags. - - gpusize realMemAllocGranularity; ///< The addresses and sizes of "real" GPU memory objects must be aligned - /// to at least this many bytes. - gpusize virtualMemAllocGranularity; ///< The addresses and sizes of virtual GPU memory objects must be aligned - /// to at least this many bytes. - gpusize virtualMemPageSize; ///< Size in bytes of a virtual GPU memory page. - /// @see IQueue::RemapVirtualMemoryPages. - gpusize fragmentSize; ///< Size in bytes of a video memory fragment. If GPU memory object - /// addresses and sizes are aligned to at least this value, VA translation - /// will be a bit faster. It is aligned to the allocation granularities. - gpusize largePageSizeInBytes; ///< The large page optimization will allow compatible allocations to - /// potentially be upgraded to a page size larger than 64KiB to reduce TLB - /// pressure. PAL will automatically pad the size and alignment of some - /// allocations to enable this optimization; - /// see largePageMinSizeForAlignmentInBytes in PalPublicSettings. - gpusize maxVirtualMemSize; ///< Total virtual GPU memory available (total VA space size). - gpusize maxPhysicalMemSize; ///< Total VRAM available (Local + Invisible + non-Local heap sizes). - gpusize vaStart; ///< Starting address of the GPU's virtual address space. - gpusize vaEnd; ///< Ending address of the GPU's virtual address space. - gpusize descTableVaStart; ///< Starting address of the descriptor table's virtual address space - gpusize shadowDescTableVaStart; ///< Starting address of the shadow descriptor table's virtual address space - gpusize privateApertureBase; ///< Private memory base address for generic address space (Windows only). - gpusize sharedApertureBase; ///< Shared memory base address for generic address space (Windows only). - - gpusize busAddressableMemSize; ///< SDI/DirectGMA GPU aperture size set in CCC - gpusize maxLocalMemSize; ///< Total VRAM available on the GPU (Local + Invisible heap sizes). - LocalMemoryType localMemoryType; ///< Type of local memory used by the GPU. - gpusize maxCaptureReplaySize; ///< Total virtual GPU available for Capture/Replay - gpusize barSize; ///< Total VRAM which can be accessed by the CPU. - - struct - { - float maxMemClock; ///< Maximum GPU memory clock in MHz. For DX builds this value is valid only after - /// the device has been finalized. - uint32 memPerfRating; ///< Precomputed performance rating of memory operations. - uint32 vramBusBitWidth; ///< Memory bus width. - uint32 memOpsPerClock; ///< Memory operations per clock. - } performance; ///< Performance-related memory properties. - - } gpuMemoryProperties; ///< Memory properties for this device. - - struct - { - union - { - struct - { - /// Images created on this device supports AQBS stereo mode, this AQBS stereo mode doesn't apply to the - /// array-based stereo feature supported by Presentable images. - uint32 supportsAqbsStereoMode : 1; - - /// Set if images created on this device support being created with corner sampling. - uint32 supportsCornerSampling : 1; - - /// Load Bearing placeholder, do not touch. - uint32 placeholder0 : 1; - - /// Reserved for future use. - uint32 reserved : 29; - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory property flags. - - Extent3d maxDimensions; ///< Maximum supported width/height/depth for an image. - uint32 maxArraySlices; ///< Maximum supported number of array slices for a 1D or 2D image. - PrtFeatureFlags prtFeatures; ///< PRT features supported by the hardware. - gpusize prtTileSize; ///< Size, in bytes, of a PRT tile. - MsaaFlags msaaSupport; ///< Bitflags for MSAA sample/fragment count support. - uint8 maxMsaaFragments; ///< Max number of MSAA fragments per pixel (may have more samples). - uint8 numSwizzleEqs; ///< How many swizzle equations are in pSwizzleEqs. - Extent2d vrsTileSize; ///< Pixel dimensions of a VRS tile. 0x0 indicates image-based shading rate - /// is not supported. - const SwizzleEquation* pSwizzleEqs; ///< These describe how to interpret device-dependent tiling modes. - - bool tilingSupported[static_cast(ImageTiling::Count)]; ///< If each image tiling is supported. - } imageProperties; ///< Image properties for this device. - - struct - { - /// Maximum number of available shader-accessible user data entries. @see PipelineShaderInfo. - uint32 maxUserDataEntries; - uint32 maxThreadGroupSize; ///< Per-device limit on threads per threadgroup for compute shaders. - /// Some hardware supported by PAL has a bug which can cause a GPU hang if async compute enginesare used while - /// compute shaders with > maxAsyncComputeThreadGroupSize are in flight on any queue. This reports the - /// maximum "safe" limit on threads per threadgroup for compute shaders for this device if the client wishes to - /// use async compute engines. Note that the bug can occur if the following conditions are met: - /// (a) Async compute workloads are running *somewhere* on the GPU, in any process; - /// (b) Some compute workloads on either the async compute engine or on the universal engine have a threads per - /// threadgroup amount which exceeds maxAsyncComputeThreadGroupSize. - /// - /// It is up to the client to choose how to work around this bug. They are free to either limit applications to - /// only creating compute shaders with <= maxAsyncComputeThreadGroupSize threads per group, or to avoid using - /// the async compute engines at all. - /// - /// If this value equals maxThreadGroupSize, then the device does not have this bug and the client can use - /// any compute shader on any queue. - uint32 maxAsyncComputeThreadGroupSize; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 951 - DispatchDims maxComputeThreadGroupCount; ///< Maximum number of thread groups supported for compute pipelines - DispatchDims maxTaskMeshThreadGroupCount; ///< Maximum number of thread groups supported for task+mesh pipelines - DispatchDims maxMeshThreadGroupCount; ///< Maximum number of thread groups supported for mesh-only pipelines - - uint32 maxTaskPayloadSize; ///< Maximum size in bytes of payload passed from task shader to mesh shader -#else - uint32 maxComputeThreadGroupCountX; ///< Maximum number of thread groups supported - uint32 maxComputeThreadGroupCountY; ///< Maximum number of thread groups supported - uint32 maxComputeThreadGroupCountZ; ///< Maximum number of thread groups supported -#endif - - uint32 maxBufferViewStride; ///< Maximum stride, in bytes, that can be specified in a buffer view. - - uint32 hardwareContexts; ///< Number of distinct state contexts available for graphics workloads. Mostly - /// irrelevant to clients, but may be useful to tools. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - uint32 ceRamSize; ///< Maximum on-chip CE RAM size in bytes. -#endif - uint32 maxPrimgroupSize; ///< Maximum primitive group size. - uint32 supportedVrsRates; ///< Bitmask of VrsShadingRate enumerations indicating which modes are supported. - - uint32 mallSizeInBytes; ///< Size of total MALL (Memory Attached Last Level - L3) cache in bytes. - - uint32 gl2UncachedCpuCoherency; ///< If supportGl2Uncached is set, then this is a bitmask of all - /// CacheCoherencyUsageFlags that will be coherent with CPU reads/writes. - /// Note that reporting CoherShader only means that GLC accesses will be - /// CPU coherent. - /// Note: Only valid if @ref supportGl2Uncached is true. - - uint32 maxGsOutputVert; ///< Maximum number of GS output vertices. - uint32 maxGsTotalOutputComponents; ///< Maximum number of GS output components totally. - uint32 maxGsInvocations; ///< Maximum number of GS prim instances, corresponding to geometry shader - /// invocation in glsl. - - RayTracingIpLevel rayTracingIp; ///< HW RayTracing IP version - - uint32 cpUcodeVersion; ///< Command processor feature version. - uint32 pfpUcodeVersion; ///< Command processor, graphics prefetch firmware version. - - union - { - struct - { - uint64 support8bitIndices : 1; ///< Hardware natively supports 8bit indices - uint64 support16BitInstructions : 1; ///< Hardware supports FP16 and INT16 instructions - uint64 supportBorderColorSwizzle : 1; ///< Hardware supports border color swizzle - uint64 supportDoubleRate16BitInstructions : 1; ///< Hardware supports double rate packed math - uint64 supportFp16Fetch : 1; ///< Hardware supports FP16 texture fetches - uint64 supportFp16Dot2 : 1; ///< Hardware supports a paired FP16 dot product. - uint64 supportConservativeRasterization : 1; ///< Hardware supports conservative rasterization - uint64 supportImplicitPrimitiveShader : 1; ///< Device supports implicit compiling of the - /// hardware vertex shader as a primitive shader to - /// perform culling and compaction optimizations in - /// the shader. - uint64 supportMeshShader : 1; ///< Indicates support for mesh shaders. - uint64 supportTaskShader : 1; ///< Indicates support for task shaders. - uint64 supportMsFullRangeRtai : 1; ///< HW supports full range render target array - /// index for Mesh Shaders. - uint64 supportPrtBlendZeroMode : 1; ///< Blend zero mode support. - uint64 supports2BitSignedValues : 1; ///< Hardware natively supports 2-bit signed values. - uint64 supportPrimitiveOrderedPs : 1; ///< Hardware supports primitive ordered UAV - /// accesses in the PS. - uint64 supportPatchTessDistribution : 1; ///< Hardware supports patch level tessellation - /// distribution among VGTs. - uint64 supportDonutTessDistribution : 1; ///< Hardware supports donut granularity of - /// tessellation distribution among VGTs. - uint64 supportTrapezoidTessDistribution : 1; ///< Hardware supports trapezoid granularity of - /// tessellation distribution among VGTs. - uint64 supportSingleChannelMinMaxFilter : 1; ///< Hardware supports min/max filtering that can - /// return one channel at a time. - uint64 supportPerChannelMinMaxFilter : 1; ///< Hardware returns min/max value on a per-channel - /// basis. - uint64 supportRgpTraces : 1; ///< Hardware supports RGP traces. - uint64 supportMsaaCoverageOut : 1; ///< Set if HW supports MSAA coverage feature - uint64 supportPostDepthCoverage : 1; ///< Set if HW supports post depth coverage feature - uint64 supportSpiPrefPriority : 1; ///< Set if HW supports preference priority. - uint64 supportWaveBreakSize : 1; ///< The HW supports specifying the wavebreak size - /// in the pixel shader pipeline. - uint64 supportsPerShaderStageWaveSize : 1; ///< If set, the "waveSize" setting in the - /// @ref PipelineShaderInfo structure is meaningful. - uint64 placeholder2 : 1; ///< Placeholder for backward compatibility, no use it. - uint64 supportSpp : 1; ///< Hardware supports Shader Profiling for Power. - uint64 timestampResetOnIdle : 1; ///< GFX timestamp resets after idle between - /// submissions. The client cannot assume that - /// timestamps will increase monotonically across - /// command buffer submissions. - uint64 support1xMsaaSampleLocations : 1; ///< HW supports 1xMSAA custom quad sample patterns -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 - uint64 supportReleaseAcquireInterface : 1; ///< Set if HW supports the basic functionalities of - /// acquire/release-based barrier interface. This - /// provides CmdReleaseThenAcquire() as a convenient - /// way to replace the legacy barrier interface's - /// CmdBarrier() to handle single point barriers. -#else - uint64 placeholder4 : 1; ///< Placeholder for backward compatibility, no use it. -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 883 - uint64 supportSplitReleaseAcquire : 1; ///< Set if HW supports additional split barrier feature - /// on top of basic acquire/release interface support. - /// This provides CmdAcquire() and CmdRelease() to - /// implement split barriers. - /// Note: supportReleaseAcquireInterface is a - /// prerequisite to supportSplitReleaseAcquire. -#else - uint64 placeholder3 : 1; ///< Placeholder for backward compatibility, no use it. -#endif - uint64 supportGl2Uncached : 1; ///< Indicates support for the allocation of GPU L2 - /// un-cached memory. @see gl2UncachedCpuCoherency - uint64 supportOutOfOrderPrimitives : 1; ///< HW supports higher throughput for out of order - uint64 supportIntersectRayBarycentrics : 1; ///< HW supports the ray intersection mode which - /// returns triangle barycentrics. - uint64 supportFloat32BufferAtomics : 1; ///< Hardware supports float32 buffer atomics - uint64 supportFloat32ImageAtomics : 1; ///< Hardware supports float32 image atomics - uint64 supportFloat32BufferAtomicAdd : 1; ///< Hardware supports float32 buffer atomic add - uint64 supportFloat32ImageAtomicAdd : 1; ///< Hardware supports float32 image atomic add - uint64 supportFloat64Atomics : 1; ///< Hardware supports float64 atomics - uint64 supportFloat32ImageAtomicMinMax : 1; ///< Hardware supports float32 image atomic min and max - uint64 supportFloat64BufferAtomicMinMax : 1; ///< Hardware supports float64 buffer atomic min and max - uint64 supportFloat64SharedAtomicMinMax : 1; ///< Hardware supports float64 shared atomic min and max - uint64 support64BitInstructions : 1; ///< Hardware supports 64b instructions - uint64 supportShaderSubgroupClock : 1; ///< HW supports clock functions across subgroup. - uint64 supportShaderDeviceClock : 1; ///< HW supports clock functions across device. - uint64 supportAlphaToOne : 1; ///< HW supports forcing PS output alpha channel to 1 - uint64 supportCaptureReplay : 1; ///< HW supports captureReplay - uint64 supportSortAgnosticBarycentrics : 1; ///< HW supports sort-agnostic Barycentrics for PS - uint64 supportVrsWithDsExports : 1; ///< If true, asic support coarse VRS rates - /// when z or stencil exports are enabled - uint64 supportRayTraversalStack : 1; ///< HW assisted ray tracing traversal stack support - uint64 supportPointerFlags : 1; ///< Ray tracing HW supports flags embedded in the node - /// pointer bits - uint64 supportTextureGatherBiasLod : 1; ///< HW supports SQ_IMAGE_GATHER4_L_O - uint64 supportInt8Dot : 1; ///< Hardware supports a dot product 8bit. - uint64 supportInt4Dot : 1; ///< Hardware supports a dot product 4bit. - uint64 supportMixedSignIntDot : 1; ///< Hardware supports a integer dot product with mixed - /// sign inputs. - uint64 support2DRectList : 1; ///< HW supports PrimitiveTopology::TwoDRectList. - uint64 supportHsaAbi : 1; ///< PAL supports HSA ABI compute pipelines. - uint64 supportImageViewMinLod : 1; ///< Indicates image srd supports min_lod. - uint64 supportStaticVmid : 1; ///< Indicates support for static-VMID - uint64 support3dUavZRange : 1; ///< HW supports read-write ImageViewSrds of 3D images - /// with zRange specified. - uint64 supportCooperativeMatrix : 1; ///< HW supports cooperative matrix - uint64 support1dDispatchInterleave : 1; ///< Indicates support for 1D Dispatch Interleave. - uint64 support2dDispatchInterleave : 1; ///< Indicates support for 2D Dispatch Interleave. - uint64 supportBFloat16 : 1; ///< HW supports bf16 instructions. - uint64 supportFloat8 : 1; ///< HW supports float 8-bit instructions. - uint64 supportInt4 : 1; ///< HW supports integer 4-bit instructions. - uint64 supportCooperativeMatrix2 : 1; ///< HW supports Gfx12 extension cooperative matrix. - uint64 placeholder14 : 2; - uint64 reserved : 59; ///< Reserved for future use. - }; - uint64 u64All[2]; ///< Flags packed as 32-bit uint. - } flags; ///< Device IP property flags. - - struct - { -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 - union - { - uint32 bufferView; -#endif - uint32 typedBufferView; ///< Size in bytes (and required alignment) of a typed buffer view SRD. - /// @see IDevice::CreateTypedBufferViewSrds(). -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 - }; -#endif - uint32 untypedBufferView; ///< Size in bytes (and required alignment) of a untyped buffer view SRD. - /// @see IDevice::CreateUntypedBufferViewSrds(). - uint32 imageView; ///< Size in bytes (and required alignment) of an image view SRD. - /// @see IDevice::CreateImageViewSrds(). - uint32 fmaskView; ///< Size in bytes (and required alignment) of an fmask view SRD. - /// @see IDevice::CreateFmaskViewSrds(). This value can be zero to denote - /// a lack of fMask support. - uint32 sampler; ///< Size in bytes (and required alignment) of a sampler SRD. - /// @see IDevice::CreateSamplerSrds(). - uint32 bvh; ///< Size in bytes (and required alignment) of a BVH SRD - /// Will be zero if HW doesn't support ray-tracing capabilities. - /// @see IDevice::CreateBvhSrds(). - } srdSizes; ///< Sizes for various types of _shader resource descriptor_ (SRD). - - struct - { - const void* pNullBufferView; ///< Pointer to null buffer view srd - const void* pNullImageView; ///< Pointer to null image view srd - const void* pNullFmaskView; ///< Pointer to null fmask view srd. This pointer can be nullptr to - /// indicate a lack of fMask support. - const void* pNullSampler; ///< Pointer to null sampler srd - } nullSrds; ///< Null SRDs are used to drop shader writes or read 0 - - struct - { - float maxGpuClock; ///< Maximum GPU engine clock in MHz. For DX builds this value is valid only after - /// the device has been finalized. - float aluPerClock; ///< Maximum shader ALU operations per clock. - float texPerClock; ///< Maximum texture fetches per clock. - float primsPerClock; ///< Maximum primitives processed per clock. - float pixelsPerClock; ///< Maximum pixels processed per clock. - uint32 gfxipPerfRating; ///< Precomputed performance rating of the GfxIp block. - } performance; ///< Performance-related device properties. - - struct - { - union - { - struct - { - uint32 eccProtectedGprs : 1; ///< Whether or not the GPU has ECC protection - ///< on its VGPR's - uint32 placeholder0 : 1; - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as a 32-bit unsigned integer. - } flags; - - uint32 numShaderEngines; ///< Number of non-harvested shader engines. - uint32 numShaderArrays; ///< Number of shader arrays. - uint32 numCusPerShaderArray; ///< Number of CUs per shader array that are actually usable. - uint32 maxCusPerShaderArray; ///< Maximum number of CUs per shader array. Count of physical CUs prior to - ///< harvesting CUs for yield in certain variants of ASICs (ex: Fiji PRO). - uint32 numSimdsPerCu; ///< Number of SIMDs per compute unit. - uint32 numWavefrontsPerSimd; ///< Number of wavefront slots in each SIMD. - uint32 numActiveRbs; ///< Number of active Renderbackends - uint32 nativeWavefrontSize; ///< The native wavefront size. - uint32 minWavefrontSize; ///< The smallest supported wavefront size. - uint32 maxWavefrontSize; ///< All powers of two between the min size and max size are supported. - uint32 numAvailableSgprs; ///< Number of available SGPRs. - uint32 sgprsPerSimd; ///< Number of physical SGPRs per SIMD. - uint32 minSgprAlloc; ///< Minimum number of SGPRs that can be allocated by a wave. - uint32 sgprAllocGranularity; ///< SGPRs are allocated in groups of this size. Meaning, if your shader - /// only uses 1 SGPR, you will still end up reserving this number of - /// SGPRs. - uint32 numAvailableVgprs; ///< Number of available VGPRs. - uint32 vgprsPerSimd; ///< Number of physical VGPRs per SIMD. - uint32 minVgprAlloc; ///< Minimum number of VGPRs that can be allocated by a wave. - uint32 vgprAllocGranularity; ///< VGPRs are allocated in groups of this size. Meaning, if your shader - /// only uses 1 VGPR, you will still end up reserving this number of - /// VGPRs. On hardware where wave32 is available, the granularity for a - /// wave64 shader is half of this value, but the VGPR allocation is - /// double. The same number of total physical registers is allocated for - /// each unit of allocation with either wave size. - uint32 ldsSizePerCu; ///< Local Data Store size available in bytes per CU. - uint32 ldsSizePerThreadGroup; ///< Local Data Store size available in bytes per thread-group. - uint32 ldsGranularity; ///< Local Data Store allocation granularity expressed in bytes. - uint32 gsPrimBufferDepth; ///< Hardware configuration for the GS prim buffer depth. - uint32 gsVgtTableDepth; ///< Hardware configuration for the GS VGT table depth. - uint32 numOffchipTessBuffers; ///< Number of offchip buffers that are used for offchip tessellation to - /// pass data between shader stages. - uint32 offchipTessBufferSize; ///< Size of each buffer used for passing data between shader stages when - /// tessellation passes data using off-chip memory. - uint32 tessFactorBufSizePerSe; ///< Size of GPU's the tessellatio-factor buffer, per shader engine. - uint32 tccSizeInBytes; ///< Size of total L2 TCC cache in bytes. - uint32 tcpSizeInBytes; ///< Size of one L1 TCP cache in bytes. There is one TCP per CU. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 959 - uint32 maxLateAllocVsLimit; ///< Maximum number of VS waves that can be in flight without - /// having param cache and position buffer space. -#endif - uint32 shaderPrefetchBytes; ///< Number of bytes the SQ will prefetch, if any. - uint32 gl1cSizePerSa; ///< Size in bytes of GL1 cache per SA. - uint32 instCacheSizePerCu; ///< Size in bytes of instruction cache per CU/WGP. - uint32 scalarCacheSizePerCu; ///< Size in bytes of scalar cache per CU/WGP. - uint32 numAvailableCus; ///< Total number of CUs that are actually usable. - uint32 numPhysicalCus; ///< Count of physical CUs prior to harvesting. - /// Mask of active pixel packers. The mask is 128 bits wide, assuming a max of 32 SEs and a max of 4 pixel - /// packers (indicated by a single bit each) per SE. - uint32 activePixelPackerMask[ActivePixelPackerMaskDwords]; - /// Mask of present, non-harvested CUs (Virtual Layout) - uint32 activeCuMask[MaxShaderEngines][MaxShaderArraysPerSe]; - } shaderCore; ///< Properties of computational power of the shader engine. - - } gfxipProperties; - - struct - { - union - { - struct - { - uint32 supportTurboSync : 1; ///< Whether TurboSync is supported by KMD - uint32 enableUmdFpsCap : 1; ///< Whether UMD FPS CAP enabled - uint32 isCwgSupported : 1; ///< KMD supports Creator Who Game (CWG) feature - uint32 isGamingDriver : 1; ///< KMD works in gaming mode -#if defined(_WIN32) - uint32 canShareSemaphoreKmtHandle : 1; ///< OS supports sharing queue semaphores by KMT handle. -#else - uint32 placeholder0 : 1; -#endif - uint32 ifhModeEnabled : 1; ///< Whether the IFH mode is enabled - uint32 requireFrameEnd : 1; ///< If the client must tag the last command buffer - /// submission in each frame with a @ref CmdBufInfo with - /// the frameEnd flag set. - uint32 supportNativeHdrWindowing : 1; ///< Support HDR presentation that does not require FSE. - uint32 flipQueueSupportsDecodeDst : 1; ///< If set, Decode destination images are supported - /// in the OS flip-queue. - uint32 supportFreeMux : 1; ///< Whether FreeMux is supported by KMD - uint32 isDataCenterBoard : 1; ///< Whether the current board in use is a Data Center board. - /// This is meant to support a unified VDI/CG driver package. -#if defined(__unix__) - uint32 hasPrimaryDrmNode : 1; ///< Set if the device has a primary DRM node. - uint32 hasRenderDrmNode : 1; ///< Set if the device has a render DRM node. -#else - uint32 placeholder1 : 2; -#endif - uint32 forceAlignmentSupported : 1; ///< If PalPublicSettings::hardwareBufferAlignmentMode - /// has any effect. - uint32 reserved : 18; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< OS-specific property flags. - -#if (PAL_KMT_BUILD) - int32 luidHighPart; ///< High 32 bits of the LUID (locally unique identifier) for this device. - uint32 luidLowPart; ///< Low 32 bits of the LUID (locally unique identifier) for this device. - bool supportNtHandle; ///< Support export/import resource with NT handle. - bool supportNativefence; ///< Expose the capability of native fence support to client in DeviceProperties. -#endif - - union - { - struct - { - uint32 support : 1; ///< Support Timeline type semaphore. - uint32 supportHostQuery : 1; ///< Support Timeline type semaphore host query. - uint32 supportHostWait : 1; ///< Support Timeline type semaphore host wait. - uint32 supportHostSignal : 1; ///< Support Timeline type semaphore host signal. - uint32 supportWaitBeforeSignal : 1; ///< Support Timeline type semaphore wait before signal. - - uint32 reserved : 27; ///< Reserved for future use. - }; - uint32 u32All; - } timelineSemaphore; - -#if defined(__unix__) - bool supportOpaqueFdSemaphore; ///< Support export/import semaphore as opaque fd in linux KMD. - bool supportSyncFileSemaphore; ///< Support export/import semaphore as sync file in linux KMD. - bool supportSyncFileFence; ///< Support export/import fence as sync file in linux KMD. -#endif - - bool supportQueuePriority; ///< Support create queue with priority - bool supportDynamicQueuePriority; ///< Support set the queue priority through IQueue::SetExecutionPriority - -#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) - bool supportMemoryBudgetQuery; ///< Support memory budget query through IDevice::QueryGpuMemoryBudgetInfo -#endif - -#if defined(_WIN32) - bool supportArbitaryPrtMapUnmap; ///< Support arbitary prt map unmap operation. -#endif - - uint32 umdFpsCapFrameRate; ///< The frame rate of the UMD FPS CAP - VirtualDisplayCapabilities virtualDisplayCaps; ///< Capabilities of virtual display, it's provided by KMD - - union - { - struct - { - uint32 supportDevice : 1; ///< GPU time domain - uint32 supportClockMonotonic : 1; ///< POSIX CLOCK_MONOTONIC time domain - uint32 supportClockMonotonicRaw : 1; ///< POSIX CLOCK_MONOTONIC_RAW time domain - uint32 supportQueryPerformanceCounter : 1; ///< Windows Query Performance Counter time domain - - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; - } timeDomains; - -#if defined(_WIN32) - char driverStorePath[Util::MaxPathStrLen]; ///< Driver store path -#endif - -#if defined(__unix__) - int64 primaryDrmNodeMajor; ///< DRM primary node major number. - int64 primaryDrmNodeMinor; ///< DRM primary node minor number. - int64 renderDrmNodeMajor; ///< DRM render node major number. - int64 renderDrmNodeMinor; ///< DRM render node minor number. -#endif - union - { - struct - { - uint32 supportPostflip : 1; ///< KMD support DirectCapture post-flip access - uint32 supportPreflip : 1; ///< KMD support DirectCapture pre-flip access - uint32 supportRSync : 1; ///< KMD support RSync - uint32 maxFrameGenRatio : 4; ///< Maximum frame generation ratio or zero if not supported - uint32 supportNonPrimary : 1; ///< KMD support non-primary DirectCapture auxiliary data - uint32 reserved : 24; ///< Reserved for future use. - }; - uint32 u32All; - } directCapture; - } osProperties; ///< OS-specific properties of this device. - - struct - { - uint32 domainNumber; ///< PCI bus number. - uint32 busNumber; ///< PCI bus number. - uint32 deviceNumber; ///< PCI device number. - uint32 functionNumber; ///< PCI function number. - - union - { - struct - { - uint32 gpuConnectedViaThunderbolt : 1; ///< Device is an externally housed GPU connected to the system - /// via Thunderbolt. This will drastically impact CPU read and - /// write performance of memory in the @ref GpuHeapLocal heap. - uint32 gpuEmulatedInSoftware : 1; ///< Device is really a software package which emulates the - /// GPU. This is meant for pre-silicon development. - uint32 gpuEmulatedInHardware : 1; ///< Device is a hardware emulated GPU. This is meant for - /// pre-silicon development. - uint32 gpuVirtualization : 1; ///< Set if running under VM. - uint32 atomicOpsSupported : 1; ///< Set if pcie atomic is supported. - uint32 reserved : 27; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< PCI bus property flags. - } pciProperties; ///< PCI bus properties of this device. - - BigSoftwareReleaseInfo bigSoftwareReleaseInfo; ///< Big Software (BigSW) Release Version information -}; - -/// Defines callback function to notify client of private screen changes. -typedef void (PAL_STDCALL *TopologyChangeNotificationFunc)(void* pClient); - -/// Defines callback function to notify client of the private screen removal. -typedef void (PAL_STDCALL *DestroyNotificationFunc)(void* pOwner); - -/// Specifies the private screen topology change notification data. -struct PrivateScreenNotifyInfo -{ - void* pClient; ///< Pointer to client, PAL use this pointer as parameter when PAL - /// calls callback pfnOnTopology. - TopologyChangeNotificationFunc pfnOnTopology; ///< Pointer to client provided function. PAL should call this when - /// the topology change happens and let the client handle the change. - DestroyNotificationFunc pfnOnDestroy; ///< Pointer to client provdided function. PAL should call this when - /// a private screen object is to be destroyed. The pOwner data is - /// passed at @ref IPrivateScreen::BindOwner() time. -}; - -/// Specifies fullscreen frame metadata control flags. Used for the KMD to notify clients about which types of frame -/// metadata it needs to send to KMD. The meaning depends on the context: -/// - During device finalization, client can set the flags indicating the specified metadata 'is supported' by client. -/// - During present, client can query these flags that indicate which metadata 'is enabled' currently so that the -/// client should send them to the KMD. -union FullScreenFrameMetadataControlFlags -{ - struct - { - uint32 timerNodeSubmission : 1; ///< Timer node submission, used for cases such as FRTC/FP/PFPA. - uint32 frameBeginFlag : 1; ///< FrameBegin flag on CmdBufInfo, see CmdBufInfo for details. - uint32 frameEndFlag : 1; ///< FrameEnd flag on CmdBufInfo, see CmdBufInfo for details. - uint32 primaryHandle : 1; ///< Pending primary handle for pre-flip primary access (PFPA) - uint32 p2pCmdFlag : 1; ///< P2P copy command. See CmdBufInfo comments for details. - uint32 forceSwCfMode : 1; ///< Force software crossfire mode. - uint32 postFrameTimerSubmission : 1; ///< It indicates whether the timer node submission at frame N is to - /// synchronize the flip of frame N (postFrameTimerSubmission == TRUE) - /// or N+1 (postFrameTimerSubmission == FALSE). - /// It's only valid when timerNodeSubmission is also set. - uint32 useHp3dForDwm : 1; ///< KMD Informs (DX11) UMD to use HP3D for DWM or not (Output only). - uint32 expandDcc : 1; ///< KMD notifies UMD to expand DCC (Output only). - uint32 enableTurboSyncForDwm : 1; ///< Indicates DWM should turn on TurboSync(Output only). - uint32 enableDwmFrameMetadata : 1; ///< When cleared, no frame metadata should be sent for DWM(Output only). - uint32 flipIntervalOverride : 3; ///< KMD-UMD interface FLIP_INTERVAL_OVERRIDE, for KMD to request flip - /// interval override from UMD. - uint32 disableFreeMux : 1; ///< KMD notifies UMD to disable FreeMux. - uint32 maxFrameLatency : 2; ///< KMD can notify UMD to override the frame latency of an app. - uint32 sendMotionVectors : 1; ///< Send the motion vector in CmdBufInfo once per frame - uint32 sendDepth : 1; ///< Send the depth buffer in CmdBufInfo once per frame - uint32 sendCameraMatrix : 1; ///< Send the camera matrix in CmdBufInfo once per frame - uint32 sendHudLessImage : 1; ///< Send the HUD less image in CmdBufInfo once per frame - uint32 reserved : 11; ///< Reserved for future use. - - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Indicates the desired UMD behavior with timer node submission. -/// This is used to distinguish FP Vsync On + FreeSync Off case from HSync or FreeSync cases, the former case doesn't -/// hold flip while the later cases do. -enum class TimerNodeMode : uint32 -{ - Unspecified, ///< Unspecified, client can decide what to do with the timer submission. - ForceFlipHold, ///< Client must hold flip with the timer submission -}; - -/// Specifies fullscreen frame metadata control data. Including FullScreenFrameMetadataControlFlags plus extended data. -/// According to KMD's design, the difference is that 'flags' can be used to indicate 'client caps' during device -/// initialization, while the 'data' is only passed from KMD to UMD. -struct PerSourceFrameMetadataControl -{ - FullScreenFrameMetadataControlFlags flags; ///< The frame metadata control flags - TimerNodeMode timerNodeSubmissionMode; ///< Desired UMD behavior with timer node submission -}; - -/// Specifies the texture optimization level to use for an image. -/// -/// @ingroup ResourceBinding -enum class ImageTexOptLevel : uint32 -{ - Default = 0, ///< Use device default setting - Disabled, ///< Disable texture filter optimization - Enabled, ///< Enable texture filter optimization - Maximum, ///< Maximum texture filter optimization - Count -}; - -/// Specifies properties for @ref IDevice finalization. Input structure to IDevice::Finalize(). -struct DeviceFinalizeInfo -{ - union - { - struct - { - uint32 supportPrivateScreens : 1; ///< Initializes private screen support. - uint32 requireFlipStatus : 1; ///< Requires to initialize flip status shared memory - uint32 requireFrameMetadata : 1; ///< Requires to initialize frame metadata flags shared memory. - /// Clients should only set this flag on the master device in an - /// LDA chain. - uint32 internalGpuMemAutoPriority : 1; ///< Forces internal GPU memory allocation priorities to be - /// determined automatically. It is an error to set this flag - /// if the device does not report that it supports this feature. - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Device finalization flags. - - /// Specifies which engines of each type should be created for the device. - struct - { - uint32 engines; ///< A mask of which engines are requested. - } requestedEngineCounts[EngineTypeCount]; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - /// Bytes of CE RAM to be used by the client for each engine type. This value must be <= ceRamSizeAvailable reported - /// for that engine type. In the case where more than one engine of a given type is requested it is assumed each - /// engine of that type will use this amount of CE RAM so the total size of (ceRamSizeUsed * queueCounts) must be <= - /// ceRamSizeAvailable for that engine type. Each entry must be either zero or a multiple of 32 bytes. - size_t ceRamSizeUsed[EngineTypeCount]; -#endif - - /// @see PrivateScreenNotifyInfo - /// Private screen notify info, must be filled when supportPrivateScreens=1. The client pointer and callback are to - /// be saved in device. PAL should call the callback when there is any topology (hotplug) change with the client - /// pointer as parameter. - PrivateScreenNotifyInfo privateScreenNotifyInfo; - - /// Fullscreen frame metadata control flags indicating the types of metadata that the client supports. - /// During adapter initialization, capable KMD notifies clients that it supports frame metadata, - /// clients should then set these flags on device finalization info, indicating which types of metadata the client - /// supports. - FullScreenFrameMetadataControlFlags supportedFullScreenFrameMetadata; - - /// Specify the texture optimization level which only applies to internally-created views by PAL (e.g., for BLTs), - /// client-created views must use the texOptLevel parameter in ImageViewInfo. - ImageTexOptLevel internalTexOptLevel; -}; - -/// Reports the compatibility and available features when using two particular devices in a multi-GPU system. Output -/// structure from IDevice::GetMultiGpuCompatibility(). -struct GpuCompatibilityInfo -{ - union - { - struct - { - uint32 gpuFeatures : 1; ///< The devices have an exact feature match: same internal tiling, same - /// pipeline binary data, etc. - uint32 iqMatch : 1; ///< Devices produce images with same precision. - uint32 peerTransferWrite : 1; ///< Peer-to-peer transfers write are supported. See - /// IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). - uint32 peerTransferRead : 1; ///< Peer-to-peer transfers based on xmgi are supported. - /// See IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). - uint32 sharedMemory : 1; ///< Devices can share memory objects with. IDevice::OpenSharedMemory(). - uint32 sharedSync : 1; ///< Devices can share queue semaphores with - /// IDevice::OpenSharedQueueSemaphore(). -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 - uint32 shareThisGpuScreen : 1; ///< Either device can present to this device. Means that the device - /// indicated by the otherDevice param in - /// IDevice::GetMultiGpuCompatibility() can present to the device the - /// method was called on. - uint32 shareOtherGpuScreen : 1; ///< Either device can present to the other device. Means that the - /// device IDevice::GetMultiGpuCompatibility() was called on can present - /// to the GPU indicated by the otherGpu param. -#else - uint32 reserved1 : 2; -#endif - uint32 peerEncode : 1; ///< whether encoding HW can access FB memory of remote GPU in chain - uint32 peerDecode : 1; ///< whether decoding HW can access FB memory of remote GPU in chain - uint32 peerTransferProtected : 1; ///< whether protected content can be transferred over P2P - uint32 crossGpuCoherency : 1; ///< whether remote FB memory can be accessed without need for cache flush - uint32 reserved : 20; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU compatibility flags. -}; - -/// Reports properties of a GPU memory heap. -/// -/// @note The performance ratings represent an approximate memory throughput for a particular access scenario, but -/// should not be taken as an absolute performance metric. -struct GpuMemoryHeapProperties -{ - union - { - struct - { - uint32 cpuVisible : 1; ///< Accessible with IGpuMemory::Map() - uint32 cpuGpuCoherent : 1; ///< Cache coherent between the CPU and GPU. - uint32 cpuUncached : 1; ///< Not cached by CPU, but could still be GPU cached. - uint32 cpuWriteCombined : 1; ///< CPU write-combined memory. - uint32 holdsPinned : 1; ///< GPU memory objects created by IDevice::CreatePinnedGpuMemory() are in - /// this heap. - uint32 shareable : 1; ///< GPU memory objects in this heap can be shared between multiple devices. - uint32 supportsTmz : 1; ///< This heap supports TMZ allocations. - uint32 reserved : 25; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory heap property flags. - - gpusize logicalSize; ///< Size of the heap in bytes. If HBCC is enabled, certain heaps may be - /// virtualized and the logical size will exceed the physical size. - gpusize physicalSize; ///< Physical size of the heap in bytes -}; - -/// Reports properties of a specific GPU block required for interpretting performance experiment data from that block. -/// See @ref PerfExperimentProperties. -struct GpuBlockPerfProperties -{ - bool available; ///< If performance data is available for this block. - uint32 instanceCount; ///< How many instances of this block are in the device. - uint32 maxEventId; ///< Maximum event ID for this block. - uint32 maxGlobalOnlyCounters; ///< Number of counters available only for global counts. - uint32 maxGlobalSharedCounters; ///< Total counters available including state shared between global and SPM. - uint32 maxSpmCounters; ///< Counters available for streaming only. - - /// If the instance group size is equal to one, every block instance has its own independent counter hardware. - /// PAL guarantees this is true for all non-DF blocks. - /// - /// Otherwise the instance group size will be a value greater than one which indicates how many sequential - /// instances share the same counter hardware. The client must take care to not enable too many counters within - /// each of these groups. - /// - /// For example, the DfMall block may expose 16 instances with 8 global counters but define a group size of 16. - /// In that case all instances are part of one massive group which uses one pool of counter state such that no - /// combination of DfMall counter configurations can exceed 8 global counters. - uint32 instanceGroupSize; -}; - -/// Reports performance experiment capabilities of a device. Returned by IDevice::GetPerfExperimentProperties(). -struct PerfExperimentProperties -{ - PerfExperimentDeviceFeatureFlags features; ///< Performance experiment device features. - - size_t maxSqttSeBufferSize; ///< SQTT buffer size per shader engine. - size_t sqttSeBufferAlignment; ///< SQTT buffer size and base address alignment. - uint32 shaderEngineCount; ///< Number of shader engines. - - /// Reports availability and properties of each device block. - GpuBlockPerfProperties blocks[static_cast(GpuBlock::Count)]; -}; - -/// Reports maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' -/// elements are no larger than maxElementSize. -struct LinearImageAlignments -{ - uint16 maxElementSize; ///< Maximum element size in bytes. - uint16 baseAddress; ///< Minimum required base address alignment in bytes. - uint16 rowPitch; ///< Minimum required row pitch alignment in bytes. - uint16 depthPitch; ///< Minimum required depth pitch alignment in bytes. -}; - -/// Specifies image view type (i.e., 1D, 2D, 3D, or cubemap). -/// -/// @ingroup ResourceBinding -enum class ImageViewType : uint32 -{ - Tex1d = 0x0, - Tex2d = 0x1, - Tex3d = 0x2, - TexCube = 0x3, - - Count -}; - -/// Enumeration which defines the mode for magnification and minification sampling -/// -/// @ingroup ResourceBinding -enum XyFilter : uint32 -{ - XyFilterPoint = 0, ///< Use single point sampling - XyFilterLinear, ///< Use linear sampling - XyFilterAnisotropicPoint, ///< Use anisotropic with single point sampling - XyFilterAnisotropicLinear, ///< Use anisotropic with linear sampling - XyFilterCount -}; - -/// Enumeration which defines the mode for volume texture sampling -/// -/// @ingroup ResourceBinding -enum ZFilter : uint32 -{ - ZFilterNone = 0, ///< Disable Z filtering - ZFilterPoint, ///< Use single point sampling - ZFilterLinear, ///< Use linear sampling - ZFilterCount -}; - -/// Enumeration which defines the mode for mip-map texture sampling -/// -/// @ingroup ResourceBinding -enum MipFilter : uint32 -{ - MipFilterNone = 0, ///< Disable Mip filtering - MipFilterPoint, ///< Use single point sampling - MipFilterLinear, ///< Use linear sampling - MipFilterCount -}; - -/// Specifies parameters for an image view descriptor controlling how a given texture is sampled -/// -/// @ingroup ResourceBinding -struct TexFilter -{ - union - { - struct - { - uint32 magnification : 2; ///< Used with enum XyFilter for Plane magnification filtering - uint32 minification : 2; ///< Used with enum XyFilter for Plane minification filtering - uint32 zFilter : 2; ///< Used with enum ZFilter for volume texture filtering - uint32 mipFilter : 2; ///< Used with enum MipFilter for mip-map filtering - uint32 reserved : 24; ///< Reserved for future use - }; - uint32 u32All; ///< Value of flags bitfield - }; -}; - -/// Determines if "TexFilter" should be ignored or not. -enum class TexFilterMode : uint32 -{ - Blend = 0x0, ///< Use the filter method specified by the TexFilter enumeration - Min = 0x1, ///< Use the minimum value returned by the sampler, no blending op occurs - Max = 0x2, ///< Use the maximum value returned by the sampler, no blending op occurs - Count -}; - -/// Specifies how texture coordinates outside of texture boundaries are interpreted. -/// -/// @ingroup ResourceBinding -enum class TexAddressMode : uint32 -{ - Wrap = 0x0, ///< Repeat the texture. - Mirror = 0x1, ///< Mirror the texture by flipping it at every other coordinate interval. - Clamp = 0x2, ///< Clamp the texture to the texture's edge pixel. - MirrorOnce = 0x3, ///< Mirror the texture once then clamp. - ClampBorder = 0x4, ///< Clamp the texture to the border color specified in the sampler. - MirrorClampHalfBorder = 0x5, ///< Mirror the texture once then clamp the texture to half of the edge color. - ClampHalfBorder = 0x6, ///< Clamp the texture to half of the edge color. - MirrorClampBorder = 0x7, ///< Mirror the texture once then clamp the texture to the samler's border color. - Count -}; - -/// Specifies how a border color should be chosen when the TexAddressClampBorder texture addressing is used by a -/// sampler. -/// -/// @ingroup ResourceBinding -enum class BorderColorType : uint32 -{ - White = 0x0, ///< White border color (1.0, 1.0, 1.0, 1.0). - TransparentBlack = 0x1, ///< Transparent black border color (0.0, 0.0, 0,0, 0.0). - OpaqueBlack = 0x2, ///< Opaque black border color (0.0, 0.0, 0.0, 1.0). - PaletteIndex = 0x3, ///< Fetch border color from the border color palette. - Count -}; - -/// Residency maps are helper surfaces used in conjunction with PRT+. They reflect the resident mip levels -/// associated with a given UV region of the parent image. -enum class PrtMapAccessType : uint32 -{ - Raw = 0x0, ///< Read / write the map image as a normal image. - Read = 0x1, ///< Read the residency map as floating point data - WriteMin = 0x2, ///< Write the residency map with min(existing,new) - WriteMax = 0x3, ///< Write the residency map with max(existing,new) - WriteSamplingStatus = 0x4, ///< Write to the sampling status map. - Count -}; - -/// Specifies compression behavior for an IImage or image/buffer view. -enum class CompressionMode : uint32 -{ - Default = 0, ///< Let PAL choose behavior. - ReadEnableWriteEnable = 1, ///< Override default to force read and write compression on. - ReadEnableWriteDisable = 2, ///< Support reading compressed data, but force any writes to be uncompressed (keeping - /// physical metadata consistent). - ReadBypassWriteDisable = 3, ///< Bypass physical metadata on reads (assume decompressed), all writes will be - /// uncompressed and will write physical metatdata marking updated blocks as being - /// uncompressed. This mode is intended to handle placed resources that do not - /// want compression in memory allocations that have distributed compression enabled. - /// WARNING: Using this mode to read compressed data will result in corruption. - Count, -}; - -/// Specifies client compression behavior for an IImage. -enum class ClientCompressionMode : uint32 -{ - Default = 0, ///< Let implementation decide whether to enable or disable - Enable = 1, ///< Force enable - Disable = 2, ///< Force disable - DisableClearOnly = 3, ///< Force enable for all image views except for image clears - Count, -}; - -/// Specifies parameters for a buffer view descriptor that control how a range of GPU memory is viewed by a shader. -/// -/// Input to either CreateTypedBufferViewSrds() or CreateUntypedBufferViewSrds(). Used for any buffer descriptor, -/// including read-only shader resources, UAVs, vertex buffers, etc. The usage of stride and format depends on the -/// expected shader instruction access: -/// -/// + _Typed buffer_ access must set a valid format and channel mapping. -/// + _Raw buffer_ access is indicated by setting an invalid format and setting stride to 1. -/// + _Structured buffer_ access is indicated by setting an invalid format and setting stride to any value except 1. A -/// stride of 0 maps all view accesses to the first structure stored in memory. -/// -/// _Typed buffer_ SRD's must be created using @ref IDevice::CreateTypedBufferViewSrds(). -/// _Raw buffer_ and _structured buffer_ SRD's must be created using @ref IDevice::CreateUntypedBufferViewSrds(). -/// -/// If necessary, PAL will adjust the out of bounds read/write behavior to match the client's API requirements based on -/// the client defines - PAL_CLIENT_VULKAN, etc. -/// -/// @ingroup ResourceBinding -struct BufferViewInfo -{ - gpusize gpuAddr; ///< GPU memory virtual address where the buffer view starts, in bytes. - /// Must be aligned to bytes-per-element for typed access. - gpusize range; ///< Restrict the buffer view to this many bytes. Will be rounded down to a - ///< multiple of the stride. - gpusize stride; ///< Stride in bytes. Must be aligned to bytes-per-element for typed access. - SwizzledFormat swizzledFormat; ///< Format and channel swizzle for typed access. Must be Undefined for structured - /// or raw access. - CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. - /// Only relevant if the backing memory pages enable compression. - - union - { - struct - { - /// Set to have this surface independently bypass the MALL for read and / or write operations. - /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful - /// only on GPUs that have supportsMall set in DeviceProperties. - uint32 bypassMallRead : 1; - uint32 bypassMallWrite : 1; - uint32 reserved : 30; ///< Reserved for future use - }; - uint32 u32All; ///< Value of flags bitfield - } flags; -}; - -/// D3D12_VERTEX_BUFFER_VIEW structure. Match vertex offset mode. -/// @ref ICmdBuffer::CmdSetVertexBuffers function. -struct VertexBufferView -{ - gpusize gpuva; ///< GPU virtual address to vertex buffer. - uint32 sizeInBytes; ///< The size in bytes of the buffer. - uint32 strideInBytes; ///< The size in bytes of each vertex entry. -}; - -/// Specifies an array of vertex buffer view descriptors. PAL supports multiple vertex buffer modes which change how -/// the GPU memory is viewed by a shader. This struct wraps up the choice between these modes. -/// -/// If offsetMode is true PAL will create "vertex offset mode" SRDs using the @ref VertexBufferView array. This new SRD -/// has stronger robustness. It avoids that the last vertex is removed when buffer size is not multiple of the stride. -/// -/// @see ICmdBuffer::CmdSetVertexBuffers -struct VertexBufferViews -{ - uint32 firstBuffer; ///< First vertex buffer slot to change. Must be less than @ref MaxVertexBuffers. - uint32 bufferCount; ///< The length of the buffer view array. - bool offsetMode; ///< If true, pVertexBufferViews must be used, otherwise pBufferViewInfos is used. - - union - { - const VertexBufferView* pVertexBufferViews; ///< Offset mode vertex buffer views. Valid if offsetMode is true. - const BufferViewInfo* pBufferViewInfos; ///< Index mode vertex buffer views. Valid if offsetMode is false. - }; -}; - -/// Specifies parameters for an image view descriptor controlling how a shader will view the specified image. -/// -/// Input to CreateImageViewSrd(). Used for any image view descriptor, including read-only shader resources and UAVs. -/// -/// @ingroup ResourceBinding -struct ImageViewInfo -{ - const IImage* pImage; ///< Image associated with the view. - ImageViewType viewType; ///< 1D, 2D, 3D, or Cubemap. Typically this should match the image type, but a - /// Cubemap view can be imposed on a 2D array image. - SwizzledFormat swizzledFormat; ///< Specifies the image view format and channel swizzle. Must be compatible (same - /// bit-widths per channel) with the image's base format. - /// @note: YUV formats are invalid for an ImageView. A format should be chosen to be - /// compatible with either the luma or chroma plane(s) of the YUV format. - SubresRange subresRange; ///< Specifies a subset of subresources to include in the view. If the base Image - /// has a YUV planar format, the number of array slices in the range must be 1. - /// If zRange feature is used, the number of mips in the range must be 1. - float minLod; ///< Minimum mip level of detail to use for this view. - - uint32 samplePatternIdx; ///< Index into the currently bound MSAA sample pattern palette to be - /// read/evaluated when samplepos shader instructions are executed on this - /// view. Can be ignored if the samplepos shadinstruction will not be used. - /// Must be less than MaxSamplePatternPaletteEntries. See - /// IDevice::SetSamplePatternPalette(). - Range zRange; ///< Specifies the z offset and z range. - - ImageTexOptLevel texOptLevel; ///< Specific the texture optimization level. - - const IImage* pPrtParentImg; ///< Meaningful only if "mapAccess" is not "raw". - PrtMapAccessType mapAccess; ///< Type of access to be done if "pImage" is a PRT+ meta-data image. - /// See @ref ImageCreateInfo - - ImageLayout possibleLayouts; ///< Union of all possible layouts this view can be in while accessed by this view. - /// (ie. what can be done with this SRD without having a layout transition?) - /// In DX, for example, it's possible that a texture SRV could be accessed in a state - /// with all other read-only usages allowed, but a UAV must exclusively be accessed - /// in the UNORDERED_ACCESS state. - /// The primary purpose of this flag is to avoid compressed shader writes if a - /// different usage does not support compression and PAL won't get an opportunity to - /// decompress it (ie. a transition in a barrier) - - CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. - /// Only relevant if the backing IImage resource and its bound memory pages - /// enable compression. ReadBypassWriteDisable is only valid if compressionMode - /// in ImageCreateInfo disables compressed write. - - union - { - struct - { - /// Set to have this surface independently bypass the MALL for read and / or write operations. - /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful - /// only on GPUs that have supportsMall set in DeviceProperties. - uint32 bypassMallRead : 1; - uint32 bypassMallWrite : 1; - - uint32 zRangeValid : 1; ///< whether z offset/ range value is valid. - uint32 includePadding : 1; ///< Whether internal padding should be included in the view range. - - uint32 reserved : 28; ///< Reserved for future use - }; - uint32 u32All; ///< Value of flags bitfield - } flags; ///< Image view flags. -}; - -/// Specifies parameters controlling execution of sample instructions in a shader. Input to CreateSamplerSrd(). -/// -/// @ingroup ResourceBinding -struct SamplerInfo -{ - TexFilterMode filterMode; ///< Min/max filtering modes - TexFilter filter; ///< Filtering to apply to texture fetches. - TexAddressMode addressU; ///< Addressing mode for U texture coords outside of the [0..1] range. - TexAddressMode addressV; ///< Addressing mode for V texture coords outside of the [0..1] range. - TexAddressMode addressW; ///< Addressing mode for W texture coords outside of the [0..1] range. - float mipLodBias; ///< Bias for mipmap level of detail selection. - uint32 maxAnisotropy; ///< Anisotropy value clamp when the filter mode is TexFilterAnisotropic. - CompareFunc compareFunc; ///< Comparison function to apply to fetched data. - float minLod; ///< High-resolution mipmap LOD clamp. - float maxLod; ///< Low-resolution mipmap LOD clamp. - BorderColorType borderColorType; ///< Selects border color when an address mode is TexAddressClampBorder. - uint32 borderColorPaletteIndex; ///< Choose color from the border color palette when borderColorType is - /// BorderColorPalette. - float anisoThreshold; ///< Opt-in, flags.useAnisoThreshold == 1 and flags.preciseAniso == 0. - /// The value should be computed taking account the maxAnisotropy - /// setting. This is a high resolution value which is quantized and - /// clamped down to 3 bits to the domain [0.0, 0.875] for current Hw. - /// We can interpret the functioning of the threshold value as follows. - /// maxAnisotropy per-pixel can be 1, 2, 4, 8 or 16 (N). - /// During sampling, the initial count (or S) is computed in Hw for each - /// quad and the domain for current Hw is [0-16]. - /// Final sample count = min(pow(2, ceil(log2(S - anisoThreshold))), N) - /// Note: when flags.useAnisoThreshold == 0, Pal will ignore this value - /// and instead use a maximum of 0.25 at the highest anisotropic setting. - /// It is important to be aware that this feature tunes quality vs - /// performance, so care should be taken to not degrade image quality - /// 'noticeably' when enabling using this feature - uint32 perfMip; ///< Controls the value of the PERF_MIP field in Sampler SRD's. - /// This field basically controls the Fractional part of the LOD - /// calculation. if LOD is fractional so let us say 1.23, in this case - /// you must avg.out your samples from both MIP 1 and 2.But if PERF_MIP - /// is set to nonzero the HW will perform an optimization and may fetch - /// from only 1 MIP. - - // These values are used to define a filtering line used when sampling a residency map. The defined - // slopes in both the X (U) and Y (V) directions are to avoid visible disconnects when sampling between - // different samples. - Offset2d uvOffset; ///< u/v offset value selectors. Values specified are in - /// log2 of fractions of pixel. i.e., 1 / (1 << x). Not all values - /// are supported by all HW. - Offset2d uvSlope; ///< u/v slope value selectors. Supported slope values are - /// specified in degrees. In the case of a 3D image, the supplied - /// uvSlope.y is interpreted as wSlope. - /// 0 2.5 - /// 1 3 - /// 2 4 - /// 3 5 - /// 4 8 - /// 5 16 - /// 6 32 - /// 7 64 - /// other values: unsupported - - union - { - struct - { - uint32 mgpuIqMatch : 1; ///< Enables image compatibility for MGPU scenarios where paired devices - /// come from different hardware families. - uint32 preciseAniso : 1; ///< Anisotropic filtering should prefer precision over speed. - uint32 unnormalizedCoords : 1; ///< If set then always use unnormalized texture coordinates instead of - /// zero to one. Only works under certain conditions (no mip filtering, - /// no computed LOD, no offsets, only edge or border clamp address modes) - uint32 truncateCoords : 1; ///< If set then hardware will truncate mantissa instead of - /// rounding to nearest even in float point to fixed point - /// texture coordinate conversion - uint32 seamlessCubeMapFiltering : 1; ///< If set then there's filtering across the edges of the cube map. - uint32 prtBlendZeroMode : 1; ///< Allow unmapped PRT texels to be treated as zero and blended with - /// mapped texels. If set to 0, the destination of the sample instruction - /// is written with all 0s when TFE == 0; if set to 1, Treat unmapped - /// texels as zeros and blend them with other mapped texels, write the - /// result of this sample instruction to the destination GPRs. - uint32 useAnisoThreshold : 1; ///< If set, Hw will use the value assigned in anisoThreshold, but - /// only if preciseAniso is set to 0, also. - - /// This allows the sampler to turn off overriding anisotropic filtering when the resource view contains a - /// single mipmap level. Not all graphics IP supports overriding anisotropic filtering, and this flag will - /// be ignored for such GPUs. - uint32 disableSingleMipAnisoOverride : 1; - - uint32 forResidencyMap : 1; ///< Set if the surface being sampled is a residency map used in PRTs. - /// Only meaningful if the corresponding ImageView's mapAccess is set to - /// "read". Only valid for devices that report the "PrtFeaturePrtPlus" - /// flag. - uint32 reserved : 23; ///< Reserved for future use - }; - uint32 u32All; ///< Value of flags bitfield - } flags; -}; - -/// Specifies which heuristic should be utilized for sorting children when box sorting is enabled -enum class BoxSortHeuristic : uint32 -{ - ClosestFirst = 0x0, ///< Traversal is ordered to enter the children that - ///< intersect the ray closer to the ray origin first. - ///< This is good baseline option. Default option for RT IP 1.x. - LargestFirst = 0x1, ///< Traversal is ordered to enter the children that have the largest - ///< interval where the box intersects the ray first. - ///< Good for shadow rays with terminate on first hit. - ClosestMidPoint = 0x2, ///< Traversal is ordered to enter the children that have a midpoint in the interval - ///< where the box intersects that has the lowest intersection time before clamping( - ///< Good for reflection rays. - Disabled = 0x3, ///< Box sort and heuristic are disabled. - Count -}; - -/// Specifies parameter for creating a BvH (bounding volume hierarchy, used by ray-trace) descriptor -struct BvhInfo -{ - const IGpuMemory* pMemory; ///< Memory object holding the BVH nodes - gpusize offset; ///< Offset from memory address specified by pMemory. Combination of - /// pMemory address and the offset must be 256 byte aligned. - gpusize numNodes; ///< Number of nodes in the view - uint32 boxGrowValue; ///< Number of ULPs (unit in last place) to be added during ray-box test. - - BoxSortHeuristic boxSortHeuristic; ///< Specifies which heuristic should be utilized for - ///< sorting children when box sorting is enabled - union - { - struct - { - uint32 useZeroOffset : 1; ///< If set, SRD address is programmed to zero - uint32 returnBarycentrics : 1; ///< When enabled, ray intersection will return triangle barycentrics. - /// Note: Only valid if @see supportIntersectRayBarycentrics is true. - - /// Set to have this surface independently bypass the MALL for read and / or write operations. - /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful - /// only on GPUs that have supportsMall set in DeviceProperties. - uint32 bypassMallRead : 1; - uint32 bypassMallWrite : 1; - uint32 pointerFlags : 1; ///< If set, flags are encoded in the node pointer bits - uint32 highPrecisionBoxNode : 1; ///< If set, enable 64-byte high precision box node - uint32 wideSort : 1; ///< If set, enable wide sort - uint32 hwInstanceNode : 1; ///< If set, enable hardware instance node - uint32 sortTrianglesFirst : 1; ///< If set, triangle nodes are treated specially during child sorting - uint32 compressedFormatEn : 1; ///< If set, enable compressed format support. This include enable - /// support for compressed primitive packets, BVH8-128B box nodes, - /// and changes to triangle intersection test return data. - uint32 reserved : 22; ///< Reserved for future HW - }; - - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< BVH creation flags. -}; - -/// Specifies parameters for an fmask view descriptor. -/// -/// Input to CreateFmaskViewSrd(). Allows the client to access fmask from a shader using the load_fptr IL instruction. -/// -/// @ingroup ResourceBinding -struct FmaskViewInfo -{ - const IImage* pImage; ///< Image associated with the fmask view. - uint32 baseArraySlice; ///< First slice in the view. - uint32 arraySize; ///< Number of slices in the view. - - union - { - struct - { - uint32 shaderWritable : 1; ///< True if used with an image that has been transitioned to a shader- - /// writable image state (e.g. [Graphics|Compute][WriteOnly|ReadWrite]) - uint32 reserved : 31; ///< Reserved for future use - }; - uint32 u32All; ///< Value of flags bitfield - } flags; ///< Fmask view flags -}; - -/// Element of the multisample pattern representing a sample position (X, Y), type of SamplePatternPalette, which -/// matches the layout defined by SC. -struct SamplePos -{ - float x; ///< x coordinate of sample position. - float y; ///< y coordinate of sample position. - uint32 reserved1; ///< reserved for future use - uint32 reserved2; ///< reserved for future use -}; - -/// Specifies a palette of MSAA sample patterns used by the client. Input to SetSamplePatternPalette, which is used -/// to implement samplepos shader instruction support. -typedef SamplePos SamplePatternPalette[MaxSamplePatternPaletteEntries][MaxMsaaRasterizerSamples]; - -/// Provides a GPU timestamp along with the corresponding CPU timestamps, for use in calibrating CPU and GPU timelines. -struct CalibratedTimestamps -{ - uint64 gpuTimestamp; ///< GPU timestamp value compatible with ICmdBuffer::CmdWriteTimestamp(). - uint64 cpuClockMonotonicTimestamp; ///< POSIX CLOCK_MONOTONIC timestamp - uint64 cpuClockMonotonicRawTimestamp; ///< POSIX CLOCK_MONOTONIC_RAW timestamp - uint64 cpuQueryPerfCounterTimestamp; ///< Windows QueryPerformanceCounter timestamp - uint64 maxDeviation; ///< Maximum deviation in nanoseconds between the GPU and CPU timestamps -}; - -/// Specifies connector types -enum class DisplayConnectorType : uint32 -{ - Unknown = 0, ///< Unknown connector type - Vga, ///< VGA - DviD, ///< DVI_D - DviI, ///< DVI_I - Hdmi, ///< HDMI - Dp, ///< DP - Edp, ///< EDP - Minidp, ///< MINI_DP - Count -}; - -/// Specifies properties for display connectors connected to GPU -struct DisplayConnectorProperties -{ - DisplayConnectorType type; ///< Connector type - VGA, DVI, HDMI, DP etc -}; - -/// Specifies pre-defined power profile which is used to communicate with KMD/PPLib and set correspond power states. -enum class PowerProfile : uint32 -{ - Default = 0, ///< Default power profile. - VrCustom = 1, ///< Power profile used by custom VR scenario. - VrDefault = 2, ///< Power profile used by default VR scenario. - Idle = 3, ///< Power profile used for forced DPM0, in case HMD is taken off but the game is still running. - Count -}; - -/// Fine-grain power switch info. -struct PowerSwitchInfo -{ - uint32 time; ///< Time in microseconds, relative to the frame start at V-sync. Clients should consider the - /// powerSwitchLatency value reported in @ref PrivateScreenProperties when specifying - /// switch times. - - uint32 performance; ///< Performance to be set (between 0-100), which is mapped to a certain DPM level by KMD. -}; - -/// Maximum number of power switch info allowed in one custom power profile. -static constexpr uint32 MaxNumPowerSwitchInfo = 5; - -/// Fine-grain power management for dynamic power mode. This structure specifies multiple DPM states to be cycled -/// through each frame. -struct CustomPowerProfile -{ - IPrivateScreen* pScreen; ///< Dynamic power mode needs V-sync so a private screen object is needed. - uint32 numSwitchInfo; ///< Number of discrete DPM states to cycle through per frame. Number entries - /// in switchInfo[] and actualSwitchInfo[]. - - PowerSwitchInfo switchInfo[MaxNumPowerSwitchInfo]; ///< Specifies the set of power states to cycle through each - /// frame. Each entry specifies an offset into the frame where - /// the DPM state should be switched, and a rough performance - /// requirement value which will be translated into an - /// appropriate DPM state by KMD. - PowerSwitchInfo actualSwitchInfo[MaxNumPowerSwitchInfo]; ///< The actual set of power states that KMD/PPLib sets. -}; - -/// Flags for IDevice::AddGpuMemoryReferences(). Depending on their residency model, a client may set these flags as -/// directed by the application or hard-code them to a single value. Driver-internal memory references should be marked -/// as CantTrim unless the client explicitly handles trim support. -/// -/// Note that the CantTrim and MustSucceed flags are based on the same WDDM flags; it is expected that PAL will ignore -/// them on non-WDDM platforms. -enum GpuMemoryRefFlags : uint32 -{ - GpuMemoryRefCantTrim = 0x1, ///< The caller can't or won't free this allocation on OS request. - GpuMemoryRefMustSucceed = 0x2, ///< Hint to the OS that we can't process a failure here, this may result in a TDR. - GpuMemoryRefAllFlags = 0x3 ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/// Specifies input arguments for IDevice::GetPrimaryInfo(). Client must specify a display ID and properties of the -/// primary surface that will drive that display in order to query capabilities. -struct GetPrimaryInfoInput -{ - uint32 vidPnSrcId; ///< Video present source id. - uint32 width; ///< Primary surface width. - uint32 height; ///< Primary surface height. - SwizzledFormat swizzledFormat; ///< Format and swizzle of the primary surface. - Rational refreshRate; ///< Video refresh rate, this is only valid if refreshRateValid is set. - union - { - struct - { - uint32 qbStereoRequest : 1; ///< Going to set a stereo mode. - uint32 refreshRateValid : 1; ///< Refresh rate is valid. - uint32 freeSyncInCrossFireSupport : 1; ///< True if client supports FreeSync in CrossFire. - uint32 useKmdCalcFramePacing : 1; ///< True if client uses KMD frame pacing. If so, the client - /// creates a timer queue to delay the present, and the delay - /// value is calculated by KMD. - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< get primary surface info input flags. -}; - -/// Specifies output arguments for IDevice::GetStereoDisplayModes(), returning supported stereo mode -struct StereoDisplayModeOutput -{ - Extent2d extent; ///< Dimensions in pixels WxH. - Rational refreshRate; ///< Refresh rate. - SwizzledFormat format; ///< Format and swizzle of the primary surface. -}; - -/// Specifies output arguments for IDevice::GetActive10BitPackedPixelMode(), returning which, if any, 10-bit -/// display mode is active. -struct Active10BitPackedPixelModeOutput -{ - bool isInWs10BitMode; ///< Whether the workstation 10-bit feature is enabled. - bool notifyKmd10bitsPresent; ///< When in 10-bit mode and at present time, if the - /// client driver sees a 10-bit to 8-bit surface blt, - /// it needs to call RequestKmdReinterpretAs10Bit() to - /// inform the KMD that the dst surface must be reinterpreted - /// as 10-bits per channel for all KMD-initiated BLTs. - PackedPixelType packedPixelType; ///< Format of the packed pixels. - uint32 pixelPackRatio; ///< The number of 10-bit pixels that are packed into one 8-8-8-8 - /// format pixel. -}; - -/// Specifies primary surface stereo mode. -enum StereoMode : uint32 -{ - StereoModeHwAlignedViews = 0, ///< The stereo views are HW aligned on the display. - StereoModeSwPackedViews = 1, ///< The layout of the stereo views on the display are determined by the client. - StereoModeNotSupported = 2, ///< Not support stereo mode - StereoModeSideBySide = 3, ///< The two stereo views are put side by side on the display. - StereoModeTopBottom = 4 ///< One stereo view is on the top of the display, and the other is on the bottom. -}; - -/// Enumerates the supported workstation stereo modes. -enum class WorkstationStereoMode : uint32 -{ - Disabled, - ViaConnector, ///< Active Stereo for 3 Pin VESA connector. - ViaBlueLine, ///< Blue line Active Stereo for laptops. - Passive, ///< Passive Stereo (Dual head). - PassiveInvertRightHoriz, ///< Passive Stereo with Horizontal Invert (Dual Head). - PassiveInvertRightVert, ///< Passive Stereo with Vertical Invert (Dual Head). - Auto, ///< Auto Stereo Vertical Interleaved. - AutoHoriz, ///< Auto Stereo Horizontal Interleaved. - AutoCheckerboard, ///< Auto Stereo Checkerboard Interleaved. - AutoTsl, ///< Tridelity SL Auto Stereo. - Count, -}; - -/// Specifies output arguments for IDevice::GetPrimaryInfo(), returning capabilitiy information for a display in -/// a particular mode. -struct GetPrimaryInfoOutput -{ - uint32 tilingCaps; ///< Tiling caps supported by this primary surface. - StereoMode stereoMode; ///< Stereo mode supported by this primary surface. - uint32 mallCursorCacheSize; ///< Size of the mall cursor cache in bytes - union - { - struct - { -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 - /// MGPU flag: this primary surface supports DVO HW compositing mode. - uint32 dvoHwMode : 1; - /// MGPU flag: this primary surface supports XDMA HW compositing mode. - uint32 xdmaHwMode : 1; - /// MGPU flag: this primary surface supports client doing SW compositing mode. - uint32 swMode : 1; -#else - uint32 reserved1 : 3; -#endif - /// MGPU flag: this primary surface supports freesync. - uint32 isFreeSyncEnabled : 1; - /// Single-GPU flag: gives hint to the client that they should use rotated tiling mode. - uint32 hwRotationPortraitMode : 1; - /// Single-GPU flag: this primary surface supports non local heap. - uint32 displaySupportsNonLocalHeap : 1; - /// Reserved for future use. - uint32 reserved : 26; - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< get primary surface support info output flags. -}; - -/// Specifies different clock modes that the device can be set to. -enum class DeviceClockMode : uint32 -{ - Default = 0, ///< Device clocks and other power settings are restored to default. - Query = 1, ///< Queries the current device clock ratios. Leaves the clock mode of the device unchanged. - Profiling = 2, ///< Scale down from peak ratio. Clocks are set to a constant amount which is - /// known to be power and thermal sustainable. The engine/memory clock ratio - /// will be kept the same as much as possible. - MinimumMemory = 3, ///< Memory clock is set to the lowest available level. Engine clock is set to - /// thermal and power sustainable level. - MinimumEngine = 4, ///< Engine clock is set to the lowest available level. Memory clock is set to - /// thermal and power sustainable level. - Peak = 5, ///< Clocks set to maximum when possible. Fan set to maximum. Note: Under power - /// and thermal constraints device will clock down. - QueryProfiling = 6, ///< Queries the profiling device clock ratios. Leaves the clock mode of the device unchanged. - QueryPeak = 7, ///< Queries the peak device clock ratios. Leaves the clock mode of the device unchanged. - Count -}; - -/// Specifies input argument to IDeive::SetClockMode. The caller can read the clock ratios the device is currently -/// running by querying using the mode DeviceClockMode::DeviceClockModeQuery. -struct SetClockModeOutput -{ - uint32 memoryClockFrequency; /// Current mem clock (absolute) value in Mhz - uint32 engineClockFrequency; /// Current gpu core clock (absolute) value in Mhz -}; - -/// Specifies input argument to IDeive::SetClockMode. The caller must specify the mode in which to set the device. -struct SetClockModeInput -{ - DeviceClockMode clockMode; ///< Used to specify the clock mode for the device. -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 -/// Specifies primary surface MGPU compositing mode. -enum MgpuMode : uint32 -{ - MgpuModeOff = 0, ///< MGPU compositing mode off, the client does not do SW compositing at all, e.g. AFR disabled. - MgpuModeSw = 1, ///< MGPU SW compositing mode, the client handle the SW compositing. - MgpuModeDvo = 2, ///< MGPU DVO HW compositing mode - MgpuModeXdma = 3, ///< MGPU XDMA HW compositing mode - MgpuModeCount -}; -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 943 -/// Specifies input arguments for IDevice::SetMgpuMode(). A client set a particular MGPU compositing mode and whether -/// frame pacing is enabled for a display. -struct SetMgpuModeInput -{ - uint32 vidPnSrcId; ///< Video present source id. - MgpuMode mgpuMode; ///< Primary surface MGPU compositing mode. - bool isFramePacingEnabled; ///< True if frame pacing enabled. If so, the client creates a timer queue - /// to delay the present, and the delay value is calculated by KMD. -}; -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 -constexpr uint32 XdmaMaxDevices = 8; ///< Maximum number of Devices for XDMA compositing. - -/// Specifies XDMA cache buffer info for each gpu. -struct XdmaBufferInfo -{ - uint32 bufferSize; ///< XDMA cache buffer size of each device - uint32 startAlignment; ///< XDMA cache buffer start alignment of each device -}; - -/// Specifies output arguments for IDevice::GetXdmaInfo(), returning the XDMA cache buffer information of each GPU for -/// a display. -struct GetXdmaInfoOutput -{ - XdmaBufferInfo xdmaBufferInfo[XdmaMaxDevices]; ///< Output XDMA cache buffer info -}; -#endif - -/// Specifies flipping status flags on a specific VidPnSource. It's Windows specific. -union FlipStatusFlags -{ - struct - { - uint32 immediate : 1; ///< Is immediate flip - uint32 dwmFlip : 1; ///< Is DWM conducted flip - uint32 iFlip : 1; ///< Is independent exclusive flip - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies the VSync mode of virtual display. -enum class VirtualDisplayVSyncMode : uint32 -{ - Default = 0, ///< Using the default VSync mode based on refresh rate - Immediate = 1, ///< The presentation should be executed immediately without waiting for vsync to display - HMD = 2, ///< Using HMD VSync, the HMD is specified by pPrivateScreen - Count -}; - -/// The VirtualDisplayInfo is provided by application and KMD uses it to create a virtual display. -/// @see IDevice::CreateVirtualDisplay. -struct VirtualDisplayInfo -{ - uint32 width; ///< Horizontal dimension in pixels - uint32 height; ///< Vertical dimension in pixels - Rational refreshRate; ///< Refresh rate of virtual display - VirtualDisplayVSyncMode vsyncMode; ///< VSync mode - uint32 vsyncOffset; ///< VSync front porch location in pixels or lines. - /// It's needed when VSyncMode is HMD - Pal::IPrivateScreen* pPrivateScreen; ///< A pointer to IPrivateScreen. - /// It's needed when VSyncMode is HMD -}; - -/// Function pointer type definition for creating a buffer view SRD. -/// -/// @see IDevice::CreateTypedBufferViewSrds()/CreateUntypedBufferViewSrds(). -/// -/// @param [in] pDevice Pointer to the device this function is called on. -/// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. -/// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. -/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. -/// -/// @ingroup ResourceBinding -typedef void (PAL_STDCALL *CreateBufferViewSrdsFunc)( - const IDevice* pDevice, - uint32 count, - const BufferViewInfo* pBufferViewInfo, - void* pOut); - -/// Function pointer type definition for creating an image view SRD. -/// -/// @see IDevice::CreateImageViewSrds(). -/// -/// @param [in] pDevice Pointer to the device this function is called on. -/// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. -/// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. -/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. -/// -/// @ingroup ResourceBinding -typedef void (PAL_STDCALL *CreateImageViewSrdsFunc)( - const IDevice* pDevice, - uint32 count, - const ImageViewInfo* pImgViewInfo, - void* pOut); - -/// Function pointer type definition for creating a fmask view SRD. -/// -/// @see IDevice::CreateFmaskViewSrds(). -/// -/// @param [in] pDevice Pointer to the device this function is called on. -/// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. -/// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. -/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. -/// -/// @ingroup ResourceBinding -typedef void (PAL_STDCALL *CreateFmaskViewSrdsFunc)( - const IDevice* pDevice, - uint32 count, - const FmaskViewInfo* pFmaskViewInfo, - void* pOut); - -/// Function pointer type definition for creating a sampler SRD. -/// -/// @see IDevice::CreateSamplerSrds(). -/// -/// @param [in] pDevice Pointer to the device this function is called on. -/// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. -/// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. -/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. -/// -/// @ingroup ResourceBinding -typedef void (PAL_STDCALL *CreateSamplerSrdsFunc)( - const IDevice* pDevice, - uint32 count, - const SamplerInfo* pSamplerInfo, - void* pOut); - -/// Function pointer type definition for creating a ray tracing SRD. -/// -/// @see IDevice::CreateBvhSrds(). -/// -/// @param [in] pDevice Pointer to the device this function is called on. -/// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. -/// @param [in] pBvhInfo Array of BVH descriptions directing SRD construction. -/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. -/// -/// @ingroup ResourceBinding -typedef void (PAL_STDCALL *CreateBvhSrdsFunc)( - const IDevice* pDevice, - uint32 count, - const BvhInfo* pBvhInfo, - void* pOut); - -/// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. -/// -/// @param [in] pDevice -/// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. -/// @param [out] pViewInfo Pointer to memory to be filled with decoded info. -typedef void (PAL_STDCALL *DecodeBufferViewSrdFunc)( - const IDevice* pDevice, - const void* pBufferViewSrd, - BufferViewInfo* pViewInfo); - -/// Structure filled in by DecodeImageViewSrd(). -struct DecodedImageSrd -{ - SwizzledFormat swizzledFormat; ///< SRD's swizzled format. - SubresRange subresRange; ///< SRD's subresource range; the array-range is always { 0, 1 } for 3D images. - Range zRange; ///< z-range of the SRD's subresRange.startSubres. -}; - -/// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). -/// -/// @param [in] pDevice -/// @param [in] pImage The same image the SRD was created on. -/// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. -/// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. -typedef void (PAL_STDCALL *DecodeImageViewSrdFunc)( - const IDevice* pDevice, - const IImage* pImage, - const void* pImageViewSrd, - DecodedImageSrd* pDecodedInfo); - -/// Function pointer table for SRD methods. -struct DeviceInterfacePfnTable -{ - CreateBufferViewSrdsFunc pfnCreateTypedBufViewSrds; ///< Typed Buffer view SRD creation function pointer. - CreateBufferViewSrdsFunc pfnCreateUntypedBufViewSrds; ///< Untyped Buffer view SRD creation function ptr. - CreateImageViewSrdsFunc pfnCreateImageViewSrds; ///< Image view SRD creation function pointer. - CreateFmaskViewSrdsFunc pfnCreateFmaskViewSrds; ///< Fmask View SRD creation function pointer. - CreateSamplerSrdsFunc pfnCreateSamplerSrds; ///< Sampler SRD creation function pointer. - CreateBvhSrdsFunc pfnCreateBvhSrds; ///< BVH SRD creation function pointer. - DecodeBufferViewSrdFunc pfnDecodeBufferViewSrd; ///< Buffer SRD decode function pointer. - DecodeImageViewSrdFunc pfnDecodeImageViewSrd; ///< Image SRD decode function pointer. -}; - -/// Specifies output arguments for IDevice::QueryWorkstationCaps(), returning worksation feature information -/// on this device workstation board. -union WorkStationCaps -{ - struct - { - uint32 workStationBoard : 1; ///< Running a workstation driver on a workstation board. - /// On workstation boards that support CWG (Creator Who Game), - /// the user can switch to a Gaming/consumer driver on the - /// workstation board, and then this will be false. - uint32 supportWorkstationAppPerfOpt : 1; ///< Workstation boards have optimizations for kinds of workstation - /// applications. These optimization is enabled if it is set. - uint32 supportWorkstationEdgeFlag : 1; ///< Workstation boards have a DX9 feature that edge flag can be - /// exported via point size output in VS. The feature is enabled - /// if it is set. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// FrameLock/GenLock support state enum -enum class FlglSupport : uint32 -{ - NotAvailable = 0, ///< FL/GL not supported by the GPU - NotConnected = 1, ///< FL/GL support available in the GPU, but is not connected to a GLSync board - Available = 2, ///< FL/GL support available and connected - Count -}; - -/// Container structure for FrameLock/GenLock state. -struct FlglState -{ - union - { - struct - { - uint32 genLockEnabled : 1; ///< True if genlock is currently enabled. Genlock is a system-wide setting - ///< in CCC. Genlock provides a singal source (which is used in framelock) - uint32 frameLockEnabled : 1; ///< True if (KMD) framelock is currently enabled. - ///< Framelock is the mechanism to sync all presents in multiple adapters. - uint32 isTimingMaster : 1; ///< True if the display being driven by the current adapter is the timing - ///< master in a genlock configuration - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Packed 32-bit uint value. - }; - FlglSupport support; ///< The state of the FLGL support in current adapter - uint32 firmwareVersion; ///< Firmware version number of the GLSync hardware (S400 board), if available -}; - -/// GlSync setting mask definition, used with GlSyncConfig -enum GlSyncConfigMask : uint32 -{ - GlSyncConfigMaskSignalSource = 0x00000001, - GlSyncConfigMaskSyncField = 0x00000002, - GlSyncConfigMaskSampleRate = 0x00000004, - GlSyncConfigMaskSyncDelay = 0x00000008, - GlSyncConfigMaskTriggerEdge = 0x00000010, - GlSyncConfigMaskScanRateCoeff = 0x00000020, - GlSyncConfigMaskFrameLockCntl = 0x00000040, - GlSyncConfigMaskSigGenFrequency = 0x00000080 -}; - -/// specify GLSYNC framelock control state -enum GlSyncFrameLockCtrl : uint32 -{ - GlSyncFrameLockCntlNone = 0x00000000, - GlSyncFrameLockCntlEnable = 0x00000001, - GlSyncFrameLockCntlDisable = 0x00000002, - GlSyncFrameLockCntlResetSwapCounter = 0x00000004, - GlSyncFrameLockCntlAckSwapCounter = 0x00000008, - GlSyncFrameLockCntlVersionKmd = 0x00000010 -}; - -/// Specifies GlSync Signal Source -enum GlSyncSignalSource : uint32 -{ - GlSyncSignalSourceGpuMask = 0x0FF, - GlSyncSignalSourceUndefined = 0x100, - GlSyncSignalSourceFreerun = 0x101, - GlSyncSignalSourceBncPort = 0x102, - GlSyncSignalSourceRj45Port1 = 0x103, - GlSyncSignalSourceRj45Port2 = 0x104 -}; - -/// Specifies GlSync Sync Field -enum GlSyncSyncField : uint8 -{ - GlSyncSyncFieldUndefined = 0, - GlSyncSyncFieldBoth = 1, - GlSyncSyncField1 = 2 -}; - -/// Specifies GlSync Sync Trigger Edge -enum GlSyncTriggerEdge : uint8 -{ - GlSyncTriggerEdgeUndefined = 0, - GlSyncTriggerEdgeRising = 1, - GlSyncTriggerEdgeFalling = 2, - GlSyncTriggerEdgeBoth = 3 -}; - -/// Specifies GlSync scan rate coefficient/multiplier options -enum GlSyncScanRateCoeff : uint8 -{ - GlSyncScanRateCoeffUndefined = 0, - GlSyncScanRateCoeffx5 = 1, - GlSyncScanRateCoeffx4 = 2, - GlSyncScanRateCoeffx3 = 3, - GlSyncScanRateCoeffx5Div2 = 4, - GlSyncScanRateCoeffx2 = 5, - GlSyncScanRateCoeffx3Div2 = 6, - GlSyncScanRateCoeffx5Div4 = 7 -}; - -/// Container structure for FrameLock/GenLock config. -struct GlSyncConfig -{ - uint32 validMask; ///< Mask that specifies which settings are actually referred in the structure. - /// GlSyncConfigMask* - uint32 syncDelay; ///< Delay of sync signal in microseconds - uint32 framelockCntlVector; ///< Vector of Framelock control bits. GlSyncFrameLockCntl* - uint32 signalSource; ///< Source of sync signal. Can be House Sync, RJ45 Port or GPUPort. - /// GlSyncSignalSource* or GPUPort Index - uint8 sampleRate; ///< Number of VSyncs per sample. 0 - no sampling, syncronized by singal VSync. - uint8 syncField; ///< Sync to Field 1 or to both Fields when input signal is interlaced. - /// GlSyncSyncField* - uint8 triggerEdge; ///< Which edge should be used as trigger. GlSyncTriggerEdge* - uint8 scanRateCoeff; ///< Scan Rate Multiplier applied to original sync signal. GlSyncScanRateCoeff* - uint32 sigGenFrequency; ///< Frequency in mHz of internal signal generator -}; - -#if PAL_KMT_BUILD -/// The type of NT object, it decides which KMT function should be called to retrieve the NT handle. -enum class ExternalObjectType : uint32 -{ - Allocation = 0, ///< Specifies the NT object is an allocation, such as GPUMemory etc. - SyncObject = 1, ///< Specifies the NT object is a sync object, such as semaphore and fence etc. -}; - -/// Specifies the NT handle attributes when opening from name. -struct ExternalHandleInfo -{ - ExternalObjectType objectType; ///< The type of NT object, it decides which KMT function should - /// be called to retrieve the NT handle. - const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable - /// attribute. - const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT handle - /// with a name, and then the handle can be acquired via this - /// name. - uint32 accessFlags; ///< Desried access rights of the object the handle refers to. -}; -#endif - -#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) -/// Gpu heap group enumeration. One heap group contains several pal GpuHeap. -enum GpuHeapGroup : uint32 -{ - GpuHeapGroupLocal = 0x0, /// Local heap group includes GpuHeapLocal and GpuHeapInvisible on Windows. - /// But GpuHeapInvisible is not included on Linux. - GpuHeapGroupNonLocal = 0x1, /// NonLocal heap group includes GpuHeapGartUswc and GpuHeapGartCacheable. -#if PAL_AMDGPU_BUILD - GpuHeapGroupInvisible = 0x2, /// This is used on Linux as GpuHeapLocal and GpuHeapInvisible are not combined. -#endif - GpuHeapGroupCount, -}; - -/// Struct for querying current gpu memory usage info and budget info. -struct GpuMemoryBudgetInfo -{ -#if PAL_AMDGPU_BUILD - gpusize systemUsage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of whole system. -#else - gpusize usage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of current process. - gpusize budget[GpuHeapGroupCount]; /// Current total memory budget of specified heap group of the device which - /// implies how much memory the device can allocate from that heap group - /// before allocations may fail or cause performance degradation, including - /// all allocated memory. Budget might be affected by OS status and other - /// processes. -#endif -}; -#endif - -/// Reclaim allocation result enumeration. -enum class ReclaimResult : uint8 -{ - Ok = 0, ///< Reclaim result is OK. - Discarded = 1, ///< Reclaim result is discarded. - NotCommitted = 2, ///< Reclaim result is not committed. - Count -}; - -#if defined(_WIN32) -/// KMD has an interface in place to support UMD registration of Events to be signaled based on various -/// system-level conditions. Each enum here represents a certain condition which will cause KMD to signal -/// the event Clients previously registered through RegisterEvent. -enum class EventTrackingType : uint32 -{ - ShaderInterrupt = 0, - EarlyPresent = 1, - Count -}; - -/// Clients specify the Util::Event and EventTrackingType it wants to register with PAL in this struct. -/// Under the condition specified by the trackingType, pEvent will be signaled by KMD. -struct RegisterEventInfo -{ - Util::Event* pEvent; - EventTrackingType trackingType; -}; - -/// As an input of UnregisterEvent to specify the event and trackingType Clients want to unregister from KMD. -struct UnregisterEventInfo -{ - Util::Event* pEvent; - EventTrackingType trackingType; -}; - -/// Output parameter of RegisterEvent. There's one struct for each type of EventTrackingType. -union RegisterEventOutputInfo -{ - struct - { - uint32 eventId; - gpusize eventMailboxGpuVa; - } shaderInterrupt; - - struct - { - uint32 eventId; - } earlyPresent; -}; -#endif - -/// Contains the page fault status of the GPU. -struct PageFaultStatus -{ - union - { - struct - { - uint32 pageFault : 1; ///< Set if there was a GPU page fault. - uint32 readFault : 1; ///< Set if the page fault was during a read operation. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; - } flags; - - gpusize faultAddress; ///< GPU virtual address where page fault occurred. Ignored if @ref pageFault is not set. -}; - -/// Input to the RegisterRuntimeState call, which allows the HIP runtime to pass information to KMD which can be shared -/// with the HIP debugger. -struct HipRuntimeSetup -{ - const void* pRdebug; ///< Address of the r_debug structure in the runtime - uint32 runtimeState; ///< Runtime-specific enum indicating runtime state - uint32 ttmpSetupHint; ///< Hint indicating that ttmp values should be initialized -}; - -/** - *********************************************************************************************************************** - * @interface IDevice - * @brief Interface representing a client-configurable context for a particular GPU. - * - * This object becomes the root of all client/PAL interaction to get work done on that GPU. The main functionality - * provided by the device object: - * - * + Creation of all other PAL objects. - * + GPU memory management. - *********************************************************************************************************************** - */ -class IDevice -{ -public: - /// Get the maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' - /// elements are no larger than pAlignments->maxElementSize. - /// - /// @param [out] pAlignments Its maxElementSize will be used to fill the rest of its members with valid alignments. - /// - /// @returns Success if pAlignments was filled with data. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pAlignments is null. - /// + ErrorInvalidValue if pAlignments->maxElementSize is zero. - virtual Result GetLinearImageAlignments( - LinearImageAlignments* pAlignments) const = 0; - - /// Fills out a structure with details on the properties of this device. This includes capability flags, - /// supported engines/queues, performance characteristics, etc. This should only be called after a client has - /// called @ref CommitSettingsAndInit(). - /// - /// @see DeviceProperties - /// - /// @param [out] pInfo Properties structure to be filled out by PAL based on properties of this device. - /// - /// @returns Success if the device properties were successfully returned in pInfo. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pInfo is null. - virtual Result GetProperties( - DeviceProperties* pInfo) const = 0; - - /// Checks and returns execution state of the device. Currently unsupported for DX clients and - /// will return Unavailable if called by those clients. - /// - /// @param [out] pPageFaultStatus This struct is filled out when this function returns ErrorGpuPageFaultDetected. - /// - /// @returns Success if device is operational and running. Otherwise, one of the following errors may be - /// + ErrorDeviceLost if device is lost, reset or not responding, - /// + ErrorInvalidValue if failed to get device reset state, - /// + ErrorOutOfGpuMemory if ran out of GPU memory, - /// + ErrorGpuPageFaultDetected if page fault was detected, - /// + ErrorUnknown if device is in unknown state. - virtual Result CheckExecutionState( - PageFaultStatus* pPageFaultStatus) = 0; - - /// Returns this devices client-visible settings structure initialized with appropriate defaults. Clients can - /// modify parameters in this structure as they wish in order to modify PAL's behavior for this device. After - /// modifying settings, the client must call CommitSettingsAndInit() before creating finalizing the device. - /// - /// @warning The returned value points to an internal PAL structure. Modifying data using this pointer after - /// calling CommitSettingsAndInit() will result in undefined behavior. - /// - /// @returns Pointer to this devices public settings for examination and/or modification by the client. - virtual PalPublicSettings* GetPublicSettings() = 0; - - /// Reads a specific setting from the operating system specific source (e.g. registry or config file). - /// - /// @param [in] pSettingName Name of the setting. Must be null-terminated. - /// @param [in] settingScope The scope of settings accessible. - /// @param [in] valueType The type of the setting to return (e.g. bool or int). - /// @param [out] pValue Buffer to write data that was read. Must be non-null. - /// @param [out] bufferSz Size of string buffer (pValue). Only necessary for ValueType::Str. - /// @param [out] pInRegistry If not null, this will be set to true if the setting was read from the registry. - /// - /// @returns True if the read of specified setting is successful. False indicates failure. - virtual bool ReadSetting( - const char* pSettingName, - SettingScope settingScope, - Util::ValueType valueType, - void* pValue, - size_t bufferSz = 0, - bool* pInRegistry = nullptr) const = 0; - -#if defined(_WIN32) - /// Clients call this function to register a Util::Event() object to be signaled based on specified conditions. - /// For now, we will only add one condition: shader interrupt. - /// - /// @param [in] input RegisterEventInfo which contains Util::Event client wants to be signaled and - /// EventTrackingType. - /// @param [out] pOutput Depending on the trackingType in input, the pOutput may refers different info - /// which clients may need. Refer to the definition of RegisterEventOutputInfo - /// for details. - /// @returns Success if no errors occurred. - virtual Result RegisterEvent( - const RegisterEventInfo& input, - RegisterEventOutputInfo* pOutput) = 0; - - /// Clients call this function to unregister an event previously registered with RegisterEvent(). - /// - /// @param [in] input UnregisterEventInfo which contains Util::Event and tracing type the client - /// wants to unregister. - /// @returns Success if no errors occurred. - virtual Result UnregisterEvent( - const UnregisterEventInfo& input) = 0; - -#endif - - /// Indicates that the client has finished overriding public settings so the settings struct can be finalized and - /// any late-stage initialization can be done. This method must be called before @ref IDevice::Finalize() can be - /// called. - /// - /// @note The only functions in IDevice that are able to be called before CommitSettingsAndInit(): - /// + GetLinearImageAlignments() - /// + GetPublicSettings() - /// + ReadSetting() - /// - /// @note Finalizing the settings may override values set by the client. This can occur if: - /// + Invalid settings, either because they are not supported by hardware or are somehow self-conflicting, - /// will be overridden. - /// + Settings specified in the private settings will override client-specified settings. - /// - /// @returns Success if settings have been committed successfully and any late-stage initialization is completed - /// successfully as well. - virtual Result CommitSettingsAndInit() = 0; - - /// Returns the largest possible GPU memory alignment requirement for any IGpuMemoryBindable object created on this - /// device. - /// - /// This is useful for clients that may want to allocate generic GPU memory rafts up front to support many objects - /// without creating all of those objects to query their alignment requirements ahead of time. In practice, most - /// objects have a much smaller alignment requirement than the allocation granularity, but images may require more - /// than the allocation granularity on some devices. - /// - /// @returns Largest possible GPU memory byte alignment for an IGpuMemoryBindable object on this device. - virtual gpusize GetMaxGpuMemoryAlignment() const = 0; - - /// Indicates that the client is able to finalize the initialization of this device with the requisite information. - /// This method must be called before any of the factory creation methods may be called. - /// - /// @note The only functions in IDevice that are able to be called before Finalize(): - /// + The functions listed in IDevice::CommitSettingsAndInit(). - /// + GetMaxGpuMemoryAlignment() - /// + GetProperties() - /// - /// @param [in] finalizeInfo Device finalization properties. - /// - /// @returns Success if final initialization is successful. - virtual Result Finalize( - const DeviceFinalizeInfo& finalizeInfo) = 0; - - /// Cleans up all internal state, undoing any work done by CommitSettingsAndInit() and Finalize(). Following a call - /// to this function, the device will be in its initial state as if it was re-enumerated; the client may requery - /// settings and build up the device for further use. If the client doesn't call this function, it will be called - /// automatically when IPlatform::Destroy() is called or when devices are re-enumerated. - /// - /// This function provides clients with a way to return devices to a trival state, one in which they have no - /// lingering OS or kernel driver dependencies. If a client pairs external state (e.g., an OS handle) with their - /// devices they may be required to call this function when they destroy their API device objects. - /// - /// It is expected that all PAL objects created by the device have already been destroyed (e.g. GPU memory, queues), - /// if not, the device may fall into an illegal state and the client will experience undefined behavior. - /// - /// @returns Success if no errors occurred. - virtual Result Cleanup() = 0; - - /// Returns if dual-source blending can be enabled. It checks the ColorBlendStateCreateInfo for any src1 blending - /// options. Then it checks if we are going to override those src1 options because the blend func is - /// min or max. - /// - /// @param [in] createInfo The ColorBlendStateCreateInfo that is checked for conditions that call for dual-source - /// blending. - /// - /// @returns true if the blend state calls for dual-source blending to be enabled. - virtual bool CanEnableDualSourceBlend( - const ColorBlendStateCreateInfo& createInfo) const = 0; - - /// Specifies how many frames can be placed in the presentation queue. This limits how many frames the CPU can get - /// in front of the device. - /// - /// @param [in] maxFrames Maximum number of frames that can be batched. Specifying a value of 0 resets the limit to - /// a default system value (3 frames on Windows). - /// - /// @returns Success if the limit was successfully adjusted. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not available on this OS. - virtual Result SetMaxQueuedFrames( - uint32 maxFrames) = 0; - - /// Compares this device against another device object to determine how compatible they are for multi-GPU - /// operations. - /// - /// @param [in] otherDevice Device to determine MGPU compatibility with. - /// @param [out] pInfo Result compatibility info. - /// - /// @returns Success if the compatibility info was successfully returned in pInfo. Otherwise, one of the following - /// errors may be returned: - /// + ErrorInvalidPointer if pInfo is null. - virtual Result GetMultiGpuCompatibility( - const IDevice& otherDevice, - GpuCompatibilityInfo* pInfo) const = 0; - - /// Reports properties of all GPU memory heaps available to this device (e.g., size, whether it is CPU visible or - /// not, performance characteristics, etc.). - /// - /// @param [out] info Properties of each GPU heap available to this device, indexed by the GPU ID defined in - /// @ref GpuHeap. If a particular heap is unavailable, its entry will report a size of 0. - /// - /// @returns Success if the heap properties were successfully queried and returned in info[]. Otherwise, one of the - /// following errors may be returned: - /// + ErrorUnknown if an unexpected internal error occured. - virtual Result GetGpuMemoryHeapProperties( - GpuMemoryHeapProperties info[GpuHeapCount]) const = 0; - - /// Reports all format and tiling mode related properties for this device. - /// - /// @param [out] pInfo Output properties. - /// - /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pInfo is null. - virtual Result GetFormatProperties( - MergedFormatPropertiesTable* pInfo) const = 0; - - /// Reports performance experiment related properties for this device. - /// - /// Enumerates the GPU family, blocks, capabilities, etc.. - /// - /// @param [out] pProperties Output properties. - /// - /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pProperties is null. - virtual Result GetPerfExperimentProperties( - PerfExperimentProperties* pProperties) const = 0; - - /// Fills out the default MSAA quad sample pattern for the given sample count. - /// - /// @param [in] samples The number of valid samples in the sample pattern. Must be a power of two. - /// @param [out] pQuadSamplePattern Fill this with the default pattern. - /// - /// @returns Success if @ref pQuadSamplePattern was filled with the default sample pattern. - /// Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if @ref pQuadSamplePattern is null. - /// + ErrorInvalidValue if @ref samples is not a supported power of two. - /// + ErrorUnavailable if this device lacks GfxIp support. - virtual Result GetDefaultSamplePattern( - uint32 samples, - MsaaQuadSamplePattern* pQuadSamplePattern) const = 0; - - /// Adds a list of per-device memory object references that persist across command buffer submissions. It is the - /// responsibility of the client to make sure that all required memory references have been added before submitting - /// the command buffer that uses on them. References can be added at the device, queue or specified at submit time. - /// gpuMemRefCount and ppGpuMemory cannot be 0/null. PAL will assert and crash if these values are invalid. If - /// multiple references are provided for the same memory, PAL will retain the safest set of GpuMemoryRef flags. - /// - /// see @ref IQueue::Submit() - /// - /// @param [in] gpuMemRefCount Number of memory references in the memory reference list, must be non-zero. - /// @param [in] pGpuMemoryRefs Array of gpuMemRefCount GPU memory references. - /// @param [in] pQueue Optional IQueue that the memory references will be used on, used to optimize - /// residency operations, can be null. Note, if a queue is specified here the same queue - /// should be specified in RemoveGpuMemoryReferences. - /// @param [in] flags Flags from GpuMemoryRefFlags that will apply to all memory object references. - /// - /// @returns Success if the memory references were successfully added. Can also return NotReady if the client - /// passes in a valid pPagingFence pointer and the operation doesn't complete before the function returns. - /// Otherwise, one of the following errors may be returned: - /// + ErrorTooManyMemoryReferences if gpuMemRefCount will cause the total reference count to exceed the - /// limit of this device. - /// + ErrorOutOfMemory if GPU memory objects will not fit in available GPU memory space (i.e. GPU Memory - /// is overcommitted). - virtual Result AddGpuMemoryReferences( - uint32 gpuMemRefCount, - const GpuMemoryRef* pGpuMemoryRefs, - IQueue* pQueue, - uint32 flags - ) = 0; - - /// Removes a list of per-device memory object references that have previously been added via - /// IDevice::AddGpuMemoryReferences(). PAL is responsible for ensuring that timestamps have been retired prior to - /// actually performing any residency operations related to removal of a memory reference, so clients are free to - /// call this function without regard for command buffer use. Memory references are reference counted, so an - /// individual memory reference will only be removed when the total internal reference count reaches zero. - /// gpuMemoryCount and ppGpuMemory cannot be 0/null, PAL will assert and crash if these values are invalid. - /// - /// @param [in] gpuMemoryCount Number of memory objects in the memory reference list (size of ppGpuMemory array). - /// This count must be greater than zero. - /// @param [in] ppGpuMemory Array of GPU memory references. - /// @param [in] pQueue Optional IQueue that the memory references were used on, used to optimize residency - /// operations, can be null. Note, if a queue was specified in AddGpuMemoryReferences, - /// pQueue must match. - /// - /// @returns Success if the memory references were successfully updated. - virtual Result RemoveGpuMemoryReferences( - uint32 gpuMemoryCount, - IGpuMemory*const* ppGpuMemory, - IQueue* pQueue - ) = 0; - - /// Queries the Device for the total amount of referenced GPU memory for each heap type. These totals include all - /// memory added to the Device or any Queue using @ref AddGpuMemoryReferences and not yet removed using @ref - /// RemoveGpuMemoryReferences. Internal PAL allocations are included in these totals, but memory referenced using - /// the per-submit list in @ref IQueue::Submit is not included in these amounts. - /// - /// The intended use for this interface is for clients to be able to manage budgeting of resident GPU memory. - /// - /// @param [out] referencedGpuMemTotal Array containing the total amount of referenced GPU memory for each GPU - /// memory heap. - virtual void GetReferencedMemoryTotals( - gpusize referencedGpuMemTotal[GpuHeapCount]) const = 0; - - /// Get primary surface MGPU support information based upon primary surface create info and input flags provided - /// by client. - /// - /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your - /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. - /// - /// @param [in] primaryInfoInput Primary surface info input arguments. - /// @param [in,out] pPrimaryInfoOutput Primary surface info output arguments. - /// - /// @returns Success if the primary surface MGPU support information were successfully queried. - virtual Result GetPrimaryInfo( - const GetPrimaryInfoInput& primaryInfoInput, - GetPrimaryInfoOutput* pPrimaryInfoOutput) const = 0; - - /// Returns the supported stereo modes list. - /// - /// @param [in,out] pStereoModeCount Input value specifies the maximum number of stereo modes to enumerate, and the - /// output value specifies the total number of stereo modes that were enumerated - /// in pStereoModeList. The input value is ignored if pStereoModeList is null. - /// This pointer must not be null. - /// @param [out] pStereoModeList Output list of stereo modes. Can be null, in which case the total number of - /// available modes will be written to pStereoModeCount. - /// - /// @returns Success if the display modes were successfully queried and the results were reported in - /// pStereoModeCount/pStereoModeList. Otherwise, one of the following errors may be returned: - /// + Unsupported if stereo mode is not supported, or the stereo modes can't be queried. - /// + ErrorOutOfMemory if temp memeory allocation failed. - virtual Result GetStereoDisplayModes( - uint32* pStereoModeCount, - StereoDisplayModeOutput* pStereoModeList) const = 0; - - /// Returns the currently selected Workstation stereo mode on Windows OS. - /// - /// @param [out] pWsStereoMode Output currently selected Workstation Stereo mode. - /// - /// @returns Success if the currently selected Workstation stereo mode were successfully queried - /// and the results were reported in pWsStereoMode. - virtual Result GetWsStereoMode(WorkstationStereoMode* pWsStereoMode) const = 0; - - /// Return information about active workstation support for 10-bit (potentially packed pixel) displays. - /// - /// @param [out] pMode Output reports if the workstation 10-bit display feature is enabled, and if so, - /// details on any required pixel packing. - /// - /// @returns Success if the 10-bits and packed-pixel format were successfully queried and the result were - /// reported in pMode. - virtual Result GetActive10BitPackedPixelMode( - Active10BitPackedPixelModeOutput* pMode) const = 0; - - /// Inform the KMD that this allocation must be reinterpreted as 10-bits per channel for the all - /// KMD-initiated BLTs. - /// - /// When in 10-bit mode and at present time, if the dx9p driver sees a 10-bit to 8-bit surface blt, - /// it will use this interface to inform the KMD that the blt dst surface must be reinterpreted as - /// 10-bits per channel for the all KMD-initiated BLTs. - /// - /// @param [in] pGpuMemory The dst GPU memory reference which will be marked as 10 bits format. - /// - /// @returns Success if the KMD has been sucessfully notified. - virtual Result RequestKmdReinterpretAs10Bit( - const IGpuMemory* pGpuMemory) const = 0; - - /// Set or query device clock mode. - /// - /// This function can be called by clients to set the device engine and memory clocks to certain pre-defined ratios. - /// If a call to restore the device clocks to default does not occur, the device stays in the previously set mode. - /// - /// @param [in] setClockModeInput Specify the clock mode to set the device to. - /// @param [out] pSetClockModeOutput @b Optional - Output device clock mode. If not nullptr, it is used - /// to query the current clock mode the device is running in. - /// - /// @returns Success if the device clock mode query/set request was successful. - virtual Result SetClockMode( - const SetClockModeInput& setClockModeInput, - SetClockModeOutput* pSetClockModeOutput) = 0; - - /// Request to enable/disable static VMID for the device. - /// - /// The function must be called with enable = true before a profiling session starts and enable = false after a - /// profiling session ends. It may be called any time, though it is illegal to disable without a prior corresponding - // enable/acquire. Only after this returns success (when enabling) can the driver make submissions targeting the - /// static VMID. - /// - /// @param [in] enable Specifies whether acquiring or releasing the static VMID - /// - /// @returns Success if the static VMID acquire/release request was successful. - virtual Result SetStaticVmidMode( - bool enable) = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 943 - /// Set up MGPU compositing mode of a display provided by client. - /// - /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your - /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. - /// - /// @param [in] setMgpuModeInput Set MGPU compositing mode input arguments. - /// - /// @returns Success if the MGPU compositing mode were successfully set. - inline Result SetMgpuMode( - const SetMgpuModeInput& setMgpuModeInput) const { return Result::Success; } -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 - /// Get XDMA cache buffer information of each GPU based upon video present source ID provided by client. - /// - /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your - /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. - /// - /// @param [in] vidPnSrcId Video present source id. - /// @param [in] gpuMemory Primary surface GPU memory. - /// @param [in,out] pGetXdmaInfoOutput Set XDMA cache buffer info output arguments. - /// - /// @returns Success if the XDMA cache buffer information were successfully queried. - inline Result GetXdmaInfo( - uint32 vidPnSrcId, - const IGpuMemory& gpuMemory, - GetXdmaInfoOutput* pGetXdmaInfoOutput) const { return Result::ErrorUnavailable; } -#endif - - /// Polls current fullscreen frame metadata controls on given vidPnSourceId, including extended data. - /// - /// The function is used by clients that support frame metadata through KMD-UMD shared memory. - /// It polls the frame metadata shared memory for the given VidPnSource. Indicating which types of metadata - /// the UMD should send to KMD. - /// Clients should only call this function on the master device in an LDA chain. - /// - /// @param [in] vidPnSrcId Video present source id - /// @param [out] pFrameMetadataControl @b Optional - Output frame metadata controls. Clients can pass null to - /// check if the buffer is initialized successfully and if the - /// vidPnSrcId is valid. - /// - /// @returns Success if the metadata controls on the given vidPnSrcId was successfully polled. - /// Otherwise, one of the following erros may be returned: - /// + ErrorInvalidValue if vidPnSrcId is invalid (out of range) - /// + ErrorUnavailable if no implementation on current platform or if metadata shared buffer is null. - virtual Result PollFullScreenFrameMetadataControl( - uint32 vidPnSrcId, - PerSourceFrameMetadataControl* pFrameMetadataControl) const = 0; - - /// Get flip status flags and a flag indicating if current device owns the flags. (DX only) - /// - /// The function is used by clients that need flip status polling through KMD-UMD shared memory. - /// It provides caller the FlipStatusFlags on the given VidPnSource. Note that the flag returned is only a hint, - /// and can have a one frame delay during flip status transition. - /// - /// @param [in] vidPnSrcId Video present source id. - /// @param [out] pFlipFlags Output flip flags on given vidPnSrcId. Must not be null. - /// @param [out] pIsFlipOwner Output indicating if the current device owns the flip flags. - /// - /// @returns Success if flipping flags on given vidPnSrcId was successfully polled. - /// Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if vidPnSrcId is invalid. - /// + ErrorUnavailable if no implementation on current platform. - /// + ErrorInitializationFailed if flip status shared buffer was failed to initialize. - virtual Result GetFlipStatus( - uint32 vidPnSrcId, - FlipStatusFlags* pFlipFlags, - bool* pIsFlipOwner) const = 0; - - /// Resets the specified set of fences. - /// - /// All fences must be reset before passing them to a submission command. - /// - /// @param [in] fenceCount Number of fences to reset. - /// @param [in] ppFences Array of fences to reset. - /// - /// @returns Success if the specified fences have been successfully reset. - /// - /// @note The function assumes that neither ppFences is null nor that any of the elements of the array pointed by - /// ppFences are null. - virtual Result ResetFences( - uint32 fenceCount, - IFence*const* ppFences) const = 0; - - /// Stalls the current thread until one or all of the specified fences have been reached by the device. - /// - /// If waitAll is true all fences must have been submitted at least once before this is called; - /// otherwise at least one fence must have been submitted. Using a zero timeout value returns - /// immediately and can be used to determine the status of a set of fences without stalling. - /// - /// @param [in] fenceCount Number of fences to wait for (i.e., size of the ppFences array). - /// @param [in] ppFences Array of fences to be waited on. - /// @param [in] waitAll If true, wait for completion of all fences in the array before returning; if false, - /// return after any single fence in the array has completed. - /// @param [in] timeout This method will return after this many nanoseconds even if the fences do not complete. - /// - /// @returns Success if the specified fences have been reached, or Timeout if the fences have not been reached but - /// the specified timeout time has elapsed. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if: - /// - ppFences is null. - /// - Any member of the ppFences array is null. - /// + ErrorInvalidValue if: - /// - fenceCount is zero. - /// + ErrorFenceNeverSubmitted if: - /// - Any of the specified fences haven't been submitted. - virtual Result WaitForFences( - uint32 fenceCount, - const IFence*const* ppFences, - bool waitAll, - std::chrono::nanoseconds timeout) const = 0; - - /// Stalls the current thread until one or all of the specified Semaphores have been reached by the device. - /// - /// Using a zero timeout value returns immediately and can be used to determine the status of a set of semaphores - /// without stalling. - /// - /// @param [in] semaphoreCount Number of semaphores to wait for (i.e., size of the ppFences array). - /// @param [in] ppSemaphores Array of semaphores to be waited on. - /// @param [in] pValues Array of semaphores's value to be waited on. - /// @param [in] flags Combination of zero or more @ref HostWaitFlags values describing the behavior of this - /// wait operation. See @ref HostWaitFlags for more details. - /// @param [in] timeout This method will return after this many nanoseconds even if the semaphores do not - /// complete. - /// - /// @returns Success if the specified semaphores have been reached, or Timeout if the semaphores have not been - /// reached but the specified timeout time has elapsed. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if: - /// - ppSemaphores is null. - /// - Any member of the ppSemaphores array is null. - /// + ErrorInvalidValue if: - /// - semaphoreCount is zero. - virtual Result WaitForSemaphores( - uint32 semaphoreCount, - const IQueueSemaphore*const* ppSemaphores, - const uint64* pValues, - uint32 flags, - std::chrono::nanoseconds timeout) const = 0; - - /// Correlates a GPU timestamp with the corresponding CPU timestamps, for tighter CPU/GPU timeline synchronization - /// - /// @param [out] pCalibratedTimestamps Reports a current GPU timestamp along with the CPU timestamps at the time - /// that GPU timestamp was written. The CPU timestamps are OS-specific. Also - /// reports a maximum deviation between the captured timestamps in nanoseconds. - /// - /// @returns Success if the request was successful. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if: - /// - pCalibratedTimestamps is null. - /// + ErrorUnavailable if: - /// - unable to capture timestamps for all requested time domains. - virtual Result GetCalibratedTimestamps( - CalibratedTimestamps* pCalibratedTimestamps) const = 0; - - /// Binds the specified GPU memory as a trap handler for the specified pipeline type. This GPU memory must hold - /// shader machine code (i.e., the client must generate HW-specific shader binaries through some external means, - /// probably the SP3 assembler). - /// - /// The same trap handler will be installed for all shader stages that are part of the pipeline. A trap handler - /// will only ever be executed for shaders that set the trapPresent bit in @ref PipelineShaderInfo. - /// - /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap handler will be installed - /// for _all_ hardware shader stages. - /// @param [in] pGpuMemory GPU memory allocation holding the trap handler. - /// @param [in] offset Offset in bytes into pGpuMemory where the trap handler shader code begins. Must be - /// 256 byte aligned. - virtual void BindTrapHandler( - PipelineBindPoint pipelineType, - IGpuMemory* pGpuMemory, - gpusize offset) = 0; - - /// Binds the specified GPU memory location as a trap buffer for the specified pipeline type. This GPU memory will - /// be available to the trap handler as scratch memory to use as it chooses. The same trap buffer will be installed - /// for all shader stages that are part of the pipeline. - /// - /// There is no size parameter for the trap buffer. The client is responsible for ensuring that the trap handler - /// only reads/writes data within the bounds designated for trap buffer usage. - /// - /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap buffer will be installed - /// for _all_ hardware shader stages. - /// @param [in] pGpuMemory GPU memory allocation holding the trap buffer range. - /// @param [in] offset Offset in bytes into pGpuMemory where the trap buffer range starts. Must be 256 byte - /// aligned. - virtual void BindTrapBuffer( - PipelineBindPoint pipelineType, - IGpuMemory* pGpuMemory, - gpusize offset) = 0; - - /// Get the swap chain information for creating a swap chain and presenting an image. - /// - /// @param [in] hDisplay Display handle of the local window system. - /// @param [in] hWindow Window handle of the local window system. - /// @param [in] wsiPlatform WSI Platform the swapchain supposed to work on - /// @param [in,out] pSwapChainProperties Contains swap chain information. - /// - /// @returns Success if get swap chain information successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result GetSwapChainInfo( - OsDisplayHandle hDisplay, - OsWindowHandle hWindow, - WsiPlatform wsiPlatform, - SwapChainProperties* pSwapChainProperties) = 0; - - /// Determines if the given window system requirement is supported by the underlying wsiPlatform. - /// - /// @param [in] hDisplay Display handle of the local window system. - /// @param [in] wsiPlatform WSI Platform the request supposed to send to - /// @param [in] visualId Requested visual information which may not needed for some wsiPlatforms - /// - /// @returns Success if the request is supported. Otherwise, one of the following erros may be returned: - /// + Unsupported - virtual Result DeterminePresentationSupported( - OsDisplayHandle hDisplay, - WsiPlatform wsiPlatform, - int64 visualId) = 0; - - /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each - /// WsiPlatform. - /// - /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. - /// @param [in] mode The swap chain will use this present mode. - /// - /// @returns Returns a mask of SwapChainModeSupport. - virtual uint32 GetSupportedSwapChainModes( - WsiPlatform wsiPlatform, - PresentMode mode) const = 0; - - /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each - /// WsiPlatform. - /// - /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. - /// @param [in] mode The swap chain will use this present mode. - /// - /// @returns Returns a mask of SwapChainModeSupport. - virtual uint32 GetCompatiblePresentModes( - WsiPlatform wsiPlatform, - SwapChainMode mode) const = 0; - - /// Determines if the given information corresponds to an external shared image. - /// - /// Some clients may not know if a given external shared resource is a simple GPU memory allocation or an image; it - /// is expected they will call this function to determine which set of open functions they must call. - /// - /// @param [in] openInfo The open info describing the external shared resource. - /// @param [out] pIsImage Its contents will be set to true if the external shared resource is an image. - /// - /// @returns Success if PAL was able to determine whether or not the resource is an image. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pIsImage is null. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result DetermineExternalSharedResourceType( - const ExternalResourceOpenInfo& openInfo, - bool* pIsImage) const = 0; - - /// @name FactoryMethods Device Factory Methods - /// - /// The following set of IDevice methods is the interface through which almost all PAL objects are created. - /// - /// PAL does not allocate its own system memory for these objects. Instead, the client must query the amount of - /// system memory required for the object then provide a pointer where PAL will construct the object. - /// - /// This approach allows the client to roll the PAL object into its own allocations without unnecessary heap - /// allocations and cache misses. It can also allocate many objects in a single memory space without PAL - /// involvement. - /// - /// @{ - - /// Determines the amount of system memory required for a queue object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateQueue(). - /// - /// @param [in] createInfo Properties of the new queue such as engine type and engine index. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IQueue object with the specified properties. - /// A return value of 0 indicates the createInfo was invalid. - virtual size_t GetQueueSize( - const QueueCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a queue object. - /// - /// @param [in] createInfo Properties of the new queue such as engine type and engine index. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetQueueSize() with the same - /// create info. - /// @param [out] ppQueue Constructed queue object. When successful, the returned address will be the same - /// as specified in pPlacementAddr. - /// - /// @returns Success if the queue was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppQueue is null. - /// + ErrorInvalidValue if the create info's engineType is invalid or if the engineIndex is invalid. - virtual Result CreateQueue( - const QueueCreateInfo& createInfo, - void* pPlacementAddr, - IQueue** ppQueue) = 0; - - /// Determines the amount of system memory required for a multi-queue object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateMultiQueue(). - /// - /// @param [in] queueCount Number of queues in the gang; matches number of entries in pCreateInfo. - /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first - /// entry in this array describes the master queue which will be used to execute all - /// IQueue interfaces except for MultiSubmit(). - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an multi-queue IQueue object with the specified - /// properties. A return value of 0 indicates the createInfo was invalid. - virtual size_t GetMultiQueueSize( - uint32 queueCount, - const QueueCreateInfo* pCreateInfo, - Result* pResult) const = 0; - - /// Creates a multi-queue (i.e., gang submission queue) object. The resulting version of the IQueue interface - /// is composed of multiple hardware queues which can be atomically submitted to as a group. When this is done, - /// it is safe to use IGpuEvent objects to tightly synchronize work done across queues in a single call to Submit(). - /// This can allow the client to tightly schedule asynchronous workloads for maximum efficiency that isn't possible - /// across queues using IQueueSemaphore objects. - /// - /// @param [in] queueCount Number of queues in the gang; matches number of entries in the pCreateInfo array. - /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first - /// entry in this array describes the master queue which will be used to execute all - /// IQueue interfaces except for the ganged-portion of a Submit() (e.g., Present()). - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetMultiQueueSize() with the same - /// arguments. - /// @param [out] ppQueue Constructed multi queue object. - /// - /// @returns Success if the multi queue was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidValue if queueCount is less than 2. - /// + ErrorInvalidQueueType if any of the created sub-queues are not multi-queue compatible. This is - /// indicated by the supportsMultiQueue engineProperties flag in @ref DeviceProperties. - /// + ErrorInvalidPointer if pCreateInfo, pPlacementAddr or ppQueue is null. - /// + ErrorInvalidValue if any create info's configuration is invalid. - virtual Result CreateMultiQueue( - uint32 queueCount, - const QueueCreateInfo* pCreateInfo, - void* pPlacementAddr, - IQueue** ppQueue) = 0; - - /// Determines the amount of system memory required for a GPU memory object. - /// - /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of CreateGpuMemory(). - /// - /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed heaps. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetGpuMemorySize( - const GpuMemoryCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates an @ref IGpuMemory object with the requested properties. - /// - /// This method can create either _real_ or _virtual_ GPU memory allocations. - /// - /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed - /// heaps. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetGpuMemorySize() with the same - /// createInfo param. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the GPU memory was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + TooManyFlippableAllocations if the GPU memory was successfully created, but the client has reached - /// the limit of flippable allocations for this Device. This is a warning that future flippable GPU - /// memory may fail to be created due to internal OS limitations. - /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. - /// + ErrorInvalidMemorySize if createInfo.size is invalid. - /// + ErrorInvalidAlignment if createInfo.alignment is invalid. - /// + ErrorInvalidValue if createInfo.heapCount is 0 for real allocations or non-0 for virtual allocations. - /// + ErrorOutOfGpuMemory if the allocation failed due to a lack of GPU memory. - /// + ErrorUnavailable if the Reserve Gpu Virtual Address failed. - virtual Result CreateGpuMemory( - const GpuMemoryCreateInfo& createInfo, - void* pPlacementAddr, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a pinned GPU memory object. - /// - /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of - /// CreatePinnedGpuMemory(). - /// - /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IGpuMemory object pinned with the specified - /// properties. - virtual size_t GetPinnedGpuMemorySize( - const PinnedGpuMemoryCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Pins a segment of system memory in place and create an @ref IGpuMemory object allowing access by the GPU. - /// - /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetPinnedGpuMemorySize() with the - /// same params. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the system memory was successfully pinned and a corresponding GPU memory object was created. - /// Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if createInfo.pSysMem, pPlacementAddr, or ppGpuMemory is null, or if - /// createInfo.pSysMem is not allocation granularity aligned. - /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. - /// + ErrorOutOfMemory if the creation failed because the system memory could not be pinned. - virtual Result CreatePinnedGpuMemory( - const PinnedGpuMemoryCreateInfo& createInfo, - void* pPlacementAddr, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a SVM memory object - /// - /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of - /// CreateSvmGpuMemory(). - /// - /// @param [in] createInfo Data controlling the SVM memory properties, such as size and type of SVM buffer. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. - virtual size_t GetSvmGpuMemorySize( - const SvmGpuMemoryCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates an SVM (Shared Virtual Memory) IGpuMemory object. - /// The basic idea of SVM is to create system memory that has the same CPU and GPU virtual address - /// (i.e., "pointer is a pointer"). This can work in two modes: fine-grain, or coarse-grain. - /// - /// Fine-grain (Single-GPU): The client should just call this function with pReservedGpuVaOwner set to null. - /// PAL will allocate GPU-accessible system memory that will have the same CPU virtual address - /// (as returned by IGpuMemory::Map()) as GPU virtual address - /// (as returned in the gpuVirtAddr value returned by IGpuMemory::Desc()). - /// - /// Fine-grain (MGPU): The client can call this function with pReservedGpuVaOwner set to IGpuMemory object - /// allocated on the first device and receive mapping to the same GPU VA location on another device. - /// - /// Coarse-grain: In this mode, there are actually two separate IGpuMemory objects. - /// The client should first create the "local" GPU memory object by calling IDevice::CreateGpuMemory() to create - /// a standard GPU memory object in the VaRange::Svm VA space. Next, the client should create the "staging" GPU - /// memory by calling this function (CreateSvmGpuMemory) with pReserveGpuVaOwner pointing to the "local" GPU - /// memory object. PAL will create system memory for the "staging" GPU memory with a CPU virtual address matching - /// the "local" GPU memory's GPU virtual address. The GPU virtual address of the "staging" GPU memory is - /// arbitrarily assigned. The client is responsible for managing the contents of the two related allocations - /// per their API rules. - /// - /// @param [in] createInfo Data controlling the SVM memory properties, such as size and location of SVM buffer. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetSvmGpuMemorySize() with the - /// same params. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the SVM buffer was successfully created and a corresponding GPU memory object was created. - /// Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr, or ppGpuMemory is null. - /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. - /// + ErrorOutOfMemory if the creation failed because there is not enough GPU memory - /// or the system memory could not be pinned. - virtual Result CreateSvmGpuMemory( - const SvmGpuMemoryCreateInfo& createInfo, - void* pPlacementAddr, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a GPU memory object created by opening an allocation from a - /// different GPU. - /// - /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of - /// OpenSharedGpuMemory(). - /// - /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified - /// properties. A return value of 0 indicates the openInfo was invalid. - virtual size_t GetSharedGpuMemorySize( - const GpuMemoryOpenInfo& openInfo, - Result* pResult) const = 0; - - /// Opens a shareable GPU memory object created on another device for use on this device. - /// - /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetSharedGpuMemorySize() with the - /// same params. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. - /// + ErrorNotShareable if the specified memory object was not marked as shareable on creation. - virtual Result OpenSharedGpuMemory( - const GpuMemoryOpenInfo& openInfo, - void* pPlacementAddr, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a external GPU memory object created by opening - /// an allocation from a compatible device, such as D3D device. - /// - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified - /// properties. A return value of 0 indicates the openInfo was invalid. - virtual size_t GetExternalSharedGpuMemorySize( - Result* pResult) const = 0; - - /// Opens an external shared memory object which is created by a compatible device, such as D3D device. - /// There could be more than one underlying allocations in the shared memory object, only one allocation - /// created on the device's GPU will be opened, other allocations will be ignored. - /// - /// @param [in] openInfo Open info. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetExternalSharedGpuMemorySize() - /// with the same params. - /// @param [out] pMemCreateInfo Return CreateInfo of the external shared GPU memory. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory or ppGpuMemory is null. - /// + ErrorNotShareable if none of allocations in the shared memory object is created on the device's GPU. - virtual Result OpenExternalSharedGpuMemory( - const ExternalGpuMemoryOpenInfo& openInfo, - void* pPlacementAddr, - GpuMemoryCreateInfo* pMemCreateInfo, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a proxy GPU memory object to a GPU memory object on a - /// different GPU. An allocation of this amount of memory must be provided in the pPlacementAddr parameter of - /// OpenPeerGpuMemory(). - /// - /// @param [in] openInfo Specifies a handle to a GPU memory object to open for peer-to-peer transfer access. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for opening a peer IGpuMemory object with the specified - /// properties. A return value of 0 indicates the openInfo was invalid. - virtual size_t GetPeerGpuMemorySize( - const PeerGpuMemoryOpenInfo& openInfo, - Result* pResult) const = 0; - - /// Opens previously created GPU memory object for peer access on another device. - /// - /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetPeerGpuMemorySize() with the same - /// params. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the memory was successfully opened for peer access on this device. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory, or openInfo.pOriginalMem is null. - virtual Result OpenPeerGpuMemory( - const PeerGpuMemoryOpenInfo& openInfo, - void* pPlacementAddr, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for an image object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateImage(). - /// - /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an @ref IImage object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetImageSize( - const ImageCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Determines if the image will have faster clone copy (than normal CS or graphics copy) in @ref CmdCopyImage(). - /// If this function returns true, clients can set @ref ImageCreateInfo::flags::cloneable = 1 to enable clone - /// copy support in @ref CmdCopyImage(); however whether @ref CmdCopyImage() goes through clone copy or not depends - /// on multiple factors, please see doxygen in @ref CmdCopyImage() for more details. - /// - /// Note that clients are free to set this flag to 1 regardless; it still enables clone copy potentially but can't - /// guarantee better performance. If clients find more faster clone copy cases, feel free to improve this function - /// to cover these kinds of images. - /// - /// Due to rely on @ref ImageCreateInfo values to make decision, ideally clients should set all other members - /// for @ref ImageCreateInfo first, and then call this function to setup cloneable flag last before creating image. - /// - /// @param [in] createInfo Properties of the new image. - /// - /// @returns true if image prefers clone copy in CmdCopyImage(); otherwise goes through normal copy. - virtual bool ImagePrefersCloneCopy( - const ImageCreateInfo& createInfo) const = 0; - - /// Creates an @ref IImage object with the requested properties. - /// - /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetImageSize() with the same - /// createInfo param. - /// @param [out] ppImage Constructed image object. When successful, the returned address will be the same as - /// specified in pPlacementAddr. - /// - /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if: - /// - The image dimensions are invalid based on the image type. - /// - The image dimensions are not properly aligned for compressed formats. - /// - The number of samples is invalid for the image type and format. - /// - MSAA is enabled for an image that doesn't support color or depth usage. - /// - MSAA images have more than one mip level. - /// - The array size is zero, non-1 for 3D images, or beyond the max number of slices for 1D or 2D - /// images. - /// - The number of mipmaps is invalid for the image dimensions. - /// + ErrorInvalidPointer if pPlacementAddr or ppImage is null. - /// + ErrorInvalidFormat if: - /// - The format doesn't support the usage flags. - /// - A 1D image specifies a compressed format. - /// + ErrorInvalidFlags if: - /// - The color target and depth/stencil usages are specified simultaneously. - /// - The color target flag is set for a 1D image. - /// - The depth/stencil flag is set for a non-2D image. - virtual Result CreateImage( - const ImageCreateInfo& createInfo, - void* pPlacementAddr, - IImage** ppImage) = 0; - - /// Determines the amount of system memory required for a presentable image object (and an associated memory - /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and - /// pGpuMemoryPlacementAddr parameters of CreatePresentableImage(). - /// - /// Only images created through this interface are valid sources for IQueue::Present(). - /// - /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. - /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. - /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). - /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a IGpuMemory object attached to the - /// presentable IImage. Should be specified to the pGpuMemoryPlacementAddr argument - /// of CreatePresentableImage(). - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - virtual void GetPresentableImageSizes( - const PresentableImageCreateInfo& createInfo, - size_t* pImageSize, - size_t* pGpuMemorySize, - Result* pResult) const = 0; - - /// Creates a presentable image. Presentable image must have internally bound GPU memory allocated as OS needs the - /// information of image/memory via OS callbacks. - /// - /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. - /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must - /// be as much size available here as reported by calling - /// GetPresentableImageSizes(). - /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated - /// with this presentable image. There must be as much size available here as - /// reported by calling GetPresentableImageSizes(). - /// @param [out] ppImage Constructed image object. - /// @param [out] ppGpuMemory Constructed memory object. This object is only valid for specifying in a - /// memory reference list. It must be destroyed when the image is destroyed. - /// - /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: - /// + TooManyFlippableAllocations if the image was successfully created, but the client has reached the - /// limit of flippable allocations for this Device. This is a warning that future presentable Images - /// may fail to be created due to internal OS limitations. - /// + ErrorTooManyPresentableImages if the swap chain cannot be associated with more presentable images. - /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, or ppGpuMemory is null. - /// + ErrorInvalidValue if: - /// - The image dimensions are invalid. - /// - The refresh rate is invalid for a fullscreen image. - /// + ErrorInvalidFormat if the format doesn't support presentation. - virtual Result CreatePresentableImage( - const PresentableImageCreateInfo& createInfo, - void* pImagePlacementAddr, - void* pGpuMemoryPlacementAddr, - IImage** ppImage, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for an image object (and an associated memory object) opened for - /// peer access to an image created on another GPU. Allocations of these amounts of memory must be provided in the - /// pImagePlacementAddr and pGpuMemoryPlacementAddr parameters of OpenPeerImage(). - /// - /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. - /// @param [out] pPeerImageSize Size, in bytes, of system memory required for a peer IImage. Should be - /// specified to the pImagePlacementAddr argument to OpenPeerImage(). - /// @param [out] pPeerGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached - /// to a peer IImage. Should be specified to the pGpuMemoryPlacementAddr argument - /// to OpenPeerImage(). - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - virtual void GetPeerImageSizes( - const PeerImageOpenInfo& openInfo, - size_t* pPeerImageSize, - size_t* pPeerGpuMemorySize, - Result* pResult) const = 0; - - /// Creates an @ref IImage object as a proxy to an IImage on another GPU to be used for peer-to-peer transfers. - /// - /// @note The @ref IImage object provided in the @ref PeerImageOpenInfo must be bound to an existing - /// @ref IGpuMemory object prior to opening a peer version of it. - /// If the new image needs to bind to an existing peer memory allocation, as when images are suballocated, - /// pGpuMemoryPlacementAddr must equal nullptr and ppGpuMemory must point to an existing IGpuMemory object - /// that was previously opened to reference the same memory from the current device. - /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. - /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must - /// be as much size available here as reported by calling GetPeerImageSizes(). - /// @param [in] pGpuMemoryPlacementAddr If nonzero, this is a pointer to the location where PAL should construct a - /// IGpuMemory to be associated with this peer image. There must be as much - /// size available here as reported by calling GetPeerImageSizes(). - /// @param [out] ppImage Constructed image object. - /// @param [in] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying - /// in a memory reference list. - /// - /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, ppGpuMemory, or - /// openInfo.pOriginalImage is null. - virtual Result OpenPeerImage( - const PeerImageOpenInfo& openInfo, - void* pImagePlacementAddr, - void* pGpuMemoryPlacementAddr, - IImage** ppImage, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for an external shared image object (and an associated memory - /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and - /// pGpuMemoryPlacementAddr parameters of OpenExternalSharedImage(). - /// - /// @param [in] openInfo Specifies the external image to be opened. - /// @param [out] pImageSize Size, in bytes, of system memory required for pImagePlacementAddr. - /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for pGpuMemoryPlacementAddr. - /// @param [out] pImgCreateInfo If non-null, it will be filled out with information describing the shared image. - /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pImageSize or pGpuMemorySize is null. - /// + ErrorNotShareable if none of allocations in the shared image is created on the device's GPU. - virtual Result GetExternalSharedImageSizes( - const ExternalImageOpenInfo& openInfo, - size_t* pImageSize, - size_t* pGpuMemorySize, - ImageCreateInfo* pImgCreateInfo) const = 0; - - /// Opens an external shared image object which was created by a compatible device, such as D3D device. - /// There could be more than one underlying allocations in the shared image object, only one allocation - /// created on the device's GPU will be opened, other allocations will be ignored. - /// - /// @param [in] openInfo Specifies the external image to be opened. - /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct the image object. - /// There must be as much space available here as reported by calling - /// GetExternalSharedImageSizes() with the same params. - /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct the GPU memory object. - /// There must be as much space available here as reported by calling - /// GetExternalSharedImageSizes() with the same params. - /// @param [out] pMemCreateInfo If non-null, it is filled with information describing the external GPU - /// memory that backs this external image. - /// @param [out] ppImage Constructed image object. When successful, the returned address will - /// be the same as specified in pImagePlacementAddr. - /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will - /// be the same as specified in pGpuMemoryPlacementAddr. - /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage or ppGpuMemory is null. - /// + ErrorNotShareable if none of allocations in the shared image object is created on the device's GPU. - virtual Result OpenExternalSharedImage( - const ExternalImageOpenInfo& openInfo, - void* pImagePlacementAddr, - void* pGpuMemoryPlacementAddr, - GpuMemoryCreateInfo* pMemCreateInfo, - IImage** ppImage, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for a color target view object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateColorTargetView(). - /// - /// Unlike most creation methods in this class, GetColorTargetViewSize() does not require a ColorTargetCreateInfo - /// parameter. PAL must guarantee that all color target view objects are the same size in order to support DX12, - /// where these views are treated similarly to SRDs. - /// - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IColorTargetView object. - virtual size_t GetColorTargetViewSize( - Result* pResult) const = 0; - - /// Creates an @ref IColorTargetView object with the requested properties. - /// - /// @param [in] createInfo Properties of the color target view to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetColorTargetViewSize(). - /// @param [out] ppColorTargetView Constructed color target view object. When successful, the returned address will - /// be the same as specified in pPlacementAddr. - /// - /// @returns Success if the color target view was successfully created. Otherwise, one of the following - /// errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr, ppColorTargetView, or createInfo.pImage is null. - /// + ErrorInvalidValue if: - /// - The base slice is invalid for the given image object and view type. - /// - The number of array slices is zero or the range of slices is too large for the specified image. - /// - The mip level is invalid for the given image object. - /// + ErrorInvalidImage if the image object doesn't have the color target access flag set. - virtual Result CreateColorTargetView( - const ColorTargetViewCreateInfo& createInfo, - void* pPlacementAddr, - IColorTargetView** ppColorTargetView) const = 0; - - /// Determines the amount of system memory required for a depth/stencil view object. An allocation of this amount - /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilView(). - /// - /// Unlike most creation methods in this class, GetDepthStencilViewSize() does not require a - /// DepthStencilViewCreateInfo parameter. PAL must guarantee that all color target view objects are the same size - /// in order to support DX12, where these views are treated similarly to SRDs. - /// - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IDepthStencilView object. - virtual size_t GetDepthStencilViewSize( - Result* pResult) const = 0; - - /// Creates an @ref IDepthStencilView object with the requested properties. - /// - /// @param [in] createInfo Properties of the depth/stencil view to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be - /// as much size available here as reported by calling GetDepthStencilViewSize(). - /// @param [out] ppDepthStencilView Constructed depth/stencil view object. When successful, the returned address - /// will be the same as specified in pPlacementAddr. - /// - /// @returns Success if the depth/stencil view was successfully created. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidPointer if pPlacementAddr, ppDepthStencilView, or createInfo.pImage is null. - /// + ErrorInvalidValue if: - /// - The base slice is invalid for the given image object and view type. - /// - The number of array slices is zero or the range of slices is too large for the specified image. - /// - The mip level is invalid for the given image object. - /// + ErrorInvalidImage if the image object doesn't have the depth/stencil target access flag set. - virtual Result CreateDepthStencilView( - const DepthStencilViewCreateInfo& createInfo, - void* pPlacementAddr, - IDepthStencilView** ppDepthStencilView) const = 0; - - /// Creates one or more typed buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.typedBufferView - /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should - /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD - /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly - /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, - /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: - /// + If pBufferViewInfo or pOut, is null. - /// + If count is 0. - /// + If pBufferViewInfo[].format is Undefined. - /// + If pBufferViewInfo[].stride does not match the size of an element of that format. - /// + If pBufferViewInfo[].gpuAddr is 0. - /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). - /// - /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. - /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @ingroup ResourceBinding - void CreateTypedBufferViewSrds( - uint32 count, - const BufferViewInfo* pBufferViewInfo, - void* pOut) const - { m_pfnTable.pfnCreateTypedBufViewSrds(this, count, pBufferViewInfo, pOut); } - - /// Creates one or more untyped buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. - /// These SRDs can be accessed in a shader as either _raw_ or _structured_ views. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.untypedBufferView - /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should - /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD - /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly - /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, - /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: - /// + If pBufferViewInfo or pOut, is null. - /// + If count is 0. - /// + If pBufferViewInfo[].format is not Undefined. - /// + If pBufferViewInfo[].gpuAddr is 0. - /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). - /// - /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. - /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @ingroup ResourceBinding - void CreateUntypedBufferViewSrds( - uint32 count, - const BufferViewInfo* pBufferViewInfo, - void* pOut) const - { m_pfnTable.pfnCreateUntypedBufViewSrds(this, count, pBufferViewInfo, pOut); } - - /// Validates image view SRD input parameters. Error checking for image view SRDs is handled by a separate - /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when - /// rebuilding SRDs. - /// - /// @param [in] viewInfo Input image view SRD parameter info. - /// - /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: - /// + ErrorImagePlaneUnavailable if the requested image plane specified in the view is not available on - /// the image. - /// + ErrorImageNotShaderAccessible if the image does not have a shader-readable or shader-writable usage. - /// + ErrorInvalidFormatSwizzle if the view's channel swizzle specifies components not available in the view - /// format. - /// + ErrorInvalidBaseMipLevel if the view's start subresource has a mip level larger than the number of - /// of available mip levels. - /// + ErrorFormatIncompatibleWithImageFormat if the view's format is not compatible with the image's format. - /// This can happen if: - /// - For color plane views, the bit-depths of the two formats are not equal. - /// + ErrorFormatIncompatibleWithImagePlane if the view's format is not compatible with the image's plane. - /// This can happen if: - /// - For depth plane views, the bit-depths of the view format and the depth component of the image - /// are not equal. - /// - For stencil plane views, the bit-depths of the view format and the stencil component of the image - /// are not equal. - /// + ErrorInvalidViewArraySize if: - /// - The view array size is 0. - /// - The image type is 3D and the view array size is not 1. - /// + ErrorViewTypeIncompatibleWithImageType if: - /// - The image type is 1D and the view type is not 1D - /// - The image type is 2D and the view type is not 2D or cubemap - /// - The image type is 3D and the view type is not 3D - /// + ErrorInsufficientImageArraySize if the number of viewed array slices is more than available on - /// the image. - /// + ErrorCubemapIncompatibleWithMsaa if the view type is a cubemap view and the image has - /// multiple samples. - /// + ErrorCubemapNonSquareFaceSize if the view type is a cubemap view and the image 2D extents are not - /// square. - /// + ErrorInvalidViewBaseSlice - /// - If the image type is 3D and the view base slice is not 0. - /// - /// @ingroup ResourceBinding - virtual Result ValidateImageViewInfo(const ImageViewInfo& viewInfo) const = 0; - - /// Creates one or more image view _shader resource descriptors (SRDs)_ in memory provided by the client. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.imageView - /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated image view SRD allows a set of subresources in an image to be accessed by a shader, and should - /// be setup as described in @ref ImageViewInfo. The client should put the resulting SRD in an appropriate - /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data - /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// @warning SRDs for Planar YUV images will include padding if pImageViewInfo->subresRange.numSlices > 1 - /// - /// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. - /// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @ingroup ResourceBinding - void CreateImageViewSrds( - uint32 count, - const ImageViewInfo* pImgViewInfo, - void* pOut) const - { m_pfnTable.pfnCreateImageViewSrds(this, count, pImgViewInfo, pOut); } - - /// Validates an fmask view SRD input parameters. Error checking for fmask view SRDs is handled by a separate - /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when - /// rebuilding SRDs. - /// - /// @param [in] viewInfo Input image view SRD parameter info. - /// - /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: - /// + ErrorImageFmaskUnavailable if the image does not have an FMask. - /// + ErrorInvalidViewArraySize if the view array size is 0. - /// + ErrorViewTypeIncompatibleWithImageType if the image type is not 2D. - /// + ErrorInsufficientImageArraySize if the view base array slice and size define an out of bounds array range. - /// - /// @ingroup ResourceBinding - virtual Result ValidateFmaskViewInfo(const FmaskViewInfo& viewInfo) const = 0; - - /// Creates one or more fmask view _shader resource descriptors (SRDs)_ in memory provided by the client. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.fmaskView - /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated fmask view SRD allows a range of image slices to be accessed bo the load_fptr IL instruction, - /// which allows a shader to read compressed MSAA data at the expense of a texture indirection. This SRD should be - /// setup as described in @ref FmaskViewInfo. The client should put the resulting SRD in an appropriate - /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data - /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. - /// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @ingroup ResourceBinding - void CreateFmaskViewSrds( - uint32 count, - const FmaskViewInfo* pFmaskViewInfo, - void* pOut) const - { m_pfnTable.pfnCreateFmaskViewSrds(this, count, pFmaskViewInfo, pOut); } - - /// Validates a sampler SRD input parameters. Error checking for sampler SRDs is handled by a separate function for - /// performance reasons and to avoid rechecking parameters that the client knows are correct when rebuilding SRDs. - /// - /// @param [in] samplerInfo Input sampler SRD parameter info. - /// - /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if: - /// - The max anisotropy or LOD bias value is outside of the legal range. - /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. - /// - The border color palette index is out of the legal range. - /// - /// @ingroup ResourceBinding - virtual Result ValidateSamplerInfo(const SamplerInfo& samplerInfo) const = 0; - - /// Creates one or more sampler _shader resource descriptors (SRDs)_ in memory provided by the client. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.sampler in - /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated sampler SRD controlls execution of sample instructions in a shader, and should be setup as - /// described in @ref SamplerInfo. The client should put the resulting SRD in an appropriate location based on the - /// shader resource mapping specified by the bound pipeline, either directly in user data - /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. - /// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following - /// errors may be returned: - /// + ErrorInvalidPointer if pSamplerInfo or pOut is null. - /// + ErrorInvalidValue if: - /// - The max anisotropy or LOD bias value is outside of the legal range. - /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. - /// - The border color palette index is out of the legal range. - /// - /// @ingroup ResourceBinding - void CreateSamplerSrds( - uint32 count, - const SamplerInfo* pSamplerInfo, - void* pOut) const - { m_pfnTable.pfnCreateSamplerSrds(this, count, pSamplerInfo, pOut); } - - /// Creates one or more _BVH resource descriptors (SRDs)_ in memory provided by the client. - /// - /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.bvhInfo in - /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. - /// - /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client - /// must ensure there are no GPU accesses of this memory in flight before calling this method. - /// - /// The generated BVH SRD controls execution of ray trace instructions in a shader, and should be setup as - /// described in @ref BvhInfo. The client should put the resulting SRD in an appropriate location based on - /// the shader resource mapping specified by the bound pipeline, either directly in user data - /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. - /// - /// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. - /// @param [in] pBvhInfo Array of BVH (bounding volume hierarchy) descriptions directing SRD construction. - /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. - /// - /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following - /// errors may be returned: - /// + ErrorInvalidPointer if pBvhInfo or pOut is null. - /// - /// @ingroup ResourceBinding - void CreateBvhSrds( - uint32 count, - const BvhInfo* pBvhInfo, - void* pOut) const - { - m_pfnTable.pfnCreateBvhSrds(this, count, pBvhInfo, pOut); - } - - /// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. - /// - /// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. - /// @param [out] pViewInfo Pointer to memory to be filled with decoded info. - void DecodeBufferViewSrd( - const void* pBufferViewSrd, - BufferViewInfo* pViewInfo) const - { - m_pfnTable.pfnDecodeBufferViewSrd(this, pBufferViewSrd, pViewInfo); - } - - /// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). - /// - /// @param [in] image The same image the SRD was created on. - /// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. - /// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. - void DecodeImageViewSrd( - const IImage& image, - const void* pImageViewSrd, - DecodedImageSrd* pDecodedInfo) const - { - m_pfnTable.pfnDecodeImageViewSrd(this, &image, pImageViewSrd, pDecodedInfo); - } - - /// The MSAA sample pattern palette is a client-managed table of sample patterns that might be in use by the app. - /// - /// The only purpose of this palette is to implement the samplepos shader instruction. This instruction returns the - /// position of a particular sample based on the sample pattern of the current rasterizer state or a particular - /// specified resource. When this instruction is executed, the shader will determine the correct palette index - /// as specified in the pipeline (see samplePatternIdx in the rsState structure inside GraphicsPipelineCreateInfo) - /// or in the image view SRD (see samplePatternIdx in ImageViewInfo). The shader will then return the position - /// for the specified sample in the specified entry of the currently bound sample pattern palette as set with this - /// function. - /// - /// The initial bound sample pattern palette values are undefined. A palette entry must be specified before it is - /// referenced by a samplepos instruction. - /// - /// @warning The samplepos instruction and sample pattern palette both assume every pixel has the same pattern. - /// This may not be accurate if the application uses custom sample patterns that aren't identical for - /// every pixel in the quad. - /// - /// @param [in] palette Small set of sample patterns. Each sample pattern consists of an x,y tuple per sample - /// describing where it is located in a pixel. The coordinate system is described in - /// MsaaStateCreateInfo. - /// - /// @returns Success if the palette was successfully updated. An error is only possible due to an internal error, - /// such as a failure to allocate GPU memory for a new table. - virtual Result SetSamplePatternPalette( - const SamplePatternPalette& palette) = 0; - - /// Determines the amount of system memory required for a border color palette object. An allocation of this amount - /// of memory must be provided in the pPlacementAddr parameter of CreateBorderColorPalette(). - /// - /// @param [in] createInfo Border color palette creation info (specifies number of entries). - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IBorderColorPalette object with the specified - /// properties. A return value of 0 indicates the createInfo was invalid. - virtual size_t GetBorderColorPaletteSize( - const BorderColorPaletteCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a border color palette object. - /// - /// @param [in] createInfo Border color palette creation info (number of entries). - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetBorderColorPaletteSize() with the - /// same createInfo param. - /// @param [out] ppPalette Constructed border color palette object. When successful, the returned address will - /// be the same as specified in pPlacementAddr. - /// - /// @returns Success if the border color palette was successfully created. Otherwise, one of the following errors - /// may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppPalette is null. - /// + ErrorInvalidValue if the palette size is too large to be used on any queue on this device. - virtual Result CreateBorderColorPalette( - const BorderColorPaletteCreateInfo& createInfo, - void* pPlacementAddr, - IBorderColorPalette** ppPalette) const = 0; - - /// Determines the amount of system memory required for a compute pipeline object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateComputePipeline(). - /// - /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetComputePipelineSize( - const ComputePipelineCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a compute @ref IPipeline object with the requested properties. - /// - /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetComputePipelineSize() with the - /// same createInfo param. - /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if: - /// - pPlacementAddr or ppPipeline is null. - /// - A required shader pointer is null. - /// - The link time constant data pointer is null. - /// + ErrorInvalidValue if: - /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. - /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. - /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. - virtual Result CreateComputePipeline( - const ComputePipelineCreateInfo& createInfo, - void* pPlacementAddr, - IPipeline** ppPipeline) = 0; - - /// Determines the amount of system memory required for a shader library object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateShaderLibrary(). - /// - /// @param [in] createInfo Library creation parameters including ELF code object and other items. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additonal validation. - /// - /// @returns Size, in bytes, of system memory required for an IShaderLibrary object with the specified properties. - /// A return value of zero indicates the createInfo was invalid. - virtual size_t GetShaderLibrarySize( - const ShaderLibraryCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a @ref IShaderLibrary object with the requested properties. - /// - /// @param [in] createInfo Library creation parameters including ELF code object and other items. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetShaderLibrarySize() with the - /// same createInfo parameter. - /// @param [out] ppLibrary Constructed library object. When successful, the returned address will be the same - /// as specified in pPlacementAddr. - /// - /// @returns Success if the library was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if: - /// - pPlacementAddr or ppLibrary is null. - /// - Required code object pointer is null. - virtual Result CreateShaderLibrary( - const ShaderLibraryCreateInfo& createInfo, - void* pPlacementAddr, - IShaderLibrary** ppLibrary) = 0; - - /// Determines the amount of system memory required for a graphics pipeline object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateGraphicsPipeline(). - /// - /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetGraphicsPipelineSize( - const GraphicsPipelineCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a graphics @ref IPipeline object with the requested properties. - /// If this pipeline is created from a set of @IShaderLibrary objects, it is the caller's responsibility to ensure - /// that the lifetime of the pipeline does not exceed the lifetime of the libraries themselves. - /// - /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetGraphicsPipelineSize() with the - /// same createInfo param. - /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the same - /// as specified in pPlacementAddr. - /// - /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if: - /// - pPlacementAddr or ppPipeline is null. - /// - A required shader pointer is null. - /// - The link time constant data pointer is null. - /// + ErrorInvalidValue if: - /// - The number of control points is invalid for a tessellation pipeline. - /// - Logic operations are enabled while some of the color targets enable blending. - /// - The dual source blend enable doesn't match expectations for color target and blend enable setup. - /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. - /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. - /// + ErrorInvalidFormat if: - /// - Blending is enabled by the color target format doesn't support blending. - /// - Logic operations are enabled by an incompatible format is used. - /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. - virtual Result CreateGraphicsPipeline( - const GraphicsPipelineCreateInfo& createInfo, - void* pPlacementAddr, - IPipeline** ppPipeline) = 0; - - /// Determines the amount of system memory required for a MSAA state object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateMsaaState(). - /// - /// @returns Size, in bytes, of system memory required for an @ref IMsaaState object. - /// This value will always be non-zero if the device has GfxIp support. - virtual size_t GetMsaaStateSize() const = 0; - - /// Creates an @ref IMsaaState object with the requested properties. - /// - /// @param [in] createInfo Properties of the MSAA state object to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetMsaaStateSize() with the same - /// createInfo param. - /// @param [out] ppMsaaState Constructed MSAA state object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the MSAA state was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppMsaaState is null. - /// + ErrorInvalidValue if: - /// - The number of samples is unsupported. - virtual Result CreateMsaaState( - const MsaaStateCreateInfo& createInfo, - void* pPlacementAddr, - IMsaaState** ppMsaaState) const = 0; - - /// Determines the amount of system memory required for a color blend state object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateColorBlendState(). - /// - /// @returns Size, in bytes, of system memory required for an @ref IColorBlendState object. - /// This value will always be non-zero if the device has GfxIp support. - virtual size_t GetColorBlendStateSize() const = 0; - - /// Creates an @ref IColorBlendState object with the requested properties. - /// - /// @param [in] createInfo Properties of the color blend state object to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetColorBlendStateSize() with the - /// same createInfo param. - /// @param [out] ppColorBlendState Constructed color blend state object. When successful, the returned address will - /// be the same as specified in pPlacementAddr. - /// - /// @returns Success if the color blend state was successfully created. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppColorBlendState is null. - /// + ErrorInvalidValue if: - /// - An unsupported blend function is used with dual source blending. - virtual Result CreateColorBlendState( - const ColorBlendStateCreateInfo& createInfo, - void* pPlacementAddr, - IColorBlendState** ppColorBlendState) const = 0; - - /// Determines the amount of system memory required for a depth/stencil state object. An allocation of this amount - /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilState(). - /// - /// @returns Size, in bytes, of system memory required for an @ref IDepthStencilState object. - /// This value will always be non-zero if the device has GfxIp support. - virtual size_t GetDepthStencilStateSize() const = 0; - - /// Creates an @ref IDepthStencilState object with the requested properties. - /// - /// @param [in] createInfo Properties of the depth/stencil state object to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be - /// as much size available here as reported by calling GetDepthStencilStateSize() - /// with the same createInfo param. - /// @param [out] ppDepthStencilState Constructed depth/stencil state object. When successful, the returned address - /// will be the same as specified in pPlacementAddr. - /// - /// @returns Success if the depth/stencil state was successfully created. Otherwise, one of the following errors - /// may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppDepthStencilState is null. - /// + ErrorInvalidValue if: - /// - Depth bounds is enabled and the depth range is invalid. - virtual Result CreateDepthStencilState( - const DepthStencilStateCreateInfo& createInfo, - void* pPlacementAddr, - IDepthStencilState** ppDepthStencilState) const = 0; - - /// Determines the amount of system memory required for a queue semaphore object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateQueueSemaphore(). - /// - /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore count. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IQueueSemaphore object with the specified properties. - /// A return value of 0 indicates the createInfo was invalid. - virtual size_t GetQueueSemaphoreSize( - const QueueSemaphoreCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates an @ref IQueueSemaphore object with the requested properties. - /// - /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore - /// count. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetQueueSemaphoreSize() with the - /// same createInfo param. - /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the queue semaphore was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. - /// + ErrorInvalidValue if createInfo.initialCount is outside of the [0..31] range. - virtual Result CreateQueueSemaphore( - const QueueSemaphoreCreateInfo& createInfo, - void* pPlacementAddr, - IQueueSemaphore** ppQueueSemaphore) = 0; - - /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore - /// from a different device. An allocation of this amount of memory must be provided in the pPlacementAddr - /// parameter of OpenSharedQueueSemaphore(). - /// - /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the - /// specified properties. A return value of 0 indicates the openInfo was invalid. - virtual size_t GetSharedQueueSemaphoreSize( - const QueueSemaphoreOpenInfo& openInfo, - Result* pResult) const = 0; - - /// Opens a shareable queue semaphore object created on another device for use on this device. - /// - /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with - /// the same params. - /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of - /// the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. - /// + ErrorNotShareable if the specified queue semaphore object was not marked as shareable on creation. - virtual Result OpenSharedQueueSemaphore( - const QueueSemaphoreOpenInfo& openInfo, - void* pPlacementAddr, - IQueueSemaphore** ppQueueSemaphore) = 0; - - /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore from - /// a different API which isn't a PAL client. An allocation of this amount of memory must be provided in the - /// pPlacementAddr parameter of OpenExternalSharedQueueSemaphore(). - /// - /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the - /// specified properties. A return value of 0 indicates the openInfo was invalid. - virtual size_t GetExternalSharedQueueSemaphoreSize( - const ExternalQueueSemaphoreOpenInfo& openInfo, - Result* pResult) const = 0; - - /// Opens a shareable queue semaphore object created on another API which isn't a PAL client for use on this device. - /// - /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open and flags. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with - /// the same params. - /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of - /// the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. - virtual Result OpenExternalSharedQueueSemaphore( - const ExternalQueueSemaphoreOpenInfo& openInfo, - void* pPlacementAddr, - IQueueSemaphore** ppQueueSemaphore) = 0; - -#if PAL_KMT_BUILD - /// Opens an external NT handle from name. - /// - /// @param [in] handleInfo Specifies the NT handle attributes when opening from name. - /// @param [out] pHandle A NT handle returned by runtime. - /// - /// @returns Success if the NT handle was successfully opened. Otherwise, one of - /// the following errors may be returned: - /// + ErrorInvalidValue if the name or attributes is invaild. - virtual Result OpenExternalHandleFromName( - const ExternalHandleInfo& handleInfo, - OsExternalHandle* pHandle) = 0; -#endif - -#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) - /// Query current gpu memory usage info and budget info of specified heap group of the device. - /// - /// @param [out] pInfo Heap usage and budget info reported from Os. - virtual Result QueryGpuMemoryBudgetInfo( - GpuMemoryBudgetInfo* pInfo) = 0; -#endif - - /// Determines the amount of system memory required for an IFence object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateFence(). - /// - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IFence object. - virtual size_t GetFenceSize( - Result* pResult) const = 0; - - /// Creates a GPU fence object. - /// - /// @param [in] createInfo Data controlling the fence properties - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetFenceSize(). - /// @param [out] ppFence Constructed fence object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the fence was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppFence is null. - virtual Result CreateFence( - const FenceCreateInfo& createInfo, - void* pPlacementAddr, - IFence** ppFence) const = 0; - - /// Opens a fence wihich was shared by another Device. - /// - /// @param [in] openInfo A reference to FenceOpenInfo, the handle is used if it's not null, or the - /// event is opened via name. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetFenceSize(). - /// @param [out] ppFence Constructed fence object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// @returns Success if the event was successfully reconstructed, otherwise an appropriate error code. - virtual Result OpenFence( - const FenceOpenInfo& openInfo, - void* pPlacementAddr, - IFence** ppFence) const = 0; - - /// Determines the amount of system memory required for an IGpuEvent object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateGpuEvent(). - /// - /// @param [in] createInfo Properties of the GPU event object to create. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IGpuEvent object. - virtual size_t GetGpuEventSize( - const GpuEventCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a GPU event object. - /// - /// @param [in] createInfo Properties of the GPU event object to create. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetEventSize(). - /// @param [out] ppGpuEvent Constructed event object. When successful, the returned address will be the same as - /// specified in pPlacementAddr. - /// - /// @returns Success if the event was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppEvent is null. - virtual Result CreateGpuEvent( - const GpuEventCreateInfo& createInfo, - void* pPlacementAddr, - IGpuEvent** ppGpuEvent) = 0; - - /// Determines the amount of system memory required for a query pool object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateQueryPool(). - /// - /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are in - /// the pool. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IQueryPool object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetQueryPoolSize( - const QueryPoolCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates an @ref IQueryPool object with the requested properties. - /// - /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are - /// in the pool. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetQueryPoolSize() with the same - /// createInfo param. - /// @param [out] ppQueryPool Constructed query pool object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if the query pool was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppQueryPool is null. - /// + ErrorInvalidValue if createInfo.numSlots is zero. - virtual Result CreateQueryPool( - const QueryPoolCreateInfo& createInfo, - void* pPlacementAddr, - IQueryPool** ppQueryPool) const = 0; - - /// Determines the amount of system memory required for a command allocator object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateCmdAllocator(). - /// - /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation. - /// - /// @returns Size, in bytes, of system memory required for an ICmdAllocator object with the specified properties. - /// A return value of 0 indicates the createInfo was invalid. - virtual size_t GetCmdAllocatorSize( - const CmdAllocatorCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a command allocator object that can allocate GPU memory with the specified properties for use by command - /// buffer objects. - /// - /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetCmdAllocatorSize() with the - /// same createInfo param. - /// @param [out] ppCmdAllocator Constructed command allocator object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the command allocator was successfully created. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppCmdAllocator is null. - virtual Result CreateCmdAllocator( - const CmdAllocatorCreateInfo& createInfo, - void* pPlacementAddr, - ICmdAllocator** ppCmdAllocator) = 0; - - /// Determines the amount of system memory required for a command buffer object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreateCmdBuffer(). - /// - /// @param [in] createInfo Command buffer properties including the target queue type. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an ICmdBuffer object with the specified properties. A - /// return value of 0 indicates the createInfo was invalid. - virtual size_t GetCmdBufferSize( - const CmdBufferCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a command buffer object that can build work intended for a particular queue type. - /// - /// @param [in] createInfo Command buffer properties including the target queue type. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetCmdBufferSize() with the same - /// createInfo param. - /// @param [out] ppCmdBuffer Constructed command buffer object. When successful, the returned address will be - /// the same as specified in pPlacementAddr. - /// - /// @returns Success if the command buffer was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. - virtual Result CreateCmdBuffer( - const CmdBufferCreateInfo& createInfo, - void* pPlacementAddr, - ICmdBuffer** ppCmdBuffer) = 0; - - /// Determines the amount of system memory required for an indirect command generator object. An allocation of this - /// amount must be provided in the pPlacementAddr parameter of CreateIndirectCmdGenerator(). - /// - /// @param [in] createInfo Indirect command generator properties. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the - /// additional validation steps. - /// - /// @returns Size, in bytes, of system memory required for an IIndirectCmdGenerator object with the specified - /// properties. A return value of zero indicates the createInfo was invalid. - virtual size_t GetIndirectCmdGeneratorSize( - const IndirectCmdGeneratorCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates an indirect command generator object which can translate an application-specified command buffer into a - /// format understandable by the GPU. - /// - /// @param [in] createInfo - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetIndirectCmdGeneratorSize() with - /// the same createInfo param. - /// @param [out] ppGenerator Constructed indirect command generator object. When successful, the returned - /// address will be the same as specified in pPlacementAddr. - /// - /// @returns Success if the command generator was successfully created. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppGenerator is null. - virtual Result CreateIndirectCmdGenerator( - const IndirectCmdGeneratorCreateInfo& createInfo, - void* pPlacementAddr, - IIndirectCmdGenerator** ppGenerator) const = 0; - - /// Determines the amount of system memory required for a perf experiment object. An allocation of this amount of - /// memory must be provided in the pPlacementAddr parameter of CreatePerfExperiment(). - /// - /// @param [in] createInfo Properties of the performance experiment to be created. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an IPerfExperiment object with the specified properties. - /// A return value of 0 indicates the createInfo was invalid. - virtual size_t GetPerfExperimentSize( - const PerfExperimentCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Creates a performance experiment object that can gather performance counter and trace data for a specific span - /// of a command buffer. - /// - /// @param [in] createInfo Properties of the performance experiment to be created. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetPerfExperimentSize() with the - /// same createInfo param. - /// @param [out] ppPerfExperiment Constructed performance experiment object. When successful, the returned address - /// will be the same as specified in pPlacementAddr. - /// - /// @returns Success if the perf experiment was successfully created. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. - virtual Result CreatePerfExperiment( - const PerfExperimentCreateInfo& createInfo, - void* pPlacementAddr, - IPerfExperiment** ppPerfExperiment) const = 0; - - /// Gets @ref IPrivateScreen objects owned by this device. Private screens are screens not exposed through standard - /// OS mechanisms. This function should be called again when any of the private screens are plugged or unplugged. - /// The first call to this function enumerates all private screens and stores in device object as well. The next - /// call triggered by hot-plug event enumerates private screens again but only destroys removed ones and creates - /// new private screen objects for newly-added ones. The hash code generated at enumeration time is used as id of - /// private screens. If the id of an enumerated private screen already exists, it is treated as unchanged. The EDID - /// array and display index are used to generate MD5 hash code. - /// - /// @param [out] pNumScreens Pointer to the number of private sceens, note that this number does not mean first - /// *pNumScreens elements in ppScreens are valid but just a hint that total *pNumScreens - /// out of MaxPrivateScreens are valid. - /// @param [out] ppScreens Pointer to the array of private screens. The client must pass in the pointer to an - /// array of at least MaxPrivateScreens pointers to IPrivateScreen. - /// - /// @returns Success if the private screens are correctly retrieved. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if the device does not support private screen functionalities. - virtual Result GetPrivateScreens( - uint32* pNumScreens, - IPrivateScreen** ppScreens) = 0; - - /// Registers an emulated @ref IPrivateScreen objects owned by this device. An emulated private screen doesn't have - /// a physical display hardware connected to the GPU. This could be useful for debugging unusual configurations or - /// using in automation systems when no real HMDs are available. - /// - /// @param [in] createInfo Properties of to create an emulated private screen object. - /// @param [out] pTargetId Pointer to returned emulated private screen target id. - /// - /// @returns Success if the emulated private screen is correctly created. Otherwise, one of the following errors may - /// be returned: - /// + ErrorTooManyPrivateScreens if the device cannot create an emulated private screen. - virtual Result AddEmulatedPrivateScreen( - const PrivateScreenCreateInfo& createInfo, - uint32* pTargetId) = 0; - - /// Removes an emulated @ref IPrivateScreen objects owned by this device. - /// - /// @param [in] targetId Target id of emulated private screen to be removed. - /// - /// @returns Success if the emulated private screen is correctly removed. Otherwise, one of the following errors may - /// be returned: - /// + ErrorUnknown if any unknown error occurs. - virtual Result RemoveEmulatedPrivateScreen( - uint32 targetId) = 0; - - /// Determines the amount of system memory required for a private screen image object (and an associated memory - /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and - /// pGpuMemoryPlacementAddr parameters of CreatePrivateScreenImage(). - /// - /// Only images created through this interface are valid sources for IPrivateScreen::Present(). - /// - /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. - /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. - /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). - /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached to - /// the private screen IImage. Should be specified to the pGpuMemoryPlacementAddr - /// argument of CreatePrivateScreenImage(). - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - virtual void GetPrivateScreenImageSizes( - const PrivateScreenImageCreateInfo& createInfo, - size_t* pImageSize, - size_t* pGpuMemorySize, - Result* pResult) const = 0; - - /// Creates private screen presentable image. A private screen presentable image is similar to a regular presentable - /// image but can only be presented on the private screens. It has some implicit properties relative to standard - /// images, such as mipLevels=1, arraySize=1, numSamples=1 and etc. It also requires its bound GPU memory to be - /// pinned before presenting. - /// - /// @param [in] createInfo Create info. - /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must - /// be as much size available here as reported by calling - /// GetPrivateScreenImageSizes(). - /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated - /// with this peer image. There must be as much size available here as - /// reported by calling GetPrivateScreenImageSizes(). - /// @param [out] ppImage Constructed image object. - /// @param [out] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying - /// in a memory reference list. - /// - /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: - /// + ErrorPrivateScreenInvalidFormat if the format isn't supported on the private screen. - /// + ErrorPrivateScreenRemoved if the private screen was removed. - virtual Result CreatePrivateScreenImage( - const PrivateScreenImageCreateInfo& createInfo, - void* pImagePlacementAddr, - void* pGpuMemoryPlacementAddr, - IImage** ppImage, - IGpuMemory** ppGpuMemory) = 0; - - /// Determines the amount of system memory required for an ISwapChain object. An allocation of this amount of memory - /// must be provided in the pPlacementAddr parameter of CreateSwapChain(). - /// - /// @param [in] createInfo All the information related with this swap chain. - /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid - /// the additional validation. - /// - /// @returns Size, in bytes, of system memory required for an ISwapChain object. - virtual size_t GetSwapChainSize( - const SwapChainCreateInfo& createInfo, - Result* pResult) const = 0; - - /// Create swap chain object based on the local window system. It doesn't include creation of presentable images. - /// The presentable images should be associated with SwapChain object when presentable image is created. - /// - /// @param [in] createInfo All the information related with this swap chain. - /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - /// much size available here as reported by calling GetSwapChainSize(). - /// @param [out] ppSwapChain Constructed swapchain object. When successful, the returned address will be the - /// same as specified in pPlacementAddr. - /// - /// @returns Success if create swap chain instance successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result CreateSwapChain( - const SwapChainCreateInfo& createInfo, - void* pPlacementAddr, - ISwapChain** ppSwapChain) = 0; - - /// Sets a power profile for this device. - /// - /// @param [in] profile A profile is a pre-defined configuration indicates how KMD/PPLib is notified to work, - /// e.g. raise or lower the GPU clock etc. - /// @param [in,out] pInfo Custom power profile info needed for VrCustom mode, can be null for other modes. Note the - /// actualSwitchInfo[] field is output part of @ref CustomPowerProfile. - /// - /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not available on this OS. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result SetPowerProfile( - PowerProfile profile, - CustomPowerProfile* pInfo) = 0; - - /// Sends an escape call to the KMD to enable power optimizations for DirectML/ROCm workloads. The client must call - /// this function when the context for any ROCm or DirectML workload is created or destroyed. - /// - /// @param [in] enableOptimization Set to true if called during context creation (enable power optimizations), - /// false if context destroyed (restore defaults). - /// - /// @returns Success if the power optimization is set successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not available on this OS. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result SetMlPowerOptimization( - bool enableOptimization) const = 0; - - /// Queries workstation caps on this device. - /// - /// @param [out] pCaps Pointer to location where pal should write back workstation caps. - /// - /// @returns Success if wokstation caps is got from KMD successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorOutOfMemory if out of system memory. - virtual Result QueryWorkStationCaps( - WorkStationCaps* pCaps) const = 0; - - /// Queries display connectors installed on the GPU - /// - /// @param [in,out] pConnectorCount Input value specifies the maximum number of connectors to enumerate, and the - /// output value specifies the total number of display modes that were enumerated - /// in pConnectors. The input value is ignored if pConnectors is null. - /// This pointer must not be null. - /// @param [out] pConnectors Output list of connectors. Can be null, in which case the total number of - /// available connectors will be written to pConnectorCount. - /// - /// - /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not available on this OS. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result QueryDisplayConnectors( - uint32* pConnectorCount, - DisplayConnectorProperties* pConnectors) = 0; - - /// Queries the dispatch kernel source code - /// - /// @returns Returns the source code for the kernel dispatch function used in - /// the kernel scheduler for device enqueue (CL C code) or nullptr if it's not available. - /// - /// @note This function is to support OpenCL AQL submissions. - virtual const char* GetDispatchKernelSource() const = 0; - - /// @} - - /// Query the Flgl state from the device. Device will query the connectivity of GLSync card and return - /// the state. Pal internal state of Flgl will be updated. - /// - /// @param [out] pState Pointer to the location that PAL should write the internal flgl states back. - /// - /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: - /// + ErrorOutOfMemory if out of system memory. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglQueryState( - FlglState* pState) = 0; - - /// Set the Flgl config of the device. - /// - /// @param [in] glSyncConfig const reference to the config struct. - /// - /// @returns Success if setting returns with success. Otherwise, one of the following errors may returned: - /// + ErrorUnknown if an unexpected internal error occurs. - /// + ErrorUnsuppported if the this GenLock function is not available. - virtual Result FlglSetSyncConfiguration( - const GlSyncConfig& glSyncConfig) = 0; - - /// Get the Flgl config of the device. - /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. - /// - /// @param [out] pGlSyncConfig Pointer to the location that PAL should write the config back. - /// - /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: - /// + ErrorUnknown if an unexpected internal error occurs. - /// + ErrorInvalidPointer if pGlSyncConfig is null poiter. - virtual Result FlglGetSyncConfiguration( - GlSyncConfig* pGlSyncConfig) const = 0; - - /// Set the Framelock to disable or enable. Client should call this interface first to enable/disable Flgl. - /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. - /// - /// @param [in] enable If true enables KMD framelock, otherwise disables framelock. - /// - /// @returns Success if framelock enable/disable successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not supported on this Asic. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglSetFrameLock( - bool enable) = 0; - - /// Set the Genlock to disable or enable. - /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. - /// - /// @param [in] enable If true enables the genlock, otherwise disables genlock. - /// - /// @returns Success if genlock enable/disable successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not supported on this Asic. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglSetGenLock( - bool enable) = 0; - - /// Reset the framelock HW counter. The following counter operations are directly submit to hardware via I2C - /// interface Pal doesn't store the counter internally. Client should manage the counter - /// - /// @returns Success if the HW counter is reset successfully. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this function is not available on this Asic. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglResetFrameCounter() const = 0; - - /// Check if a reset/discontinuity of HW framecounter occurs. If returns false, there is no need to query HW frame - /// counter, client should update its software counter instead. If returns true, client is required to query HW - /// counter and adjusts its software counter accordingly. - /// - /// @param [out] pReset Pointer to the location that PAL should write the reset status back. - /// - /// @returns Success if the reset status is returned. Otherwise, one of the following errors may be returned: - /// + ErrorUnavailable if this function is not available on this Asic. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglGetFrameCounterResetStatus( - bool* pReset) const = 0; - - /// Get the framelock HW counter. - /// - /// @param [out] pValue Pointer to the location that PAL should write the frame counter value back. - /// @param [out] pReset Pointer to the location that PAL should write the frame counter reset state. - /// - /// @returns Success if the frame counter is returned. Otherwise, one of the following errors may be returned: - /// + ErrorUnavailable if this function is not available on this Asic. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result FlglGetFrameCounter( - uint64* pValue, - bool* pReset) const = 0; - - /// Checks if the specified externally-controlled feature settings have changed since the last time the function was - /// called. - /// - /// This is intended to be a lightweight function that can be called per frame per feature. If the function - /// returns Result::Success and (*pRsFeaturesChanged & RsFeatureTypeXX) != 0, then the user changed some related - /// settings in the UI. - /// - /// If TurboSync has updated, the client should first try to re-read the application profile settings by calling - /// IPlatform::QueryRawApplicationProfile() with client = User3D. If that returns Unsupported, then fall back - /// to device-wide TurboSync settings read via GetRsFeatureGlobalSettings(). - /// - /// If Chill has updated, call IPlatform::QueryRawApplicationProfile() with client = Chill to re-read the - /// system app profiles and then with client = User3D for any per-user Chill overrides, and additionally - /// call GetRsFeatureGlobalSettings() to get the Chill enabled state. - /// - /// If Delag has updated, call IPlatform::QueryRawApplicationProfile() with client = User3D to get the enabled - /// state, and additionally call GetRsFeatureGlobalSettings() to get the Delag hotkey. - /// - /// - /// @param [in] rsFeatures Bitmask of RsFeatureType value(s) to query. Use UINT_MAX to poll all. - /// @param [out] pRsFeaturesChanged Bitmask of queried RsFeatureTypes that have changed since last polling. - /// - /// @returns Success if the call succeeded. - virtual Result DidRsFeatureSettingsChange( - uint32 rsFeatures, - uint32* pRsFeaturesChanged) = 0; - - /// Gets externally-controlled per-device settings for the requested RsFeatureType. - /// - /// @param [in] rsFeature Feature type to request information for (singular, not a mask). - /// @param [out] pRsFeatureInfo Settings related to the specified RsFeatureType. - /// - /// @returns Success if the call succeeded. - virtual Result GetRsFeatureGlobalSettings( - RsFeatureType rsFeature, - RsFeatureInfo* pRsFeatureInfo) = 0; - - /// Update Chill Status (last active time stamp). After every frame, UMD needs to generate a time stamp and inform - /// KMD through the shared memory, if the time stamp changes between 2 frames, it means Chill is active and KMD - /// needs to adjust power through PSM. - /// - /// @param [in] lastChillActiveTimeStampUs the last Chill active time stamp in microseconds to set - /// - /// @returns Success if the call succeeded. - virtual Result UpdateChillStatus( - uint64 lastChillActiveTimeStampUs) = 0; - - /// Make the Bus Addressable allocations available to be accessed by remote device. - /// Exposes the surface and marker bus addresses for each allocation. These bus addresses can be accessed by - /// calling @ref IGpuMemory::Desc() on the appropriate object. - /// Client drivers must call @ref AddGpuMemoryReferences() for all relevant allocations before calling this. - /// - /// @param [in] pQueue Queue used by PAL for performing this operation. - /// @param [in] gpuMemCount Number of GPU memory allocations to expose to remote devices. - /// @param [in] ppGpuMemList Array of gpuMemCount IGpuMemory objects. - /// - /// @returns Success if bus addresses are available by calling @ref IGpuMemory::Desc() on all IGpuMemory objects - virtual Result InitBusAddressableGpuMemory( - IQueue* pQueue, - uint32 gpuMemCount, - IGpuMemory*const* ppGpuMemList) = 0; - - /// Create virtual display. Virtual display is similar to the regular display (IScreen), the difference is the - /// virtual display doesn't have a physical monitor connected. When CreateVirtualDisplay is called, KMD will - /// generate a hot-plug-in event to notify application a new display is added. Then the app/client will call PAL to - /// re-querythe attached screens and they will find a new one in the list that is pretend, but they can use it just - /// like a normal display. - /// - /// @param [in] virtualDisplayInfo Virtual display creation infomation. - /// @param [out] pScreenTargetId The screen target ID returned by KMD - /// - /// @returns Success if the call succeeded. - virtual Result CreateVirtualDisplay( - const VirtualDisplayInfo& virtualDisplayInfo, - uint32* pScreenTargetId) = 0; - - /// Destroy virtual display. When DestroyVirtualDisplay is called KMD will generate a hot-plug-out event to notify - /// application a virtual display is removed, it also will be removed from the display list, and app/client can't - /// uses it anymore. - /// - /// @param [in] screenTargetId Screen target ID. - /// - /// @returns Success if the call succeeded. - virtual Result DestroyVirtualDisplay( - uint32 screenTargetId) = 0; - - /// Query virtual display Properties from screen target Id. - /// - /// @param [in] screenTargetId Screen target ID. - /// @param [out] pVirtualDisplayProperties A pointer to VirtualDisplayProperties - /// - /// @returns Success if the call succeeded. - virtual Result GetVirtualDisplayProperties( - uint32 screenTargetId, - VirtualDisplayProperties* pProperties) = 0; - - /// Determines if hardware accelerated stereo rendering can be enabled for given graphic pipeline. - /// If hardware accelerate stereo rendering can be enabled, client doesn't need to do shader patching - /// which includes translating view id intrinsic to user data slot, outputing render target - /// array index and viewport array index in shader closest to scan converter. - /// - /// @param [in] viewInstancingInfo Graphic pipeline view instancing information. - /// - /// @returns True if hardware accelerated stereo rendering can be enabled, False otherwise. - virtual bool DetermineHwStereoRenderingSupported( - const GraphicPipelineViewInstancingInfo& viewInstancingInfo) const = 0; - - /// Get the UTF-8 file path used to put all files for cache purpose. - /// On Windows, the returned path depends on Util::GetProcessIntegrityLevel and Util::IsProcessInAppContainer. - /// - /// @returns Pointer to cache file path. - virtual const char* GetCacheFilePath() const = 0; - - /// Get the UTF-8 file path used to put all files for debug purpose (such as logs, dumps, replace shader). - /// - /// @returns Pointer to debug file path. - virtual const char* GetDebugFilePath() const = 0; - - /// Queries the base driver Radeon Software Version string (as shown in Radeon Settings). - /// - /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. - /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). - /// - /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors - /// may be returned: - /// + Unsupported if this function is not available on this environment. - /// + NotFound if the Radeon Software Version string is not present. - /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. - virtual Result QueryRadeonSoftwareVersion( - char* pBuffer, - size_t bufferLength) const = 0; - - /// Queries the base Driver Release Version string. - /// - /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. - /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). - /// - /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors - /// may be returned: - /// + Unsupported if this function is not available on this environment. - /// + NotFound if the Release Version string is not present. - /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. - virtual Result QueryReleaseVersion( - char* pBuffer, - size_t bufferLength) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -#if defined(__unix__) - /// Gets the modifier list and modifier count according to image format. - /// - /// @param [in] format Image format. - /// @param [out] pModifierCount Modifier count. - /// @param [out] pModifiersList A pointer to the modifier list. - virtual void GetModifiersList( - ChNumFormat format, - uint32* pModifierCount, - uint64* pModifiersList) const = 0; -#endif - - /// Passes the HIP runtime state setup to KMD on behalf of the HIP runtime. - /// - /// @param [in] runtimeState A structure containing runtime state information to pass to KMD - /// - /// @returns Result for error handling. - virtual Result RegisterHipRuntimeState(const HipRuntimeSetup& runtimeState) const = 0; - - /// Sets the second-level trap handler for HIP - /// - /// @param [in] pTrapHandlerCode A pointer to the piece of memory containing the trap handler code - /// This may be nullptr, which indicates that there is no secondary trap handler. - /// @param [in] codeOffset An offset, in bytes, into the pTrapHandlerCode's memory region - /// @param [in] pTrapHandlerMemory A pointer to the piece of memory containing the trap handler's memory - /// This may be nullptr, which indicates that there is no valid trap handler - /// memory. - /// @param [in] memoryOffset An offset, in bytes, into the pTrapHandlerMemory's memory region - /// - /// @returns Result for error handling. - virtual Result SetHipTrapHandler( - const IGpuMemory* pTrapHandlerCode, - gpusize codeOffset, - const IGpuMemory* pTrapHandlerMemory, - gpusize memoryOffset) const = 0; - - /// Sets the CmdDisassembly::ICmdBufferReporting associated with this device - /// Currently only one can be registered at a time - but this could be changed in the future - /// - /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting being - /// registered - /// @return Result::Success if successful - /// Result::Unsupported if feature is not supported - /// Result::AlreadyExists if there is already a CmdDisassembly::ICmdBufferReporting - /// registered with this devide - /// Result::ErrorInvalidValue if pInterface == nullptr - /// - virtual Result RegisterCmdReportingInterface( - CmdDisassembly::ICmdBufferReporting* pInterface) - { - return Result::Unsupported; - } - - /// Resets the CmdDisassembly::ICmdBufferReporting associated with this device to nullptr - /// Currently only one can be registered at a time - but this could be changed in the future - /// - /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting that - /// was registered - /// @return Result::Success if successful - /// Result::Unsupported if feature is not supported - /// Result::ErrorInvalidValue if pInterface == nullptr or pInterface was not the last - /// CmdDisassembly::ICmdBufferReporting registered - /// - virtual Result UnregisterCmdReportingInterface( - CmdDisassembly::ICmdBufferReporting* pInterface) - { - return Result::Unsupported; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IDevice() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Device objects will be destroyed when - /// @ref IPlatform::Destroy() is called. - virtual ~IDevice() { } - - DeviceInterfacePfnTable m_pfnTable; ///< SRD function pointer table. - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -/** - *********************************************************************************************************************** - * @defgroup ResourceBinding Resource Binding Model - * - * _Resource binding_ refers to the process of binding resources (textures, UAVs, samplers, etc.) for access by shaders - * in a pipeline. This is an area where 3D APIs diverge significantly. PAL's model is designed to minimally abstract - * the underlying hardware in a way that allows performant implementations by each client driver. - * - * ### Hardware User Data - * GCN hardware has 16 _user data_ registers that act as a generic interface for passing values from a command buffer to - * a shader. User data registers are set to their desired value via packets in a command buffer, then the specified - * values are loaded from the user data registers into shader GPRs when a wave is launched. Since the user data is just - * arbitrary generic data, this method can be used to pass any type of data a client may want to specify directly from - * a command buffer, for example: - * - * - __Constant value__ - a 32-bit floating point or integer constant could be written into user data then be used - * directly by the shader. - * - __Shader resource descriptor (SRD)__ - 4 or 8 dwords of consecutive data could be an SRD which will be used as a - * t#, s#, etc. by the shader. - * - __Pointer__ - The user data could be an arbitrary GPU virtual address where a table of constants, SRDs, etc. are - * stored. - * - * The 3D driver and shader compiler are responsible for working together to define how resources referenced in a shader - * should be mapped to user data bound in a command buffer. - * - * ### PAL User Data - * PAL only lightly abstracts the hardware user data concept. DeviceProperties reports the number of user data entries - * supported on the device in maxUserDataEntries. Note that some clients may require more user data entries than there - * are physical user data registers - PAL will manage "spilling" of user data entries to GPU memory if necessary. - * - * User data entries are set in a command buffer by calling ICmdBuffer::CmdSetUserData(). - * - * ### Shader User Data Mapping - * When creating a pipeline, the client must specify how the user data entries set in a command buffer map to resources - * referenced by each shader in the pipeline. This is done in the pUserDataNodes array of PipelineShaderInfo. - * - * The resource mapping is built as a graph of _resource mapping nodes_ where the root nodes in the graph correspond - * to the user data entries. Each node fits in one of the following categories: - * - * - __SRD__: A 4 or 8 dword descriptor describing a shader resource. The mapping specifies the type and slot the SRD - * corresponds to (e.g., UAV 3 or sampler 7). - * - __Descriptor table pointer__: A GPU virtual address pointing at an array of other nodes. Typically this will be a - * pointer to GPU memory containing just SRDs, but tables are free to be built hierarchically such that tables have - * pointers to other tables in them. - * - __Inline constants__: 32-bit constants loaded directly byu the shader. The mapping specified the CB slot that - * should load the constant (e.g., cb3[1]). - * - __Unused__: A particular shader may not use all entries in a user data layout, and those should be marked unused. - * - * The following image illustrates a simple user data mapping: - * - * @image html userDataMapping.png - * - * ### Building Descriptor Tables - * The client is responsible for building specifying SRDs and pointers to GPU memory in order to execute the shader - * resource mapping specified during pipeline creation. SRDs can be created with several methods provided by IDevice: - * - * - CreateTypedBufferViewSrds() - * - CreateUntypedBufferViewSrds() - * - CreateImageViewSrds() - * - CreateFmaskViewSrds() - * - CreateSamplerSrds() - * - * The size required for each of these SRD types is returned in the srdSizes structure in DeviceProperties. - * - * When building descriptor tables in GPU memory, the client will need to retrieve a virtual address of the GPU memory - * where the tables exist in order to reference them from user data or from other descriptor tables. IGpuMemory - * provides the GetVirtAddr() method for this purpose. - *********************************************************************************************************************** - */ - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDevice.h + * @brief Defines the Platform Abstraction Library (PAL) IDevice interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palCmdAllocator.h" +#include "palDestroyable.h" +#include "palFence.h" +#include "palFile.h" +#include "palGpuMemory.h" +#include "palImage.h" +#include "palInlineFuncs.h" +#include "palLib.h" +#include "palPerfExperiment.h" +#include "palPipeline.h" +#include "palQueue.h" +#include + +#if PAL_KMT_BUILD +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Util +{ + class Event; +} + +namespace Pal +{ + +// Forward declarations. +class IBorderColorPalette; +class ICmdAllocator; +class ICmdBuffer; +class IColorBlendState; +class IColorTargetView; +class IDepthStencilState; +class IDepthStencilView; +class IDevice; +class IFence; +class IGpuEvent; +class IGpuMemory; +class IImage; +class IIndirectCmdGenerator; +class IMsaaState; +class IPerfExperiment; +class IPipeline; +class IPrivateScreen; +class IQueryPool; +class IQueue; +class IQueueSemaphore; +class IShaderLibrary; +class ISwapChain; +struct BorderColorPaletteCreateInfo; +struct CmdAllocatorCreateInfo; +struct CmdBufferCreateInfo; +struct ColorBlendStateCreateInfo; +struct ColorTargetViewCreateInfo; +struct ComputePipelineCreateInfo; +struct DepthStencilStateCreateInfo; +struct DepthStencilViewCreateInfo; +struct ExternalImageOpenInfo; +struct ExternalGpuMemoryOpenInfo; +struct ExternalQueueSemaphoreOpenInfo; +struct ExternalResourceOpenInfo; +struct GpuEventCreateInfo; +struct GpuMemoryCreateInfo; +struct GpuMemoryOpenInfo; +struct GpuMemoryRef; +struct GraphicsPipelineCreateInfo; +struct ImageCreateInfo; +struct IndirectCmdGeneratorCreateInfo; +struct MsaaStateCreateInfo; +struct MsaaQuadSamplePattern; +struct PeerGpuMemoryOpenInfo; +struct PeerImageOpenInfo; +struct PerfExperimentCreateInfo; +struct PinnedGpuMemoryCreateInfo; +struct PresentableImageCreateInfo; +struct PrivateScreenCreateInfo; +struct PrivateScreenNotifyInfo; +struct QueryPoolCreateInfo; +struct QueueCreateInfo; +struct QueueSemaphoreCreateInfo; +struct QueueSemaphoreOpenInfo; +struct ShaderLibraryCreateInfo; +struct SwapChainCreateInfo; +struct SwapChainProperties; +struct SvmGpuMemoryCreateInfo; +struct GraphicPipelineViewInstancingInfo; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 881 +enum class WsiPlatform : uint32; +#else +enum WsiPlatform : uint32; +#endif +enum class PipelineBindPoint : uint32; +enum class VaRange : uint32; +enum class SwapChainMode : uint32; +namespace CmdDisassembly +{ + class ICmdBufferReporting; +} + +/// Maximum string length for GPU names. @see DeviceProperties. +constexpr uint32 MaxDeviceName = 256; + +/// Maximum number of indirect user-data tables managed by PAL's command buffer objects. @see DeviceFinalizeInfo. +constexpr uint32 MaxIndirectUserDataTables = 1; + +/// Maximum number of supported entries in the MSAA sample pattern palette. See IDevice::SetSamplePatternPalette(). +constexpr uint32 MaxSamplePatternPaletteEntries = 16; + +/// Maximum number of supported units in the gpu. These can be much larger than the actual values, but useful for arrays. +constexpr uint32 MaxShaderEngines = 32; +/// Maximum number of supported subunits each Shader Engine splits into (SH or SA, depending on generation) +constexpr uint32 MaxShaderArraysPerSe = 2; + +/// Size of the Active Pixel Packer Mask in DWORDs +constexpr uint32 ActivePixelPackerMaskDwords = 4; + +/// Maximum number of pixel packers per SE expected by PAL +constexpr uint32 MaxPixelPackerPerSe = 4; + +/// Defines host flags for Semaphore/Fence Array wait +enum HostWaitFlags : uint32 +{ + HostWaitAny = 0x1, ///< if set this bit, return after any signle semaphore/fence in the array has + /// completed. if not set, wait for completion of all semaphores/fences in the + /// array before returning. +}; + +/// Specifies what type of GPU a particular IDevice is (i.e., discrete vs. integrated). +enum class GpuType : uint32 +{ + Unknown = 0x0, ///< The GPU type can't be determined and is unknown. + Integrated = 0x1, ///< Integrated GPU (i.e., APU). + Discrete = 0x2, ///< Discrete GPU. + Virtual = 0x3, ///< Virtualized GPU. + Count +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Specifies which operating-system-support IP level (OSSIP) this device has. +enum class OssIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an OSSIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an OSSIP block, or its level cannot be determined +#endif + + OssIp2_4 = 0x3, + OssIp4 = 0x4, +}; + +/// Specifies which VCE IP level this device has. +enum class VceIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an VCEIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an VCEIP block, or its level cannot be determined +#endif + + VceIp1 = 0x1, + VceIp2 = 0x2, + VceIp3 = 0x3, + VceIp3_1 = 0x4, + VceIp3_4 = 0x5, + VceIp4 = 0x6, +}; + +/// Specifies which UVD IP level this device has. +enum class UvdIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an UVDIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an UVDIP block, or its level cannot be determined +#endif + + UvdIp3_2 = 0x1, + UvdIp4 = 0x2, + UvdIp4_2 = 0x2, + UvdIp5 = 0x3, + UvdIp6 = 0x4, + UvdIp6_2 = 0x5, + UvdIp6_3 = 0x6, + UvdIp7 = 0x7, + UvdIp7_2 = 0x8, +}; +#endif + +/// Specifies which VCN IP level this device has. +enum class VcnIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an VCNIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an VCNIP block, or its level cannot be determined +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + VcnIp1 = 0x1, +#endif +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 +/// Specifies which SPU IP level this device has. +enum class SpuIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an SPUIP block, or its level cannot be determined +#ifndef None + None = _None, ///< The device does not have an SPUIP block, or its level cannot be determined +#endif + SpuIp = 0x1, +}; +#endif + +/// Specifies which PSP IP level this device has. +enum class PspIpLevel : uint32 +{ + _None = 0x0, ///< @internal The device does not have an PSPIP block, or its level cannot be determined +#ifndef None + None = _None, ///< The device does not have an PSPIP block, or its level cannot be determined +#endif + PspIp10 = 0x1, +}; + +/// Specified video decode type +enum class VideoDecodeType : uint32 +{ + H264 = 0x0, ///< H264 VLD + Vc1 = 0x1, ///< VC1 VLD + Mpeg2Idct = 0x2, ///< Partial MPEG2 decode (IT+MP) + Mpeg2Vld = 0x3, ///< Full MPEG2 decode (RE+IT+MP+DB) + Mpeg4 = 0x4, ///< MPEG4 + Wmv9 = 0x5, ///< WMV9 IDCT + Mjpeg = 0x6, ///< Motion JPEG + Hevc = 0x7, ///< HEVC + Vp9 = 0x8, ///< VP9 + Hevc10Bit = 0x9, ///< HEVC 10bit + Vp910Bit = 0xa, ///< VP9 10bit + Av1 = 0xb, ///< AV1 8/10bit + Av112Bit = 0xc, ///< AV1 12bit + Count, +}; + +/// Video CODEC to use for encoding +enum class VideoEncodeCodec : uint32 +{ + H264 = 0x0, ///< H.264 + H265 = 0x1, ///< H.265 + Av1 = 0x2, ///< AV1 + Count +}; + +/// Specifies a virtual address range memory should be allocated in. +enum class VaRange : uint32 +{ + Default, ///< Default VA range. Choose this for most allocations. + DescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for descriptor tables. Knowing + /// an allocation is allocated in this range, only one user data entry is required to + /// specify a descriptor table. @see ResourceMappingNodeType. + ShadowDescriptorTable, ///< Place the allocation in a 4GB VA range reserved by PAL for "shadow" descriptor tables. + /// A shadow descriptor table is an additional table with the same layout as its parent + /// descriptor table that can hold infrequently needed data like fmask SRDs or UAV counter + /// data. This scheme allows the client and SC to work out a known location for + /// infrequently needed data without wasting a user data entry or wasting half of every + /// descriptor cache line. + /// Only supported if DeviceProperties::gpuMemoryProperties::flags::shadowDescVaSupport is + /// set. + Svm, ///< Place the allocation in a VA range reserved by PAL for shared virtual memory(SVM). + /// This is a GPU VA range that is reserved also on the CPU-side. + /// The size of reserved VA is set by PAL client by calling CreatePlatform. + CaptureReplay, ///< Place the allocation in a VA range reserved for capture and playback. + Count, +}; + +/// Enumerates tmz(trusted memory zone) support level. +enum class TmzSupportLevel : uint32 +{ + None = 0, ///< TMZ not supported. + PerQueue = 1, ///< Enable TMZ mode per queue. + PerSubmission = 2, ///< Enable TMZ mode per submission. + PerCommandOp = 3 ///< Enable TMZ mode per command operation. +}; + +/// How to interpret a single bit in a swizzle equation. +union SwizzleEquationBit +{ + struct + { + uint8 valid : 1; ///< Indicates whether this channel setting is valid. + uint8 channel : 2; ///< 0 for x channel, 1 for y channel, 2 for z channel. + uint8 index : 5; ///< The channel index. + }; + uint8 u8All; ///< The above values packed in an 8-bit uint. +}; + +constexpr uint32 SwizzleEquationMaxBits = 20; ///< Swizzle equations will consider no more than this many bits. +constexpr uint8 InvalidSwizzleEqIndex = 0xFF; ///< Indicates an invalid swizzle equation index in the equation table. +constexpr uint8 LinearSwizzleEqIndex = 0xFE; ///< An invalid eq. index indicating a row-major, linear memory layout. + +/// Texture fetch meta-data capabilities bitfield definition, used with tcCompatibleMetaData setting +enum TexFetchMetaDataCaps : uint32 +{ + TexFetchMetaDataCapsNoAaColor = 0x00000001, + TexFetchMetaDataCapsMsaaColor = 0x00000002, + TexFetchMetaDataCapsFmask = 0x00000004, + TexFetchMetaDataCapsNoAaDepth = 0x00000008, + TexFetchMetaDataCapsMsaaDepth = 0x00000010, + TexFetchMetaDataCapsAllowStencil = 0x00000020, + TexFetchMetaDataCapsAllowZ16 = 0x00000040, +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 +/// Catalyst AI setting enums +enum CatalystAiSettings : uint32 +{ + CatalystAiDisable = 0, + CatalystAiEnable = 1, + CatalystAiMaximum = 2, +}; +#endif + +/// Texture Filter optimization enum values +enum TextureFilterOptimizationSettings : uint32 +{ + TextureFilterOptimizationsDisabled = 0, + TextureFilterOptimizationsEnabled = 1, + TextureFilterOptimizationsAggressive = 2, +}; + +/// Distribution Tess Mode enum values +enum DistributionTessMode : uint32 +{ + DistributionTessOff = 0, + DistributionTessDefault = 1, + DistributionTessPatch = 2, + DistributionTessDonut = 3, + DistributionTessTrapezoid = 4, + DistributionTessTrapezoidOnly = 5, +}; + +/// Defines the context roll optimization flags +enum ContextRollOptimizationFlags : uint32 +{ + OptFlagNone = 0x00000000, + PadParamCacheSpace = 0x00000001, +}; + +/// Defines the initial value to use for DCC metadata +enum class DccInitialClearKind { + Uncompressed = 0x0, + OpaqueBlack = 0x1, + OpaqueWhite = 0x2, + ForceBit = 0x10, + ForceOpaqueBlack = (ForceBit | OpaqueBlack), + ForceOpaqueWhite = (ForceBit | OpaqueWhite), +}; + +/// Enum defining the different scopes (i.e. registry locations) where settings values are stored +enum InternalSettingScope : uint32 +{ + PrivateDriverKey = 0x0, + PublicPalKey = 0x1, + PrivatePalKey = 0x2, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + PublicCatalystKey = 0x3, + PrivatePalGfx9Key = 0x4, + PrivatePalGfx12Key = 0x5, +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + PrivatePalGfx6Key = 0x3, + PrivatePalGfx9Key = 0x4, + PublicCatalystKey = 0x5, + PrivatePalGfx12Key = 0x7, +#endif + PublicPalFile = 0x9, + RootDriverKey = 0xa, +}; + +/// Enum defining override states for feature settings. +enum class FeatureOverride : uint32 +{ + Default = 0, ///< Default setting state. + Enabled = 1, ///< (Force) enabled state. Default may change itself to this state. + Disabled = 2 ///< (Force) disabled state. Default may change itself to this state. +}; + +/// Enum bitmask defining externally-controlled (e.g. by Radeon Settings/KMD) driver feature settings. +enum RsFeatureType : uint32 +{ + RsFeatureTypeTurboSync = (1u << 0), + RsFeatureTypeChill = (1u << 1), + RsFeatureTypeDelag = (1u << 2), + RsFeatureTypeBoost = (1u << 4), + RsFeatureTypeProVsr = (1u << 5), +}; + +/// Output structure containing information about the requested RsFeatureType (singular). +union RsFeatureInfo +{ + /// Global TurboSync settings. + struct + { + bool enabled; ///< Specifies whether TurboSync is enabled globally. + } turboSync; + + /// Global Chill settings. + struct + { + bool enabled; ///< Specifies whether Chill is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Chill. + uint32 minFps; ///< Specifies the global Chill minimum FPS limit. + uint32 maxFps; ///< Specifies the global Chill maximum FPS limit. + } chill; + + /// Global Delag settings. + struct + { + bool enabled; ///< Specifies whether Delag is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Delag. + uint32 hotkeyInd;///< If nonzero, specifies the virtual key code assigned to Delag's indicator. + uint32 limitFps; ///< Specifies the global Delag FPS limit. + uint32 level; ///< Specifies the global Delag level. + } delag; + + /// Global Boost settings. + struct + { + bool enabled; ///< Specifies whether Boost is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to Boost. + uint32 hotkeyInd; ///< If nonzero, specifies the virtual key code assigned to Boost's indicator. + uint32 minRes; ///< Specifies the global Boost minimum resolution. + bool adaptiveVrsEnabled; ///< Specifies whether BoostAdaptiveVrs is enabled globally. + } boost; + + /// Global ProVsr settings. + struct + { + bool enabled; ///< Specifies whether ProVsr is enabled globally. + uint32 hotkey; ///< If nonzero, specifies the virtual key code assigned to ProVsr. + } proVsr; + +}; + +/// High-dynamic range (HDR) surface display modes. Used to indicate the HDR display standard for a particular swap +/// chain texture format and screen colorspace/transfer function combination. +enum class HdrDisplayMode : uint32 +{ + Sdr = 0, ///< Standard dynamic range; non-HDR compatible (default). + Hdr10 = 1, ///< HDR10 PQ. Requires 10:10:10:2 swap chain. + ScRgb = 2, ///< scRGB HDR (Microsoft and FreeSync2 linear mode). 1.0 = 80 nits, 125.0 = 10000 nits. + /// Requires FP16 swapchain. + FreeSync2 = 3, ///< FreeSync2 HDR10 Gamma 2.2. Requires 10:10:10:2 swap chain. +}; + +static constexpr uint32 MaxMiscStrLen = 61; + +/// Whether to use graphics or compute for performing fast clears on depth stencil views. +enum class FastDepthStencilClearMode : uint8 +{ + Default, ///< Compute or graphics will be chosen at the driver's discretion + Graphics, ///< Graphics will always be used + Compute ///< Compute will always be used +}; + +enum DeferredBatchBinMode : uint32 +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 875 + DeferredBatchBinCustom = 0, + DeferredBatchBinAccurate = 1 +#else + DeferredBatchBinDisabled = 0, + DeferredBatchBinCustom = 1, + DeferredBatchBinAccurate = 2 +#endif +}; + +/// PWS enable mode: e.g. disabled, fully enabled or partially enabled. +enum class PwsMode : uint32 +{ + Disabled = 0, ///< PWS feature is disabled + Enabled = 1, ///< PWS feature is fully enabled if HW supports. + NoLateAcquirePoint = 2 ///< PWS feature is enabled with PWS counter only if HW supports, no late acquire points. +}; + +#if defined(__unix__) +/// Defines the supported VM always valid modes. +enum VmAlwaysValidEnable : uint32 +{ + VmAlwaysValidDefaultEnable = 0, ///< Only enable the optimization when kernel driver can support + VmAlwaysValidForceDisable = 1, ///< Force the optimization always disabled + VmAlwaysValidForceEnable = 2 ///< Force the optimization always enabled +}; +#endif + +/// Alignment mode for accessing graphics/compute buffers. +/// This doesn't change the alignment of the whole buffer, just the alignment at which accesses must be made. +enum class BufferAlignmentMode : uint8 +{ + Default, ///< KMD (and therefore PAL) picks alignment requirement. Client should be prepared for anything. + Dword, ///< Hardware will automatically align requests to the smaller of: element-size or DWORD. + Unaligned ///< Any request alignment is allowed. +}; + +enum TemporalHintsMrtBehavior : uint8 +{ + TemporalHintsDynamicRt = 0x0, ///< Enable Dynamic RT Temporal hints. PAL chooses NT vs RT based on heuristics. + TemporalHintsStaticRt = 0x1, ///< Regular temporal for both near and far read/write caches. + TemporalHintsStaticNt = 0x2, ///< Non-temporal (re-use not expected) for both near and far read/write caches. +}; + +/// Client-controllable behavior for Gfx12-specific software workaround to HiSZ hardware bug. +/// Allows the client to override PAL defaults for performance or profiled reasons. +enum class HiSZWorkaroundBehavior : uint8 +{ + Default = 0x0, ///< Let PAL decide what the default is. + ForceDisableAllWar, ///< Force disable all workarounds. + /// Note: This should rarely be chosen and carries an extremely high risk of issue. + /// Should only be used when application has been profiled to guarantee no + /// risk of issue. + ForceHiSZDisableBasedWar, ///< Force the disable HiZ/S based workaround behavior. + ForceHiSZEventBasedWar, ///< Force the event-after-draw workaround behavior. + /// Note: This carries a risk that the hang may still be seen. + ForceHiSZDisableBaseWarWithReZ ///< Force the disable HiZ/S based workaround behavior, but with an added + /// optimization to force ZOrder mode to EarlyZThenReZ to reclaim some performance. + /// This will apply to all graphics pipelines that trigger the workaround + /// condition, except those pipelines that have set + /// GraphicsPipelineCreateInfo::noForceReZ. This ZOrder change will occur only when + /// HiZ/S is forcibly disabled by the workaround; otherwise the ZOrder remains what + /// was chosen by the compiler. +}; + +/// Pal settings that are client visible and editable. +struct PalPublicSettings +{ + /// Maximum border color palette size supported by any queue. + uint32 borderColorPaletteSizeLimit; + + /// Whether to use graphics or compute for performing fast clears on depth stencil views. + FastDepthStencilClearMode fastDepthStencilClearMode; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Forces all serialized loads (LoadPipeline or LoadCompoundState) to fail. + bool forceLoadObjectFailure; +#endif + + /// Controls the distribution mode for tessellation, which affects how patches are processed by different VGT + /// units. 0: None - No distribution across VGTs (legacy mode). 1: Default - Optimal settings are chosen depending + /// on the gfxip. 2: Patch - Individual patches are distributed to different VGTs. 3: Donut - Patches are split + /// into donuts and distributed to different VGTs. 4: Trapezoid - Patches from donuts are split into trapezoids and + /// distributed to different VGTs. Falls back to donut mode if HW does not support this mode. 5: Trapezoid only - + /// Distribution turned off if HW does not support this mode. + uint32 distributionTessMode; + + /// Flags that control PAL optimizations to reduce context rolls. 0: Optimization disabled. 1: Pad parameter cache + /// space. Sets VS export count and PS interpolant number to per-command buffer maximum value. Reduces context rolls + /// at the expense of parameter cache space. + uint32 contextRollOptimizationFlags; + + /// The number of unbound descriptor debug srds to allocate. To detect reads of unbound descriptor within arrays, + /// multiple debug srds can be allocated. + uint32 unboundDescriptorDebugSrdCount; + + /// Disables compilation of internal PAL shaders. It can be enabled only if a PAL client won't use any of PAL blit + /// functionalities on gfx/compute engines. + bool disableResourceProcessingManager; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Controls app detect and image quality altering optimizations exposed by CCC. + uint32 catalystAI; +#endif + + /// Controls texture filtering optimizations exposed by CCC. + uint32 textureOptLevel; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Disables SC initialization. It can be enabled only if a PAL client won't use SC for shader compilation and + /// provide direct ISA binaries(usually AQL path). + bool disableScManager; +#endif + + /// Information about the client performing the rendering. For example: Rendered By PAL (0.0.1) + char renderedByString[MaxMiscStrLen]; + + /// Debug information that the client or tester might want reported. + char miscellaneousDebugString[MaxMiscStrLen]; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + /// Allows SC to make optimizations at the expense of IEEE compliance. + bool allowNonIeeeOperations; + + /// Controls whether shaders should execute one atomic instruction per wave for UAV append/consume operations. + /// If false, one atomic will be executed per thread. + bool appendBufPerWaveAtomic; +#endif + + /// Bitmask of cases where texture compatible meta data will be used Single-sample color surface: 0x00000001 MSAA + /// color surface: 0x00000002 FMask data: 0x00000004 Single-sample depth surface: 0x00000008 MSAA depth surface: + /// 0x00000010 Allow stencil: 0x00000020 Allow Z-16 surfs 0x00000040 + uint32 tcCompatibleMetaData; + + /// Specifies the threshold below which CmdCopyMemory() is executed via a CpDma BLT, in bytes. CPDMA copies have + /// lower overhead than CS/Gfx copies, but less throughput for large copies. + uint32 cpDmaCmdCopyMemoryMaxBytes; + + /// Forces high performance state for allocated queues. Note: currently supported in Windows only. + bool forceHighClocks; + + /// When submitting multiple command buffers in a single grQueueSubmit call, the ICD will patch the command streams + /// so that the command buffers are chained together instead of submitting through KMD multiple times. This setting + /// limits the number of command buffers that will be chained together; reduce to prevent problems due to long + /// running submits. + uint32 cmdBufBatchedSubmitChainLimit; + + /// Flags that control PAL's command allocator residency optimizations. If a command allocation isn't optimized PAL + /// will wait for it to become resident at creation. 0x1 - Wait for command data to become resident at Submit-time. + /// 0x2 - Wait for embedded data to become resident at Submit-time. 0x4 - Wait for marker data to become resident at + /// Submit-time. + uint32 cmdAllocResidency; + + /// Overrides max queued frames allowed + uint32 maxQueuedFrames; + + /// Maximum number of presentable images per adapter(including LDA chain) which is recommended. If app exceeds the + /// presentable image number threshold, awarning may be reported. + uint32 presentableImageNumberThreshold; + + /// Provides a hint to PAL that client knows that every individual depth stencil surfaces are always cleared with + /// same values.If TRUE, per-tile tracking of exp/clear will be enabled (requires HTile). + bool hintInvariantDepthStencilClearValues; + + /// Provides a hint to PAL that PAL should disable color compression on surfaces that are smaller than or equal to + /// this setting (setting * setting) in size. + uint32 hintDisableSmallSurfColorCompressionSize; + + /// Disables Escape call to KMD. This is a temporary setting for experimentation that is expected to break features + /// that currently needs Escape call. + bool disableEscapeCall; + + /// A hint to the Windows OS that this application has submissions that are expected to run for a long time. This + /// tells the OS that checking elapsed execution time is not a good way to judge if the GPU is hung. + /// + /// If this flag is set, Windows shouldn't TDR long submissions on uncontested hardware queues. Submissions on + /// contested queues must preempt within the time limit to avoid a TDR. Setting this may also extend the TDR timer + /// on compute queues. + bool longRunningSubmissions; + + /// Disables MCBP on demand. This is a temporary setting until ATOMIC_MEM packet issue with MCBP is resolved. + bool disableCommandBufferPreemption; + + /// Disable the fast clear eliminate skipping optimization. This optimization will conservatively track the usage + /// of clear values to allow the vast majority of images that never clear to a value that isn't TC-compatible to + /// skip the CPU and front-end GPU overhead of issuing a predicated fast clear eliminate BLT. + bool disableSkipFceOptimization; + + /// Sets the minimum BPP of surfaces which will have DCC enabled + uint32 dccBitsPerPixelThreshold; + + /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have + /// PAL automatically pad allocation starting virtual address alignments to enable this optimization. By + /// default, PAL will use the KMD-reported limit. + gpusize largePageMinSizeForVaAlignmentInBytes; + + /// See largePageSizeInBytes in DeviceProperties. This limit defines how large an allocation must be to have + /// PAL automatically pad allocation sizes to fill an integral number of large pages. By default, PAL will + /// use the KMD-reported limit. + gpusize largePageMinSizeForSizeAlignmentInBytes; + + /// Makes the unbound descriptor debug srd 0 so the hardware drops the load and ignores it instead of pagefaulting. + /// Used to workaround incorrect app behavior. + bool zeroUnboundDescDebugSrd; + + /// Preferred heap for uploading client pipelines. Default is set to @ref GpuHeap::GpuHeapInvisible. Setting is + /// ignored for internal pipelines and are uploaded to @ref GpuHeap::GpuHeapLocal. + GpuHeap pipelinePreferredHeap; + + bool depthClampBasedOnZExport; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 + /// Force the PreColorTarget to an earlier PreRasterization point if used as a wait point. This is to prevent a + /// write-after-read hazard for a corner case: shader exports from distinct packers are not ordered. Advancing + /// wait point from PreColorTarget to PostPrefetch could cause over-sync due to extra VS/PS_PARTIAL_FLUSH + /// inserted. It is default to false, but client drivers may choose to app-detect to enable if see corruption. + bool forceWaitPointPreColorToPostPrefetch; +#endif + + /// Allows the client to disable debug overlay visual confirm after DebugOverlay::Platform is created when the + /// panel setting DebugOverlayEnabled is globally set but a certain application might need to turn off visual + /// confirm to make the screen not too noisy. + bool disableDebugOverlayVisualConfirm; + + bool enableExecuteIndirectPacket; + + /// Offers flexibility to the client to choose Graphics vs Compute engine for Indirect Command Generation + /// (Shader path) based on performance and other factors. The default is false since we have seen perf gains using + /// the ACE. + bool disableExecuteIndirectAceOffload; + + /// Value to initialize metadata for DCC surfaces to, if they are compressable. This has no effect on non-DCC + /// images. Images whose initial layout is not compressable are only affected if this is "forced". + /// 0x00 - Uncompressed (default) + /// 0x01 - Opaque Black + /// 0x02 - Opaque White + /// 0x11 - Forced Opaque Black + /// 0x12 - Forced Opaque White + uint32 dccInitialClearKind; + + /// Allows the client to not create internal VrsImage. Pal internal will create a 16M image as vrsImageSize. + bool disableInternalVrsImage; + + /// Allows the client to control binning persistent and context states per bin. + /// A value of 0 tells PAL to pick the number of states per bin. + uint32 binningPersistentStatesPerBin; + uint32 binningContextStatesPerBin; + + /// This key controls if binning will be disabled when the PS may kill pixels. + OverrideMode disableBinningPsKill; + + /// Controls GS LateAlloc val (for pos/prim allocations NOT param cache) on NGG pipelines. Can be no more than 127. + uint32 nggLateAllocGs; + + /// Bitmask of cases where RPM view memory accesses will bypass the MALL + /// RpmViewsBypassMallOff (0x0): Disable MALL bypass + /// RpmViewsBypassMallOnRead (0x1): Skip MALL for read access of views created in RPM + /// RpmViewsBypassMallOnWrite (0x2): Skip MALL for write access of views created in RPM + /// RpmViewsBypassMallOnCbDbWrite (0x4): Control the RPM CB/DB behavior + RpmViewsBypassMall rpmViewsBypassMall; + + /// Optimize color export format for depth only rendering. Only applicable for RB+ parts + bool optDepthOnlyExportRate; + + /// Controls whether or not we should expand Hi-Z to full range rather than doing fine-grain resummarize + /// operations. Expanding Hi-Z leaves the Hi-Z data in a less optimal state but is a much faster operation + /// than the fine-grain resummarize. + bool expandHiZRangeForResummarize; + + /// Control whether to have command buffer emit SQTT marker events. Useful for client driver to perform SQTT + /// dump without the involvement of dev driver. + bool enableSqttMarkerEvent; + + /// Controls the value of CB_COLOR0_ATTRIB.LIMIT_COLOR_FETCH_TO_256B_MAX. This bit limits CB fetch to 256B on cache + /// miss, regardless of sector size. + bool limitCbFetch256B; + + /// Controls whether or not deferred batch binning is enabled 0 : Batch binning always disabled 1 : Use custom bin + /// sizes 2 : Optimal. + DeferredBatchBinMode binningMode; + + /// Controls the custom batch bin size.Only used when deferredBatchBinMode == 1 High word is for x, low word is for + /// y. Default is 128x128. Values must be power of two between 16 and 512. + uint32 customBatchBinSize; + + /// Maximum number of primitives per batch. The maximum value is 1024. + uint32 binningMaxPrimPerBatch; + + /// Controls PWS enable mode: disabled, fully enabled or partially enabled. Only take effect if HW supports PWS. + PwsMode pwsMode; + + /// Controls the MaxScratchRingSizeBaseline, which is really just the maximum size of the scratch ring + gpusize maxScratchRingSizeBaseline; + + /// Controls the maximum size of the scratch ring allocation + uint32 maxScratchRingSizeScalePct; + +#if defined(__unix__) + /// Whether enable vm-always-valid feature on Linux while allocating Bo + VmAlwaysValidEnable enableVmAlwaysValid; +#endif + + /// Alignment mode for graphics and compute buffers. + /// Clients must check Pal::DeviceProperties::osProperties::flags::forceAlignmentSupported + /// to see if anything other than default will work. + BufferAlignmentMode hardwareBufferAlignmentMode; + + // Disallows putting the shader ring in system memory for performance purposes + // This is done by un-listing GpuHeapGartUswc as a possible heap for the shader rings. + // Instead only allowing GpuHeapInvisible and GpuHeapLocal + bool forceShaderRingToVMem; + + /// If the client sets this to true they promise they've done exhaustive testing on every ASIC to prove that this + /// application can use AC01 fast clears safely. This should never be forced to true unconditionally. + bool ac01WaNotNeeded; + + /// Toggles whether or not image copies will prefer using the graphics pipeline. This setting does not force all + /// copies to use graphics or compute, it changes what method will be selected in cases where either could be used. + bool preferGraphicsImageCopy; + + /// Bitmask to control adding Waits around Flush events + /// This is public setting to allow AppDetect to override of 'WaitOnFlush' setting. + /// Setting 'waitOnFlush' for actual workarounds is strongly discouraged because: + /// 1) It has a negative performance impact. + /// 2) Waits effect the timing and pipeline execution which can hide underlying hw/fw/sw bugs. + /// Issues resolved by added waits should be root caused. + uint32 waitOnFlush; + + /// Provides the ability for mall to be alloc-ed/noalloc-ed using the dynamic or static behavior. + TemporalHintsMrtBehavior temporalHintsMrtBehavior; + + /// Allows the client to change the behavior of the Gfx12 HW Bug that impacts HiS and HiZ. + /// This should never be forced away from default unconditionally. + HiSZWorkaroundBehavior hiSZWorkaroundBehavior; + + /// For event-based HiSZ workarounds (Gfx12), specifies the timeouts supported by the Scan Converter and Depth Block + /// for their tile summarizer controller. + /// Ignored unless @ref hiSZWorkaroundBehavior is set to ForceHiSZEventBasedWar. + /// If set to 0, PAL picks a default value. + /// Note: Choosing any value other than 0 carries with it **significant risk** of hangs, as the timeout value + /// determined by PAL is the most optimal to avoid the most hangs. Any deviation from the default must be + /// thoroughly tested and is not guaranteed to be safe! + uint32 tileSummarizerTimeout; + +#if PAL_KMT_BUILD + /// If client sets it to false, native fence will be disabled in OS wide no matter OS or hardware supports + /// native fence or not. + bool enableNativeFence; +#endif + + /// Whether HiZ/HiS is valid after depth/stencil image is copied as destination. + /// - If false, this is not guaranteed, PAL will issue necessary HiZ/HiS resummarization at later client barrier + /// call time when transition to another HiZ/HiS valid layout like DepthStencilTarget. + /// - If true, PAL does necessary HiZ/HiS fixup after copy if needed and HiZ/HiS will be always valid after copy. + /// At later client barrier call time, no HiZ/HiS resummarization will happen. + bool makeLayoutCopyDstHiSZCompatible; +}; + +/// Defines the modes that the GPU Profiling layer can use when its buffer fills. +enum GpuProfilerStallMode : uint32 +{ + GpuProfilerStallAlways = 0, ///< Always stall to get accurate trace data + GpuProfilerStallLoseDetail = 1, ///< Lose register-level detail if under pressure to avoid stalls + GpuProfilerStallNever = 2, ///< Never stall, miss trace packets +}; + +/// Describes the equations needed to interpret the raw memory of a tiled texture. +struct SwizzleEquation +{ + SwizzleEquationBit addr[SwizzleEquationMaxBits]; ///< Address setting: each bit is the result of addr ^ xor ^ xor2. + SwizzleEquationBit xor1[SwizzleEquationMaxBits]; ///< xor setting. + SwizzleEquationBit xor2[SwizzleEquationMaxBits]; ///< xor2 setting. + uint32 numBits; ///< The number of bits in the equation. + bool stackedDepthSlices; ///< True if depth slices are treated as being stacked vertically + /// prior to swizzling. +}; + +/// Specifies the hardware features supported for PRT (sparse images). +enum PrtFeatureFlags : uint32 +{ + PrtFeatureBuffer = 0x00000001, ///< Indicates support for sparse buffers + PrtFeatureImage2D = 0x00000002, ///< Indicates support for sparse 2D images + PrtFeatureImage3D = 0x00000004, ///< Indicates support for sparse 3D images + PrtFeatureImageMultisampled = 0x00000008, ///< Indicates support for sparse multisampled images + PrtFeatureImageDepthStencil = 0x00000010, ///< Indicates support for sparse depth/stencil images + PrtFeatureShaderStatus = 0x00000020, ///< Indicates support for residency status in shader instructions + PrtFeatureShaderLodClamp = 0x00000040, ///< Indicates support for LOD clamping in shader instructions + PrtFeatureUnalignedMipSize = 0x00000080, ///< Indicates support for non-miptail levels with dimensions that + /// aren't integer multiples of the tile size as long as they are + /// at least as large as a single tile + PrtFeaturePerSliceMipTail = 0x00000100, ///< Indicates support for per-slice miptail (slice-major order) + + PrtFeatureTileAliasing = 0x00000200, ///< Indicates support for aliasing tiles (without metadata) + PrtFeatureStrictNull = 0x00000400, ///< Indicates whether reads of unmapped tiles always return zero +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 936 + PrtFeatureNonStandardImage3D = 0x00000800, ///< Indicates support for sparse 3D images restricted to + /// non-standard tile shapes that match the tile mode block depth + PrtFeaturePrtPlus = 0x00001000, ///< Indicates that this image supports use of residency maps. +#else + PrtFeaturePrtPlus = 0x00000800, ///< Indicates that this image supports use of residency maps. +#endif +}; + +/// Describe the settings' scope accessible by clients. +enum class SettingScope +{ + Driver, ///< For settings specific to a UMD + Global, ///< For global settings controlled by CCC + File, ///< For settings that are only read from a file + Root, ///< For settings under HKR +}; + +/// Big Software (BigSW) Release information structure +/// Software release management uses this version # to control a rollout of big SW features together. +struct BigSoftwareReleaseInfo +{ + uint32 majorVersion; ///< BigSW Release Major version + uint32 minorVersion; ///< BigSW Release Minor version. + uint32 miscControl; ///< BigSW Release miscellaneous control. +}; + +/// Virtual display capabilities as determined by the OS. The reported values bound the valid ranges of values supported +/// by the @ref VirtualDisplayInfo structure passed in to @ref IDevice::CreateVirtualDisplay. +struct VirtualDisplayCapabilities +{ + uint32 maxVirtualDisplays; ///< The maximum number of virtual display supported + Rational minRefreshRate; ///< The minimum refresh rate + Rational maxRefreshRate; ///< The maximum refresh rate +}; + +/// The properties of a specific virtual display +struct VirtualDisplayProperties +{ + bool isVirtualDisplay; ///< True, if it's a virtual display +}; + +/// Enumerates all of the types of local video memory which could be associated with a GPU. +enum class LocalMemoryType : uint32 +{ + Unknown = 0, + Ddr2, + Ddr3, + Ddr4, + Gddr5, + Gddr6, + Hbm, + Hbm2, + Hbm3, + Lpddr4, + Lpddr5, + Ddr5, + Count +}; + +/// Bitmask of all MSAA/EQAA types supported, in terms of samples (S) and shaded fragments (F) +enum MsaaFlags : uint16 +{ + MsaaS1F1 = 0x0001, + MsaaS2F1 = 0x0002, + MsaaS4F1 = 0x0004, + MsaaS8F1 = 0x0008, + MsaaS16F1 = 0x0010, + MsaaAllF1 = 0x001F, + + MsaaS2F2 = 0x0020, + MsaaS4F2 = 0x0040, + MsaaS8F2 = 0x0080, + MsaaS16F2 = 0x0100, + MsaaAllF2 = 0x01E0, + + MsaaS4F4 = 0x0200, + MsaaS8F4 = 0x0400, + MsaaS16F4 = 0x0800, + MsaaAllF4 = 0x0E00, + + MsaaS8F8 = 0x1000, + MsaaS16F8 = 0x2000, + MsaaAllF8 = 0x3000, + + MsaaAll = 0x3FFF, +}; + +/// Supported RTIP version enumeration +enum class RayTracingIpLevel : uint32 +{ + _None = 0x0, ///< The device does not have an RayTracing Ip Level +#ifndef None + None = _None, ///< The device does not have an RayTracing Ip Level +#endif + RtIp1_0 = 0x1, ///< First Implementation of HW RT + RtIp1_1 = 0x2, ///< Added computation of triangle barycentrics into HW + RtIp2_0 = 0x3, ///< Added more Hardware RayTracing features, such as BoxSort, PointerFlag, etc +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, + /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 + RtIp3_1 = 0x5, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, + /// 128 Byte Quantized box node, obb support, wide sort) +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + RtIp3_0 = 0x4, ///< Added high precision box node, HW instance node, dual intersect ray, BVH8 intersect ray, + /// LDS stack push 8 pop 1, and LDS stack push 8 pop 2 + RtIp3_1 = 0x6, ///< Added improved bvh footprints (change to node pointer, 128 Byte primitive structure format, + /// 128 Byte Quantized box node, obb support, wide sort) +#endif +}; + +/// Which ip version a component has reported +struct IpTriple +{ + uint32 major : 8; /// Major revision value + uint32 minor : 8; /// Minor revision value + uint32 stepping : 16; /// Stepping value + + constexpr operator uint32() const + { + return ((stepping ) | + (minor << 16) | + (major << 24)); + } +}; + +/// Reports various properties of a particular IDevice to the client. @see IDevice::GetProperties. +struct DeviceProperties +{ + uint32 vendorId; ///< Vendor ID (should always be 0x1002 for AMD). + uint32 deviceId; ///< GPU device ID (e.g., Hawaii XT = 0x67B0). + uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different + /// SKUs or revisions. Corresponds to one of the PRID_* revision IDs. + uint32 eRevId; ///< GPU emulation/internal revision ID. + AsicRevision revision; ///< ASIC revision. + GpuType gpuType; ///< Type of GPU (discrete vs. integrated) + uint16 gpuPerformanceCapacity; ///< Portion of GPU assigned in virtualized system (SRIOV) + ///< 0-65535, 0 invalid (not virtualized), 1 min, 65535 max + GfxIpLevel gfxLevel; ///< IP level of this GPU's GFX block +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + OssIpLevel ossLevel; ///< IP level of this GPU's OSS block + VceIpLevel vceLevel; ///< IP level of this GPU's VCE block + UvdIpLevel uvdLevel; ///< IP level of this GPU's UVD block +#endif + VcnIpLevel vcnLevel; ///< IP level of this GPU's VCN block +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + SpuIpLevel spuLevel; ///< IP level of this GPU's SPU block +#endif + PspIpLevel pspLevel; ///< IP level of this GPU's PSP block + uint32 gfxStepping; ///< Stepping level of this GPU's GFX block + IpTriple gfxTriple; ///< Full GFX IP level (major.minor.step) of this GPU + char gpuName[MaxDeviceName]; ///< Null terminated string identifying the GPU. + uint32 gpuIndex; ///< Device's index in a linked adapter chain. + uint32 maxGpuMemoryRefsResident; ///< Maximum number of GPU memory references that can be resident + /// at any time. Memory references set both via IQueue and IDevice + /// (via AddGpuMemoryReferences() or Submit()) count against this limit. + uint64 timestampFrequency; ///< Frequency of the device's timestamp counter in Hz. + /// @see ICmdBuffer::CmdWriteTimestamp. + uint32 attachedScreenCount; ///< Number of screen attached to the device. + uint32 maxSemaphoreCount; ///< Queue semaphores cannot have a signal count higher than this value. + /// For example, one indicates that queue semaphores are binary. + PalPublicSettings settings; ///< Public settings that the client has the option of overriding +#if PAL_CLIENT_EXAMPLE + AddrHandle hAddrlib; ///< Handle to addrlib for directed image tests. +#endif + + struct + { + union + { + struct + { + /// This engine supports timestamps (ICmdBuffer::CmdWriteTimestamp()). + uint32 supportsTimestamps : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on Streamout/Occlusion query + uint32 supportsQueryPredication : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on a 32-bit GPU memory allocation + uint32 supports32bitMemoryPredication : 1; + + /// This engine supports ICmdBuffer::CmdSetPredication() based on a 64-bit GPU memory allocation + uint32 supports64bitMemoryPredication : 1; + + /// This engine supports ICmdBuffer::If(), Else() and EndIf() calls. + uint32 supportsConditionalExecution : 1; + + /// This engine supports ICmdBuffer::While() and EndWhile() calls. + uint32 supportsLoopExecution : 1; + + /// This engine supports ICmdBuffer::CmdWaitRegisterValue(), WaitMemoryValue() and + /// CopyRegisterToMemory() calls. + uint32 supportsRegMemAccess : 1; + + /// This engine supports ICmdBuffer::CmdCopyImage() between optimally tiled images with + /// mismatched tiling tokens. + uint32 supportsMismatchedTileTokenCopy : 1; + + /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref + /// LayoutUninitializedTarget layout. + uint32 supportsImageInitBarrier : 1; + + /// This engine supports ICmdBuffer::Barrier() calls that transition out of the @ref + /// LayoutUninitializedTarget layout for individual subresources. If this is not set and + /// supportsImageInitBarrier is set, the subresource range must span the entire image. + uint32 supportsImageInitPerSubresource : 1; + + /// This engine does not support any virtual memory features. IQueue::RemapVirtualMemoryPages and + /// IQueue::CopyVirtualPageMappings are not supported on Queues using this engine. + uint32 runsInPhysicalMode : 1; + + /// Indicates whether this engine can do virtual memory remap or not. + uint32 supportVirtualMemoryRemap : 1; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Indicates whether this Queues using this engine can maintain the contents of CE RAM across + /// consecutive submissions. If this is not set, the client must not specify a nonzero value for + /// either @ref QueueCreateInfo::persistentCeRamSize or @ref QueueCreateInfo::persistentCeRamOffset. + uint32 supportPersistentCeRam : 1; +#else + uint32 reserved914 : 1; +#endif + + /// Indicates whether the engine supports the command allocator tracks which chunk is idle. + uint32 supportsTrackBusyChunks : 1; + + /// Indicates whether the engine can safely access non-resident ranges of resources. + uint32 supportsUnmappedPrtPageAccess : 1; + + /// This engine supports clear or copy with MSAA depth-stencil destination + uint32 supportsClearCopyMsaaDsDst : 1; + + /// Reserved for future use. + uint32 reserved : 16; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Engines property flags. + + struct + { + union + { + struct + { + uint32 exclusive : 1; ///< Engine is exclusively owned by one client at a time. + uint32 mustUseDispatchTunneling : 1; ///< Queues created on this engine must use dispatch + /// tunneling. + /// Indicates whether this engine instance can be used for gang submission workloads via + /// a multi-queue. + /// @see IDevice::CreateMultiQueue. + uint32 supportsMultiQueue : 1; + uint32 hwsEnabled : 1; + uint32 isHighPriority : 1; + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Capabilities property flags. + + uint32 queuePrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue + /// priority levels are supported by this engine. + uint32 dispatchTunnelingPrioritySupport; ///< Mask of QueuePrioritySupport flags indicating which queue + /// priority levels support dispatch tunneling on this engine. + uint32 maxFrontEndPipes; ///< Up to this number of IQueue objects can be consumed in + /// parallel by the front-end of this engine instance. It will + /// only be greater than 1 on hardware scheduled engine backed + /// by multiple hardware pipes/threads. + } capabilities[MaxAvailableEngines]; ///< Lists each engine of this type (up to engineCount) and their + /// properties. + + uint32 engineCount; ///< Number available engines of this type. + uint32 queueSupport; ///< Mask of QueueTypeSupport flags indicating which queues are + /// supported by this engine. + uint32 maxBorderColorPaletteSize; ///< Maximum size of a border color palette on this engine. + uint32 controlFlowNestingLimit; ///< Maximum depth of command-buffer control flow nesting on this + /// engine. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 ceRamSizeAvailable; ///< Size, in bytes, of constant engine RAM available on this engine. +#endif + Extent3d minTiledImageCopyAlignment; ///< Minimum alignments (pixels) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyImage() between optimally tiled images. + Extent3d minTiledImageMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyImage() with an optimally tiled image and a + /// linearly tiled image. Also applies to + /// ICmdBuffer::CmdCopyImageToMemory() or + /// ICmdBuffer::CmdCopyMemoryToImage() with an optimally tiled image. + Extent3d minLinearMemCopyAlignment; ///< Minimum alignments (bytes) for X/Y/Z/Width/Height/Depth for + /// ICmdBuffer::CmdCopyTypedBuffer(). + uint32 minTimestampAlignment; ///< If supportsTimestamps is set, this is the minimum address alignment + /// in bytes of the dstOffset in ICmdBuffer::CmdWriteTimestamp(). + uint32 maxNumDedicatedCu; ///< The maximum number of dedicated CUs for the real time audio queue + uint32 maxNumDedicatedCuPerQueue; ///< The maximum number of dedicated CUs per queue + uint32 dedicatedCuGranularity; ///< The granularity at which compute units can be dedicated to a queue + /// Specifies the suggested heap preference clients should use when creating an @ref ICmdAllocator that will + /// allocate command space for this engine type. These heap preferences should be specified in the allocHeap + /// parameter of @ref CmdAllocatorCreateInfo. Clients are free to ignore these defaults and use their own + /// heap preferences, but may suffer a performance penalty. + GpuHeap preferredCmdAllocHeaps[CmdAllocatorTypeCount]; + + /// Indicate which queue supports per-command, per-submit, or per-queue TMZ based on the queue type. + TmzSupportLevel tmzSupportLevel; + } engineProperties[EngineTypeCount]; ///< Lists available engines on this device and their properties. + + struct + { + union + { + struct + { + /// This queue supports IQueue::PresentSwapChain() calls. Note that a queue may support swap chain + /// presents even if the supportedDirectPresentModes flags below indicate no support for direct + /// presents; instead swap chain PresentMode support is queried via GetSwapChainInfo. + uint32 supportsSwapChainPresents : 1; + uint32 reserved744 : 1; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 927 + /// Set if the queue supports additional split barrier feature on top of basic acquire/release + /// interface support. This provides CmdAcquire() and CmdRelease() to implement split barriers. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 + /// Note: supportReleaseAcquireInterface is a prerequisite to supportSplitReleaseAcquire. +#endif + uint32 supportSplitReleaseAcquire : 1; +#else + uint32 reserved927 : 1; +#endif + + /// Reserved for future use. + uint32 reserved : 29; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Queue property flags. + + uint32 supportedDirectPresentModes; ///< A mask of PresentModeSupport flags indicating support for various + /// PresentModes when calling IQueue::PresentDirect(). + } queueProperties[QueueTypeCount]; ///< Lists the properties of all queues supported by PAL. + + struct + { + union + { + struct + { + /// Indicates support for virtual GPU memory allocations. @see IQueue::RemapVirtualMemoryPages. + uint32 virtualRemappingSupport : 1; + + /// Indicates support for pinning system memory for access as GPU memory. + /// @see IDevice::PinSystemMemory. + uint32 pinningSupport : 1; + + /// Indicates support pinned memory which is host-mapped from foreign device. + uint32 supportHostMappedForeignMemory : 1; + + /// Indicates whether specifying memory references at Submit time is supported. If not supported + /// all memory references must be manged via IDevice or IQueue AddGpuMemoryReferences() + uint32 supportPerSubmitMemRefs : 1; + + /// Indicates support for GPU virtual addresses that are visible to all devices. + uint32 globalGpuVaSupport : 1; + + /// Indicates support for Shared Virtual Memory VA range. + uint32 svmSupport : 1; + + /// Indicates support for shadow desc VA range. + uint32 shadowDescVaSupport : 1; + + /// Indicates support for IOMMUv2. Fine grain SVM is not supported without IOMMU. + /// PAL client needs to check this flag before using fine grain SVM. + /// IOMMU is a memory management unit (MMU) that connects a direct-memory-access-capable + /// (DMA-capable) I/O bus to the main memory. + uint32 iommuv2Support : 1; + + /// Indiciates that the platform supports automatic GPU memory priority management. + uint32 autoPrioritySupport : 1; + + /// Indicates KMD has enabled HBCC(High Bandwidth Cache Controller) page migration support. This means + /// shaders must be compiled such that all memory clauses can be replayed in response to an XNACK. + uint32 pageMigrationEnabled : 1; + /// Indicates TMZ (or HSFB) protected memory allocations are supported. + uint32 supportsTmz : 1; + + /// Memory allocations on this device support MALL (memory access last level); essentially + /// the lowest level cache possible. + uint32 supportsMall : 1; + + /// Support for querying page fault information + uint32 supportPageFaultInfo : 1; + + /// Indicates if this device supports GFX12-style distributed compression. Client can control + /// whether distributed compression is enabled or not per IGpuMemory object using the + /// distributedCompression field in @ref GpuMemoryCreateInfo. + uint32 supportDistributedCompression : 1; + + /// All GPU memory allocations in all heaps are always initialized to zero on creation. + /// The @ref initializeToZero flag is not needed when this is true. + uint32 alwaysInitializedToZero : 1; + + /// Reserved for future use. + uint32 reserved : 17; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory property flags. + + gpusize realMemAllocGranularity; ///< The addresses and sizes of "real" GPU memory objects must be aligned + /// to at least this many bytes. + gpusize virtualMemAllocGranularity; ///< The addresses and sizes of virtual GPU memory objects must be aligned + /// to at least this many bytes. + gpusize virtualMemPageSize; ///< Size in bytes of a virtual GPU memory page. + /// @see IQueue::RemapVirtualMemoryPages. + gpusize fragmentSize; ///< Size in bytes of a video memory fragment. If GPU memory object + /// addresses and sizes are aligned to at least this value, VA translation + /// will be a bit faster. It is aligned to the allocation granularities. + gpusize largePageSizeInBytes; ///< The large page optimization will allow compatible allocations to + /// potentially be upgraded to a page size larger than 64KiB to reduce TLB + /// pressure. PAL will automatically pad the size and alignment of some + /// allocations to enable this optimization; + /// see largePageMinSizeForAlignmentInBytes in PalPublicSettings. + gpusize maxVirtualMemSize; ///< Total virtual GPU memory available (total VA space size). + gpusize maxPhysicalMemSize; ///< Total VRAM available (Local + Invisible + non-Local heap sizes). + gpusize vaStart; ///< Starting address of the GPU's virtual address space. + gpusize vaEnd; ///< Ending address of the GPU's virtual address space. + gpusize descTableVaStart; ///< Starting address of the descriptor table's virtual address space + gpusize shadowDescTableVaStart; ///< Starting address of the shadow descriptor table's virtual address space + gpusize privateApertureBase; ///< Private memory base address for generic address space (Windows only). + gpusize sharedApertureBase; ///< Shared memory base address for generic address space (Windows only). + + gpusize busAddressableMemSize; ///< SDI/DirectGMA GPU aperture size set in CCC + gpusize maxLocalMemSize; ///< Total VRAM available on the GPU (Local + Invisible heap sizes). + LocalMemoryType localMemoryType; ///< Type of local memory used by the GPU. + gpusize maxCaptureReplaySize; ///< Total virtual GPU available for Capture/Replay + gpusize barSize; ///< Total VRAM which can be accessed by the CPU. + + struct + { + float maxMemClock; ///< Maximum GPU memory clock in MHz. For DX builds this value is valid only after + /// the device has been finalized. + uint32 memPerfRating; ///< Precomputed performance rating of memory operations. + uint32 vramBusBitWidth; ///< Memory bus width. + uint32 memOpsPerClock; ///< Memory operations per clock. + } performance; ///< Performance-related memory properties. + + } gpuMemoryProperties; ///< Memory properties for this device. + + struct + { + union + { + struct + { + /// Images created on this device supports AQBS stereo mode, this AQBS stereo mode doesn't apply to the + /// array-based stereo feature supported by Presentable images. + uint32 supportsAqbsStereoMode : 1; + + /// Set if images created on this device support being created with corner sampling. + uint32 supportsCornerSampling : 1; + + /// Load Bearing placeholder, do not touch. + uint32 placeholder0 : 1; + + /// Reserved for future use. + uint32 reserved : 29; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory property flags. + + Extent3d maxDimensions; ///< Maximum supported width/height/depth for an image. + uint32 maxArraySlices; ///< Maximum supported number of array slices for a 1D or 2D image. + PrtFeatureFlags prtFeatures; ///< PRT features supported by the hardware. + gpusize prtTileSize; ///< Size, in bytes, of a PRT tile. + MsaaFlags msaaSupport; ///< Bitflags for MSAA sample/fragment count support. + uint8 maxMsaaFragments; ///< Max number of MSAA fragments per pixel (may have more samples). + uint8 numSwizzleEqs; ///< How many swizzle equations are in pSwizzleEqs. + Extent2d vrsTileSize; ///< Pixel dimensions of a VRS tile. 0x0 indicates image-based shading rate + /// is not supported. + const SwizzleEquation* pSwizzleEqs; ///< These describe how to interpret device-dependent tiling modes. + + bool tilingSupported[static_cast(ImageTiling::Count)]; ///< If each image tiling is supported. + } imageProperties; ///< Image properties for this device. + + struct + { + /// Maximum number of available shader-accessible user data entries. @see PipelineShaderInfo. + uint32 maxUserDataEntries; + uint32 maxThreadGroupSize; ///< Per-device limit on threads per threadgroup for compute shaders. + /// Some hardware supported by PAL has a bug which can cause a GPU hang if async compute enginesare used while + /// compute shaders with > maxAsyncComputeThreadGroupSize are in flight on any queue. This reports the + /// maximum "safe" limit on threads per threadgroup for compute shaders for this device if the client wishes to + /// use async compute engines. Note that the bug can occur if the following conditions are met: + /// (a) Async compute workloads are running *somewhere* on the GPU, in any process; + /// (b) Some compute workloads on either the async compute engine or on the universal engine have a threads per + /// threadgroup amount which exceeds maxAsyncComputeThreadGroupSize. + /// + /// It is up to the client to choose how to work around this bug. They are free to either limit applications to + /// only creating compute shaders with <= maxAsyncComputeThreadGroupSize threads per group, or to avoid using + /// the async compute engines at all. + /// + /// If this value equals maxThreadGroupSize, then the device does not have this bug and the client can use + /// any compute shader on any queue. + uint32 maxAsyncComputeThreadGroupSize; + + uint32 maxComputeThreadGroupCountX; ///< Maximum number of thread groups supported + uint32 maxComputeThreadGroupCountY; ///< Maximum number of thread groups supported + uint32 maxComputeThreadGroupCountZ; ///< Maximum number of thread groups supported + + uint32 maxBufferViewStride; ///< Maximum stride, in bytes, that can be specified in a buffer view. + + uint32 hardwareContexts; ///< Number of distinct state contexts available for graphics workloads. Mostly + /// irrelevant to clients, but may be useful to tools. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 ceRamSize; ///< Maximum on-chip CE RAM size in bytes. +#endif + uint32 maxPrimgroupSize; ///< Maximum primitive group size. + uint32 supportedVrsRates; ///< Bitmask of VrsShadingRate enumerations indicating which modes are supported. + + uint32 mallSizeInBytes; ///< Size of total MALL (Memory Attached Last Level - L3) cache in bytes. + + uint32 gl2UncachedCpuCoherency; ///< If supportGl2Uncached is set, then this is a bitmask of all + /// CacheCoherencyUsageFlags that will be coherent with CPU reads/writes. + /// Note that reporting CoherShader only means that GLC accesses will be + /// CPU coherent. + /// Note: Only valid if @ref supportGl2Uncached is true. + + uint32 maxGsOutputVert; ///< Maximum number of GS output vertices. + uint32 maxGsTotalOutputComponents; ///< Maximum number of GS output components totally. + uint32 maxGsInvocations; ///< Maximum number of GS prim instances, corresponding to geometry shader + /// invocation in glsl. + + RayTracingIpLevel rayTracingIp; ///< HW RayTracing IP version + + uint32 cpUcodeVersion; ///< Command processor feature version. + uint32 pfpUcodeVersion; ///< Command processor, graphics prefetch firmware version. + + union + { + struct + { + uint64 support8bitIndices : 1; ///< Hardware natively supports 8bit indices + uint64 support16BitInstructions : 1; ///< Hardware supports FP16 and INT16 instructions + uint64 supportBorderColorSwizzle : 1; ///< Hardware supports border color swizzle + uint64 supportDoubleRate16BitInstructions : 1; ///< Hardware supports double rate packed math + uint64 supportFp16Fetch : 1; ///< Hardware supports FP16 texture fetches + uint64 supportFp16Dot2 : 1; ///< Hardware supports a paired FP16 dot product. + uint64 supportConservativeRasterization : 1; ///< Hardware supports conservative rasterization + uint64 supportImplicitPrimitiveShader : 1; ///< Device supports implicit compiling of the + /// hardware vertex shader as a primitive shader to + /// perform culling and compaction optimizations in + /// the shader. + uint64 supportMeshShader : 1; ///< Indicates support for mesh shaders. + uint64 supportTaskShader : 1; ///< Indicates support for task shaders. + uint64 supportMsFullRangeRtai : 1; ///< HW supports full range render target array + /// index for Mesh Shaders. + uint64 supportPrtBlendZeroMode : 1; ///< Blend zero mode support. + uint64 supports2BitSignedValues : 1; ///< Hardware natively supports 2-bit signed values. + uint64 supportPrimitiveOrderedPs : 1; ///< Hardware supports primitive ordered UAV + /// accesses in the PS. + uint64 supportPatchTessDistribution : 1; ///< Hardware supports patch level tessellation + /// distribution among VGTs. + uint64 supportDonutTessDistribution : 1; ///< Hardware supports donut granularity of + /// tessellation distribution among VGTs. + uint64 supportTrapezoidTessDistribution : 1; ///< Hardware supports trapezoid granularity of + /// tessellation distribution among VGTs. + uint64 supportSingleChannelMinMaxFilter : 1; ///< Hardware supports min/max filtering that can + /// return one channel at a time. + uint64 supportPerChannelMinMaxFilter : 1; ///< Hardware returns min/max value on a per-channel + /// basis. + uint64 supportRgpTraces : 1; ///< Hardware supports RGP traces. + uint64 supportMsaaCoverageOut : 1; ///< Set if HW supports MSAA coverage feature + uint64 supportPostDepthCoverage : 1; ///< Set if HW supports post depth coverage feature + uint64 supportSpiPrefPriority : 1; ///< Set if HW supports preference priority. + uint64 supportWaveBreakSize : 1; ///< The HW supports specifying the wavebreak size + /// in the pixel shader pipeline. + uint64 supportsPerShaderStageWaveSize : 1; ///< If set, the "waveSize" setting in the + /// @ref PipelineShaderInfo structure is meaningful. + uint64 placeholder2 : 1; ///< Placeholder for backward compatibility, no use it. + uint64 supportSpp : 1; ///< Hardware supports Shader Profiling for Power. + uint64 timestampResetOnIdle : 1; ///< GFX timestamp resets after idle between + /// submissions. The client cannot assume that + /// timestamps will increase monotonically across + /// command buffer submissions. + uint64 support1xMsaaSampleLocations : 1; ///< HW supports 1xMSAA custom quad sample patterns +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 893 + uint64 supportReleaseAcquireInterface : 1; ///< Set if HW supports the basic functionalities of + /// acquire/release-based barrier interface. This + /// provides CmdReleaseThenAcquire() as a convenient + /// way to replace the legacy barrier interface's + /// CmdBarrier() to handle single point barriers. +#else + uint64 placeholder4 : 1; ///< Placeholder for backward compatibility, no use it. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 883 + uint64 supportSplitReleaseAcquire : 1; ///< Set if HW supports additional split barrier feature + /// on top of basic acquire/release interface support. + /// This provides CmdAcquire() and CmdRelease() to + /// implement split barriers. + /// Note: supportReleaseAcquireInterface is a + /// prerequisite to supportSplitReleaseAcquire. +#else + uint64 placeholder3 : 1; ///< Placeholder for backward compatibility, no use it. +#endif + uint64 supportGl2Uncached : 1; ///< Indicates support for the allocation of GPU L2 + /// un-cached memory. @see gl2UncachedCpuCoherency + uint64 supportOutOfOrderPrimitives : 1; ///< HW supports higher throughput for out of order + uint64 supportIntersectRayBarycentrics : 1; ///< HW supports the ray intersection mode which + /// returns triangle barycentrics. + uint64 supportFloat32BufferAtomics : 1; ///< Hardware supports float32 buffer atomics + uint64 supportFloat32ImageAtomics : 1; ///< Hardware supports float32 image atomics + uint64 supportFloat32BufferAtomicAdd : 1; ///< Hardware supports float32 buffer atomic add + uint64 supportFloat32ImageAtomicAdd : 1; ///< Hardware supports float32 image atomic add + uint64 supportFloat64Atomics : 1; ///< Hardware supports float64 atomics + uint64 supportFloat32ImageAtomicMinMax : 1; ///< Hardware supports float32 image atomic min and max + uint64 supportFloat64BufferAtomicMinMax : 1; ///< Hardware supports float64 buffer atomic min and max + uint64 supportFloat64SharedAtomicMinMax : 1; ///< Hardware supports float64 shared atomic min and max + uint64 support64BitInstructions : 1; ///< Hardware supports 64b instructions + uint64 supportShaderSubgroupClock : 1; ///< HW supports clock functions across subgroup. + uint64 supportShaderDeviceClock : 1; ///< HW supports clock functions across device. + uint64 supportAlphaToOne : 1; ///< HW supports forcing PS output alpha channel to 1 + uint64 supportCaptureReplay : 1; ///< HW supports captureReplay + uint64 supportSortAgnosticBarycentrics : 1; ///< HW supports sort-agnostic Barycentrics for PS + uint64 supportVrsWithDsExports : 1; ///< If true, asic support coarse VRS rates + /// when z or stencil exports are enabled + uint64 supportRayTraversalStack : 1; ///< HW assisted ray tracing traversal stack support + uint64 supportPointerFlags : 1; ///< Ray tracing HW supports flags embedded in the node + /// pointer bits + uint64 supportTextureGatherBiasLod : 1; ///< HW supports SQ_IMAGE_GATHER4_L_O + uint64 supportInt8Dot : 1; ///< Hardware supports a dot product 8bit. + uint64 supportInt4Dot : 1; ///< Hardware supports a dot product 4bit. + uint64 supportMixedSignIntDot : 1; ///< Hardware supports a integer dot product with mixed + /// sign inputs. + uint64 support2DRectList : 1; ///< HW supports PrimitiveTopology::TwoDRectList. + uint64 supportHsaAbi : 1; ///< PAL supports HSA ABI compute pipelines. + uint64 supportImageViewMinLod : 1; ///< Indicates image srd supports min_lod. + uint64 supportStaticVmid : 1; ///< Indicates support for static-VMID + uint64 support3dUavZRange : 1; ///< HW supports read-write ImageViewSrds of 3D images + /// with zRange specified. + uint64 supportCooperativeMatrix : 1; ///< HW supports cooperative matrix + uint64 support1dDispatchInterleave : 1; ///< Indicates support for 1D Dispatch Interleave. + uint64 support2dDispatchInterleave : 1; ///< Indicates support for 2D Dispatch Interleave. + uint64 supportBFloat16 : 1; ///< HW supports bf16 instructions. + uint64 supportFloat8 : 1; ///< HW supports float 8-bit instructions. + uint64 supportInt4 : 1; ///< HW supports integer 4-bit instructions. + uint64 supportCooperativeMatrix2 : 1; ///< HW supports Gfx12 extension cooperative matrix. + uint64 placeholder14 : 2; + uint64 reserved : 59; ///< Reserved for future use. + }; + uint64 u64All[2]; ///< Flags packed as 32-bit uint. + } flags; ///< Device IP property flags. + + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 + union + { + uint32 bufferView; +#endif + uint32 typedBufferView; ///< Size in bytes (and required alignment) of a typed buffer view SRD. + /// @see IDevice::CreateTypedBufferViewSrds(). +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 892 + }; +#endif + uint32 untypedBufferView; ///< Size in bytes (and required alignment) of a untyped buffer view SRD. + /// @see IDevice::CreateUntypedBufferViewSrds(). + uint32 imageView; ///< Size in bytes (and required alignment) of an image view SRD. + /// @see IDevice::CreateImageViewSrds(). + uint32 fmaskView; ///< Size in bytes (and required alignment) of an fmask view SRD. + /// @see IDevice::CreateFmaskViewSrds(). This value can be zero to denote + /// a lack of fMask support. + uint32 sampler; ///< Size in bytes (and required alignment) of a sampler SRD. + /// @see IDevice::CreateSamplerSrds(). + uint32 bvh; ///< Size in bytes (and required alignment) of a BVH SRD + /// Will be zero if HW doesn't support ray-tracing capabilities. + /// @see IDevice::CreateBvhSrds(). + } srdSizes; ///< Sizes for various types of _shader resource descriptor_ (SRD). + + struct + { + const void* pNullBufferView; ///< Pointer to null buffer view srd + const void* pNullImageView; ///< Pointer to null image view srd + const void* pNullFmaskView; ///< Pointer to null fmask view srd. This pointer can be nullptr to + /// indicate a lack of fMask support. + const void* pNullSampler; ///< Pointer to null sampler srd + } nullSrds; ///< Null SRDs are used to drop shader writes or read 0 + + struct + { + float maxGpuClock; ///< Maximum GPU engine clock in MHz. For DX builds this value is valid only after + /// the device has been finalized. + float aluPerClock; ///< Maximum shader ALU operations per clock. + float texPerClock; ///< Maximum texture fetches per clock. + float primsPerClock; ///< Maximum primitives processed per clock. + float pixelsPerClock; ///< Maximum pixels processed per clock. + uint32 gfxipPerfRating; ///< Precomputed performance rating of the GfxIp block. + } performance; ///< Performance-related device properties. + + struct + { + union + { + struct + { + uint32 eccProtectedGprs : 1; ///< Whether or not the GPU has ECC protection + ///< on its VGPR's + uint32 placeholder0 : 1; + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as a 32-bit unsigned integer. + } flags; + + uint32 numShaderEngines; ///< Number of non-harvested shader engines. + uint32 numShaderArrays; ///< Number of shader arrays. + uint32 numCusPerShaderArray; ///< Number of CUs per shader array that are actually usable. + uint32 maxCusPerShaderArray; ///< Maximum number of CUs per shader array. Count of physical CUs prior to + ///< harvesting CUs for yield in certain variants of ASICs (ex: Fiji PRO). + uint32 numSimdsPerCu; ///< Number of SIMDs per compute unit. + uint32 numWavefrontsPerSimd; ///< Number of wavefront slots in each SIMD. + uint32 numActiveRbs; ///< Number of active Renderbackends + uint32 nativeWavefrontSize; ///< The native wavefront size. + uint32 minWavefrontSize; ///< The smallest supported wavefront size. + uint32 maxWavefrontSize; ///< All powers of two between the min size and max size are supported. + uint32 numAvailableSgprs; ///< Number of available SGPRs. + uint32 sgprsPerSimd; ///< Number of physical SGPRs per SIMD. + uint32 minSgprAlloc; ///< Minimum number of SGPRs that can be allocated by a wave. + uint32 sgprAllocGranularity; ///< SGPRs are allocated in groups of this size. Meaning, if your shader + /// only uses 1 SGPR, you will still end up reserving this number of + /// SGPRs. + uint32 numAvailableVgprs; ///< Number of available VGPRs. + uint32 vgprsPerSimd; ///< Number of physical VGPRs per SIMD. + uint32 minVgprAlloc; ///< Minimum number of VGPRs that can be allocated by a wave. + uint32 vgprAllocGranularity; ///< VGPRs are allocated in groups of this size. Meaning, if your shader + /// only uses 1 VGPR, you will still end up reserving this number of + /// VGPRs. On hardware where wave32 is available, the granularity for a + /// wave64 shader is half of this value, but the VGPR allocation is + /// double. The same number of total physical registers is allocated for + /// each unit of allocation with either wave size. + uint32 ldsSizePerCu; ///< Local Data Store size available in bytes per CU. + uint32 ldsSizePerThreadGroup; ///< Local Data Store size available in bytes per thread-group. + uint32 ldsGranularity; ///< Local Data Store allocation granularity expressed in bytes. + uint32 gsPrimBufferDepth; ///< Hardware configuration for the GS prim buffer depth. + uint32 gsVgtTableDepth; ///< Hardware configuration for the GS VGT table depth. + uint32 numOffchipTessBuffers; ///< Number of offchip buffers that are used for offchip tessellation to + /// pass data between shader stages. + uint32 offchipTessBufferSize; ///< Size of each buffer used for passing data between shader stages when + /// tessellation passes data using off-chip memory. + uint32 tessFactorBufSizePerSe; ///< Size of GPU's the tessellatio-factor buffer, per shader engine. + uint32 tccSizeInBytes; ///< Size of total L2 TCC cache in bytes. + uint32 tcpSizeInBytes; ///< Size of one L1 TCP cache in bytes. There is one TCP per CU. + uint32 maxLateAllocVsLimit; ///< Maximum number of VS waves that can be in flight without + /// having param cache and position buffer space. + uint32 shaderPrefetchBytes; ///< Number of bytes the SQ will prefetch, if any. + uint32 gl1cSizePerSa; ///< Size in bytes of GL1 cache per SA. + uint32 instCacheSizePerCu; ///< Size in bytes of instruction cache per CU/WGP. + uint32 scalarCacheSizePerCu; ///< Size in bytes of scalar cache per CU/WGP. + uint32 numAvailableCus; ///< Total number of CUs that are actually usable. + uint32 numPhysicalCus; ///< Count of physical CUs prior to harvesting. + /// Mask of active pixel packers. The mask is 128 bits wide, assuming a max of 32 SEs and a max of 4 pixel + /// packers (indicated by a single bit each) per SE. + uint32 activePixelPackerMask[ActivePixelPackerMaskDwords]; + /// Mask of present, non-harvested CUs (Virtual Layout) + uint32 activeCuMask[MaxShaderEngines][MaxShaderArraysPerSe]; + } shaderCore; ///< Properties of computational power of the shader engine. + + } gfxipProperties; + + struct + { + union + { + struct + { + uint32 supportTurboSync : 1; ///< Whether TurboSync is supported by KMD + uint32 enableUmdFpsCap : 1; ///< Whether UMD FPS CAP enabled + uint32 isCwgSupported : 1; ///< KMD supports Creator Who Game (CWG) feature + uint32 isGamingDriver : 1; ///< KMD works in gaming mode +#if defined(_WIN32) + uint32 canShareSemaphoreKmtHandle : 1; ///< OS supports sharing queue semaphores by KMT handle. +#else + uint32 placeholder0 : 1; +#endif + uint32 ifhModeEnabled : 1; ///< Whether the IFH mode is enabled + uint32 requireFrameEnd : 1; ///< If the client must tag the last command buffer + /// submission in each frame with a @ref CmdBufInfo with + /// the frameEnd flag set. + uint32 supportNativeHdrWindowing : 1; ///< Support HDR presentation that does not require FSE. + uint32 flipQueueSupportsDecodeDst : 1; ///< If set, Decode destination images are supported + /// in the OS flip-queue. + uint32 supportFreeMux : 1; ///< Whether FreeMux is supported by KMD + uint32 isDataCenterBoard : 1; ///< Whether the current board in use is a Data Center board. + /// This is meant to support a unified VDI/CG driver package. +#if defined(__unix__) + uint32 hasPrimaryDrmNode : 1; ///< Set if the device has a primary DRM node. + uint32 hasRenderDrmNode : 1; ///< Set if the device has a render DRM node. +#else + uint32 placeholder1 : 2; +#endif + uint32 forceAlignmentSupported : 1; ///< If PalPublicSettings::hardwareBufferAlignmentMode + /// has any effect. + uint32 reserved : 18; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< OS-specific property flags. + +#if (PAL_KMT_BUILD) + int32 luidHighPart; ///< High 32 bits of the LUID (locally unique identifier) for this device. + uint32 luidLowPart; ///< Low 32 bits of the LUID (locally unique identifier) for this device. + bool supportNtHandle; ///< Support export/import resource with NT handle. + bool supportNativefence; ///< Expose the capability of native fence support to client in DeviceProperties. +#endif + + union + { + struct + { + uint32 support : 1; ///< Support Timeline type semaphore. + uint32 supportHostQuery : 1; ///< Support Timeline type semaphore host query. + uint32 supportHostWait : 1; ///< Support Timeline type semaphore host wait. + uint32 supportHostSignal : 1; ///< Support Timeline type semaphore host signal. + uint32 supportWaitBeforeSignal : 1; ///< Support Timeline type semaphore wait before signal. + + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; + } timelineSemaphore; + +#if defined(__unix__) + bool supportOpaqueFdSemaphore; ///< Support export/import semaphore as opaque fd in linux KMD. + bool supportSyncFileSemaphore; ///< Support export/import semaphore as sync file in linux KMD. + bool supportSyncFileFence; ///< Support export/import fence as sync file in linux KMD. +#endif + + bool supportQueuePriority; ///< Support create queue with priority + bool supportDynamicQueuePriority; ///< Support set the queue priority through IQueue::SetExecutionPriority + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) + bool supportMemoryBudgetQuery; ///< Support memory budget query through IDevice::QueryGpuMemoryBudgetInfo +#endif + +#if defined(_WIN32) + bool supportArbitaryPrtMapUnmap; ///< Support arbitary prt map unmap operation. +#endif + + uint32 umdFpsCapFrameRate; ///< The frame rate of the UMD FPS CAP + VirtualDisplayCapabilities virtualDisplayCaps; ///< Capabilities of virtual display, it's provided by KMD + + union + { + struct + { + uint32 supportDevice : 1; ///< GPU time domain + uint32 supportClockMonotonic : 1; ///< POSIX CLOCK_MONOTONIC time domain + uint32 supportClockMonotonicRaw : 1; ///< POSIX CLOCK_MONOTONIC_RAW time domain + uint32 supportQueryPerformanceCounter : 1; ///< Windows Query Performance Counter time domain + + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; + } timeDomains; + +#if defined(_WIN32) + char driverStorePath[Util::MaxPathStrLen]; ///< Driver store path +#endif + +#if defined(__unix__) + int64 primaryDrmNodeMajor; ///< DRM primary node major number. + int64 primaryDrmNodeMinor; ///< DRM primary node minor number. + int64 renderDrmNodeMajor; ///< DRM render node major number. + int64 renderDrmNodeMinor; ///< DRM render node minor number. +#endif + union + { + struct + { + uint32 supportPostflip : 1; ///< KMD support DirectCapture post-flip access + uint32 supportPreflip : 1; ///< KMD support DirectCapture pre-flip access + uint32 supportRSync : 1; ///< KMD support RSync + uint32 maxFrameGenRatio : 4; ///< Maximum frame generation ratio or zero if not supported + uint32 supportNonPrimary : 1; ///< KMD support non-primary DirectCapture auxiliary data + uint32 reserved : 24; ///< Reserved for future use. + }; + uint32 u32All; + } directCapture; + } osProperties; ///< OS-specific properties of this device. + + struct + { + uint32 domainNumber; ///< PCI bus number. + uint32 busNumber; ///< PCI bus number. + uint32 deviceNumber; ///< PCI device number. + uint32 functionNumber; ///< PCI function number. + + union + { + struct + { + uint32 gpuConnectedViaThunderbolt : 1; ///< Device is an externally housed GPU connected to the system + /// via Thunderbolt. This will drastically impact CPU read and + /// write performance of memory in the @ref GpuHeapLocal heap. + uint32 gpuEmulatedInSoftware : 1; ///< Device is really a software package which emulates the + /// GPU. This is meant for pre-silicon development. + uint32 gpuEmulatedInHardware : 1; ///< Device is a hardware emulated GPU. This is meant for + /// pre-silicon development. + uint32 gpuVirtualization : 1; ///< Set if running under VM. + uint32 atomicOpsSupported : 1; ///< Set if pcie atomic is supported. + uint32 reserved : 27; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< PCI bus property flags. + } pciProperties; ///< PCI bus properties of this device. + + BigSoftwareReleaseInfo bigSoftwareReleaseInfo; ///< Big Software (BigSW) Release Version information +}; + +/// Defines callback function to notify client of private screen changes. +typedef void (PAL_STDCALL *TopologyChangeNotificationFunc)(void* pClient); + +/// Defines callback function to notify client of the private screen removal. +typedef void (PAL_STDCALL *DestroyNotificationFunc)(void* pOwner); + +/// Specifies the private screen topology change notification data. +struct PrivateScreenNotifyInfo +{ + void* pClient; ///< Pointer to client, PAL use this pointer as parameter when PAL + /// calls callback pfnOnTopology. + TopologyChangeNotificationFunc pfnOnTopology; ///< Pointer to client provided function. PAL should call this when + /// the topology change happens and let the client handle the change. + DestroyNotificationFunc pfnOnDestroy; ///< Pointer to client provdided function. PAL should call this when + /// a private screen object is to be destroyed. The pOwner data is + /// passed at @ref IPrivateScreen::BindOwner() time. +}; + +/// Specifies fullscreen frame metadata control flags. Used for the KMD to notify clients about which types of frame +/// metadata it needs to send to KMD. The meaning depends on the context: +/// - During device finalization, client can set the flags indicating the specified metadata 'is supported' by client. +/// - During present, client can query these flags that indicate which metadata 'is enabled' currently so that the +/// client should send them to the KMD. +union FullScreenFrameMetadataControlFlags +{ + struct + { + uint32 timerNodeSubmission : 1; ///< Timer node submission, used for cases such as FRTC/FP/PFPA. + uint32 frameBeginFlag : 1; ///< FrameBegin flag on CmdBufInfo, see CmdBufInfo for details. + uint32 frameEndFlag : 1; ///< FrameEnd flag on CmdBufInfo, see CmdBufInfo for details. + uint32 primaryHandle : 1; ///< Pending primary handle for pre-flip primary access (PFPA) + uint32 p2pCmdFlag : 1; ///< P2P copy command. See CmdBufInfo comments for details. + uint32 forceSwCfMode : 1; ///< Force software crossfire mode. + uint32 postFrameTimerSubmission : 1; ///< It indicates whether the timer node submission at frame N is to + /// synchronize the flip of frame N (postFrameTimerSubmission == TRUE) + /// or N+1 (postFrameTimerSubmission == FALSE). + /// It's only valid when timerNodeSubmission is also set. + uint32 useHp3dForDwm : 1; ///< KMD Informs (DX11) UMD to use HP3D for DWM or not (Output only). + uint32 expandDcc : 1; ///< KMD notifies UMD to expand DCC (Output only). + uint32 enableTurboSyncForDwm : 1; ///< Indicates DWM should turn on TurboSync(Output only). + uint32 enableDwmFrameMetadata : 1; ///< When cleared, no frame metadata should be sent for DWM(Output only). + uint32 flipIntervalOverride : 3; ///< KMD-UMD interface FLIP_INTERVAL_OVERRIDE, for KMD to request flip + /// interval override from UMD. + uint32 disableFreeMux : 1; ///< KMD notifies UMD to disable FreeMux. + uint32 maxFrameLatency : 2; ///< KMD can notify UMD to override the frame latency of an app. + uint32 sendMotionVectors : 1; ///< Send the motion vector in CmdBufInfo once per frame + uint32 sendDepth : 1; ///< Send the depth buffer in CmdBufInfo once per frame + uint32 sendCameraMatrix : 1; ///< Send the camera matrix in CmdBufInfo once per frame + uint32 sendHudLessImage : 1; ///< Send the HUD less image in CmdBufInfo once per frame + uint32 reserved : 11; ///< Reserved for future use. + + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Indicates the desired UMD behavior with timer node submission. +/// This is used to distinguish FP Vsync On + FreeSync Off case from HSync or FreeSync cases, the former case doesn't +/// hold flip while the later cases do. +enum class TimerNodeMode : uint32 +{ + Unspecified, ///< Unspecified, client can decide what to do with the timer submission. + ForceFlipHold, ///< Client must hold flip with the timer submission +}; + +/// Specifies fullscreen frame metadata control data. Including FullScreenFrameMetadataControlFlags plus extended data. +/// According to KMD's design, the difference is that 'flags' can be used to indicate 'client caps' during device +/// initialization, while the 'data' is only passed from KMD to UMD. +struct PerSourceFrameMetadataControl +{ + FullScreenFrameMetadataControlFlags flags; ///< The frame metadata control flags + TimerNodeMode timerNodeSubmissionMode; ///< Desired UMD behavior with timer node submission +}; + +/// Specifies the texture optimization level to use for an image. +/// +/// @ingroup ResourceBinding +enum class ImageTexOptLevel : uint32 +{ + Default = 0, ///< Use device default setting + Disabled, ///< Disable texture filter optimization + Enabled, ///< Enable texture filter optimization + Maximum, ///< Maximum texture filter optimization + Count +}; + +/// Specifies properties for @ref IDevice finalization. Input structure to IDevice::Finalize(). +struct DeviceFinalizeInfo +{ + union + { + struct + { + uint32 supportPrivateScreens : 1; ///< Initializes private screen support. + uint32 requireFlipStatus : 1; ///< Requires to initialize flip status shared memory + uint32 requireFrameMetadata : 1; ///< Requires to initialize frame metadata flags shared memory. + /// Clients should only set this flag on the master device in an + /// LDA chain. + uint32 internalGpuMemAutoPriority : 1; ///< Forces internal GPU memory allocation priorities to be + /// determined automatically. It is an error to set this flag + /// if the device does not report that it supports this feature. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Device finalization flags. + + /// Specifies which engines of each type should be created for the device. + struct + { + uint32 engines; ///< A mask of which engines are requested. + } requestedEngineCounts[EngineTypeCount]; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + /// Bytes of CE RAM to be used by the client for each engine type. This value must be <= ceRamSizeAvailable reported + /// for that engine type. In the case where more than one engine of a given type is requested it is assumed each + /// engine of that type will use this amount of CE RAM so the total size of (ceRamSizeUsed * queueCounts) must be <= + /// ceRamSizeAvailable for that engine type. Each entry must be either zero or a multiple of 32 bytes. + size_t ceRamSizeUsed[EngineTypeCount]; +#endif + + /// @see PrivateScreenNotifyInfo + /// Private screen notify info, must be filled when supportPrivateScreens=1. The client pointer and callback are to + /// be saved in device. PAL should call the callback when there is any topology (hotplug) change with the client + /// pointer as parameter. + PrivateScreenNotifyInfo privateScreenNotifyInfo; + + /// Fullscreen frame metadata control flags indicating the types of metadata that the client supports. + /// During adapter initialization, capable KMD notifies clients that it supports frame metadata, + /// clients should then set these flags on device finalization info, indicating which types of metadata the client + /// supports. + FullScreenFrameMetadataControlFlags supportedFullScreenFrameMetadata; + + /// Specify the texture optimization level which only applies to internally-created views by PAL (e.g., for BLTs), + /// client-created views must use the texOptLevel parameter in ImageViewInfo. + ImageTexOptLevel internalTexOptLevel; +}; + +/// Reports the compatibility and available features when using two particular devices in a multi-GPU system. Output +/// structure from IDevice::GetMultiGpuCompatibility(). +struct GpuCompatibilityInfo +{ + union + { + struct + { + uint32 gpuFeatures : 1; ///< The devices have an exact feature match: same internal tiling, same + /// pipeline binary data, etc. + uint32 iqMatch : 1; ///< Devices produce images with same precision. + uint32 peerTransferWrite : 1; ///< Peer-to-peer transfers write are supported. See + /// IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). + uint32 peerTransferRead : 1; ///< Peer-to-peer transfers based on xmgi are supported. + /// See IDevice::OpenPeerMemory() and IDevice::OpenPeerImage(). + uint32 sharedMemory : 1; ///< Devices can share memory objects with. IDevice::OpenSharedMemory(). + uint32 sharedSync : 1; ///< Devices can share queue semaphores with + /// IDevice::OpenSharedQueueSemaphore(). + uint32 shareThisGpuScreen : 1; ///< Either device can present to this device. Means that the device + /// indicated by the otherDevice param in + /// IDevice::GetMultiGpuCompatibility() can present to the device the + /// method was called on. + uint32 shareOtherGpuScreen : 1; ///< Either device can present to the other device. Means that the + /// device IDevice::GetMultiGpuCompatibility() was called on can present + /// to the GPU indicated by the otherGpu param. + uint32 peerEncode : 1; ///< whether encoding HW can access FB memory of remote GPU in chain + uint32 peerDecode : 1; ///< whether decoding HW can access FB memory of remote GPU in chain + uint32 peerTransferProtected : 1; ///< whether protected content can be transferred over P2P + uint32 crossGpuCoherency : 1; ///< whether remote FB memory can be accessed without need for cache flush + uint32 reserved : 20; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU compatibility flags. +}; + +/// Reports properties of a GPU memory heap. +/// +/// @note The performance ratings represent an approximate memory throughput for a particular access scenario, but +/// should not be taken as an absolute performance metric. +struct GpuMemoryHeapProperties +{ + union + { + struct + { + uint32 cpuVisible : 1; ///< Accessible with IGpuMemory::Map() + uint32 cpuGpuCoherent : 1; ///< Cache coherent between the CPU and GPU. + uint32 cpuUncached : 1; ///< Not cached by CPU, but could still be GPU cached. + uint32 cpuWriteCombined : 1; ///< CPU write-combined memory. + uint32 holdsPinned : 1; ///< GPU memory objects created by IDevice::CreatePinnedGpuMemory() are in + /// this heap. + uint32 shareable : 1; ///< GPU memory objects in this heap can be shared between multiple devices. + uint32 supportsTmz : 1; ///< This heap supports TMZ allocations. + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory heap property flags. + + gpusize logicalSize; ///< Size of the heap in bytes. If HBCC is enabled, certain heaps may be + /// virtualized and the logical size will exceed the physical size. + gpusize physicalSize; ///< Physical size of the heap in bytes +}; + +/// Reports properties of a specific GPU block required for interpretting performance experiment data from that block. +/// See @ref PerfExperimentProperties. +struct GpuBlockPerfProperties +{ + bool available; ///< If performance data is available for this block. + uint32 instanceCount; ///< How many instances of this block are in the device. + uint32 maxEventId; ///< Maximum event ID for this block. + uint32 maxGlobalOnlyCounters; ///< Number of counters available only for global counts. + uint32 maxGlobalSharedCounters; ///< Total counters available including state shared between global and SPM. + uint32 maxSpmCounters; ///< Counters available for streaming only. + + /// If the instance group size is equal to one, every block instance has its own independent counter hardware. + /// PAL guarantees this is true for all non-DF blocks. + /// + /// Otherwise the instance group size will be a value greater than one which indicates how many sequential + /// instances share the same counter hardware. The client must take care to not enable too many counters within + /// each of these groups. + /// + /// For example, the DfMall block may expose 16 instances with 8 global counters but define a group size of 16. + /// In that case all instances are part of one massive group which uses one pool of counter state such that no + /// combination of DfMall counter configurations can exceed 8 global counters. + uint32 instanceGroupSize; +}; + +/// Reports performance experiment capabilities of a device. Returned by IDevice::GetPerfExperimentProperties(). +struct PerfExperimentProperties +{ + PerfExperimentDeviceFeatureFlags features; ///< Performance experiment device features. + + size_t maxSqttSeBufferSize; ///< SQTT buffer size per shader engine. + size_t sqttSeBufferAlignment; ///< SQTT buffer size and base address alignment. + uint32 shaderEngineCount; ///< Number of shader engines. + + /// Reports availability and properties of each device block. + GpuBlockPerfProperties blocks[static_cast(GpuBlock::Count)]; +}; + +/// Reports maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' +/// elements are no larger than maxElementSize. +struct LinearImageAlignments +{ + uint16 maxElementSize; ///< Maximum element size in bytes. + uint16 baseAddress; ///< Minimum required base address alignment in bytes. + uint16 rowPitch; ///< Minimum required row pitch alignment in bytes. + uint16 depthPitch; ///< Minimum required depth pitch alignment in bytes. +}; + +/// Specifies image view type (i.e., 1D, 2D, 3D, or cubemap). +/// +/// @ingroup ResourceBinding +enum class ImageViewType : uint32 +{ + Tex1d = 0x0, + Tex2d = 0x1, + Tex3d = 0x2, + TexCube = 0x3, + + Count +}; + +/// Enumeration which defines the mode for magnification and minification sampling +/// +/// @ingroup ResourceBinding +enum XyFilter : uint32 +{ + XyFilterPoint = 0, ///< Use single point sampling + XyFilterLinear, ///< Use linear sampling + XyFilterAnisotropicPoint, ///< Use anisotropic with single point sampling + XyFilterAnisotropicLinear, ///< Use anisotropic with linear sampling + XyFilterCount +}; + +/// Enumeration which defines the mode for volume texture sampling +/// +/// @ingroup ResourceBinding +enum ZFilter : uint32 +{ + ZFilterNone = 0, ///< Disable Z filtering + ZFilterPoint, ///< Use single point sampling + ZFilterLinear, ///< Use linear sampling + ZFilterCount +}; + +/// Enumeration which defines the mode for mip-map texture sampling +/// +/// @ingroup ResourceBinding +enum MipFilter : uint32 +{ + MipFilterNone = 0, ///< Disable Mip filtering + MipFilterPoint, ///< Use single point sampling + MipFilterLinear, ///< Use linear sampling + MipFilterCount +}; + +/// Specifies parameters for an image view descriptor controlling how a given texture is sampled +/// +/// @ingroup ResourceBinding +struct TexFilter +{ + union + { + struct + { + uint32 magnification : 2; ///< Used with enum XyFilter for Plane magnification filtering + uint32 minification : 2; ///< Used with enum XyFilter for Plane minification filtering + uint32 zFilter : 2; ///< Used with enum ZFilter for volume texture filtering + uint32 mipFilter : 2; ///< Used with enum MipFilter for mip-map filtering + uint32 reserved : 24; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + }; +}; + +/// Determines if "TexFilter" should be ignored or not. +enum class TexFilterMode : uint32 +{ + Blend = 0x0, ///< Use the filter method specified by the TexFilter enumeration + Min = 0x1, ///< Use the minimum value returned by the sampler, no blending op occurs + Max = 0x2, ///< Use the maximum value returned by the sampler, no blending op occurs + Count +}; + +/// Specifies how texture coordinates outside of texture boundaries are interpreted. +/// +/// @ingroup ResourceBinding +enum class TexAddressMode : uint32 +{ + Wrap = 0x0, ///< Repeat the texture. + Mirror = 0x1, ///< Mirror the texture by flipping it at every other coordinate interval. + Clamp = 0x2, ///< Clamp the texture to the texture's edge pixel. + MirrorOnce = 0x3, ///< Mirror the texture once then clamp. + ClampBorder = 0x4, ///< Clamp the texture to the border color specified in the sampler. + MirrorClampHalfBorder = 0x5, ///< Mirror the texture once then clamp the texture to half of the edge color. + ClampHalfBorder = 0x6, ///< Clamp the texture to half of the edge color. + MirrorClampBorder = 0x7, ///< Mirror the texture once then clamp the texture to the samler's border color. + Count +}; + +/// Specifies how a border color should be chosen when the TexAddressClampBorder texture addressing is used by a +/// sampler. +/// +/// @ingroup ResourceBinding +enum class BorderColorType : uint32 +{ + White = 0x0, ///< White border color (1.0, 1.0, 1.0, 1.0). + TransparentBlack = 0x1, ///< Transparent black border color (0.0, 0.0, 0,0, 0.0). + OpaqueBlack = 0x2, ///< Opaque black border color (0.0, 0.0, 0.0, 1.0). + PaletteIndex = 0x3, ///< Fetch border color from the border color palette. + Count +}; + +/// Residency maps are helper surfaces used in conjunction with PRT+. They reflect the resident mip levels +/// associated with a given UV region of the parent image. +enum class PrtMapAccessType : uint32 +{ + Raw = 0x0, ///< Read / write the map image as a normal image. + Read = 0x1, ///< Read the residency map as floating point data + WriteMin = 0x2, ///< Write the residency map with min(existing,new) + WriteMax = 0x3, ///< Write the residency map with max(existing,new) + WriteSamplingStatus = 0x4, ///< Write to the sampling status map. + Count +}; + +/// Specifies compression behavior for an IImage or image/buffer view. +enum class CompressionMode : uint32 +{ + Default = 0, ///< Let PAL choose behavior. + ReadEnableWriteEnable = 1, ///< Override default to force read and write compression on. + ReadEnableWriteDisable = 2, ///< Support reading compressed data, but force any writes to be uncompressed (keeping + /// physical metadata consistent). + ReadBypassWriteDisable = 3, ///< Bypass physical metadata on reads (assume decompressed), all writes will be + /// uncompressed and will write physical metatdata marking updated blocks as being + /// uncompressed. This mode is intended to handle placed resources that do not + /// want compression in memory allocations that have distributed compression enabled. + /// WARNING: Using this mode to read compressed data will result in corruption. + Count, +}; + +/// Specifies client compression behavior for an IImage. +enum class ClientCompressionMode : uint32 +{ + Default = 0, ///< Let implementation decide whether to enable or disable + Enable = 1, ///< Force enable + Disable = 2, ///< Force disable + DisableClearOnly = 3, ///< Force enable for all image views except for image clears + Count, +}; + +/// Specifies parameters for a buffer view descriptor that control how a range of GPU memory is viewed by a shader. +/// +/// Input to either CreateTypedBufferViewSrds() or CreateUntypedBufferViewSrds(). Used for any buffer descriptor, +/// including read-only shader resources, UAVs, vertex buffers, etc. The usage of stride and format depends on the +/// expected shader instruction access: +/// +/// + _Typed buffer_ access must set a valid format and channel mapping. +/// + _Raw buffer_ access is indicated by setting an invalid format and setting stride to 1. +/// + _Structured buffer_ access is indicated by setting an invalid format and setting stride to any value except 1. A +/// stride of 0 maps all view accesses to the first structure stored in memory. +/// +/// _Typed buffer_ SRD's must be created using @ref IDevice::CreateTypedBufferViewSrds(). +/// _Raw buffer_ and _structured buffer_ SRD's must be created using @ref IDevice::CreateUntypedBufferViewSrds(). +/// +/// If necessary, PAL will adjust the out of bounds read/write behavior to match the client's API requirements based on +/// the client defines - PAL_CLIENT_VULKAN, etc. +/// +/// @ingroup ResourceBinding +struct BufferViewInfo +{ + gpusize gpuAddr; ///< GPU memory virtual address where the buffer view starts, in bytes. + /// Must be aligned to bytes-per-element for typed access. + gpusize range; ///< Restrict the buffer view to this many bytes. Will be rounded down to a + ///< multiple of the stride. + gpusize stride; ///< Stride in bytes. Must be aligned to bytes-per-element for typed access. + SwizzledFormat swizzledFormat; ///< Format and channel swizzle for typed access. Must be Undefined for structured + /// or raw access. + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. + /// Only relevant if the backing memory pages enable compression. + + union + { + struct + { + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + uint32 reserved : 30; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; +}; + +/// D3D12_VERTEX_BUFFER_VIEW structure. Match vertex offset mode. +/// @ref ICmdBuffer::CmdSetVertexBuffers function. +struct VertexBufferView +{ + gpusize gpuva; ///< GPU virtual address to vertex buffer. + uint32 sizeInBytes; ///< The size in bytes of the buffer. + uint32 strideInBytes; ///< The size in bytes of each vertex entry. +}; + +/// Specifies an array of vertex buffer view descriptors. PAL supports multiple vertex buffer modes which change how +/// the GPU memory is viewed by a shader. This struct wraps up the choice between these modes. +/// +/// If offsetMode is true PAL will create "vertex offset mode" SRDs using the @ref VertexBufferView array. This new SRD +/// has stronger robustness. It avoids that the last vertex is removed when buffer size is not multiple of the stride. +/// +/// @see ICmdBuffer::CmdSetVertexBuffers +struct VertexBufferViews +{ + uint32 firstBuffer; ///< First vertex buffer slot to change. Must be less than @ref MaxVertexBuffers. + uint32 bufferCount; ///< The length of the buffer view array. + bool offsetMode; ///< If true, pVertexBufferViews must be used, otherwise pBufferViewInfos is used. + + union + { + const VertexBufferView* pVertexBufferViews; ///< Offset mode vertex buffer views. Valid if offsetMode is true. + const BufferViewInfo* pBufferViewInfos; ///< Index mode vertex buffer views. Valid if offsetMode is false. + }; +}; + +/// Specifies parameters for an image view descriptor controlling how a shader will view the specified image. +/// +/// Input to CreateImageViewSrd(). Used for any image view descriptor, including read-only shader resources and UAVs. +/// +/// @ingroup ResourceBinding +struct ImageViewInfo +{ + const IImage* pImage; ///< Image associated with the view. + ImageViewType viewType; ///< 1D, 2D, 3D, or Cubemap. Typically this should match the image type, but a + /// Cubemap view can be imposed on a 2D array image. + SwizzledFormat swizzledFormat; ///< Specifies the image view format and channel swizzle. Must be compatible (same + /// bit-widths per channel) with the image's base format. + /// @note: YUV formats are invalid for an ImageView. A format should be chosen to be + /// compatible with either the luma or chroma plane(s) of the YUV format. + SubresRange subresRange; ///< Specifies a subset of subresources to include in the view. If the base Image + /// has a YUV planar format, the number of array slices in the range must be 1. + /// If zRange feature is used, the number of mips in the range must be 1. + float minLod; ///< Minimum mip level of detail to use for this view. + + uint32 samplePatternIdx; ///< Index into the currently bound MSAA sample pattern palette to be + /// read/evaluated when samplepos shader instructions are executed on this + /// view. Can be ignored if the samplepos shadinstruction will not be used. + /// Must be less than MaxSamplePatternPaletteEntries. See + /// IDevice::SetSamplePatternPalette(). + Range zRange; ///< Specifies the z offset and z range. + + ImageTexOptLevel texOptLevel; ///< Specific the texture optimization level. + + const IImage* pPrtParentImg; ///< Meaningful only if "mapAccess" is not "raw". + PrtMapAccessType mapAccess; ///< Type of access to be done if "pImage" is a PRT+ meta-data image. + /// See @ref ImageCreateInfo + + ImageLayout possibleLayouts; ///< Union of all possible layouts this view can be in while accessed by this view. + /// (ie. what can be done with this SRD without having a layout transition?) + /// In DX, for example, it's possible that a texture SRV could be accessed in a state + /// with all other read-only usages allowed, but a UAV must exclusively be accessed + /// in the UNORDERED_ACCESS state. + /// The primary purpose of this flag is to avoid compressed shader writes if a + /// different usage does not support compression and PAL won't get an opportunity to + /// decompress it (ie. a transition in a barrier) + + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression mode override for this view. + /// Only relevant if the backing IImage resource and its bound memory pages + /// enable compression. ReadBypassWriteDisable is only valid if compressionMode + /// in ImageCreateInfo disables compressed write. + + union + { + struct + { + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + + uint32 zRangeValid : 1; ///< whether z offset/ range value is valid. + uint32 includePadding : 1; ///< Whether internal padding should be included in the view range. + + uint32 reserved : 28; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; ///< Image view flags. +}; + +/// Specifies parameters controlling execution of sample instructions in a shader. Input to CreateSamplerSrd(). +/// +/// @ingroup ResourceBinding +struct SamplerInfo +{ + TexFilterMode filterMode; ///< Min/max filtering modes + TexFilter filter; ///< Filtering to apply to texture fetches. + TexAddressMode addressU; ///< Addressing mode for U texture coords outside of the [0..1] range. + TexAddressMode addressV; ///< Addressing mode for V texture coords outside of the [0..1] range. + TexAddressMode addressW; ///< Addressing mode for W texture coords outside of the [0..1] range. + float mipLodBias; ///< Bias for mipmap level of detail selection. + uint32 maxAnisotropy; ///< Anisotropy value clamp when the filter mode is TexFilterAnisotropic. + CompareFunc compareFunc; ///< Comparison function to apply to fetched data. + float minLod; ///< High-resolution mipmap LOD clamp. + float maxLod; ///< Low-resolution mipmap LOD clamp. + BorderColorType borderColorType; ///< Selects border color when an address mode is TexAddressClampBorder. + uint32 borderColorPaletteIndex; ///< Choose color from the border color palette when borderColorType is + /// BorderColorPalette. + float anisoThreshold; ///< Opt-in, flags.useAnisoThreshold == 1 and flags.preciseAniso == 0. + /// The value should be computed taking account the maxAnisotropy + /// setting. This is a high resolution value which is quantized and + /// clamped down to 3 bits to the domain [0.0, 0.875] for current Hw. + /// We can interpret the functioning of the threshold value as follows. + /// maxAnisotropy per-pixel can be 1, 2, 4, 8 or 16 (N). + /// During sampling, the initial count (or S) is computed in Hw for each + /// quad and the domain for current Hw is [0-16]. + /// Final sample count = min(pow(2, ceil(log2(S - anisoThreshold))), N) + /// Note: when flags.useAnisoThreshold == 0, Pal will ignore this value + /// and instead use a maximum of 0.25 at the highest anisotropic setting. + /// It is important to be aware that this feature tunes quality vs + /// performance, so care should be taken to not degrade image quality + /// 'noticeably' when enabling using this feature + uint32 perfMip; ///< Controls the value of the PERF_MIP field in Sampler SRD's. + /// This field basically controls the Fractional part of the LOD + /// calculation. if LOD is fractional so let us say 1.23, in this case + /// you must avg.out your samples from both MIP 1 and 2.But if PERF_MIP + /// is set to nonzero the HW will perform an optimization and may fetch + /// from only 1 MIP. + + // These values are used to define a filtering line used when sampling a residency map. The defined + // slopes in both the X (U) and Y (V) directions are to avoid visible disconnects when sampling between + // different samples. + Offset2d uvOffset; ///< u/v offset value selectors. Values specified are in + /// log2 of fractions of pixel. i.e., 1 / (1 << x). Not all values + /// are supported by all HW. + Offset2d uvSlope; ///< u/v slope value selectors. Supported slope values are + /// specified in degrees. In the case of a 3D image, the supplied + /// uvSlope.y is interpreted as wSlope. + /// 0 2.5 + /// 1 3 + /// 2 4 + /// 3 5 + /// 4 8 + /// 5 16 + /// 6 32 + /// 7 64 + /// other values: unsupported + + union + { + struct + { + uint32 mgpuIqMatch : 1; ///< Enables image compatibility for MGPU scenarios where paired devices + /// come from different hardware families. + uint32 preciseAniso : 1; ///< Anisotropic filtering should prefer precision over speed. + uint32 unnormalizedCoords : 1; ///< If set then always use unnormalized texture coordinates instead of + /// zero to one. Only works under certain conditions (no mip filtering, + /// no computed LOD, no offsets, only edge or border clamp address modes) + uint32 truncateCoords : 1; ///< If set then hardware will truncate mantissa instead of + /// rounding to nearest even in float point to fixed point + /// texture coordinate conversion + uint32 seamlessCubeMapFiltering : 1; ///< If set then there's filtering across the edges of the cube map. + uint32 prtBlendZeroMode : 1; ///< Allow unmapped PRT texels to be treated as zero and blended with + /// mapped texels. If set to 0, the destination of the sample instruction + /// is written with all 0s when TFE == 0; if set to 1, Treat unmapped + /// texels as zeros and blend them with other mapped texels, write the + /// result of this sample instruction to the destination GPRs. + uint32 useAnisoThreshold : 1; ///< If set, Hw will use the value assigned in anisoThreshold, but + /// only if preciseAniso is set to 0, also. + + /// This allows the sampler to turn off overriding anisotropic filtering when the resource view contains a + /// single mipmap level. Not all graphics IP supports overriding anisotropic filtering, and this flag will + /// be ignored for such GPUs. + uint32 disableSingleMipAnisoOverride : 1; + + uint32 forResidencyMap : 1; ///< Set if the surface being sampled is a residency map used in PRTs. + /// Only meaningful if the corresponding ImageView's mapAccess is set to + /// "read". Only valid for devices that report the "PrtFeaturePrtPlus" + /// flag. + uint32 reserved : 23; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; +}; + +/// Specifies which heuristic should be utilized for sorting children when box sorting is enabled +enum class BoxSortHeuristic : uint32 +{ + ClosestFirst = 0x0, ///< Traversal is ordered to enter the children that + ///< intersect the ray closer to the ray origin first. + ///< This is good baseline option. Default option for RT IP 1.x. + LargestFirst = 0x1, ///< Traversal is ordered to enter the children that have the largest + ///< interval where the box intersects the ray first. + ///< Good for shadow rays with terminate on first hit. + ClosestMidPoint = 0x2, ///< Traversal is ordered to enter the children that have a midpoint in the interval + ///< where the box intersects that has the lowest intersection time before clamping( + ///< Good for reflection rays. + Disabled = 0x3, ///< Box sort and heuristic are disabled. + Count +}; + +/// Specifies parameter for creating a BvH (bounding volume hierarchy, used by ray-trace) descriptor +struct BvhInfo +{ + const IGpuMemory* pMemory; ///< Memory object holding the BVH nodes + gpusize offset; ///< Offset from memory address specified by pMemory. Combination of + /// pMemory address and the offset must be 256 byte aligned. + gpusize numNodes; ///< Number of nodes in the view + uint32 boxGrowValue; ///< Number of ULPs (unit in last place) to be added during ray-box test. + + BoxSortHeuristic boxSortHeuristic; ///< Specifies which heuristic should be utilized for + ///< sorting children when box sorting is enabled + union + { + struct + { + uint32 useZeroOffset : 1; ///< If set, SRD address is programmed to zero + uint32 returnBarycentrics : 1; ///< When enabled, ray intersection will return triangle barycentrics. + /// Note: Only valid if @see supportIntersectRayBarycentrics is true. + + /// Set to have this surface independently bypass the MALL for read and / or write operations. + /// If set, this overrides the GpuMemMallPolicy specified at memory allocation time. Meaningful + /// only on GPUs that have supportsMall set in DeviceProperties. + uint32 bypassMallRead : 1; + uint32 bypassMallWrite : 1; + uint32 pointerFlags : 1; ///< If set, flags are encoded in the node pointer bits + uint32 highPrecisionBoxNode : 1; ///< If set, enable 64-byte high precision box node + uint32 wideSort : 1; ///< If set, enable wide sort + uint32 hwInstanceNode : 1; ///< If set, enable hardware instance node + uint32 sortTrianglesFirst : 1; ///< If set, triangle nodes are treated specially during child sorting + uint32 compressedFormatEn : 1; ///< If set, enable compressed format support. This include enable + /// support for compressed primitive packets, BVH8-128B box nodes, + /// and changes to triangle intersection test return data. + uint32 reserved : 22; ///< Reserved for future HW + }; + + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< BVH creation flags. +}; + +/// Specifies parameters for an fmask view descriptor. +/// +/// Input to CreateFmaskViewSrd(). Allows the client to access fmask from a shader using the load_fptr IL instruction. +/// +/// @ingroup ResourceBinding +struct FmaskViewInfo +{ + const IImage* pImage; ///< Image associated with the fmask view. + uint32 baseArraySlice; ///< First slice in the view. + uint32 arraySize; ///< Number of slices in the view. + + union + { + struct + { + uint32 shaderWritable : 1; ///< True if used with an image that has been transitioned to a shader- + /// writable image state (e.g. [Graphics|Compute][WriteOnly|ReadWrite]) + uint32 reserved : 31; ///< Reserved for future use + }; + uint32 u32All; ///< Value of flags bitfield + } flags; ///< Fmask view flags +}; + +/// Element of the multisample pattern representing a sample position (X, Y), type of SamplePatternPalette, which +/// matches the layout defined by SC. +struct SamplePos +{ + float x; ///< x coordinate of sample position. + float y; ///< y coordinate of sample position. + uint32 reserved1; ///< reserved for future use + uint32 reserved2; ///< reserved for future use +}; + +/// Specifies a palette of MSAA sample patterns used by the client. Input to SetSamplePatternPalette, which is used +/// to implement samplepos shader instruction support. +typedef SamplePos SamplePatternPalette[MaxSamplePatternPaletteEntries][MaxMsaaRasterizerSamples]; + +/// Provides a GPU timestamp along with the corresponding CPU timestamps, for use in calibrating CPU and GPU timelines. +struct CalibratedTimestamps +{ + uint64 gpuTimestamp; ///< GPU timestamp value compatible with ICmdBuffer::CmdWriteTimestamp(). + uint64 cpuClockMonotonicTimestamp; ///< POSIX CLOCK_MONOTONIC timestamp + uint64 cpuClockMonotonicRawTimestamp; ///< POSIX CLOCK_MONOTONIC_RAW timestamp + uint64 cpuQueryPerfCounterTimestamp; ///< Windows QueryPerformanceCounter timestamp + uint64 maxDeviation; ///< Maximum deviation in nanoseconds between the GPU and CPU timestamps +}; + +/// Specifies connector types +enum class DisplayConnectorType : uint32 +{ + Unknown = 0, ///< Unknown connector type + Vga, ///< VGA + DviD, ///< DVI_D + DviI, ///< DVI_I + Hdmi, ///< HDMI + Dp, ///< DP + Edp, ///< EDP + Minidp, ///< MINI_DP + Count +}; + +/// Specifies properties for display connectors connected to GPU +struct DisplayConnectorProperties +{ + DisplayConnectorType type; ///< Connector type - VGA, DVI, HDMI, DP etc +}; + +/// Specifies pre-defined power profile which is used to communicate with KMD/PPLib and set correspond power states. +enum class PowerProfile : uint32 +{ + Default = 0, ///< Default power profile. + VrCustom = 1, ///< Power profile used by custom VR scenario. + VrDefault = 2, ///< Power profile used by default VR scenario. + Idle = 3, ///< Power profile used for forced DPM0, in case HMD is taken off but the game is still running. + Count +}; + +/// Fine-grain power switch info. +struct PowerSwitchInfo +{ + uint32 time; ///< Time in microseconds, relative to the frame start at V-sync. Clients should consider the + /// powerSwitchLatency value reported in @ref PrivateScreenProperties when specifying + /// switch times. + + uint32 performance; ///< Performance to be set (between 0-100), which is mapped to a certain DPM level by KMD. +}; + +/// Maximum number of power switch info allowed in one custom power profile. +static constexpr uint32 MaxNumPowerSwitchInfo = 5; + +/// Fine-grain power management for dynamic power mode. This structure specifies multiple DPM states to be cycled +/// through each frame. +struct CustomPowerProfile +{ + IPrivateScreen* pScreen; ///< Dynamic power mode needs V-sync so a private screen object is needed. + uint32 numSwitchInfo; ///< Number of discrete DPM states to cycle through per frame. Number entries + /// in switchInfo[] and actualSwitchInfo[]. + + PowerSwitchInfo switchInfo[MaxNumPowerSwitchInfo]; ///< Specifies the set of power states to cycle through each + /// frame. Each entry specifies an offset into the frame where + /// the DPM state should be switched, and a rough performance + /// requirement value which will be translated into an + /// appropriate DPM state by KMD. + PowerSwitchInfo actualSwitchInfo[MaxNumPowerSwitchInfo]; ///< The actual set of power states that KMD/PPLib sets. +}; + +/// Flags for IDevice::AddGpuMemoryReferences(). Depending on their residency model, a client may set these flags as +/// directed by the application or hard-code them to a single value. Driver-internal memory references should be marked +/// as CantTrim unless the client explicitly handles trim support. +/// +/// Note that the CantTrim and MustSucceed flags are based on the same WDDM flags; it is expected that PAL will ignore +/// them on non-WDDM platforms. +enum GpuMemoryRefFlags : uint32 +{ + GpuMemoryRefCantTrim = 0x1, ///< The caller can't or won't free this allocation on OS request. + GpuMemoryRefMustSucceed = 0x2, ///< Hint to the OS that we can't process a failure here, this may result in a TDR. + GpuMemoryRefAllFlags = 0x3 ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/// Specifies input arguments for IDevice::GetPrimaryInfo(). Client must specify a display ID and properties of the +/// primary surface that will drive that display in order to query capabilities. +struct GetPrimaryInfoInput +{ + uint32 vidPnSrcId; ///< Video present source id. + uint32 width; ///< Primary surface width. + uint32 height; ///< Primary surface height. + SwizzledFormat swizzledFormat; ///< Format and swizzle of the primary surface. + Rational refreshRate; ///< Video refresh rate, this is only valid if refreshRateValid is set. + union + { + struct + { + uint32 qbStereoRequest : 1; ///< Going to set a stereo mode. + uint32 refreshRateValid : 1; ///< Refresh rate is valid. + uint32 freeSyncInCrossFireSupport : 1; ///< True if client supports FreeSync in CrossFire. + uint32 useKmdCalcFramePacing : 1; ///< True if client uses KMD frame pacing. If so, the client + /// creates a timer queue to delay the present, and the delay + /// value is calculated by KMD. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< get primary surface info input flags. +}; + +/// Specifies output arguments for IDevice::GetStereoDisplayModes(), returning supported stereo mode +struct StereoDisplayModeOutput +{ + Extent2d extent; ///< Dimensions in pixels WxH. + Rational refreshRate; ///< Refresh rate. + SwizzledFormat format; ///< Format and swizzle of the primary surface. +}; + +/// Specifies output arguments for IDevice::GetActive10BitPackedPixelMode(), returning which, if any, 10-bit +/// display mode is active. +struct Active10BitPackedPixelModeOutput +{ + bool isInWs10BitMode; ///< Whether the workstation 10-bit feature is enabled. + bool notifyKmd10bitsPresent; ///< When in 10-bit mode and at present time, if the + /// client driver sees a 10-bit to 8-bit surface blt, + /// it needs to call RequestKmdReinterpretAs10Bit() to + /// inform the KMD that the dst surface must be reinterpreted + /// as 10-bits per channel for all KMD-initiated BLTs. + PackedPixelType packedPixelType; ///< Format of the packed pixels. + uint32 pixelPackRatio; ///< The number of 10-bit pixels that are packed into one 8-8-8-8 + /// format pixel. +}; + +/// Specifies primary surface stereo mode. +enum StereoMode : uint32 +{ + StereoModeHwAlignedViews = 0, ///< The stereo views are HW aligned on the display. + StereoModeSwPackedViews = 1, ///< The layout of the stereo views on the display are determined by the client. + StereoModeNotSupported = 2, ///< Not support stereo mode + StereoModeSideBySide = 3, ///< The two stereo views are put side by side on the display. + StereoModeTopBottom = 4 ///< One stereo view is on the top of the display, and the other is on the bottom. +}; + +/// Enumerates the supported workstation stereo modes. +enum class WorkstationStereoMode : uint32 +{ + Disabled, + ViaConnector, ///< Active Stereo for 3 Pin VESA connector. + ViaBlueLine, ///< Blue line Active Stereo for laptops. + Passive, ///< Passive Stereo (Dual head). + PassiveInvertRightHoriz, ///< Passive Stereo with Horizontal Invert (Dual Head). + PassiveInvertRightVert, ///< Passive Stereo with Vertical Invert (Dual Head). + Auto, ///< Auto Stereo Vertical Interleaved. + AutoHoriz, ///< Auto Stereo Horizontal Interleaved. + AutoCheckerboard, ///< Auto Stereo Checkerboard Interleaved. + AutoTsl, ///< Tridelity SL Auto Stereo. + Count, +}; + +/// Specifies output arguments for IDevice::GetPrimaryInfo(), returning capabilitiy information for a display in +/// a particular mode. +struct GetPrimaryInfoOutput +{ + uint32 tilingCaps; ///< Tiling caps supported by this primary surface. + StereoMode stereoMode; ///< Stereo mode supported by this primary surface. + uint32 mallCursorCacheSize; ///< Size of the mall cursor cache in bytes + union + { + struct + { + /// MGPU flag: this primary surface supports DVO HW compositing mode. + uint32 dvoHwMode : 1; + /// MGPU flag: this primary surface supports XDMA HW compositing mode. + uint32 xdmaHwMode : 1; + /// MGPU flag: this primary surface supports client doing SW compositing mode. + uint32 swMode : 1; + /// MGPU flag: this primary surface supports freesync. + uint32 isFreeSyncEnabled : 1; + /// Single-GPU flag: gives hint to the client that they should use rotated tiling mode. + uint32 hwRotationPortraitMode : 1; + /// Single-GPU flag: this primary surface supports non local heap. + uint32 displaySupportsNonLocalHeap : 1; + /// Reserved for future use. + uint32 reserved : 26; + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< get primary surface support info output flags. +}; + +/// Specifies different clock modes that the device can be set to. +enum class DeviceClockMode : uint32 +{ + Default = 0, ///< Device clocks and other power settings are restored to default. + Query = 1, ///< Queries the current device clock ratios. Leaves the clock mode of the device unchanged. + Profiling = 2, ///< Scale down from peak ratio. Clocks are set to a constant amount which is + /// known to be power and thermal sustainable. The engine/memory clock ratio + /// will be kept the same as much as possible. + MinimumMemory = 3, ///< Memory clock is set to the lowest available level. Engine clock is set to + /// thermal and power sustainable level. + MinimumEngine = 4, ///< Engine clock is set to the lowest available level. Memory clock is set to + /// thermal and power sustainable level. + Peak = 5, ///< Clocks set to maximum when possible. Fan set to maximum. Note: Under power + /// and thermal constraints device will clock down. + QueryProfiling = 6, ///< Queries the profiling device clock ratios. Leaves the clock mode of the device unchanged. + QueryPeak = 7, ///< Queries the peak device clock ratios. Leaves the clock mode of the device unchanged. + Count +}; + +/// Specifies input argument to IDeive::SetClockMode. The caller can read the clock ratios the device is currently +/// running by querying using the mode DeviceClockMode::DeviceClockModeQuery. +struct SetClockModeOutput +{ + uint32 memoryClockFrequency; /// Current mem clock (absolute) value in Mhz + uint32 engineClockFrequency; /// Current gpu core clock (absolute) value in Mhz +}; + +/// Specifies input argument to IDeive::SetClockMode. The caller must specify the mode in which to set the device. +struct SetClockModeInput +{ + DeviceClockMode clockMode; ///< Used to specify the clock mode for the device. +}; + +/// Specifies primary surface MGPU compositing mode. +enum MgpuMode : uint32 +{ + MgpuModeOff = 0, ///< MGPU compositing mode off, the client does not do SW compositing at all, e.g. AFR disabled. + MgpuModeSw = 1, ///< MGPU SW compositing mode, the client handle the SW compositing. + MgpuModeDvo = 2, ///< MGPU DVO HW compositing mode + MgpuModeXdma = 3, ///< MGPU XDMA HW compositing mode + MgpuModeCount +}; + +/// Specifies input arguments for IDevice::SetMgpuMode(). A client set a particular MGPU compositing mode and whether +/// frame pacing is enabled for a display. +struct SetMgpuModeInput +{ + uint32 vidPnSrcId; ///< Video present source id. + MgpuMode mgpuMode; ///< Primary surface MGPU compositing mode. + bool isFramePacingEnabled; ///< True if frame pacing enabled. If so, the client creates a timer queue + /// to delay the present, and the delay value is calculated by KMD. +}; + +constexpr uint32 XdmaMaxDevices = 8; ///< Maximum number of Devices for XDMA compositing. + +/// Specifies XDMA cache buffer info for each gpu. +struct XdmaBufferInfo +{ + uint32 bufferSize; ///< XDMA cache buffer size of each device + uint32 startAlignment; ///< XDMA cache buffer start alignment of each device +}; + +/// Specifies output arguments for IDevice::GetXdmaInfo(), returning the XDMA cache buffer information of each GPU for +/// a display. +struct GetXdmaInfoOutput +{ + XdmaBufferInfo xdmaBufferInfo[XdmaMaxDevices]; ///< Output XDMA cache buffer info +}; + +/// Specifies flipping status flags on a specific VidPnSource. It's Windows specific. +union FlipStatusFlags +{ + struct + { + uint32 immediate : 1; ///< Is immediate flip + uint32 dwmFlip : 1; ///< Is DWM conducted flip + uint32 iFlip : 1; ///< Is independent exclusive flip + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies the VSync mode of virtual display. +enum class VirtualDisplayVSyncMode : uint32 +{ + Default = 0, ///< Using the default VSync mode based on refresh rate + Immediate = 1, ///< The presentation should be executed immediately without waiting for vsync to display + HMD = 2, ///< Using HMD VSync, the HMD is specified by pPrivateScreen + Count +}; + +/// The VirtualDisplayInfo is provided by application and KMD uses it to create a virtual display. +/// @see IDevice::CreateVirtualDisplay. +struct VirtualDisplayInfo +{ + uint32 width; ///< Horizontal dimension in pixels + uint32 height; ///< Vertical dimension in pixels + Rational refreshRate; ///< Refresh rate of virtual display + VirtualDisplayVSyncMode vsyncMode; ///< VSync mode + uint32 vsyncOffset; ///< VSync front porch location in pixels or lines. + /// It's needed when VSyncMode is HMD + Pal::IPrivateScreen* pPrivateScreen; ///< A pointer to IPrivateScreen. + /// It's needed when VSyncMode is HMD +}; + +/// Function pointer type definition for creating a buffer view SRD. +/// +/// @see IDevice::CreateTypedBufferViewSrds()/CreateUntypedBufferViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. +/// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateBufferViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut); + +/// Function pointer type definition for creating an image view SRD. +/// +/// @see IDevice::CreateImageViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. +/// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateImageViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const ImageViewInfo* pImgViewInfo, + void* pOut); + +/// Function pointer type definition for creating a fmask view SRD. +/// +/// @see IDevice::CreateFmaskViewSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. +/// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateFmaskViewSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const FmaskViewInfo* pFmaskViewInfo, + void* pOut); + +/// Function pointer type definition for creating a sampler SRD. +/// +/// @see IDevice::CreateSamplerSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. +/// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateSamplerSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const SamplerInfo* pSamplerInfo, + void* pOut); + +/// Function pointer type definition for creating a ray tracing SRD. +/// +/// @see IDevice::CreateBvhSrds(). +/// +/// @param [in] pDevice Pointer to the device this function is called on. +/// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. +/// @param [in] pBvhInfo Array of BVH descriptions directing SRD construction. +/// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. +/// +/// @ingroup ResourceBinding +typedef void (PAL_STDCALL *CreateBvhSrdsFunc)( + const IDevice* pDevice, + uint32 count, + const BvhInfo* pBvhInfo, + void* pOut); + +/// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. +/// +/// @param [in] pDevice +/// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. +/// @param [out] pViewInfo Pointer to memory to be filled with decoded info. +typedef void (PAL_STDCALL *DecodeBufferViewSrdFunc)( + const IDevice* pDevice, + const void* pBufferViewSrd, + BufferViewInfo* pViewInfo); + +/// Structure filled in by DecodeImageViewSrd(). +struct DecodedImageSrd +{ + SwizzledFormat swizzledFormat; ///< SRD's swizzled format. + SubresRange subresRange; ///< SRD's subresource range; the array-range is always { 0, 1 } for 3D images. + Range zRange; ///< z-range of the SRD's subresRange.startSubres. +}; + +/// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). +/// +/// @param [in] pDevice +/// @param [in] pImage The same image the SRD was created on. +/// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. +/// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. +typedef void (PAL_STDCALL *DecodeImageViewSrdFunc)( + const IDevice* pDevice, + const IImage* pImage, + const void* pImageViewSrd, + DecodedImageSrd* pDecodedInfo); + +/// Function pointer table for SRD methods. +struct DeviceInterfacePfnTable +{ + CreateBufferViewSrdsFunc pfnCreateTypedBufViewSrds; ///< Typed Buffer view SRD creation function pointer. + CreateBufferViewSrdsFunc pfnCreateUntypedBufViewSrds; ///< Untyped Buffer view SRD creation function ptr. + CreateImageViewSrdsFunc pfnCreateImageViewSrds; ///< Image view SRD creation function pointer. + CreateFmaskViewSrdsFunc pfnCreateFmaskViewSrds; ///< Fmask View SRD creation function pointer. + CreateSamplerSrdsFunc pfnCreateSamplerSrds; ///< Sampler SRD creation function pointer. + CreateBvhSrdsFunc pfnCreateBvhSrds; ///< BVH SRD creation function pointer. + DecodeBufferViewSrdFunc pfnDecodeBufferViewSrd; ///< Buffer SRD decode function pointer. + DecodeImageViewSrdFunc pfnDecodeImageViewSrd; ///< Image SRD decode function pointer. +}; + +/// Specifies output arguments for IDevice::QueryWorkstationCaps(), returning worksation feature information +/// on this device workstation board. +union WorkStationCaps +{ + struct + { + uint32 workStationBoard : 1; ///< Running a workstation driver on a workstation board. + /// On workstation boards that support CWG (Creator Who Game), + /// the user can switch to a Gaming/consumer driver on the + /// workstation board, and then this will be false. + uint32 supportWorkstationAppPerfOpt : 1; ///< Workstation boards have optimizations for kinds of workstation + /// applications. These optimization is enabled if it is set. + uint32 supportWorkstationEdgeFlag : 1; ///< Workstation boards have a DX9 feature that edge flag can be + /// exported via point size output in VS. The feature is enabled + /// if it is set. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// FrameLock/GenLock support state enum +enum class FlglSupport : uint32 +{ + NotAvailable = 0, ///< FL/GL not supported by the GPU + NotConnected = 1, ///< FL/GL support available in the GPU, but is not connected to a GLSync board + Available = 2, ///< FL/GL support available and connected + Count +}; + +/// Container structure for FrameLock/GenLock state. +struct FlglState +{ + union + { + struct + { + uint32 genLockEnabled : 1; ///< True if genlock is currently enabled. Genlock is a system-wide setting + ///< in CCC. Genlock provides a singal source (which is used in framelock) + uint32 frameLockEnabled : 1; ///< True if (KMD) framelock is currently enabled. + ///< Framelock is the mechanism to sync all presents in multiple adapters. + uint32 isTimingMaster : 1; ///< True if the display being driven by the current adapter is the timing + ///< master in a genlock configuration + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Packed 32-bit uint value. + }; + FlglSupport support; ///< The state of the FLGL support in current adapter + uint32 firmwareVersion; ///< Firmware version number of the GLSync hardware (S400 board), if available +}; + +/// GlSync setting mask definition, used with GlSyncConfig +enum GlSyncConfigMask : uint32 +{ + GlSyncConfigMaskSignalSource = 0x00000001, + GlSyncConfigMaskSyncField = 0x00000002, + GlSyncConfigMaskSampleRate = 0x00000004, + GlSyncConfigMaskSyncDelay = 0x00000008, + GlSyncConfigMaskTriggerEdge = 0x00000010, + GlSyncConfigMaskScanRateCoeff = 0x00000020, + GlSyncConfigMaskFrameLockCntl = 0x00000040, + GlSyncConfigMaskSigGenFrequency = 0x00000080 +}; + +/// specify GLSYNC framelock control state +enum GlSyncFrameLockCtrl : uint32 +{ + GlSyncFrameLockCntlNone = 0x00000000, + GlSyncFrameLockCntlEnable = 0x00000001, + GlSyncFrameLockCntlDisable = 0x00000002, + GlSyncFrameLockCntlResetSwapCounter = 0x00000004, + GlSyncFrameLockCntlAckSwapCounter = 0x00000008, + GlSyncFrameLockCntlVersionKmd = 0x00000010 +}; + +/// Specifies GlSync Signal Source +enum GlSyncSignalSource : uint32 +{ + GlSyncSignalSourceGpuMask = 0x0FF, + GlSyncSignalSourceUndefined = 0x100, + GlSyncSignalSourceFreerun = 0x101, + GlSyncSignalSourceBncPort = 0x102, + GlSyncSignalSourceRj45Port1 = 0x103, + GlSyncSignalSourceRj45Port2 = 0x104 +}; + +/// Specifies GlSync Sync Field +enum GlSyncSyncField : uint8 +{ + GlSyncSyncFieldUndefined = 0, + GlSyncSyncFieldBoth = 1, + GlSyncSyncField1 = 2 +}; + +/// Specifies GlSync Sync Trigger Edge +enum GlSyncTriggerEdge : uint8 +{ + GlSyncTriggerEdgeUndefined = 0, + GlSyncTriggerEdgeRising = 1, + GlSyncTriggerEdgeFalling = 2, + GlSyncTriggerEdgeBoth = 3 +}; + +/// Specifies GlSync scan rate coefficient/multiplier options +enum GlSyncScanRateCoeff : uint8 +{ + GlSyncScanRateCoeffUndefined = 0, + GlSyncScanRateCoeffx5 = 1, + GlSyncScanRateCoeffx4 = 2, + GlSyncScanRateCoeffx3 = 3, + GlSyncScanRateCoeffx5Div2 = 4, + GlSyncScanRateCoeffx2 = 5, + GlSyncScanRateCoeffx3Div2 = 6, + GlSyncScanRateCoeffx5Div4 = 7 +}; + +/// Container structure for FrameLock/GenLock config. +struct GlSyncConfig +{ + uint32 validMask; ///< Mask that specifies which settings are actually referred in the structure. + /// GlSyncConfigMask* + uint32 syncDelay; ///< Delay of sync signal in microseconds + uint32 framelockCntlVector; ///< Vector of Framelock control bits. GlSyncFrameLockCntl* + uint32 signalSource; ///< Source of sync signal. Can be House Sync, RJ45 Port or GPUPort. + /// GlSyncSignalSource* or GPUPort Index + uint8 sampleRate; ///< Number of VSyncs per sample. 0 - no sampling, syncronized by singal VSync. + uint8 syncField; ///< Sync to Field 1 or to both Fields when input signal is interlaced. + /// GlSyncSyncField* + uint8 triggerEdge; ///< Which edge should be used as trigger. GlSyncTriggerEdge* + uint8 scanRateCoeff; ///< Scan Rate Multiplier applied to original sync signal. GlSyncScanRateCoeff* + uint32 sigGenFrequency; ///< Frequency in mHz of internal signal generator +}; + +#if PAL_KMT_BUILD +/// The type of NT object, it decides which KMT function should be called to retrieve the NT handle. +enum class ExternalObjectType : uint32 +{ + Allocation = 0, ///< Specifies the NT object is an allocation, such as GPUMemory etc. + SyncObject = 1, ///< Specifies the NT object is a sync object, such as semaphore and fence etc. +}; + +/// Specifies the NT handle attributes when opening from name. +struct ExternalHandleInfo +{ + ExternalObjectType objectType; ///< The type of NT object, it decides which KMT function should + /// be called to retrieve the NT handle. + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT handle + /// with a name, and then the handle can be acquired via this + /// name. + uint32 accessFlags; ///< Desried access rights of the object the handle refers to. +}; +#endif + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) +/// Gpu heap group enumeration. One heap group contains several pal GpuHeap. +enum GpuHeapGroup : uint32 +{ + GpuHeapGroupLocal = 0x0, /// Local heap group includes GpuHeapLocal and GpuHeapInvisible on Windows. + /// But GpuHeapInvisible is not included on Linux. + GpuHeapGroupNonLocal = 0x1, /// NonLocal heap group includes GpuHeapGartUswc and GpuHeapGartCacheable. +#if PAL_AMDGPU_BUILD + GpuHeapGroupInvisible = 0x2, /// This is used on Linux as GpuHeapLocal and GpuHeapInvisible are not combined. +#endif + GpuHeapGroupCount, +}; + +/// Struct for querying current gpu memory usage info and budget info. +struct GpuMemoryBudgetInfo +{ +#if PAL_AMDGPU_BUILD + gpusize systemUsage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of whole system. +#else + gpusize usage[GpuHeapGroupCount]; /// Current total memory usage of specified heap group of current process. + gpusize budget[GpuHeapGroupCount]; /// Current total memory budget of specified heap group of the device which + /// implies how much memory the device can allocate from that heap group + /// before allocations may fail or cause performance degradation, including + /// all allocated memory. Budget might be affected by OS status and other + /// processes. +#endif +}; +#endif + +/// Reclaim allocation result enumeration. +enum class ReclaimResult : uint8 +{ + Ok = 0, ///< Reclaim result is OK. + Discarded = 1, ///< Reclaim result is discarded. + NotCommitted = 2, ///< Reclaim result is not committed. + Count +}; + +#if defined(_WIN32) +/// KMD has an interface in place to support UMD registration of Events to be signaled based on various +/// system-level conditions. Each enum here represents a certain condition which will cause KMD to signal +/// the event Clients previously registered through RegisterEvent. +enum class EventTrackingType : uint32 +{ + ShaderInterrupt = 0, + EarlyPresent = 1, + Count +}; + +/// Clients specify the Util::Event and EventTrackingType it wants to register with PAL in this struct. +/// Under the condition specified by the trackingType, pEvent will be signaled by KMD. +struct RegisterEventInfo +{ + Util::Event* pEvent; + EventTrackingType trackingType; +}; + +/// As an input of UnregisterEvent to specify the event and trackingType Clients want to unregister from KMD. +struct UnregisterEventInfo +{ + Util::Event* pEvent; + EventTrackingType trackingType; +}; + +/// Output parameter of RegisterEvent. There's one struct for each type of EventTrackingType. +union RegisterEventOutputInfo +{ + struct + { + uint32 eventId; + gpusize eventMailboxGpuVa; + } shaderInterrupt; + + struct + { + uint32 eventId; + } earlyPresent; +}; +#endif + +/// Contains the page fault status of the GPU. +struct PageFaultStatus +{ + union + { + struct + { + uint32 pageFault : 1; ///< Set if there was a GPU page fault. + uint32 readFault : 1; ///< Set if the page fault was during a read operation. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; + } flags; + + gpusize faultAddress; ///< GPU virtual address where page fault occurred. Ignored if @ref pageFault is not set. +}; + +/// Input to the RegisterRuntimeState call, which allows the HIP runtime to pass information to KMD which can be shared +/// with the HIP debugger. +struct HipRuntimeSetup +{ + const void* pRdebug; ///< Address of the r_debug structure in the runtime + uint32 runtimeState; ///< Runtime-specific enum indicating runtime state + uint32 ttmpSetupHint; ///< Hint indicating that ttmp values should be initialized +}; + +/** + *********************************************************************************************************************** + * @interface IDevice + * @brief Interface representing a client-configurable context for a particular GPU. + * + * This object becomes the root of all client/PAL interaction to get work done on that GPU. The main functionality + * provided by the device object: + * + * + Creation of all other PAL objects. + * + GPU memory management. + *********************************************************************************************************************** + */ +class IDevice +{ +public: + /// Get the maximum alignments for images created with a @ref ImageTiling::Linear tiling mode assuming the images' + /// elements are no larger than pAlignments->maxElementSize. + /// + /// @param [out] pAlignments Its maxElementSize will be used to fill the rest of its members with valid alignments. + /// + /// @returns Success if pAlignments was filled with data. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pAlignments is null. + /// + ErrorInvalidValue if pAlignments->maxElementSize is zero. + virtual Result GetLinearImageAlignments( + LinearImageAlignments* pAlignments) const = 0; + + /// Fills out a structure with details on the properties of this device. This includes capability flags, + /// supported engines/queues, performance characteristics, etc. This should only be called after a client has + /// called @ref CommitSettingsAndInit(). + /// + /// @see DeviceProperties + /// + /// @param [out] pInfo Properties structure to be filled out by PAL based on properties of this device. + /// + /// @returns Success if the device properties were successfully returned in pInfo. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetProperties( + DeviceProperties* pInfo) const = 0; + + /// Checks and returns execution state of the device. Currently unsupported for DX clients and + /// will return Unavailable if called by those clients. + /// + /// @param [out] pPageFaultStatus This struct is filled out when this function returns ErrorGpuPageFaultDetected. + /// + /// @returns Success if device is operational and running. Otherwise, one of the following errors may be + /// + ErrorDeviceLost if device is lost, reset or not responding, + /// + ErrorInvalidValue if failed to get device reset state, + /// + ErrorOutOfGpuMemory if ran out of GPU memory, + /// + ErrorGpuPageFaultDetected if page fault was detected, + /// + ErrorUnknown if device is in unknown state. + virtual Result CheckExecutionState( + PageFaultStatus* pPageFaultStatus) = 0; + + /// Returns this devices client-visible settings structure initialized with appropriate defaults. Clients can + /// modify parameters in this structure as they wish in order to modify PAL's behavior for this device. After + /// modifying settings, the client must call CommitSettingsAndInit() before creating finalizing the device. + /// + /// @warning The returned value points to an internal PAL structure. Modifying data using this pointer after + /// calling CommitSettingsAndInit() will result in undefined behavior. + /// + /// @returns Pointer to this devices public settings for examination and/or modification by the client. + virtual PalPublicSettings* GetPublicSettings() = 0; + + /// Reads a specific setting from the operating system specific source (e.g. registry or config file). + /// + /// @param [in] pSettingName Name of the setting. Must be null-terminated. + /// @param [in] settingScope The scope of settings accessible. + /// @param [in] valueType The type of the setting to return (e.g. bool or int). + /// @param [out] pValue Buffer to write data that was read. Must be non-null. + /// @param [out] bufferSz Size of string buffer (pValue). Only necessary for ValueType::Str. + /// @param [out] pInRegistry If not null, this will be set to true if the setting was read from the registry. + /// + /// @returns True if the read of specified setting is successful. False indicates failure. + virtual bool ReadSetting( + const char* pSettingName, + SettingScope settingScope, + Util::ValueType valueType, + void* pValue, + size_t bufferSz = 0, + bool* pInRegistry = nullptr) const = 0; + +#if defined(_WIN32) + /// Clients call this function to register a Util::Event() object to be signaled based on specified conditions. + /// For now, we will only add one condition: shader interrupt. + /// + /// @param [in] input RegisterEventInfo which contains Util::Event client wants to be signaled and + /// EventTrackingType. + /// @param [out] pOutput Depending on the trackingType in input, the pOutput may refers different info + /// which clients may need. Refer to the definition of RegisterEventOutputInfo + /// for details. + /// @returns Success if no errors occurred. + virtual Result RegisterEvent( + const RegisterEventInfo& input, + RegisterEventOutputInfo* pOutput) = 0; + + /// Clients call this function to unregister an event previously registered with RegisterEvent(). + /// + /// @param [in] input UnregisterEventInfo which contains Util::Event and tracing type the client + /// wants to unregister. + /// @returns Success if no errors occurred. + virtual Result UnregisterEvent( + const UnregisterEventInfo& input) = 0; + +#endif + + /// Indicates that the client has finished overriding public settings so the settings struct can be finalized and + /// any late-stage initialization can be done. This method must be called before @ref IDevice::Finalize() can be + /// called. + /// + /// @note The only functions in IDevice that are able to be called before CommitSettingsAndInit(): + /// + GetLinearImageAlignments() + /// + GetPublicSettings() + /// + ReadSetting() + /// + /// @note Finalizing the settings may override values set by the client. This can occur if: + /// + Invalid settings, either because they are not supported by hardware or are somehow self-conflicting, + /// will be overridden. + /// + Settings specified in the private settings will override client-specified settings. + /// + /// @returns Success if settings have been committed successfully and any late-stage initialization is completed + /// successfully as well. + virtual Result CommitSettingsAndInit() = 0; + + /// Returns the largest possible GPU memory alignment requirement for any IGpuMemoryBindable object created on this + /// device. + /// + /// This is useful for clients that may want to allocate generic GPU memory rafts up front to support many objects + /// without creating all of those objects to query their alignment requirements ahead of time. In practice, most + /// objects have a much smaller alignment requirement than the allocation granularity, but images may require more + /// than the allocation granularity on some devices. + /// + /// @returns Largest possible GPU memory byte alignment for an IGpuMemoryBindable object on this device. + virtual gpusize GetMaxGpuMemoryAlignment() const = 0; + + /// Indicates that the client is able to finalize the initialization of this device with the requisite information. + /// This method must be called before any of the factory creation methods may be called. + /// + /// @note The only functions in IDevice that are able to be called before Finalize(): + /// + The functions listed in IDevice::CommitSettingsAndInit(). + /// + GetMaxGpuMemoryAlignment() + /// + GetProperties() + /// + /// @param [in] finalizeInfo Device finalization properties. + /// + /// @returns Success if final initialization is successful. + virtual Result Finalize( + const DeviceFinalizeInfo& finalizeInfo) = 0; + + /// Cleans up all internal state, undoing any work done by CommitSettingsAndInit() and Finalize(). Following a call + /// to this function, the device will be in its initial state as if it was re-enumerated; the client may requery + /// settings and build up the device for further use. If the client doesn't call this function, it will be called + /// automatically when IPlatform::Destroy() is called or when devices are re-enumerated. + /// + /// This function provides clients with a way to return devices to a trival state, one in which they have no + /// lingering OS or kernel driver dependencies. If a client pairs external state (e.g., an OS handle) with their + /// devices they may be required to call this function when they destroy their API device objects. + /// + /// It is expected that all PAL objects created by the device have already been destroyed (e.g. GPU memory, queues), + /// if not, the device may fall into an illegal state and the client will experience undefined behavior. + /// + /// @returns Success if no errors occurred. + virtual Result Cleanup() = 0; + + /// Returns if dual-source blending can be enabled. It checks the ColorBlendStateCreateInfo for any src1 blending + /// options. Then it checks if we are going to override those src1 options because the blend func is + /// min or max. + /// + /// @param [in] createInfo The ColorBlendStateCreateInfo that is checked for conditions that call for dual-source + /// blending. + /// + /// @returns true if the blend state calls for dual-source blending to be enabled. + virtual bool CanEnableDualSourceBlend( + const ColorBlendStateCreateInfo& createInfo) const = 0; + + /// Specifies how many frames can be placed in the presentation queue. This limits how many frames the CPU can get + /// in front of the device. + /// + /// @param [in] maxFrames Maximum number of frames that can be batched. Specifying a value of 0 resets the limit to + /// a default system value (3 frames on Windows). + /// + /// @returns Success if the limit was successfully adjusted. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + virtual Result SetMaxQueuedFrames( + uint32 maxFrames) = 0; + + /// Compares this device against another device object to determine how compatible they are for multi-GPU + /// operations. + /// + /// @param [in] otherDevice Device to determine MGPU compatibility with. + /// @param [out] pInfo Result compatibility info. + /// + /// @returns Success if the compatibility info was successfully returned in pInfo. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetMultiGpuCompatibility( + const IDevice& otherDevice, + GpuCompatibilityInfo* pInfo) const = 0; + + /// Reports properties of all GPU memory heaps available to this device (e.g., size, whether it is CPU visible or + /// not, performance characteristics, etc.). + /// + /// @param [out] info Properties of each GPU heap available to this device, indexed by the GPU ID defined in + /// @ref GpuHeap. If a particular heap is unavailable, its entry will report a size of 0. + /// + /// @returns Success if the heap properties were successfully queried and returned in info[]. Otherwise, one of the + /// following errors may be returned: + /// + ErrorUnknown if an unexpected internal error occured. + virtual Result GetGpuMemoryHeapProperties( + GpuMemoryHeapProperties info[GpuHeapCount]) const = 0; + + /// Reports all format and tiling mode related properties for this device. + /// + /// @param [out] pInfo Output properties. + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pInfo is null. + virtual Result GetFormatProperties( + MergedFormatPropertiesTable* pInfo) const = 0; + + /// Reports performance experiment related properties for this device. + /// + /// Enumerates the GPU family, blocks, capabilities, etc.. + /// + /// @param [out] pProperties Output properties. + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pProperties is null. + virtual Result GetPerfExperimentProperties( + PerfExperimentProperties* pProperties) const = 0; + + /// Fills out the default MSAA quad sample pattern for the given sample count. + /// + /// @param [in] samples The number of valid samples in the sample pattern. Must be a power of two. + /// @param [out] pQuadSamplePattern Fill this with the default pattern. + /// + /// @returns Success if @ref pQuadSamplePattern was filled with the default sample pattern. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if @ref pQuadSamplePattern is null. + /// + ErrorInvalidValue if @ref samples is not a supported power of two. + /// + ErrorUnavailable if this device lacks GfxIp support. + virtual Result GetDefaultSamplePattern( + uint32 samples, + MsaaQuadSamplePattern* pQuadSamplePattern) const = 0; + + /// Adds a list of per-device memory object references that persist across command buffer submissions. It is the + /// responsibility of the client to make sure that all required memory references have been added before submitting + /// the command buffer that uses on them. References can be added at the device, queue or specified at submit time. + /// gpuMemRefCount and ppGpuMemory cannot be 0/null. PAL will assert and crash if these values are invalid. If + /// multiple references are provided for the same memory, PAL will retain the safest set of GpuMemoryRef flags. + /// + /// see @ref IQueue::Submit() + /// + /// @param [in] gpuMemRefCount Number of memory references in the memory reference list, must be non-zero. + /// @param [in] pGpuMemoryRefs Array of gpuMemRefCount GPU memory references. + /// @param [in] pQueue Optional IQueue that the memory references will be used on, used to optimize + /// residency operations, can be null. Note, if a queue is specified here the same queue + /// should be specified in RemoveGpuMemoryReferences. + /// @param [in] flags Flags from GpuMemoryRefFlags that will apply to all memory object references. + /// + /// @returns Success if the memory references were successfully added. Can also return NotReady if the client + /// passes in a valid pPagingFence pointer and the operation doesn't complete before the function returns. + /// Otherwise, one of the following errors may be returned: + /// + ErrorTooManyMemoryReferences if gpuMemRefCount will cause the total reference count to exceed the + /// limit of this device. + /// + ErrorOutOfMemory if GPU memory objects will not fit in available GPU memory space (i.e. GPU Memory + /// is overcommitted). + virtual Result AddGpuMemoryReferences( + uint32 gpuMemRefCount, + const GpuMemoryRef* pGpuMemoryRefs, + IQueue* pQueue, + uint32 flags + ) = 0; + + /// Removes a list of per-device memory object references that have previously been added via + /// IDevice::AddGpuMemoryReferences(). PAL is responsible for ensuring that timestamps have been retired prior to + /// actually performing any residency operations related to removal of a memory reference, so clients are free to + /// call this function without regard for command buffer use. Memory references are reference counted, so an + /// individual memory reference will only be removed when the total internal reference count reaches zero. + /// gpuMemoryCount and ppGpuMemory cannot be 0/null, PAL will assert and crash if these values are invalid. + /// + /// @param [in] gpuMemoryCount Number of memory objects in the memory reference list (size of ppGpuMemory array). + /// This count must be greater than zero. + /// @param [in] ppGpuMemory Array of GPU memory references. + /// @param [in] pQueue Optional IQueue that the memory references were used on, used to optimize residency + /// operations, can be null. Note, if a queue was specified in AddGpuMemoryReferences, + /// pQueue must match. + /// + /// @returns Success if the memory references were successfully updated. + virtual Result RemoveGpuMemoryReferences( + uint32 gpuMemoryCount, + IGpuMemory*const* ppGpuMemory, + IQueue* pQueue + ) = 0; + + /// Queries the Device for the total amount of referenced GPU memory for each heap type. These totals include all + /// memory added to the Device or any Queue using @ref AddGpuMemoryReferences and not yet removed using @ref + /// RemoveGpuMemoryReferences. Internal PAL allocations are included in these totals, but memory referenced using + /// the per-submit list in @ref IQueue::Submit is not included in these amounts. + /// + /// The intended use for this interface is for clients to be able to manage budgeting of resident GPU memory. + /// + /// @param [out] referencedGpuMemTotal Array containing the total amount of referenced GPU memory for each GPU + /// memory heap. + virtual void GetReferencedMemoryTotals( + gpusize referencedGpuMemTotal[GpuHeapCount]) const = 0; + + /// Get primary surface MGPU support information based upon primary surface create info and input flags provided + /// by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] primaryInfoInput Primary surface info input arguments. + /// @param [in,out] pPrimaryInfoOutput Primary surface info output arguments. + /// + /// @returns Success if the primary surface MGPU support information were successfully queried. + virtual Result GetPrimaryInfo( + const GetPrimaryInfoInput& primaryInfoInput, + GetPrimaryInfoOutput* pPrimaryInfoOutput) const = 0; + + /// Returns the supported stereo modes list. + /// + /// @param [in,out] pStereoModeCount Input value specifies the maximum number of stereo modes to enumerate, and the + /// output value specifies the total number of stereo modes that were enumerated + /// in pStereoModeList. The input value is ignored if pStereoModeList is null. + /// This pointer must not be null. + /// @param [out] pStereoModeList Output list of stereo modes. Can be null, in which case the total number of + /// available modes will be written to pStereoModeCount. + /// + /// @returns Success if the display modes were successfully queried and the results were reported in + /// pStereoModeCount/pStereoModeList. Otherwise, one of the following errors may be returned: + /// + Unsupported if stereo mode is not supported, or the stereo modes can't be queried. + /// + ErrorOutOfMemory if temp memeory allocation failed. + virtual Result GetStereoDisplayModes( + uint32* pStereoModeCount, + StereoDisplayModeOutput* pStereoModeList) const = 0; + + /// Returns the currently selected Workstation stereo mode on Windows OS. + /// + /// @param [out] pWsStereoMode Output currently selected Workstation Stereo mode. + /// + /// @returns Success if the currently selected Workstation stereo mode were successfully queried + /// and the results were reported in pWsStereoMode. + virtual Result GetWsStereoMode(WorkstationStereoMode* pWsStereoMode) const = 0; + + /// Return information about active workstation support for 10-bit (potentially packed pixel) displays. + /// + /// @param [out] pMode Output reports if the workstation 10-bit display feature is enabled, and if so, + /// details on any required pixel packing. + /// + /// @returns Success if the 10-bits and packed-pixel format were successfully queried and the result were + /// reported in pMode. + virtual Result GetActive10BitPackedPixelMode( + Active10BitPackedPixelModeOutput* pMode) const = 0; + + /// Inform the KMD that this allocation must be reinterpreted as 10-bits per channel for the all + /// KMD-initiated BLTs. + /// + /// When in 10-bit mode and at present time, if the dx9p driver sees a 10-bit to 8-bit surface blt, + /// it will use this interface to inform the KMD that the blt dst surface must be reinterpreted as + /// 10-bits per channel for the all KMD-initiated BLTs. + /// + /// @param [in] pGpuMemory The dst GPU memory reference which will be marked as 10 bits format. + /// + /// @returns Success if the KMD has been sucessfully notified. + virtual Result RequestKmdReinterpretAs10Bit( + const IGpuMemory* pGpuMemory) const = 0; + + /// Set or query device clock mode. + /// + /// This function can be called by clients to set the device engine and memory clocks to certain pre-defined ratios. + /// If a call to restore the device clocks to default does not occur, the device stays in the previously set mode. + /// + /// @param [in] setClockModeInput Specify the clock mode to set the device to. + /// @param [out] pSetClockModeOutput @b Optional - Output device clock mode. If not nullptr, it is used + /// to query the current clock mode the device is running in. + /// + /// @returns Success if the device clock mode query/set request was successful. + virtual Result SetClockMode( + const SetClockModeInput& setClockModeInput, + SetClockModeOutput* pSetClockModeOutput) = 0; + + /// Request to enable/disable static VMID for the device. + /// + /// The function must be called with enable = true before a profiling session starts and enable = false after a + /// profiling session ends. It may be called any time, though it is illegal to disable without a prior corresponding + // enable/acquire. Only after this returns success (when enabling) can the driver make submissions targeting the + /// static VMID. + /// + /// @param [in] enable Specifies whether acquiring or releasing the static VMID + /// + /// @returns Success if the static VMID acquire/release request was successful. + virtual Result SetStaticVmidMode( + bool enable) = 0; + + /// Set up MGPU compositing mode of a display provided by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] setMgpuModeInput Set MGPU compositing mode input arguments. + /// + /// @returns Success if the MGPU compositing mode were successfully set. + virtual Result SetMgpuMode( + const SetMgpuModeInput& setMgpuModeInput) const = 0; + + /// Get XDMA cache buffer information of each GPU based upon video present source ID provided by client. + /// + /// This function should not be called by clients that rely on PAL for compositor management. Basically, if your + /// client uses the IScreen's interface to take full screen exclusive mode, then don't call this. + /// + /// @param [in] vidPnSrcId Video present source id. + /// @param [in] gpuMemory Primary surface GPU memory. + /// @param [in,out] pGetXdmaInfoOutput Set XDMA cache buffer info output arguments. + /// + /// @returns Success if the XDMA cache buffer information were successfully queried. + virtual Result GetXdmaInfo( + uint32 vidPnSrcId, + const IGpuMemory& gpuMemory, + GetXdmaInfoOutput* pGetXdmaInfoOutput) const = 0; + + /// Polls current fullscreen frame metadata controls on given vidPnSourceId, including extended data. + /// + /// The function is used by clients that support frame metadata through KMD-UMD shared memory. + /// It polls the frame metadata shared memory for the given VidPnSource. Indicating which types of metadata + /// the UMD should send to KMD. + /// Clients should only call this function on the master device in an LDA chain. + /// + /// @param [in] vidPnSrcId Video present source id + /// @param [out] pFrameMetadataControl @b Optional - Output frame metadata controls. Clients can pass null to + /// check if the buffer is initialized successfully and if the + /// vidPnSrcId is valid. + /// + /// @returns Success if the metadata controls on the given vidPnSrcId was successfully polled. + /// Otherwise, one of the following erros may be returned: + /// + ErrorInvalidValue if vidPnSrcId is invalid (out of range) + /// + ErrorUnavailable if no implementation on current platform or if metadata shared buffer is null. + virtual Result PollFullScreenFrameMetadataControl( + uint32 vidPnSrcId, + PerSourceFrameMetadataControl* pFrameMetadataControl) const = 0; + + /// Get flip status flags and a flag indicating if current device owns the flags. (DX only) + /// + /// The function is used by clients that need flip status polling through KMD-UMD shared memory. + /// It provides caller the FlipStatusFlags on the given VidPnSource. Note that the flag returned is only a hint, + /// and can have a one frame delay during flip status transition. + /// + /// @param [in] vidPnSrcId Video present source id. + /// @param [out] pFlipFlags Output flip flags on given vidPnSrcId. Must not be null. + /// @param [out] pIsFlipOwner Output indicating if the current device owns the flip flags. + /// + /// @returns Success if flipping flags on given vidPnSrcId was successfully polled. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if vidPnSrcId is invalid. + /// + ErrorUnavailable if no implementation on current platform. + /// + ErrorInitializationFailed if flip status shared buffer was failed to initialize. + virtual Result GetFlipStatus( + uint32 vidPnSrcId, + FlipStatusFlags* pFlipFlags, + bool* pIsFlipOwner) const = 0; + + /// Resets the specified set of fences. + /// + /// All fences must be reset before passing them to a submission command. + /// + /// @param [in] fenceCount Number of fences to reset. + /// @param [in] ppFences Array of fences to reset. + /// + /// @returns Success if the specified fences have been successfully reset. + /// + /// @note The function assumes that neither ppFences is null nor that any of the elements of the array pointed by + /// ppFences are null. + virtual Result ResetFences( + uint32 fenceCount, + IFence*const* ppFences) const = 0; + + /// Stalls the current thread until one or all of the specified fences have been reached by the device. + /// + /// If waitAll is true all fences must have been submitted at least once before this is called; + /// otherwise at least one fence must have been submitted. Using a zero timeout value returns + /// immediately and can be used to determine the status of a set of fences without stalling. + /// + /// @param [in] fenceCount Number of fences to wait for (i.e., size of the ppFences array). + /// @param [in] ppFences Array of fences to be waited on. + /// @param [in] waitAll If true, wait for completion of all fences in the array before returning; if false, + /// return after any single fence in the array has completed. + /// @param [in] timeout This method will return after this many nanoseconds even if the fences do not complete. + /// + /// @returns Success if the specified fences have been reached, or Timeout if the fences have not been reached but + /// the specified timeout time has elapsed. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if: + /// - ppFences is null. + /// - Any member of the ppFences array is null. + /// + ErrorInvalidValue if: + /// - fenceCount is zero. + /// + ErrorFenceNeverSubmitted if: + /// - Any of the specified fences haven't been submitted. + virtual Result WaitForFences( + uint32 fenceCount, + const IFence*const* ppFences, + bool waitAll, + std::chrono::nanoseconds timeout) const = 0; + + /// Stalls the current thread until one or all of the specified Semaphores have been reached by the device. + /// + /// Using a zero timeout value returns immediately and can be used to determine the status of a set of semaphores + /// without stalling. + /// + /// @param [in] semaphoreCount Number of semaphores to wait for (i.e., size of the ppFences array). + /// @param [in] ppSemaphores Array of semaphores to be waited on. + /// @param [in] pValues Array of semaphores's value to be waited on. + /// @param [in] flags Combination of zero or more @ref HostWaitFlags values describing the behavior of this + /// wait operation. See @ref HostWaitFlags for more details. + /// @param [in] timeout This method will return after this many nanoseconds even if the semaphores do not + /// complete. + /// + /// @returns Success if the specified semaphores have been reached, or Timeout if the semaphores have not been + /// reached but the specified timeout time has elapsed. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - ppSemaphores is null. + /// - Any member of the ppSemaphores array is null. + /// + ErrorInvalidValue if: + /// - semaphoreCount is zero. + virtual Result WaitForSemaphores( + uint32 semaphoreCount, + const IQueueSemaphore*const* ppSemaphores, + const uint64* pValues, + uint32 flags, + std::chrono::nanoseconds timeout) const = 0; + + /// Correlates a GPU timestamp with the corresponding CPU timestamps, for tighter CPU/GPU timeline synchronization + /// + /// @param [out] pCalibratedTimestamps Reports a current GPU timestamp along with the CPU timestamps at the time + /// that GPU timestamp was written. The CPU timestamps are OS-specific. Also + /// reports a maximum deviation between the captured timestamps in nanoseconds. + /// + /// @returns Success if the request was successful. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if: + /// - pCalibratedTimestamps is null. + /// + ErrorUnavailable if: + /// - unable to capture timestamps for all requested time domains. + virtual Result GetCalibratedTimestamps( + CalibratedTimestamps* pCalibratedTimestamps) const = 0; + + /// Binds the specified GPU memory as a trap handler for the specified pipeline type. This GPU memory must hold + /// shader machine code (i.e., the client must generate HW-specific shader binaries through some external means, + /// probably the SP3 assembler). + /// + /// The same trap handler will be installed for all shader stages that are part of the pipeline. A trap handler + /// will only ever be executed for shaders that set the trapPresent bit in @ref PipelineShaderInfo. + /// + /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap handler will be installed + /// for _all_ hardware shader stages. + /// @param [in] pGpuMemory GPU memory allocation holding the trap handler. + /// @param [in] offset Offset in bytes into pGpuMemory where the trap handler shader code begins. Must be + /// 256 byte aligned. + virtual void BindTrapHandler( + PipelineBindPoint pipelineType, + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Binds the specified GPU memory location as a trap buffer for the specified pipeline type. This GPU memory will + /// be available to the trap handler as scratch memory to use as it chooses. The same trap buffer will be installed + /// for all shader stages that are part of the pipeline. + /// + /// There is no size parameter for the trap buffer. The client is responsible for ensuring that the trap handler + /// only reads/writes data within the bounds designated for trap buffer usage. + /// + /// @param [in] pipelineType Select compute or graphics pipeline. If graphics, this trap buffer will be installed + /// for _all_ hardware shader stages. + /// @param [in] pGpuMemory GPU memory allocation holding the trap buffer range. + /// @param [in] offset Offset in bytes into pGpuMemory where the trap buffer range starts. Must be 256 byte + /// aligned. + virtual void BindTrapBuffer( + PipelineBindPoint pipelineType, + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Get the swap chain information for creating a swap chain and presenting an image. + /// + /// @param [in] hDisplay Display handle of the local window system. + /// @param [in] hWindow Window handle of the local window system. + /// @param [in] wsiPlatform WSI Platform the swapchain supposed to work on + /// @param [in,out] pSwapChainProperties Contains swap chain information. + /// + /// @returns Success if get swap chain information successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result GetSwapChainInfo( + OsDisplayHandle hDisplay, + OsWindowHandle hWindow, + WsiPlatform wsiPlatform, + SwapChainProperties* pSwapChainProperties) = 0; + + /// Determines if the given window system requirement is supported by the underlying wsiPlatform. + /// + /// @param [in] hDisplay Display handle of the local window system. + /// @param [in] wsiPlatform WSI Platform the request supposed to send to + /// @param [in] visualId Requested visual information which may not needed for some wsiPlatforms + /// + /// @returns Success if the request is supported. Otherwise, one of the following erros may be returned: + /// + Unsupported + virtual Result DeterminePresentationSupported( + OsDisplayHandle hDisplay, + WsiPlatform wsiPlatform, + int64 visualId) = 0; + + /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each + /// WsiPlatform. + /// + /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. + /// @param [in] mode The swap chain will use this present mode. + /// + /// @returns Returns a mask of SwapChainModeSupport. + virtual uint32 GetSupportedSwapChainModes( + WsiPlatform wsiPlatform, + PresentMode mode) const = 0; + + /// Returns a mask of SwapChainModeSupport flags for each present mode. The swapchain modes are different for each + /// WsiPlatform. + /// + /// @param [in] wsiPlatform WSI Platform the swapchain is supposed to work on. + /// @param [in] mode The swap chain will use this present mode. + /// + /// @returns Returns a mask of SwapChainModeSupport. + virtual uint32 GetCompatiblePresentModes( + WsiPlatform wsiPlatform, + SwapChainMode mode) const = 0; + + /// Determines if the given information corresponds to an external shared image. + /// + /// Some clients may not know if a given external shared resource is a simple GPU memory allocation or an image; it + /// is expected they will call this function to determine which set of open functions they must call. + /// + /// @param [in] openInfo The open info describing the external shared resource. + /// @param [out] pIsImage Its contents will be set to true if the external shared resource is an image. + /// + /// @returns Success if PAL was able to determine whether or not the resource is an image. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pIsImage is null. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result DetermineExternalSharedResourceType( + const ExternalResourceOpenInfo& openInfo, + bool* pIsImage) const = 0; + + /// @name FactoryMethods Device Factory Methods + /// + /// The following set of IDevice methods is the interface through which almost all PAL objects are created. + /// + /// PAL does not allocate its own system memory for these objects. Instead, the client must query the amount of + /// system memory required for the object then provide a pointer where PAL will construct the object. + /// + /// This approach allows the client to roll the PAL object into its own allocations without unnecessary heap + /// allocations and cache misses. It can also allocate many objects in a single memory space without PAL + /// involvement. + /// + /// @{ + + /// Determines the amount of system memory required for a queue object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateQueue(). + /// + /// @param [in] createInfo Properties of the new queue such as engine type and engine index. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueue object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueueSize( + const QueueCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a queue object. + /// + /// @param [in] createInfo Properties of the new queue such as engine type and engine index. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueueSize() with the same + /// create info. + /// @param [out] ppQueue Constructed queue object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the queue was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueue is null. + /// + ErrorInvalidValue if the create info's engineType is invalid or if the engineIndex is invalid. + virtual Result CreateQueue( + const QueueCreateInfo& createInfo, + void* pPlacementAddr, + IQueue** ppQueue) = 0; + + /// Determines the amount of system memory required for a multi-queue object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateMultiQueue(). + /// + /// @param [in] queueCount Number of queues in the gang; matches number of entries in pCreateInfo. + /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first + /// entry in this array describes the master queue which will be used to execute all + /// IQueue interfaces except for MultiSubmit(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an multi-queue IQueue object with the specified + /// properties. A return value of 0 indicates the createInfo was invalid. + virtual size_t GetMultiQueueSize( + uint32 queueCount, + const QueueCreateInfo* pCreateInfo, + Result* pResult) const = 0; + + /// Creates a multi-queue (i.e., gang submission queue) object. The resulting version of the IQueue interface + /// is composed of multiple hardware queues which can be atomically submitted to as a group. When this is done, + /// it is safe to use IGpuEvent objects to tightly synchronize work done across queues in a single call to Submit(). + /// This can allow the client to tightly schedule asynchronous workloads for maximum efficiency that isn't possible + /// across queues using IQueueSemaphore objects. + /// + /// @param [in] queueCount Number of queues in the gang; matches number of entries in the pCreateInfo array. + /// @param [in] pCreateInfo Properties of each queue to create for this gang (engine type, etc.). The first + /// entry in this array describes the master queue which will be used to execute all + /// IQueue interfaces except for the ganged-portion of a Submit() (e.g., Present()). + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetMultiQueueSize() with the same + /// arguments. + /// @param [out] ppQueue Constructed multi queue object. + /// + /// @returns Success if the multi queue was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidValue if queueCount is less than 2. + /// + ErrorInvalidQueueType if any of the created sub-queues are not multi-queue compatible. This is + /// indicated by the supportsMultiQueue engineProperties flag in @ref DeviceProperties. + /// + ErrorInvalidPointer if pCreateInfo, pPlacementAddr or ppQueue is null. + /// + ErrorInvalidValue if any create info's configuration is invalid. + virtual Result CreateMultiQueue( + uint32 queueCount, + const QueueCreateInfo* pCreateInfo, + void* pPlacementAddr, + IQueue** ppQueue) = 0; + + /// Determines the amount of system memory required for a GPU memory object. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of CreateGpuMemory(). + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed heaps. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetGpuMemorySize( + const GpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IGpuMemory object with the requested properties. + /// + /// This method can create either _real_ or _virtual_ GPU memory allocations. + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size, alignment, and allowed + /// heaps. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetGpuMemorySize() with the same + /// createInfo param. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the GPU memory was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + TooManyFlippableAllocations if the GPU memory was successfully created, but the client has reached + /// the limit of flippable allocations for this Device. This is a warning that future flippable GPU + /// memory may fail to be created due to internal OS limitations. + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorInvalidMemorySize if createInfo.size is invalid. + /// + ErrorInvalidAlignment if createInfo.alignment is invalid. + /// + ErrorInvalidValue if createInfo.heapCount is 0 for real allocations or non-0 for virtual allocations. + /// + ErrorOutOfGpuMemory if the allocation failed due to a lack of GPU memory. + /// + ErrorUnavailable if the Reserve Gpu Virtual Address failed. + virtual Result CreateGpuMemory( + const GpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a pinned GPU memory object. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// CreatePinnedGpuMemory(). + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object pinned with the specified + /// properties. + virtual size_t GetPinnedGpuMemorySize( + const PinnedGpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Pins a segment of system memory in place and create an @ref IGpuMemory object allowing access by the GPU. + /// + /// @param [in] createInfo Data controlling the GPU memory properties, such as size and the allocation to pin. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPinnedGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the system memory was successfully pinned and a corresponding GPU memory object was created. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if createInfo.pSysMem, pPlacementAddr, or ppGpuMemory is null, or if + /// createInfo.pSysMem is not allocation granularity aligned. + /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. + /// + ErrorOutOfMemory if the creation failed because the system memory could not be pinned. + virtual Result CreatePinnedGpuMemory( + const PinnedGpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a SVM memory object + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// CreateSvmGpuMemory(). + /// + /// @param [in] createInfo Data controlling the SVM memory properties, such as size and type of SVM buffer. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuMemory object with the specified properties. + virtual size_t GetSvmGpuMemorySize( + const SvmGpuMemoryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an SVM (Shared Virtual Memory) IGpuMemory object. + /// The basic idea of SVM is to create system memory that has the same CPU and GPU virtual address + /// (i.e., "pointer is a pointer"). This can work in two modes: fine-grain, or coarse-grain. + /// + /// Fine-grain (Single-GPU): The client should just call this function with pReservedGpuVaOwner set to null. + /// PAL will allocate GPU-accessible system memory that will have the same CPU virtual address + /// (as returned by IGpuMemory::Map()) as GPU virtual address + /// (as returned in the gpuVirtAddr value returned by IGpuMemory::Desc()). + /// + /// Fine-grain (MGPU): The client can call this function with pReservedGpuVaOwner set to IGpuMemory object + /// allocated on the first device and receive mapping to the same GPU VA location on another device. + /// + /// Coarse-grain: In this mode, there are actually two separate IGpuMemory objects. + /// The client should first create the "local" GPU memory object by calling IDevice::CreateGpuMemory() to create + /// a standard GPU memory object in the VaRange::Svm VA space. Next, the client should create the "staging" GPU + /// memory by calling this function (CreateSvmGpuMemory) with pReserveGpuVaOwner pointing to the "local" GPU + /// memory object. PAL will create system memory for the "staging" GPU memory with a CPU virtual address matching + /// the "local" GPU memory's GPU virtual address. The GPU virtual address of the "staging" GPU memory is + /// arbitrarily assigned. The client is responsible for managing the contents of the two related allocations + /// per their API rules. + /// + /// @param [in] createInfo Data controlling the SVM memory properties, such as size and location of SVM buffer. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSvmGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the SVM buffer was successfully created and a corresponding GPU memory object was created. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, or ppGpuMemory is null. + /// + ErrorInvalidMemorySize if createInfo.memSize is not allocation granularity aligned. + /// + ErrorOutOfMemory if the creation failed because there is not enough GPU memory + /// or the system memory could not be pinned. + virtual Result CreateSvmGpuMemory( + const SvmGpuMemoryCreateInfo& createInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a GPU memory object created by opening an allocation from a + /// different GPU. + /// + /// An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// OpenSharedGpuMemory(). + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetSharedGpuMemorySize( + const GpuMemoryOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable GPU memory object created on another device for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedGpuMemorySize() with the + /// same params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorNotShareable if the specified memory object was not marked as shareable on creation. + virtual Result OpenSharedGpuMemory( + const GpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a external GPU memory object created by opening + /// an allocation from a compatible device, such as D3D device. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetExternalSharedGpuMemorySize( + Result* pResult) const = 0; + + /// Opens an external shared memory object which is created by a compatible device, such as D3D device. + /// There could be more than one underlying allocations in the shared memory object, only one allocation + /// created on the device's GPU will be opened, other allocations will be ignored. + /// + /// @param [in] openInfo Open info. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetExternalSharedGpuMemorySize() + /// with the same params. + /// @param [out] pMemCreateInfo Return CreateInfo of the external shared GPU memory. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the shared memory was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory or ppGpuMemory is null. + /// + ErrorNotShareable if none of allocations in the shared memory object is created on the device's GPU. + virtual Result OpenExternalSharedGpuMemory( + const ExternalGpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + GpuMemoryCreateInfo* pMemCreateInfo, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a proxy GPU memory object to a GPU memory object on a + /// different GPU. An allocation of this amount of memory must be provided in the pPlacementAddr parameter of + /// OpenPeerGpuMemory(). + /// + /// @param [in] openInfo Specifies a handle to a GPU memory object to open for peer-to-peer transfer access. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a peer IGpuMemory object with the specified + /// properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetPeerGpuMemorySize( + const PeerGpuMemoryOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens previously created GPU memory object for peer access on another device. + /// + /// @param [in] openInfo Specifies a handle to a shared GPU memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPeerGpuMemorySize() with the same + /// params. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the memory was successfully opened for peer access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppGpuMemory, or openInfo.pOriginalMem is null. + virtual Result OpenPeerGpuMemory( + const PeerGpuMemoryOpenInfo& openInfo, + void* pPlacementAddr, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an image object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateImage(). + /// + /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an @ref IImage object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetImageSize( + const ImageCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Determines if the image will have faster clone copy (than normal CS or graphics copy) in @ref CmdCopyImage(). + /// If this function returns true, clients can set @ref ImageCreateInfo::flags::cloneable = 1 to enable clone + /// copy support in @ref CmdCopyImage(); however whether @ref CmdCopyImage() goes through clone copy or not depends + /// on multiple factors, please see doxygen in @ref CmdCopyImage() for more details. + /// + /// Note that clients are free to set this flag to 1 regardless; it still enables clone copy potentially but can't + /// guarantee better performance. If clients find more faster clone copy cases, feel free to improve this function + /// to cover these kinds of images. + /// + /// Due to rely on @ref ImageCreateInfo values to make decision, ideally clients should set all other members + /// for @ref ImageCreateInfo first, and then call this function to setup cloneable flag last before creating image. + /// + /// @param [in] createInfo Properties of the new image. + /// + /// @returns true if image prefers clone copy in CmdCopyImage(); otherwise goes through normal copy. + virtual bool ImagePrefersCloneCopy( + const ImageCreateInfo& createInfo) const = 0; + + /// Creates an @ref IImage object with the requested properties. + /// + /// @param [in] createInfo Properties of the new image such as pixel format and dimensions. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetImageSize() with the same + /// createInfo param. + /// @param [out] ppImage Constructed image object. When successful, the returned address will be the same as + /// specified in pPlacementAddr. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if: + /// - The image dimensions are invalid based on the image type. + /// - The image dimensions are not properly aligned for compressed formats. + /// - The number of samples is invalid for the image type and format. + /// - MSAA is enabled for an image that doesn't support color or depth usage. + /// - MSAA images have more than one mip level. + /// - The array size is zero, non-1 for 3D images, or beyond the max number of slices for 1D or 2D + /// images. + /// - The number of mipmaps is invalid for the image dimensions. + /// + ErrorInvalidPointer if pPlacementAddr or ppImage is null. + /// + ErrorInvalidFormat if: + /// - The format doesn't support the usage flags. + /// - A 1D image specifies a compressed format. + /// + ErrorInvalidFlags if: + /// - The color target and depth/stencil usages are specified simultaneously. + /// - The color target flag is set for a 1D image. + /// - The depth/stencil flag is set for a non-2D image. + virtual Result CreateImage( + const ImageCreateInfo& createInfo, + void* pPlacementAddr, + IImage** ppImage) = 0; + + /// Determines the amount of system memory required for a presentable image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of CreatePresentableImage(). + /// + /// Only images created through this interface are valid sources for IQueue::Present(). + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. + /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a IGpuMemory object attached to the + /// presentable IImage. Should be specified to the pGpuMemoryPlacementAddr argument + /// of CreatePresentableImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + virtual void GetPresentableImageSizes( + const PresentableImageCreateInfo& createInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + Result* pResult) const = 0; + + /// Creates a presentable image. Presentable image must have internally bound GPU memory allocated as OS needs the + /// information of image/memory via OS callbacks. + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling + /// GetPresentableImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated + /// with this presentable image. There must be as much size available here as + /// reported by calling GetPresentableImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [out] ppGpuMemory Constructed memory object. This object is only valid for specifying in a + /// memory reference list. It must be destroyed when the image is destroyed. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + TooManyFlippableAllocations if the image was successfully created, but the client has reached the + /// limit of flippable allocations for this Device. This is a warning that future presentable Images + /// may fail to be created due to internal OS limitations. + /// + ErrorTooManyPresentableImages if the swap chain cannot be associated with more presentable images. + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, or ppGpuMemory is null. + /// + ErrorInvalidValue if: + /// - The image dimensions are invalid. + /// - The refresh rate is invalid for a fullscreen image. + /// + ErrorInvalidFormat if the format doesn't support presentation. + virtual Result CreatePresentableImage( + const PresentableImageCreateInfo& createInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an image object (and an associated memory object) opened for + /// peer access to an image created on another GPU. Allocations of these amounts of memory must be provided in the + /// pImagePlacementAddr and pGpuMemoryPlacementAddr parameters of OpenPeerImage(). + /// + /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. + /// @param [out] pPeerImageSize Size, in bytes, of system memory required for a peer IImage. Should be + /// specified to the pImagePlacementAddr argument to OpenPeerImage(). + /// @param [out] pPeerGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached + /// to a peer IImage. Should be specified to the pGpuMemoryPlacementAddr argument + /// to OpenPeerImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + virtual void GetPeerImageSizes( + const PeerImageOpenInfo& openInfo, + size_t* pPeerImageSize, + size_t* pPeerGpuMemorySize, + Result* pResult) const = 0; + + /// Creates an @ref IImage object as a proxy to an IImage on another GPU to be used for peer-to-peer transfers. + /// + /// @note The @ref IImage object provided in the @ref PeerImageOpenInfo must be bound to an existing + /// @ref IGpuMemory object prior to opening a peer version of it. + /// If the new image needs to bind to an existing peer memory allocation, as when images are suballocated, + /// pGpuMemoryPlacementAddr must equal nullptr and ppGpuMemory must point to an existing IGpuMemory object + /// that was previously opened to reference the same memory from the current device. + /// @param [in] openInfo Specifies the image to be opened for peer access from another GPU. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling GetPeerImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr If nonzero, this is a pointer to the location where PAL should construct a + /// IGpuMemory to be associated with this peer image. There must be as much + /// size available here as reported by calling GetPeerImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [in] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying + /// in a memory reference list. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage, ppGpuMemory, or + /// openInfo.pOriginalImage is null. + virtual Result OpenPeerImage( + const PeerImageOpenInfo& openInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an external shared image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of OpenExternalSharedImage(). + /// + /// @param [in] openInfo Specifies the external image to be opened. + /// @param [out] pImageSize Size, in bytes, of system memory required for pImagePlacementAddr. + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for pGpuMemoryPlacementAddr. + /// @param [out] pImgCreateInfo If non-null, it will be filled out with information describing the shared image. + /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pImageSize or pGpuMemorySize is null. + /// + ErrorNotShareable if none of allocations in the shared image is created on the device's GPU. + virtual Result GetExternalSharedImageSizes( + const ExternalImageOpenInfo& openInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + ImageCreateInfo* pImgCreateInfo) const = 0; + + /// Opens an external shared image object which was created by a compatible device, such as D3D device. + /// There could be more than one underlying allocations in the shared image object, only one allocation + /// created on the device's GPU will be opened, other allocations will be ignored. + /// + /// @param [in] openInfo Specifies the external image to be opened. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct the image object. + /// There must be as much space available here as reported by calling + /// GetExternalSharedImageSizes() with the same params. + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct the GPU memory object. + /// There must be as much space available here as reported by calling + /// GetExternalSharedImageSizes() with the same params. + /// @param [out] pMemCreateInfo If non-null, it is filled with information describing the external GPU + /// memory that backs this external image. + /// @param [out] ppImage Constructed image object. When successful, the returned address will + /// be the same as specified in pImagePlacementAddr. + /// @param [out] ppGpuMemory Constructed GPU memory object. When successful, the returned address will + /// be the same as specified in pGpuMemoryPlacementAddr. + /// @returns Success if the shared image was successfully opened for access on this device. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pImagePlacementAddr, pGpuMemoryPlacementAddr, ppImage or ppGpuMemory is null. + /// + ErrorNotShareable if none of allocations in the shared image object is created on the device's GPU. + virtual Result OpenExternalSharedImage( + const ExternalImageOpenInfo& openInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + GpuMemoryCreateInfo* pMemCreateInfo, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for a color target view object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateColorTargetView(). + /// + /// Unlike most creation methods in this class, GetColorTargetViewSize() does not require a ColorTargetCreateInfo + /// parameter. PAL must guarantee that all color target view objects are the same size in order to support DX12, + /// where these views are treated similarly to SRDs. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IColorTargetView object. + virtual size_t GetColorTargetViewSize( + Result* pResult) const = 0; + + /// Creates an @ref IColorTargetView object with the requested properties. + /// + /// @param [in] createInfo Properties of the color target view to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetColorTargetViewSize(). + /// @param [out] ppColorTargetView Constructed color target view object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the color target view was successfully created. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppColorTargetView, or createInfo.pImage is null. + /// + ErrorInvalidValue if: + /// - The base slice is invalid for the given image object and view type. + /// - The number of array slices is zero or the range of slices is too large for the specified image. + /// - The mip level is invalid for the given image object. + /// + ErrorInvalidImage if the image object doesn't have the color target access flag set. + virtual Result CreateColorTargetView( + const ColorTargetViewCreateInfo& createInfo, + void* pPlacementAddr, + IColorTargetView** ppColorTargetView) const = 0; + + /// Determines the amount of system memory required for a depth/stencil view object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilView(). + /// + /// Unlike most creation methods in this class, GetDepthStencilViewSize() does not require a + /// DepthStencilViewCreateInfo parameter. PAL must guarantee that all color target view objects are the same size + /// in order to support DX12, where these views are treated similarly to SRDs. + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IDepthStencilView object. + virtual size_t GetDepthStencilViewSize( + Result* pResult) const = 0; + + /// Creates an @ref IDepthStencilView object with the requested properties. + /// + /// @param [in] createInfo Properties of the depth/stencil view to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be + /// as much size available here as reported by calling GetDepthStencilViewSize(). + /// @param [out] ppDepthStencilView Constructed depth/stencil view object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the depth/stencil view was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr, ppDepthStencilView, or createInfo.pImage is null. + /// + ErrorInvalidValue if: + /// - The base slice is invalid for the given image object and view type. + /// - The number of array slices is zero or the range of slices is too large for the specified image. + /// - The mip level is invalid for the given image object. + /// + ErrorInvalidImage if the image object doesn't have the depth/stencil target access flag set. + virtual Result CreateDepthStencilView( + const DepthStencilViewCreateInfo& createInfo, + void* pPlacementAddr, + IDepthStencilView** ppDepthStencilView) const = 0; + + /// Creates one or more typed buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.typedBufferView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should + /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD + /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly + /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, + /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: + /// + If pBufferViewInfo or pOut, is null. + /// + If count is 0. + /// + If pBufferViewInfo[].format is Undefined. + /// + If pBufferViewInfo[].stride does not match the size of an element of that format. + /// + If pBufferViewInfo[].gpuAddr is 0. + /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. + /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateTypedBufferViewSrds( + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateTypedBufViewSrds(this, count, pBufferViewInfo, pOut); } + + /// Creates one or more untyped buffer view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// These SRDs can be accessed in a shader as either _raw_ or _structured_ views. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.untypedBufferView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated buffer view SRD allows a range of a GPU memory allocation to be accessed by a shader, and should + /// be setup based on shader usage as described in @ref BufferViewInfo. The client should put the resulting SRD + /// in an appropriate location based on the shader resource mapping specified by the bound pipeline, either directly + /// in user data (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// For performance reasons, this method returns void and does minimal error-checking. However, in debug builds, + /// to assist clients' debug efforts, the following conditions will be checked with runtime assertions: + /// + If pBufferViewInfo or pOut, is null. + /// + If count is 0. + /// + If pBufferViewInfo[].format is not Undefined. + /// + If pBufferViewInfo[].gpuAddr is 0. + /// + If pBufferViewInfo[].gpuAddr is not properly aligned to Min(4, pBufferViewInfo[].stride). + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pBufferViewInfo array. + /// @param [in] pBufferViewInfo Array of buffer view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateUntypedBufferViewSrds( + uint32 count, + const BufferViewInfo* pBufferViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateUntypedBufViewSrds(this, count, pBufferViewInfo, pOut); } + + /// Validates image view SRD input parameters. Error checking for image view SRDs is handled by a separate + /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when + /// rebuilding SRDs. + /// + /// @param [in] viewInfo Input image view SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorImagePlaneUnavailable if the requested image plane specified in the view is not available on + /// the image. + /// + ErrorImageNotShaderAccessible if the image does not have a shader-readable or shader-writable usage. + /// + ErrorInvalidFormatSwizzle if the view's channel swizzle specifies components not available in the view + /// format. + /// + ErrorInvalidBaseMipLevel if the view's start subresource has a mip level larger than the number of + /// of available mip levels. + /// + ErrorFormatIncompatibleWithImageFormat if the view's format is not compatible with the image's format. + /// This can happen if: + /// - For color plane views, the bit-depths of the two formats are not equal. + /// + ErrorFormatIncompatibleWithImagePlane if the view's format is not compatible with the image's plane. + /// This can happen if: + /// - For depth plane views, the bit-depths of the view format and the depth component of the image + /// are not equal. + /// - For stencil plane views, the bit-depths of the view format and the stencil component of the image + /// are not equal. + /// + ErrorInvalidViewArraySize if: + /// - The view array size is 0. + /// - The image type is 3D and the view array size is not 1. + /// + ErrorViewTypeIncompatibleWithImageType if: + /// - The image type is 1D and the view type is not 1D + /// - The image type is 2D and the view type is not 2D or cubemap + /// - The image type is 3D and the view type is not 3D + /// + ErrorInsufficientImageArraySize if the number of viewed array slices is more than available on + /// the image. + /// + ErrorCubemapIncompatibleWithMsaa if the view type is a cubemap view and the image has + /// multiple samples. + /// + ErrorCubemapNonSquareFaceSize if the view type is a cubemap view and the image 2D extents are not + /// square. + /// + ErrorInvalidViewBaseSlice + /// - If the image type is 3D and the view base slice is not 0. + /// + /// @ingroup ResourceBinding + virtual Result ValidateImageViewInfo(const ImageViewInfo& viewInfo) const = 0; + + /// Creates one or more image view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.imageView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated image view SRD allows a set of subresources in an image to be accessed by a shader, and should + /// be setup as described in @ref ImageViewInfo. The client should put the resulting SRD in an appropriate + /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @warning SRDs for Planar YUV images will include padding if pImageViewInfo->subresRange.numSlices > 1 + /// + /// @param [in] count Number of buffer view SRDs to create; size of the pImageViewInfo array. + /// @param [in] pImgViewInfo Array of image view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateImageViewSrds( + uint32 count, + const ImageViewInfo* pImgViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateImageViewSrds(this, count, pImgViewInfo, pOut); } + + /// Validates an fmask view SRD input parameters. Error checking for fmask view SRDs is handled by a separate + /// function for performance reasons and to avoid rechecking parameters that the client knows are correct when + /// rebuilding SRDs. + /// + /// @param [in] viewInfo Input image view SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorImageFmaskUnavailable if the image does not have an FMask. + /// + ErrorInvalidViewArraySize if the view array size is 0. + /// + ErrorViewTypeIncompatibleWithImageType if the image type is not 2D. + /// + ErrorInsufficientImageArraySize if the view base array slice and size define an out of bounds array range. + /// + /// @ingroup ResourceBinding + virtual Result ValidateFmaskViewInfo(const FmaskViewInfo& viewInfo) const = 0; + + /// Creates one or more fmask view _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.fmaskView + /// in DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated fmask view SRD allows a range of image slices to be accessed bo the load_fptr IL instruction, + /// which allows a shader to read compressed MSAA data at the expense of a texture indirection. This SRD should be + /// setup as described in @ref FmaskViewInfo. The client should put the resulting SRD in an appropriate + /// location based on the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of fmask view SRDs to create; size of the pFmaskViewInfo array. + /// @param [in] pFmaskViewInfo Array of fmask view descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @ingroup ResourceBinding + void CreateFmaskViewSrds( + uint32 count, + const FmaskViewInfo* pFmaskViewInfo, + void* pOut) const + { m_pfnTable.pfnCreateFmaskViewSrds(this, count, pFmaskViewInfo, pOut); } + + /// Validates a sampler SRD input parameters. Error checking for sampler SRDs is handled by a separate function for + /// performance reasons and to avoid rechecking parameters that the client knows are correct when rebuilding SRDs. + /// + /// @param [in] samplerInfo Input sampler SRD parameter info. + /// + /// @returns Success if the parameters pass validation. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if: + /// - The max anisotropy or LOD bias value is outside of the legal range. + /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. + /// - The border color palette index is out of the legal range. + /// + /// @ingroup ResourceBinding + virtual Result ValidateSamplerInfo(const SamplerInfo& samplerInfo) const = 0; + + /// Creates one or more sampler _shader resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.sampler in + /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated sampler SRD controlls execution of sample instructions in a shader, and should be setup as + /// described in @ref SamplerInfo. The client should put the resulting SRD in an appropriate location based on the + /// shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of sampler SRDs to create; size of the pSamplerInfo array. + /// @param [in] pSamplerInfo Array of sampler descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pSamplerInfo or pOut is null. + /// + ErrorInvalidValue if: + /// - The max anisotropy or LOD bias value is outside of the legal range. + /// - The min/max LOD values are outside the legal range or if the max LOD is smaller than the min LOD. + /// - The border color palette index is out of the legal range. + /// + /// @ingroup ResourceBinding + void CreateSamplerSrds( + uint32 count, + const SamplerInfo* pSamplerInfo, + void* pOut) const + { m_pfnTable.pfnCreateSamplerSrds(this, count, pSamplerInfo, pOut); } + + /// Creates one or more _BVH resource descriptors (SRDs)_ in memory provided by the client. + /// + /// The client is responsible for providing _count_ times the amount of memory reported by srdSizes.bvhInfo in + /// DeviceProperties, and must also ensure the provided memory is aligned to the size of one SRD. + /// + /// The SRD can be created in either system memory or pre-mapped GPU memory. If updating GPU memory, the client + /// must ensure there are no GPU accesses of this memory in flight before calling this method. + /// + /// The generated BVH SRD controls execution of ray trace instructions in a shader, and should be setup as + /// described in @ref BvhInfo. The client should put the resulting SRD in an appropriate location based on + /// the shader resource mapping specified by the bound pipeline, either directly in user data + /// (ICmdBuffer::CmdSetUserData()) or a table in GPU memory indirectly referenced by user data. + /// + /// @param [in] count Number of BVH SRDs to create; size of the pBvhInfo array. + /// @param [in] pBvhInfo Array of BVH (bounding volume hierarchy) descriptions directing SRD construction. + /// @param [out] pOut Client-provided space where opaque, hardware-specific SRD data is written. + /// + /// @returns Success if the sampler SRD data was successfully written to pOut. Otherwise, one of the following + /// errors may be returned: + /// + ErrorInvalidPointer if pBvhInfo or pOut is null. + /// + /// @ingroup ResourceBinding + void CreateBvhSrds( + uint32 count, + const BvhInfo* pBvhInfo, + void* pOut) const + { + m_pfnTable.pfnCreateBvhSrds(this, count, pBvhInfo, pOut); + } + + /// Decode a buffer SRD back into most of the BufferViewInfo used to create it via Create{Typed,Untyped}BufferViewSrds. + /// + /// @param [in] pBufferViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. + /// @param [out] pViewInfo Pointer to memory to be filled with decoded info. + void DecodeBufferViewSrd( + const void* pBufferViewSrd, + BufferViewInfo* pViewInfo) const + { + m_pfnTable.pfnDecodeBufferViewSrd(this, pBufferViewSrd, pViewInfo); + } + + /// Decode an image SRD back into the main parameters used to create it via CreateImageViewSrds(). + /// + /// @param [in] image The same image the SRD was created on. + /// @param [in] pImageViewSrd Pointer to the SRD. Should not be in dedicated GPU memory. + /// @param [out] pDecodedInfo Pointer to memory to be filled with decoded info. + void DecodeImageViewSrd( + const IImage& image, + const void* pImageViewSrd, + DecodedImageSrd* pDecodedInfo) const + { + m_pfnTable.pfnDecodeImageViewSrd(this, &image, pImageViewSrd, pDecodedInfo); + } + + /// The MSAA sample pattern palette is a client-managed table of sample patterns that might be in use by the app. + /// + /// The only purpose of this palette is to implement the samplepos shader instruction. This instruction returns the + /// position of a particular sample based on the sample pattern of the current rasterizer state or a particular + /// specified resource. When this instruction is executed, the shader will determine the correct palette index + /// as specified in the pipeline (see samplePatternIdx in the rsState structure inside GraphicsPipelineCreateInfo) + /// or in the image view SRD (see samplePatternIdx in ImageViewInfo). The shader will then return the position + /// for the specified sample in the specified entry of the currently bound sample pattern palette as set with this + /// function. + /// + /// The initial bound sample pattern palette values are undefined. A palette entry must be specified before it is + /// referenced by a samplepos instruction. + /// + /// @warning The samplepos instruction and sample pattern palette both assume every pixel has the same pattern. + /// This may not be accurate if the application uses custom sample patterns that aren't identical for + /// every pixel in the quad. + /// + /// @param [in] palette Small set of sample patterns. Each sample pattern consists of an x,y tuple per sample + /// describing where it is located in a pixel. The coordinate system is described in + /// MsaaStateCreateInfo. + /// + /// @returns Success if the palette was successfully updated. An error is only possible due to an internal error, + /// such as a failure to allocate GPU memory for a new table. + virtual Result SetSamplePatternPalette( + const SamplePatternPalette& palette) = 0; + + /// Determines the amount of system memory required for a border color palette object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateBorderColorPalette(). + /// + /// @param [in] createInfo Border color palette creation info (specifies number of entries). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IBorderColorPalette object with the specified + /// properties. A return value of 0 indicates the createInfo was invalid. + virtual size_t GetBorderColorPaletteSize( + const BorderColorPaletteCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a border color palette object. + /// + /// @param [in] createInfo Border color palette creation info (number of entries). + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetBorderColorPaletteSize() with the + /// same createInfo param. + /// @param [out] ppPalette Constructed border color palette object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the border color palette was successfully created. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppPalette is null. + /// + ErrorInvalidValue if the palette size is too large to be used on any queue on this device. + virtual Result CreateBorderColorPalette( + const BorderColorPaletteCreateInfo& createInfo, + void* pPlacementAddr, + IBorderColorPalette** ppPalette) const = 0; + + /// Determines the amount of system memory required for a compute pipeline object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateComputePipeline(). + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetComputePipelineSize( + const ComputePipelineCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a compute @ref IPipeline object with the requested properties. + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetComputePipelineSize() with the + /// same createInfo param. + /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppPipeline is null. + /// - A required shader pointer is null. + /// - The link time constant data pointer is null. + /// + ErrorInvalidValue if: + /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. + /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. + /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. + virtual Result CreateComputePipeline( + const ComputePipelineCreateInfo& createInfo, + void* pPlacementAddr, + IPipeline** ppPipeline) = 0; + + /// Determines the amount of system memory required for a shader library object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateShaderLibrary(). + /// + /// @param [in] createInfo Library creation parameters including ELF code object and other items. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additonal validation. + /// + /// @returns Size, in bytes, of system memory required for an IShaderLibrary object with the specified properties. + /// A return value of zero indicates the createInfo was invalid. + virtual size_t GetShaderLibrarySize( + const ShaderLibraryCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a @ref IShaderLibrary object with the requested properties. + /// + /// @param [in] createInfo Library creation parameters including ELF code object and other items. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetShaderLibrarySize() with the + /// same createInfo parameter. + /// @param [out] ppLibrary Constructed library object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the library was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppLibrary is null. + /// - Required code object pointer is null. + virtual Result CreateShaderLibrary( + const ShaderLibraryCreateInfo& createInfo, + void* pPlacementAddr, + IShaderLibrary** ppLibrary) = 0; + + /// Determines the amount of system memory required for a graphics pipeline object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateGraphicsPipeline(). + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPipeline object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetGraphicsPipelineSize( + const GraphicsPipelineCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a graphics @ref IPipeline object with the requested properties. + /// If this pipeline is created from a set of @IShaderLibrary objects, it is the caller's responsibility to ensure + /// that the lifetime of the pipeline does not exceed the lifetime of the libraries themselves. + /// + /// @param [in] createInfo Pipeline properties including shaders and descriptor set mappings. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetGraphicsPipelineSize() with the + /// same createInfo param. + /// @param [out] ppPipeline Constructed pipeline object. When successful, the returned address will be the same + /// as specified in pPlacementAddr. + /// + /// @returns Success if the pipeline was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if: + /// - pPlacementAddr or ppPipeline is null. + /// - A required shader pointer is null. + /// - The link time constant data pointer is null. + /// + ErrorInvalidValue if: + /// - The number of control points is invalid for a tessellation pipeline. + /// - Logic operations are enabled while some of the color targets enable blending. + /// - The dual source blend enable doesn't match expectations for color target and blend enable setup. + /// - The link constant buffer info pointer isn't consistent with the link constant buffer count value. + /// - The dynamic memory view mapping slot object type is not unused, resource, of UAV. + /// + ErrorInvalidFormat if: + /// - Blending is enabled by the color target format doesn't support blending. + /// - Logic operations are enabled by an incompatible format is used. + /// + ErrorUnsupportedShaderIlVersion if an incorrect shader type is used in any shader stage. + virtual Result CreateGraphicsPipeline( + const GraphicsPipelineCreateInfo& createInfo, + void* pPlacementAddr, + IPipeline** ppPipeline) = 0; + + /// Determines the amount of system memory required for a MSAA state object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateMsaaState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IMsaaState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetMsaaStateSize() const = 0; + + /// Creates an @ref IMsaaState object with the requested properties. + /// + /// @param [in] createInfo Properties of the MSAA state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetMsaaStateSize() with the same + /// createInfo param. + /// @param [out] ppMsaaState Constructed MSAA state object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the MSAA state was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppMsaaState is null. + /// + ErrorInvalidValue if: + /// - The number of samples is unsupported. + virtual Result CreateMsaaState( + const MsaaStateCreateInfo& createInfo, + void* pPlacementAddr, + IMsaaState** ppMsaaState) const = 0; + + /// Determines the amount of system memory required for a color blend state object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateColorBlendState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IColorBlendState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetColorBlendStateSize() const = 0; + + /// Creates an @ref IColorBlendState object with the requested properties. + /// + /// @param [in] createInfo Properties of the color blend state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetColorBlendStateSize() with the + /// same createInfo param. + /// @param [out] ppColorBlendState Constructed color blend state object. When successful, the returned address will + /// be the same as specified in pPlacementAddr. + /// + /// @returns Success if the color blend state was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppColorBlendState is null. + /// + ErrorInvalidValue if: + /// - An unsupported blend function is used with dual source blending. + virtual Result CreateColorBlendState( + const ColorBlendStateCreateInfo& createInfo, + void* pPlacementAddr, + IColorBlendState** ppColorBlendState) const = 0; + + /// Determines the amount of system memory required for a depth/stencil state object. An allocation of this amount + /// of memory must be provided in the pPlacementAddr parameter of CreateDepthStencilState(). + /// + /// @returns Size, in bytes, of system memory required for an @ref IDepthStencilState object. + /// This value will always be non-zero if the device has GfxIp support. + virtual size_t GetDepthStencilStateSize() const = 0; + + /// Creates an @ref IDepthStencilState object with the requested properties. + /// + /// @param [in] createInfo Properties of the depth/stencil state object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be + /// as much size available here as reported by calling GetDepthStencilStateSize() + /// with the same createInfo param. + /// @param [out] ppDepthStencilState Constructed depth/stencil state object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the depth/stencil state was successfully created. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppDepthStencilState is null. + /// + ErrorInvalidValue if: + /// - Depth bounds is enabled and the depth range is invalid. + virtual Result CreateDepthStencilState( + const DepthStencilStateCreateInfo& createInfo, + void* pPlacementAddr, + IDepthStencilState** ppDepthStencilState) const = 0; + + /// Determines the amount of system memory required for a queue semaphore object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateQueueSemaphore(). + /// + /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore count. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueueSemaphore object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueueSemaphoreSize( + const QueueSemaphoreCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IQueueSemaphore object with the requested properties. + /// + /// @param [in] createInfo Data controlling the queue semaphore properties, such as an initial semaphore + /// count. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueueSemaphoreSize() with the + /// same createInfo param. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the queue semaphore was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGpuMemory is null. + /// + ErrorInvalidValue if createInfo.initialCount is outside of the [0..31] range. + virtual Result CreateQueueSemaphore( + const QueueSemaphoreCreateInfo& createInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + + /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore + /// from a different device. An allocation of this amount of memory must be provided in the pPlacementAddr + /// parameter of OpenSharedQueueSemaphore(). + /// + /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the + /// specified properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetSharedQueueSemaphoreSize( + const QueueSemaphoreOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable queue semaphore object created on another device for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with + /// the same params. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. + /// + ErrorNotShareable if the specified queue semaphore object was not marked as shareable on creation. + virtual Result OpenSharedQueueSemaphore( + const QueueSemaphoreOpenInfo& openInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + + /// Determines the amount of system memory required for a queue semaphore object created by opening a semaphore from + /// a different API which isn't a PAL client. An allocation of this amount of memory must be provided in the + /// pPlacementAddr parameter of OpenExternalSharedQueueSemaphore(). + /// + /// @param [in] openInfo Specifies a handle to a shared queue semaphore object to open. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for opening a shared IQueueSemaphore object with the + /// specified properties. A return value of 0 indicates the openInfo was invalid. + virtual size_t GetExternalSharedQueueSemaphoreSize( + const ExternalQueueSemaphoreOpenInfo& openInfo, + Result* pResult) const = 0; + + /// Opens a shareable queue semaphore object created on another API which isn't a PAL client for use on this device. + /// + /// @param [in] openInfo Specifies a handle to a queue semaphore memory object to open and flags. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSharedQueueSemaphoreSize() with + /// the same params. + /// @param [out] ppQueueSemaphore Constructed queue semaphore object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the shared semaphore was successfully opened for access on this device. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueueSemaphore is null. + virtual Result OpenExternalSharedQueueSemaphore( + const ExternalQueueSemaphoreOpenInfo& openInfo, + void* pPlacementAddr, + IQueueSemaphore** ppQueueSemaphore) = 0; + +#if PAL_KMT_BUILD + /// Opens an external NT handle from name. + /// + /// @param [in] handleInfo Specifies the NT handle attributes when opening from name. + /// @param [out] pHandle A NT handle returned by runtime. + /// + /// @returns Success if the NT handle was successfully opened. Otherwise, one of + /// the following errors may be returned: + /// + ErrorInvalidValue if the name or attributes is invaild. + virtual Result OpenExternalHandleFromName( + const ExternalHandleInfo& handleInfo, + OsExternalHandle* pHandle) = 0; +#endif + +#if (PAL_KMT_BUILD || PAL_AMDGPU_BUILD) + /// Query current gpu memory usage info and budget info of specified heap group of the device. + /// + /// @param [out] pInfo Heap usage and budget info reported from Os. + virtual Result QueryGpuMemoryBudgetInfo( + GpuMemoryBudgetInfo* pInfo) = 0; +#endif + + /// Determines the amount of system memory required for an IFence object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateFence(). + /// + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IFence object. + virtual size_t GetFenceSize( + Result* pResult) const = 0; + + /// Creates a GPU fence object. + /// + /// @param [in] createInfo Data controlling the fence properties + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetFenceSize(). + /// @param [out] ppFence Constructed fence object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the fence was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppFence is null. + virtual Result CreateFence( + const FenceCreateInfo& createInfo, + void* pPlacementAddr, + IFence** ppFence) const = 0; + + /// Opens a fence wihich was shared by another Device. + /// + /// @param [in] openInfo A reference to FenceOpenInfo, the handle is used if it's not null, or the + /// event is opened via name. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetFenceSize(). + /// @param [out] ppFence Constructed fence object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// @returns Success if the event was successfully reconstructed, otherwise an appropriate error code. + virtual Result OpenFence( + const FenceOpenInfo& openInfo, + void* pPlacementAddr, + IFence** ppFence) const = 0; + + /// Determines the amount of system memory required for an IGpuEvent object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateGpuEvent(). + /// + /// @param [in] createInfo Properties of the GPU event object to create. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IGpuEvent object. + virtual size_t GetGpuEventSize( + const GpuEventCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a GPU event object. + /// + /// @param [in] createInfo Properties of the GPU event object to create. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetEventSize(). + /// @param [out] ppGpuEvent Constructed event object. When successful, the returned address will be the same as + /// specified in pPlacementAddr. + /// + /// @returns Success if the event was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppEvent is null. + virtual Result CreateGpuEvent( + const GpuEventCreateInfo& createInfo, + void* pPlacementAddr, + IGpuEvent** ppGpuEvent) = 0; + + /// Determines the amount of system memory required for a query pool object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateQueryPool(). + /// + /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are in + /// the pool. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IQueryPool object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetQueryPoolSize( + const QueryPoolCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an @ref IQueryPool object with the requested properties. + /// + /// @param [in] createInfo Data controlling the query pool, such as what type of queries and how many slots are + /// in the pool. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetQueryPoolSize() with the same + /// createInfo param. + /// @param [out] ppQueryPool Constructed query pool object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if the query pool was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppQueryPool is null. + /// + ErrorInvalidValue if createInfo.numSlots is zero. + virtual Result CreateQueryPool( + const QueryPoolCreateInfo& createInfo, + void* pPlacementAddr, + IQueryPool** ppQueryPool) const = 0; + + /// Determines the amount of system memory required for a command allocator object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateCmdAllocator(). + /// + /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ICmdAllocator object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetCmdAllocatorSize( + const CmdAllocatorCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a command allocator object that can allocate GPU memory with the specified properties for use by command + /// buffer objects. + /// + /// @param [in] createInfo Command allocator properties including GPU memory allocation sizes. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetCmdAllocatorSize() with the + /// same createInfo param. + /// @param [out] ppCmdAllocator Constructed command allocator object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the command allocator was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdAllocator is null. + virtual Result CreateCmdAllocator( + const CmdAllocatorCreateInfo& createInfo, + void* pPlacementAddr, + ICmdAllocator** ppCmdAllocator) = 0; + + /// Determines the amount of system memory required for a command buffer object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreateCmdBuffer(). + /// + /// @param [in] createInfo Command buffer properties including the target queue type. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ICmdBuffer object with the specified properties. A + /// return value of 0 indicates the createInfo was invalid. + virtual size_t GetCmdBufferSize( + const CmdBufferCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a command buffer object that can build work intended for a particular queue type. + /// + /// @param [in] createInfo Command buffer properties including the target queue type. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetCmdBufferSize() with the same + /// createInfo param. + /// @param [out] ppCmdBuffer Constructed command buffer object. When successful, the returned address will be + /// the same as specified in pPlacementAddr. + /// + /// @returns Success if the command buffer was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. + virtual Result CreateCmdBuffer( + const CmdBufferCreateInfo& createInfo, + void* pPlacementAddr, + ICmdBuffer** ppCmdBuffer) = 0; + + /// Determines the amount of system memory required for an indirect command generator object. An allocation of this + /// amount must be provided in the pPlacementAddr parameter of CreateIndirectCmdGenerator(). + /// + /// @param [in] createInfo Indirect command generator properties. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid the + /// additional validation steps. + /// + /// @returns Size, in bytes, of system memory required for an IIndirectCmdGenerator object with the specified + /// properties. A return value of zero indicates the createInfo was invalid. + virtual size_t GetIndirectCmdGeneratorSize( + const IndirectCmdGeneratorCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates an indirect command generator object which can translate an application-specified command buffer into a + /// format understandable by the GPU. + /// + /// @param [in] createInfo + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetIndirectCmdGeneratorSize() with + /// the same createInfo param. + /// @param [out] ppGenerator Constructed indirect command generator object. When successful, the returned + /// address will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the command generator was successfully created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppGenerator is null. + virtual Result CreateIndirectCmdGenerator( + const IndirectCmdGeneratorCreateInfo& createInfo, + void* pPlacementAddr, + IIndirectCmdGenerator** ppGenerator) const = 0; + + /// Determines the amount of system memory required for a perf experiment object. An allocation of this amount of + /// memory must be provided in the pPlacementAddr parameter of CreatePerfExperiment(). + /// + /// @param [in] createInfo Properties of the performance experiment to be created. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an IPerfExperiment object with the specified properties. + /// A return value of 0 indicates the createInfo was invalid. + virtual size_t GetPerfExperimentSize( + const PerfExperimentCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Creates a performance experiment object that can gather performance counter and trace data for a specific span + /// of a command buffer. + /// + /// @param [in] createInfo Properties of the performance experiment to be created. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetPerfExperimentSize() with the + /// same createInfo param. + /// @param [out] ppPerfExperiment Constructed performance experiment object. When successful, the returned address + /// will be the same as specified in pPlacementAddr. + /// + /// @returns Success if the perf experiment was successfully created. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if pPlacementAddr or ppCmdBuffer is null. + virtual Result CreatePerfExperiment( + const PerfExperimentCreateInfo& createInfo, + void* pPlacementAddr, + IPerfExperiment** ppPerfExperiment) const = 0; + + /// Gets @ref IPrivateScreen objects owned by this device. Private screens are screens not exposed through standard + /// OS mechanisms. This function should be called again when any of the private screens are plugged or unplugged. + /// The first call to this function enumerates all private screens and stores in device object as well. The next + /// call triggered by hot-plug event enumerates private screens again but only destroys removed ones and creates + /// new private screen objects for newly-added ones. The hash code generated at enumeration time is used as id of + /// private screens. If the id of an enumerated private screen already exists, it is treated as unchanged. The EDID + /// array and display index are used to generate MD5 hash code. + /// + /// @param [out] pNumScreens Pointer to the number of private sceens, note that this number does not mean first + /// *pNumScreens elements in ppScreens are valid but just a hint that total *pNumScreens + /// out of MaxPrivateScreens are valid. + /// @param [out] ppScreens Pointer to the array of private screens. The client must pass in the pointer to an + /// array of at least MaxPrivateScreens pointers to IPrivateScreen. + /// + /// @returns Success if the private screens are correctly retrieved. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if the device does not support private screen functionalities. + virtual Result GetPrivateScreens( + uint32* pNumScreens, + IPrivateScreen** ppScreens) = 0; + + /// Registers an emulated @ref IPrivateScreen objects owned by this device. An emulated private screen doesn't have + /// a physical display hardware connected to the GPU. This could be useful for debugging unusual configurations or + /// using in automation systems when no real HMDs are available. + /// + /// @param [in] createInfo Properties of to create an emulated private screen object. + /// @param [out] pTargetId Pointer to returned emulated private screen target id. + /// + /// @returns Success if the emulated private screen is correctly created. Otherwise, one of the following errors may + /// be returned: + /// + ErrorTooManyPrivateScreens if the device cannot create an emulated private screen. + virtual Result AddEmulatedPrivateScreen( + const PrivateScreenCreateInfo& createInfo, + uint32* pTargetId) = 0; + + /// Removes an emulated @ref IPrivateScreen objects owned by this device. + /// + /// @param [in] targetId Target id of emulated private screen to be removed. + /// + /// @returns Success if the emulated private screen is correctly removed. Otherwise, one of the following errors may + /// be returned: + /// + ErrorUnknown if any unknown error occurs. + virtual Result RemoveEmulatedPrivateScreen( + uint32 targetId) = 0; + + /// Determines the amount of system memory required for a private screen image object (and an associated memory + /// object). Allocations of these amounts of memory must be provided in the pImagePlacementAddr and + /// pGpuMemoryPlacementAddr parameters of CreatePrivateScreenImage(). + /// + /// Only images created through this interface are valid sources for IPrivateScreen::Present(). + /// + /// @param [in] createInfo Properties of the image to create such as width/height and pixel format. + /// @param [out] pImageSize Size, in bytes, of system memory required for the IImage. + /// Should be specified to the pImagePlacementAddr argument of CreatePresentableImage(). + /// @param [out] pGpuMemorySize Size, in bytes, of system memory required for a dummy IGpuMemory object attached to + /// the private screen IImage. Should be specified to the pGpuMemoryPlacementAddr + /// argument of CreatePrivateScreenImage(). + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + virtual void GetPrivateScreenImageSizes( + const PrivateScreenImageCreateInfo& createInfo, + size_t* pImageSize, + size_t* pGpuMemorySize, + Result* pResult) const = 0; + + /// Creates private screen presentable image. A private screen presentable image is similar to a regular presentable + /// image but can only be presented on the private screens. It has some implicit properties relative to standard + /// images, such as mipLevels=1, arraySize=1, numSamples=1 and etc. It also requires its bound GPU memory to be + /// pinned before presenting. + /// + /// @param [in] createInfo Create info. + /// @param [in] pImagePlacementAddr Pointer to the location where PAL should construct this object. There must + /// be as much size available here as reported by calling + /// GetPrivateScreenImageSizes(). + /// @param [in] pGpuMemoryPlacementAddr Pointer to the location where PAL should construct a IGpuMemory associated + /// with this peer image. There must be as much size available here as + /// reported by calling GetPrivateScreenImageSizes(). + /// @param [out] ppImage Constructed image object. + /// @param [out] ppGpuMemory Constructed dummy memory object. This object is only valid for specifying + /// in a memory reference list. + /// + /// @returns Success if the image was successfully created. Otherwise, one of the following errors may be returned: + /// + ErrorPrivateScreenInvalidFormat if the format isn't supported on the private screen. + /// + ErrorPrivateScreenRemoved if the private screen was removed. + virtual Result CreatePrivateScreenImage( + const PrivateScreenImageCreateInfo& createInfo, + void* pImagePlacementAddr, + void* pGpuMemoryPlacementAddr, + IImage** ppImage, + IGpuMemory** ppGpuMemory) = 0; + + /// Determines the amount of system memory required for an ISwapChain object. An allocation of this amount of memory + /// must be provided in the pPlacementAddr parameter of CreateSwapChain(). + /// + /// @param [in] createInfo All the information related with this swap chain. + /// @param [out] pResult The validation result if pResult is non-null. This argument can be null to avoid + /// the additional validation. + /// + /// @returns Size, in bytes, of system memory required for an ISwapChain object. + virtual size_t GetSwapChainSize( + const SwapChainCreateInfo& createInfo, + Result* pResult) const = 0; + + /// Create swap chain object based on the local window system. It doesn't include creation of presentable images. + /// The presentable images should be associated with SwapChain object when presentable image is created. + /// + /// @param [in] createInfo All the information related with this swap chain. + /// @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + /// much size available here as reported by calling GetSwapChainSize(). + /// @param [out] ppSwapChain Constructed swapchain object. When successful, the returned address will be the + /// same as specified in pPlacementAddr. + /// + /// @returns Success if create swap chain instance successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result CreateSwapChain( + const SwapChainCreateInfo& createInfo, + void* pPlacementAddr, + ISwapChain** ppSwapChain) = 0; + + /// Sets a power profile for this device. + /// + /// @param [in] profile A profile is a pre-defined configuration indicates how KMD/PPLib is notified to work, + /// e.g. raise or lower the GPU clock etc. + /// @param [in,out] pInfo Custom power profile info needed for VrCustom mode, can be null for other modes. Note the + /// actualSwitchInfo[] field is output part of @ref CustomPowerProfile. + /// + /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result SetPowerProfile( + PowerProfile profile, + CustomPowerProfile* pInfo) = 0; + + /// Sends an escape call to the KMD to enable power optimizations for DirectML/ROCm workloads. The client must call + /// this function when the context for any ROCm or DirectML workload is created or destroyed. + /// + /// @param [in] enableOptimization Set to true if called during context creation (enable power optimizations), + /// false if context destroyed (restore defaults). + /// + /// @returns Success if the power optimization is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result SetMlPowerOptimization( + bool enableOptimization) const = 0; + + /// Queries workstation caps on this device. + /// + /// @param [out] pCaps Pointer to location where pal should write back workstation caps. + /// + /// @returns Success if wokstation caps is got from KMD successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorOutOfMemory if out of system memory. + virtual Result QueryWorkStationCaps( + WorkStationCaps* pCaps) const = 0; + + /// Queries display connectors installed on the GPU + /// + /// @param [in,out] pConnectorCount Input value specifies the maximum number of connectors to enumerate, and the + /// output value specifies the total number of display modes that were enumerated + /// in pConnectors. The input value is ignored if pConnectors is null. + /// This pointer must not be null. + /// @param [out] pConnectors Output list of connectors. Can be null, in which case the total number of + /// available connectors will be written to pConnectorCount. + /// + /// + /// @returns Success if the profile is set successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this OS. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result QueryDisplayConnectors( + uint32* pConnectorCount, + DisplayConnectorProperties* pConnectors) = 0; + + /// Queries the dispatch kernel source code + /// + /// @returns Returns the source code for the kernel dispatch function used in + /// the kernel scheduler for device enqueue (CL C code) or nullptr if it's not available. + /// + /// @note This function is to support OpenCL AQL submissions. + virtual const char* GetDispatchKernelSource() const = 0; + + /// @} + + /// Query the Flgl state from the device. Device will query the connectivity of GLSync card and return + /// the state. Pal internal state of Flgl will be updated. + /// + /// @param [out] pState Pointer to the location that PAL should write the internal flgl states back. + /// + /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglQueryState( + FlglState* pState) = 0; + + /// Set the Flgl config of the device. + /// + /// @param [in] glSyncConfig const reference to the config struct. + /// + /// @returns Success if setting returns with success. Otherwise, one of the following errors may returned: + /// + ErrorUnknown if an unexpected internal error occurs. + /// + ErrorUnsuppported if the this GenLock function is not available. + virtual Result FlglSetSyncConfiguration( + const GlSyncConfig& glSyncConfig) = 0; + + /// Get the Flgl config of the device. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [out] pGlSyncConfig Pointer to the location that PAL should write the config back. + /// + /// @returns Success if query returns with success. Otherwise, one of the following errors may returned: + /// + ErrorUnknown if an unexpected internal error occurs. + /// + ErrorInvalidPointer if pGlSyncConfig is null poiter. + virtual Result FlglGetSyncConfiguration( + GlSyncConfig* pGlSyncConfig) const = 0; + + /// Set the Framelock to disable or enable. Client should call this interface first to enable/disable Flgl. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [in] enable If true enables KMD framelock, otherwise disables framelock. + /// + /// @returns Success if framelock enable/disable successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not supported on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglSetFrameLock( + bool enable) = 0; + + /// Set the Genlock to disable or enable. + /// This function cannot be called if FlglState's support value is FlglSupport::NotAvailable. + /// + /// @param [in] enable If true enables the genlock, otherwise disables genlock. + /// + /// @returns Success if genlock enable/disable successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not supported on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglSetGenLock( + bool enable) = 0; + + /// Reset the framelock HW counter. The following counter operations are directly submit to hardware via I2C + /// interface Pal doesn't store the counter internally. Client should manage the counter + /// + /// @returns Success if the HW counter is reset successfully. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglResetFrameCounter() const = 0; + + /// Check if a reset/discontinuity of HW framecounter occurs. If returns false, there is no need to query HW frame + /// counter, client should update its software counter instead. If returns true, client is required to query HW + /// counter and adjusts its software counter accordingly. + /// + /// @param [out] pReset Pointer to the location that PAL should write the reset status back. + /// + /// @returns Success if the reset status is returned. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglGetFrameCounterResetStatus( + bool* pReset) const = 0; + + /// Get the framelock HW counter. + /// + /// @param [out] pValue Pointer to the location that PAL should write the frame counter value back. + /// @param [out] pReset Pointer to the location that PAL should write the frame counter reset state. + /// + /// @returns Success if the frame counter is returned. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if this function is not available on this Asic. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result FlglGetFrameCounter( + uint64* pValue, + bool* pReset) const = 0; + + /// Checks if the specified externally-controlled feature settings have changed since the last time the function was + /// called. + /// + /// This is intended to be a lightweight function that can be called per frame per feature. If the function + /// returns Result::Success and (*pRsFeaturesChanged & RsFeatureTypeXX) != 0, then the user changed some related + /// settings in the UI. + /// + /// If TurboSync has updated, the client should first try to re-read the application profile settings by calling + /// IPlatform::QueryRawApplicationProfile() with client = User3D. If that returns Unsupported, then fall back + /// to device-wide TurboSync settings read via GetRsFeatureGlobalSettings(). + /// + /// If Chill has updated, call IPlatform::QueryRawApplicationProfile() with client = Chill to re-read the + /// system app profiles and then with client = User3D for any per-user Chill overrides, and additionally + /// call GetRsFeatureGlobalSettings() to get the Chill enabled state. + /// + /// If Delag has updated, call IPlatform::QueryRawApplicationProfile() with client = User3D to get the enabled + /// state, and additionally call GetRsFeatureGlobalSettings() to get the Delag hotkey. + /// + /// + /// @param [in] rsFeatures Bitmask of RsFeatureType value(s) to query. Use UINT_MAX to poll all. + /// @param [out] pRsFeaturesChanged Bitmask of queried RsFeatureTypes that have changed since last polling. + /// + /// @returns Success if the call succeeded. + virtual Result DidRsFeatureSettingsChange( + uint32 rsFeatures, + uint32* pRsFeaturesChanged) = 0; + + /// Gets externally-controlled per-device settings for the requested RsFeatureType. + /// + /// @param [in] rsFeature Feature type to request information for (singular, not a mask). + /// @param [out] pRsFeatureInfo Settings related to the specified RsFeatureType. + /// + /// @returns Success if the call succeeded. + virtual Result GetRsFeatureGlobalSettings( + RsFeatureType rsFeature, + RsFeatureInfo* pRsFeatureInfo) = 0; + + /// Update Chill Status (last active time stamp). After every frame, UMD needs to generate a time stamp and inform + /// KMD through the shared memory, if the time stamp changes between 2 frames, it means Chill is active and KMD + /// needs to adjust power through PSM. + /// + /// @param [in] lastChillActiveTimeStampUs the last Chill active time stamp in microseconds to set + /// + /// @returns Success if the call succeeded. + virtual Result UpdateChillStatus( + uint64 lastChillActiveTimeStampUs) = 0; + + /// Make the Bus Addressable allocations available to be accessed by remote device. + /// Exposes the surface and marker bus addresses for each allocation. These bus addresses can be accessed by + /// calling @ref IGpuMemory::Desc() on the appropriate object. + /// Client drivers must call @ref AddGpuMemoryReferences() for all relevant allocations before calling this. + /// + /// @param [in] pQueue Queue used by PAL for performing this operation. + /// @param [in] gpuMemCount Number of GPU memory allocations to expose to remote devices. + /// @param [in] ppGpuMemList Array of gpuMemCount IGpuMemory objects. + /// + /// @returns Success if bus addresses are available by calling @ref IGpuMemory::Desc() on all IGpuMemory objects + virtual Result InitBusAddressableGpuMemory( + IQueue* pQueue, + uint32 gpuMemCount, + IGpuMemory*const* ppGpuMemList) = 0; + + /// Create virtual display. Virtual display is similar to the regular display (IScreen), the difference is the + /// virtual display doesn't have a physical monitor connected. When CreateVirtualDisplay is called, KMD will + /// generate a hot-plug-in event to notify application a new display is added. Then the app/client will call PAL to + /// re-querythe attached screens and they will find a new one in the list that is pretend, but they can use it just + /// like a normal display. + /// + /// @param [in] virtualDisplayInfo Virtual display creation infomation. + /// @param [out] pScreenTargetId The screen target ID returned by KMD + /// + /// @returns Success if the call succeeded. + virtual Result CreateVirtualDisplay( + const VirtualDisplayInfo& virtualDisplayInfo, + uint32* pScreenTargetId) = 0; + + /// Destroy virtual display. When DestroyVirtualDisplay is called KMD will generate a hot-plug-out event to notify + /// application a virtual display is removed, it also will be removed from the display list, and app/client can't + /// uses it anymore. + /// + /// @param [in] screenTargetId Screen target ID. + /// + /// @returns Success if the call succeeded. + virtual Result DestroyVirtualDisplay( + uint32 screenTargetId) = 0; + + /// Query virtual display Properties from screen target Id. + /// + /// @param [in] screenTargetId Screen target ID. + /// @param [out] pVirtualDisplayProperties A pointer to VirtualDisplayProperties + /// + /// @returns Success if the call succeeded. + virtual Result GetVirtualDisplayProperties( + uint32 screenTargetId, + VirtualDisplayProperties* pProperties) = 0; + + /// Determines if hardware accelerated stereo rendering can be enabled for given graphic pipeline. + /// If hardware accelerate stereo rendering can be enabled, client doesn't need to do shader patching + /// which includes translating view id intrinsic to user data slot, outputing render target + /// array index and viewport array index in shader closest to scan converter. + /// + /// @param [in] viewInstancingInfo Graphic pipeline view instancing information. + /// + /// @returns True if hardware accelerated stereo rendering can be enabled, False otherwise. + virtual bool DetermineHwStereoRenderingSupported( + const GraphicPipelineViewInstancingInfo& viewInstancingInfo) const = 0; + + /// Get the UTF-8 file path used to put all files for cache purpose. + /// On Windows, the returned path depends on Util::GetProcessIntegrityLevel and Util::IsProcessInAppContainer. + /// + /// @returns Pointer to cache file path. + virtual const char* GetCacheFilePath() const = 0; + + /// Get the UTF-8 file path used to put all files for debug purpose (such as logs, dumps, replace shader). + /// + /// @returns Pointer to debug file path. + virtual const char* GetDebugFilePath() const = 0; + + /// Queries the base driver Radeon Software Version string (as shown in Radeon Settings). + /// + /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. + /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). + /// + /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors + /// may be returned: + /// + Unsupported if this function is not available on this environment. + /// + NotFound if the Radeon Software Version string is not present. + /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. + virtual Result QueryRadeonSoftwareVersion( + char* pBuffer, + size_t bufferLength) const = 0; + + /// Queries the base Driver Release Version string. + /// + /// @param [out] pBuffer A non-null pointer to the buffer where the string will be written. + /// @param [in] bufferLength The byte size of the string buffer (must be non-zero). + /// + /// @returns Success if the string was successfully retrieved. Otherwise, one of the following errors + /// may be returned: + /// + Unsupported if this function is not available on this environment. + /// + NotFound if the Release Version string is not present. + /// + ErrorInvalidValue if nullptr was passed for pBuffer or 0 for bufferLength. + virtual Result QueryReleaseVersion( + char* pBuffer, + size_t bufferLength) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +#if defined(__unix__) + /// Gets the modifier list and modifier count according to image format. + /// + /// @param [in] format Image format. + /// @param [out] pModifierCount Modifier count. + /// @param [out] pModifiersList A pointer to the modifier list. + virtual void GetModifiersList( + ChNumFormat format, + uint32* pModifierCount, + uint64* pModifiersList) const = 0; +#endif + + /// Passes the HIP runtime state setup to KMD on behalf of the HIP runtime. + /// + /// @param [in] runtimeState A structure containing runtime state information to pass to KMD + /// + /// @returns Result for error handling. + virtual Result RegisterHipRuntimeState(const HipRuntimeSetup& runtimeState) const = 0; + + /// Sets the second-level trap handler for HIP + /// + /// @param [in] pTrapHandlerCode A pointer to the piece of memory containing the trap handler code + /// This may be nullptr, which indicates that there is no secondary trap handler. + /// @param [in] codeOffset An offset, in bytes, into the pTrapHandlerCode's memory region + /// @param [in] pTrapHandlerMemory A pointer to the piece of memory containing the trap handler's memory + /// This may be nullptr, which indicates that there is no valid trap handler + /// memory. + /// @param [in] memoryOffset An offset, in bytes, into the pTrapHandlerMemory's memory region + /// + /// @returns Result for error handling. + virtual Result SetHipTrapHandler( + const IGpuMemory* pTrapHandlerCode, + gpusize codeOffset, + const IGpuMemory* pTrapHandlerMemory, + gpusize memoryOffset) const = 0; + + /// Sets the CmdDisassembly::ICmdBufferReporting associated with this device + /// Currently only one can be registered at a time - but this could be changed in the future + /// + /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting being + /// registered + /// @return Result::Success if successful + /// Result::Unsupported if feature is not supported + /// Result::AlreadyExists if there is already a CmdDisassembly::ICmdBufferReporting + /// registered with this devide + /// Result::ErrorInvalidValue if pInterface == nullptr + /// + virtual Result RegisterCmdReportingInterface( + CmdDisassembly::ICmdBufferReporting* pInterface) + { + return Result::Unsupported; + } + + /// Resets the CmdDisassembly::ICmdBufferReporting associated with this device to nullptr + /// Currently only one can be registered at a time - but this could be changed in the future + /// + /// @param [in] pInterface A pointer to the CmdDisassembly::ICmdBufferReporting that + /// was registered + /// @return Result::Success if successful + /// Result::Unsupported if feature is not supported + /// Result::ErrorInvalidValue if pInterface == nullptr or pInterface was not the last + /// CmdDisassembly::ICmdBufferReporting registered + /// + virtual Result UnregisterCmdReportingInterface( + CmdDisassembly::ICmdBufferReporting* pInterface) + { + return Result::Unsupported; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IDevice() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Device objects will be destroyed when + /// @ref IPlatform::Destroy() is called. + virtual ~IDevice() { } + + DeviceInterfacePfnTable m_pfnTable; ///< SRD function pointer table. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +/** + *********************************************************************************************************************** + * @defgroup ResourceBinding Resource Binding Model + * + * _Resource binding_ refers to the process of binding resources (textures, UAVs, samplers, etc.) for access by shaders + * in a pipeline. This is an area where 3D APIs diverge significantly. PAL's model is designed to minimally abstract + * the underlying hardware in a way that allows performant implementations by each client driver. + * + * ### Hardware User Data + * GCN hardware has 16 _user data_ registers that act as a generic interface for passing values from a command buffer to + * a shader. User data registers are set to their desired value via packets in a command buffer, then the specified + * values are loaded from the user data registers into shader GPRs when a wave is launched. Since the user data is just + * arbitrary generic data, this method can be used to pass any type of data a client may want to specify directly from + * a command buffer, for example: + * + * - __Constant value__ - a 32-bit floating point or integer constant could be written into user data then be used + * directly by the shader. + * - __Shader resource descriptor (SRD)__ - 4 or 8 dwords of consecutive data could be an SRD which will be used as a + * t#, s#, etc. by the shader. + * - __Pointer__ - The user data could be an arbitrary GPU virtual address where a table of constants, SRDs, etc. are + * stored. + * + * The 3D driver and shader compiler are responsible for working together to define how resources referenced in a shader + * should be mapped to user data bound in a command buffer. + * + * ### PAL User Data + * PAL only lightly abstracts the hardware user data concept. DeviceProperties reports the number of user data entries + * supported on the device in maxUserDataEntries. Note that some clients may require more user data entries than there + * are physical user data registers - PAL will manage "spilling" of user data entries to GPU memory if necessary. + * + * User data entries are set in a command buffer by calling ICmdBuffer::CmdSetUserData(). + * + * ### Shader User Data Mapping + * When creating a pipeline, the client must specify how the user data entries set in a command buffer map to resources + * referenced by each shader in the pipeline. This is done in the pUserDataNodes array of PipelineShaderInfo. + * + * The resource mapping is built as a graph of _resource mapping nodes_ where the root nodes in the graph correspond + * to the user data entries. Each node fits in one of the following categories: + * + * - __SRD__: A 4 or 8 dword descriptor describing a shader resource. The mapping specifies the type and slot the SRD + * corresponds to (e.g., UAV 3 or sampler 7). + * - __Descriptor table pointer__: A GPU virtual address pointing at an array of other nodes. Typically this will be a + * pointer to GPU memory containing just SRDs, but tables are free to be built hierarchically such that tables have + * pointers to other tables in them. + * - __Inline constants__: 32-bit constants loaded directly byu the shader. The mapping specified the CB slot that + * should load the constant (e.g., cb3[1]). + * - __Unused__: A particular shader may not use all entries in a user data layout, and those should be marked unused. + * + * The following image illustrates a simple user data mapping: + * + * @image html userDataMapping.png + * + * ### Building Descriptor Tables + * The client is responsible for building specifying SRDs and pointers to GPU memory in order to execute the shader + * resource mapping specified during pipeline creation. SRDs can be created with several methods provided by IDevice: + * + * - CreateTypedBufferViewSrds() + * - CreateUntypedBufferViewSrds() + * - CreateImageViewSrds() + * - CreateFmaskViewSrds() + * - CreateSamplerSrds() + * + * The size required for each of these SRD types is returned in the srdSizes structure in DeviceProperties. + * + * When building descriptor tables in GPU memory, the client will need to retrieve a virtual address of the GPU memory + * where the tables exist in order to reference them from user data or from other descriptor tables. IGpuMemory + * provides the GetVirtAddr() method for this purpose. + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFence.h b/shared/amdgpu-windows-interop/pal/inc/core/palFence.h index bea43aa41e..0d2e3012f9 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palFence.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFence.h @@ -1,171 +1,171 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palFence.h - * @brief Defines the Platform Abstraction Library (PAL) IFence interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" - -namespace Pal -{ - -/// Specifies properties for fence @ref IFence fence creation. Input structure to IDevice::CreateFence(). -struct FenceCreateInfo -{ - union - { - struct - { - uint32 signaled : 1; ///< Specify whether the initial status of the fence is signaled or not. - uint32 eventCanBeInherited : 1; ///< The event handle can be inherited by child process. - uint32 shareable : 1; ///< This fence may be opened for use by a different device. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Fence creation flags. -#if defined(_WIN32) - const wchar_t* pName; /// The name of the event object, Windows uses this name to uniquely identify fence objects - /// across processes. -#endif -}; - -/// Specifies properties for fence opening. Input structure to IDevice::OpenFence(). -struct FenceOpenInfo -{ - union - { - struct - { - uint32 isReference : 1; ///< If set, then the opened fence will reference the same sync object - ///< in the kernel. Otherwise, the object is copied to the new Fence. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; - - OsExternalHandle externalFence; ///< External shared fence handle. -#if defined(_WIN32) - const wchar_t* pName; /// The name of the event object,Windows uses this name to uniquely identify - /// fence objects across processes. -#endif -}; - -/// Specifies properties for fence exporting. Input structure to IFence::ExportExternalHandle(). -struct FenceExportInfo -{ - union - { - struct - { - uint32 isReference : 1; ///< If set, then the fence exporting a handle that reference the same sync - ///< object in the kernel. Otherwise, the object is copied to the new Fence. - uint32 implicitReset : 1; ///< If set, a fence reset will be done for the sync fd exported. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; -}; - -/** - *********************************************************************************************************************** - * @interface IFence - * @brief Represents a command buffer fence the client can use for coarse-level synchronization between the GPU and - * CPU. - * - * Fences can be specified when calling IQueue::Submit() and will be signaled when certain prior queue operations have - * completed. The status of the fence can be queried by the client to determine when the GPU work of interest has - * completed. - * - * Fences are guaranteed to wait for: - * + Prior command buffer submissions. - * + Prior queue semaphore signals and waits. - * + Prior direct presents. - * - * @see IDevice::CreateFence() - *********************************************************************************************************************** - */ -class IFence : public IDestroyable -{ -public: - /// Gets the status (completed or not) of the fence. - /// - /// @returns Success if the fence has been reached, or NotReady if the fence hasn't been reached. Other return - /// codes indicate an error: - /// + ErrorFenceNeverSubmitted if the fence hasn't been submitted yet and the fence is not created with - /// initialSignaled set to true. - virtual Result GetStatus() const = 0; - - /// Export the event handle or sync object handle of the fence for external usage. - /// If @ref FenceExportInfo::isReference is not set, then this also performs an implicit reset operation on - /// the Fence. - /// - /// @param [in] exportInfo Information describing how the Fence handle should be exported. - /// @returns the handle in the type OsExternalHandle - virtual OsExternalHandle ExportExternalHandle( - const FenceExportInfo& exportInfo) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IFence() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IFence() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFence.h + * @brief Defines the Platform Abstraction Library (PAL) IFence interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +/// Specifies properties for fence @ref IFence fence creation. Input structure to IDevice::CreateFence(). +struct FenceCreateInfo +{ + union + { + struct + { + uint32 signaled : 1; ///< Specify whether the initial status of the fence is signaled or not. + uint32 eventCanBeInherited : 1; ///< The event handle can be inherited by child process. + uint32 shareable : 1; ///< This fence may be opened for use by a different device. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Fence creation flags. +#if defined(_WIN32) + const wchar_t* pName; /// The name of the event object, Windows uses this name to uniquely identify fence objects + /// across processes. +#endif +}; + +/// Specifies properties for fence opening. Input structure to IDevice::OpenFence(). +struct FenceOpenInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the opened fence will reference the same sync object + ///< in the kernel. Otherwise, the object is copied to the new Fence. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; + + OsExternalHandle externalFence; ///< External shared fence handle. +#if defined(_WIN32) + const wchar_t* pName; /// The name of the event object,Windows uses this name to uniquely identify + /// fence objects across processes. +#endif +}; + +/// Specifies properties for fence exporting. Input structure to IFence::ExportExternalHandle(). +struct FenceExportInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the fence exporting a handle that reference the same sync + ///< object in the kernel. Otherwise, the object is copied to the new Fence. + uint32 implicitReset : 1; ///< If set, a fence reset will be done for the sync fd exported. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; +}; + +/** + *********************************************************************************************************************** + * @interface IFence + * @brief Represents a command buffer fence the client can use for coarse-level synchronization between the GPU and + * CPU. + * + * Fences can be specified when calling IQueue::Submit() and will be signaled when certain prior queue operations have + * completed. The status of the fence can be queried by the client to determine when the GPU work of interest has + * completed. + * + * Fences are guaranteed to wait for: + * + Prior command buffer submissions. + * + Prior queue semaphore signals and waits. + * + Prior direct presents. + * + * @see IDevice::CreateFence() + *********************************************************************************************************************** + */ +class IFence : public IDestroyable +{ +public: + /// Gets the status (completed or not) of the fence. + /// + /// @returns Success if the fence has been reached, or NotReady if the fence hasn't been reached. Other return + /// codes indicate an error: + /// + ErrorFenceNeverSubmitted if the fence hasn't been submitted yet and the fence is not created with + /// initialSignaled set to true. + virtual Result GetStatus() const = 0; + + /// Export the event handle or sync object handle of the fence for external usage. + /// If @ref FenceExportInfo::isReference is not set, then this also performs an implicit reset operation on + /// the Fence. + /// + /// @param [in] exportInfo Information describing how the Fence handle should be exported. + /// @returns the handle in the type OsExternalHandle + virtual OsExternalHandle ExportExternalHandle( + const FenceExportInfo& exportInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IFence() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IFence() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h b/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h index 60802421a7..a36ce67d64 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFormat.h @@ -1,506 +1,506 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palFormat.h - * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines format types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palUtil.h" - -/// Library-wide namespace encapsulating all PAL entities. -namespace Pal -{ - -/// Specifies the format for an image or GPU memory view. -/// -/// This defines the bit layout of the channels and how the value in each channel is interpreted. -/// -/// Channels are listed in DX10+-style order, where the least significant channels are listed first. For example, for -/// a uint32 val with an X8Y8Z8W8 value: X = val & 0xFF, Y = (val >> 8) & 0xFF, Z = (val >> 16) & 0xFF, -/// W = (val >> 24) & 0xFF. Enums without a more detailed explanation can be decoded in this manner. Multimedia, -/// or "YUV" formats are all exceptions to this rule. Each of those formats explicitly describes how their channels are -/// organized. -/// -/// Many of the multimedia (i.e., "YUV") formats are encoded such that the chrominance (chroma, CbCr, UV) samples are -/// stored at a lower resolution than the luminance (luma, Y) samples as a form of compression. The ratio of the -/// subsampling is often referred to using an A:B:C notation, where the trio of numbers A,B,C are used to describe the -/// number of luma and chroma samples in a hypothetical region which is A pixels wide and 2 pixels high. The three -/// numbers usually refer to the following quantities by convention: -/// A --> Width of the conceptual region of pixels, and is usually 4. -/// B --> Number of chroma samples in the first of two rows of A pixels. -/// C --> Number of changes of chroma samples between the first and second rows of A pixels. -/// -/// Common examples of this notation are described below: -/// 4:4:4 --> No chroma subsampling because luma and chroma both have 4 samples per row, and the number of chroma and -/// luma rows is the same. -/// 4:1:1 --> Each row of 4 pixels has 1 chroma sample, and the number of chroma and luma rows is the same. -/// 4:2:0 --> Each row of 4 pixels has 2 chroma samples, and there is only 1 chroma row for every 2 luma rows. -/// 4:2:2 --> Each row of 4 pixels has 2 chroma samples, and the number of chroma and luma rows is the same. -/// -/// Because of the subsampling ratios for multimedia formats, there are some restrictions on what dimensions can be used -/// when creating Images of these formats. 4:1:1 formats must have widths specified as a multiple of 4. 4:2:0 formats -/// must have widths and heights specified as multiples of 2. 4:2:2 formats must have widths specified as a multiple of -/// 2. 4:4:4 formats have no dimensional restrictions. -/// -/// Additionally, the YUV formats are broadly grouped into two categories: packed and planar formats. Packed formats -/// interleave the luma and chroma samples in each row of pixels. Planar formats are organized so that all of the luma -/// samples are together, followed by all of the chroma samples. Some planar formats interleave the U and V chroma -/// data, while some choose to have separate U and V planes. Both packed and planar formats can have any subsampling -/// ratio between the luma and chroma data. -enum class ChNumFormat : Util::uint32 -{ - Undefined = 0x0, ///< Used in situations where no format is needed, like raw memory views, or to - /// indicate no color/depth target will be attached when creating a graphics - /// pipeline. - X1_Unorm = 0x1, ///< _Untested._ - X1_Uscaled = 0x2, ///< _Untested._ - X4Y4_Unorm = 0x3, - X4Y4_Uscaled = 0x4, - L4A4_Unorm = 0x5, - X4Y4Z4W4_Unorm = 0x6, - X4Y4Z4W4_Uscaled = 0x7, - X5Y6Z5_Unorm = 0x8, - X5Y6Z5_Uscaled = 0x9, - X5Y5Z5W1_Unorm = 0xA, - X5Y5Z5W1_Uscaled = 0xB, - X1Y5Z5W5_Unorm = 0xC, - X1Y5Z5W5_Uscaled = 0xD, - X8_Unorm = 0xE, - X8_Snorm = 0xF, - X8_Uscaled = 0x10, - X8_Sscaled = 0x11, - X8_Uint = 0x12, - X8_Sint = 0x13, - X8_Srgb = 0x14, - A8_Unorm = 0x15, - L8_Unorm = 0x16, - P8_Unorm = 0x17, - X8Y8_Unorm = 0x18, - X8Y8_Snorm = 0x19, - X8Y8_Uscaled = 0x1A, - X8Y8_Sscaled = 0x1B, - X8Y8_Uint = 0x1C, - X8Y8_Sint = 0x1D, - X8Y8_Srgb = 0x1E, - L8A8_Unorm = 0x1F, - X8Y8Z8W8_Unorm = 0x20, - X8Y8Z8W8_Snorm = 0x21, - X8Y8Z8W8_Uscaled = 0x22, - X8Y8Z8W8_Sscaled = 0x23, - X8Y8Z8W8_Uint = 0x24, - X8Y8Z8W8_Sint = 0x25, - X8Y8Z8W8_Srgb = 0x26, - U8V8_Snorm_L8W8_Unorm = 0x27, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats - /// are X8Y8Z8W8_Snorm (to target U8V8_Snorm) and X8Y8Z8W8_Unorm (to target - /// L8W8_Unorm). - X10Y11Z11_Float = 0x28, - X11Y11Z10_Float = 0x29, - X10Y10Z10W2_Unorm = 0x2A, - X10Y10Z10W2_Snorm = 0x2B, - X10Y10Z10W2_Uscaled = 0x2C, - X10Y10Z10W2_Sscaled = 0x2D, - X10Y10Z10W2_Uint = 0x2E, - X10Y10Z10W2_Sint = 0x2F, - X10Y10Z10W2Bias_Unorm = 0x30, ///< A four-component, 32-bit 2.8-biased fixed-point format that supports 10 - /// bits for each color channel and 2-bit alpha. A shader must be aware of - /// *Bias* and must perform its own bias and scale on any data that is read - /// from or written. - U10V10W10_Snorm_A2_Unorm = 0X31, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats - /// are X10Y10Z10W2_Snorm (to target U10V10W10_Snorm) and X10Y10Z10W2_Unorm - /// (to target A2_Unorm). - X16_Unorm = 0x32, - X16_Snorm = 0x33, - X16_Uscaled = 0x34, - X16_Sscaled = 0x35, - X16_Uint = 0x36, - X16_Sint = 0x37, - X16_Float = 0x38, - L16_Unorm = 0x39, - X16Y16_Unorm = 0x3A, - X16Y16_Snorm = 0x3B, - X16Y16_Uscaled = 0x3C, - X16Y16_Sscaled = 0x3D, - X16Y16_Uint = 0x3E, - X16Y16_Sint = 0x3F, - X16Y16_Float = 0x40, - X16Y16Z16W16_Unorm = 0x41, - X16Y16Z16W16_Snorm = 0x42, - X16Y16Z16W16_Uscaled = 0x43, - X16Y16Z16W16_Sscaled = 0x44, - X16Y16Z16W16_Uint = 0x45, - X16Y16Z16W16_Sint = 0x46, - X16Y16Z16W16_Float = 0x47, - X32_Uint = 0x48, - X32_Sint = 0x49, - X32_Float = 0x4A, - X32Y32_Uint = 0x4B, - X32Y32_Sint = 0x4C, - X32Y32_Float = 0x4D, - X32Y32Z32_Uint = 0x4E, - X32Y32Z32_Sint = 0x4F, - X32Y32Z32_Float = 0x50, - X32Y32Z32W32_Uint = 0x51, - X32Y32Z32W32_Sint = 0x52, - X32Y32Z32W32_Float = 0x53, - D16_Unorm_S8_Uint = 0x54, - D32_Float_S8_Uint = 0x55, - X9Y9Z9E5_Float = 0x56, ///< Three partial-precision floating-point numbers encoded into a single 32-bit - /// value all sharing the same 5-bit exponent (variant of s10e5, which is sign - /// bit, 10-bit mantissa, and 5-bit biased (15) exponent). There is no sign - /// bit, and there is a shared 5-bit biased (15) exponent and a 9-bit mantissa - /// for each channelShared exponent format. - Bc1_Unorm = 0x57, ///< BC1 compressed texture format. - Bc1_Srgb = 0x58, ///< BC1 compressed texture format. - Bc2_Unorm = 0x59, ///< BC2 compressed texture format. - Bc2_Srgb = 0x5A, ///< BC2 compressed texture format. - Bc3_Unorm = 0x5B, ///< BC3 compressed texture format. - Bc3_Srgb = 0x5C, ///< BC3 compressed texture format. - Bc4_Unorm = 0x5D, ///< BC4 compressed texture format. - Bc4_Snorm = 0x5E, ///< BC4 compressed texture format. - Bc5_Unorm = 0x5F, ///< BC5 compressed texture format. - Bc5_Snorm = 0x60, ///< BC5 compressed texture format. - Bc6_Ufloat = 0x61, ///< BC6 unsigned compressed texture format. - Bc6_Sfloat = 0x62, ///< BC6 signed compressed texture format. - Bc7_Unorm = 0x63, ///< BC7 compressed texture format. - Bc7_Srgb = 0x64, ///< BC7 compressed texture format. - Etc2X8Y8Z8_Unorm = 0x65, - Etc2X8Y8Z8_Srgb = 0x66, - Etc2X8Y8Z8W1_Unorm = 0x67, - Etc2X8Y8Z8W1_Srgb = 0x68, - Etc2X8Y8Z8W8_Unorm = 0x69, - Etc2X8Y8Z8W8_Srgb = 0x6A, - Etc2X11_Unorm = 0x6B, - Etc2X11_Snorm = 0x6C, - Etc2X11Y11_Unorm = 0x6D, - Etc2X11Y11_Snorm = 0x6E, - AstcLdr4x4_Unorm = 0x6F, - AstcLdr4x4_Srgb = 0x70, - AstcLdr5x4_Unorm = 0x71, - AstcLdr5x4_Srgb = 0x72, - AstcLdr5x5_Unorm = 0x73, - AstcLdr5x5_Srgb = 0x74, - AstcLdr6x5_Unorm = 0x75, - AstcLdr6x5_Srgb = 0x76, - AstcLdr6x6_Unorm = 0x77, - AstcLdr6x6_Srgb = 0x78, - AstcLdr8x5_Unorm = 0x79, - AstcLdr8x5_Srgb = 0x7A, - AstcLdr8x6_Unorm = 0x7B, - AstcLdr8x6_Srgb = 0x7C, - AstcLdr8x8_Unorm = 0x7D, - AstcLdr8x8_Srgb = 0x7E, - AstcLdr10x5_Unorm = 0x7F, - AstcLdr10x5_Srgb = 0x80, - AstcLdr10x6_Unorm = 0x81, - AstcLdr10x6_Srgb = 0x82, - AstcLdr10x8_Unorm = 0x83, - AstcLdr10x8_Srgb = 0x84, - AstcLdr10x10_Unorm = 0x85, - AstcLdr10x10_Srgb = 0x86, - AstcLdr12x10_Unorm = 0x87, - AstcLdr12x10_Srgb = 0x88, - AstcLdr12x12_Unorm = 0x89, - AstcLdr12x12_Srgb = 0x8A, - AstcHdr4x4_Float = 0x8B, - AstcHdr5x4_Float = 0x8C, - AstcHdr5x5_Float = 0x8D, - AstcHdr6x5_Float = 0x8E, - AstcHdr6x6_Float = 0x8F, - AstcHdr8x5_Float = 0x90, - AstcHdr8x6_Float = 0x91, - AstcHdr8x8_Float = 0x92, - AstcHdr10x5_Float = 0x93, - AstcHdr10x6_Float = 0x94, - AstcHdr10x8_Float = 0x95, - AstcHdr10x10_Float = 0x96, - AstcHdr12x10_Float = 0x97, - AstcHdr12x12_Float = 0x98, - X8Y8_Z8Y8_Unorm = 0x99, ///< _Untested._ - X8Y8_Z8Y8_Uscaled = 0x9A, ///< _Untested._ - Y8X8_Y8Z8_Unorm = 0x9B, ///< _Untested._ - Y8X8_Y8Z8_Uscaled = 0x9C, ///< _Untested._ - AYUV = 0x9D, ///< YUV 4:4:4 packed format. Valid Image and Color-Target view formats are - /// { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully maps the - /// entire YUV subresource, with the V,U,Y,A channels mapped to the X,Y,Z,W - /// channels respectively. Additionally, Image views can use the { X32, Uint } - /// format where all four channels are packed into a single uint32. - UYVY = 0x9E, ///< YUV 4:2:2 packed format. The Image data is subsampled such that each 32bit - /// element contains two Y samples and one U and V sample. Valid Image view - /// formats are { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully - /// maps the entire YUV subresource, with the X,Y,Z,W channels mapped to the - /// U0,Y0,V0,Y1 channels respectively. Additionally, Image views can use the - /// { X32, Uint } format where all four channels are packed into a single - /// uint32. Image views can also use the { X8Y8_Z8Y8, Unorm } format to access - /// these as well. In this case, the width of the Image view would appear to be - /// twice as wide as it normally does, and the X0,Y0,Z0,Y1 channels map to the - /// U0,Y0,V0,Y1 channels respectively. - VYUY = 0x9F, ///< YUV 4:2:2 packed format. The image data is encoded just like the - /// @ref ChNumFormat::UYVY format, except with a different channel ordering. - /// Image views with X8Y8Z8W8 channel formats map the X,Y,Z,W channels to the - /// V0,Y0,U0,Y1 channels respectively. Image views with the X8Y8_Z8Y8 channel - /// format map the X0,Y0,Z0,Y1 channels to the V0,Y0,U0,Y1 channels - /// respectively. - YUY2 = 0xA0, ///< YUV 4:2:2 packed format. The image data is encoded just like the - /// @ref ChNumFormat::UYVY format, except with a different channel ordering. - /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,U0,Y1,V0 - /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format - /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,U0,Y1,V0 channels. - YVY2 = 0xA1, ///< YUV 4:2:2 packed format. The image data is encoded just like the - /// @ref ChNumFormat::YUY2 format, except with a different channel ordering. - /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,V0,Y1,U0 - /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format - /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,V0,Y1,U0 channels. - YV12 = 0xA2, ///< YVU 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y - /// plane is first, containg a uint8 per sample. Next is the V plane and the U - /// plane, both of which have a uint8 per sample. Valid Image view formats are - /// { X8, Unorm } and { X8, Uint }. Each view only has access to one of the Y, - /// V, or U planes. - NV11 = 0xA3, ///< YUV 4:1:1 planar format, with 8 bits per luma and chroma sample. The Y - /// plane is first, containing a uint8 per sample. Next is a UV plane which - /// has interleaved U and V samples, each stored as a uint8. Valid Image and - /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } - /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view - /// only has access to the Y plane. When using X8Y8, the view only has access - /// to the UV plane. - NV12 = 0xA4, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y - /// plane is first, containing a uint8 per sample. Next is a UV plane which - /// has interleaved U and V samples, each stored as a uint8. Valid Image and - /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } - /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view - /// only has access to the Y plane. When using X8Y8, the view only has access - /// to the UV plane. - NV21 = 0xA5, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. This is - /// identical to @ref ChNumFormat::NV12, except that the second plane swaps the - /// ordering of the U and V samples. Image views behave just like with - /// @ref ChNumFormat::NV12. - P016 = 0xA6, ///< YUV 4:2:0 planar format, with 16 bits per luma and chroma sample. The - /// plane ordering is identical to @ref ChNumFormat::NV12. Instead of uint8 - /// samples, this format uses 8.8 fixed point sample encoding. Image views - /// behave just like with @ref ChNumFormat::NV12, except R16 channel formats - /// are used for the Y plane, and X16Y16 channel formats are used for the UV - /// plane. - P010 = 0xA7, ///< YUV 4:2:0 planar format, with 10 bits per luma and chroma sample. This is - /// identical to @ref ChNumFormat::P016, except that the lowest 6 bits of each - /// luma and chroma sample are ignored. This allows the source data to be - /// interpreted as either P016 or P010 interchangably. - P210 = 0xA8, ///< YUV 4:2:2 planar format, with 10 bits per luma and chroma sample. This is - /// similar to @ref ChNumFormat::P010, except that the UV planes are sub-sampled - /// only in the horizontal direction, but still by a factor of 2 so the UV plane - /// ends up having the same number of lines as the Y plane. - X8_MM_Unorm = 0xA9, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces. Such as the Y plane or any plane in YV12. - X8_MM_Uint = 0xAA, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces. Such as the Y plane or any plane in YV12. - X8Y8_MM_Unorm = 0xAB, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces. - X8Y8_MM_Uint = 0xAC, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces. - X16_MM10_Unorm = 0xAD, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces (10-bit). Such as the Y plane or any plane in YV12. - X16_MM10_Uint = 0xAE, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces (10-bit). Such as the Y plane or any plane in YV12. - X16Y16_MM10_Unorm = 0xAF, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces (10-bit). - X16Y16_MM10_Uint = 0xB0, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces (10-bit). - P208 = 0xB1, ///< YUV 4:2:2 planar format, with 8 bits per luma and chroma sample. This is - /// similar to @ref ChNumFormat::NV12, except that the UV planes are sub-sampled - /// only in the horizontal direction, but still by a factor of 2 so the UV plane - /// ends up having the same number of lines as the Y plane. This format is - /// sometimes referred to as NV16. - X16_MM12_Unorm = 0xB2, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces (12-bit). - X16_MM12_Uint = 0xB3, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar - /// surfaces (12-bit). - X16Y16_MM12_Unorm = 0xB4, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces (12-bit). - X16Y16_MM12_Uint = 0xB5, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar - /// surfaces (12-bit). - P012 = 0xB6, ///< YUV 4:2:0 planar format, with 12 bits per luma and chroma sample. This is - /// identical to @ref ChNumFormat::P010, except that the lowest 4 bits of each - /// luma and chroma sample are ignored. - P212 = 0xB7, ///< YUV 4:2:2 planar format, with 12 bits per luma and chroma sample. This is - /// identical to @ref ChNumFormat::P210, except that the lowest 4 bits of each - /// luma and chroma sample are ignored. - P412 = 0xB8, ///< YUV 4:4:4 planar format, with 12 bits per luma and chroma sample. It consists - /// of a Y-plane followed by an interleaved UV plane. - X10Y10Z10W2_Float = 0xB9, ///< RGBA format with three 10-bit floats (6e4) and a 2-bit unorm as alpha. - Y216 = 0xBA, ///< YUV 4:2:2 packed, with 16 bits per luma or chroma sample. No alpha. - Y210 = 0xBB, ///< YUV 4:2:2 packed, with 10 bits per luma or chroma sample. No alpha. - /// Same memory layout as @ref ChNumFormat::Y216. - /// The lowest 6 bits of each sample are ignored. - Y416 = 0xBC, ///< YUV 4:4:4 packed, with 16 bits per luma or chroma sample. - Y410 = 0xBD, ///< YUV 4:4:4 packed, with 10 bits per luma or chroma sample and 2 bits for alpha. - _ReservedBE = 0xBE, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 - P216 = 0xBF, ///< YUV 4:2:2 planar format, with 16 bits per luma and chroma sample. It consists - /// of a Y-plane followed by interleaved UV plane. -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 - YUV_420P10 = 0xC0, ///< YUV 4:2:0 tri-planar format, with 10 bits per luma and chroma sample. - YUV_422P10 = 0xC1, ///< YUV 4:2:2 tri-planar format, with 10 bits per luma and chroma sample. - YUV_444P10 = 0xC2, ///< YUV 4:4:4 tri-planar format, with 10 bits per luma and chroma sample. - YUV_420P12 = 0xC3, ///< YUV 4:2:0 tri-planar format, with 12 bits per luma and chroma sample. - YUV_422P12 = 0xC4, ///< YUV 4:2:2 tri-planar format, with 12 bits per luma and chroma sample. - YUV_444P12 = 0xC5, ///< YUV 4:4:4 tri-planar format, with 12 bits per luma and chroma sample. - YUV_420P16 = 0xC6, ///< YUV 4:2:0 tri-planar format, with 16 bits per luma and chroma sample. - YUV_422P16 = 0xC7, ///< YUV 4:2:2 tri-planar format, with 16 bits per luma and chroma sample. - YUV_444P16 = 0xC8, ///< YUV 4:4:4 tri-planar format, with 16 bits per luma and chroma sample. -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 - YV16 = 0xC9, ///< YVU 4:2:2 tri-planar format, with 8 bits per luma and chroma sample. This - /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled in - /// vertical direction. - YV24 = 0xCA, ///< YVU 4:4:4 tri-planar format, with 8 bits per luma and chroma sample. This - /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled. - NV24 = 0xCB, ///< YUV 4:4:4 bi-planar format, with 8 bits per luma and chroma sample. This - /// is similar to @ref ChNumFormat::NV12, except chroma is not subsampled. - /// This format is sometimes referred to as P408. - P410 = 0xCC, ///< YUV 4:4:4 planar format, with 10 bits per luma and chroma sample. It consists - /// of a Y-plane followed by interleaved UV plane. - P416 = 0xCD, ///< YUV 4:4:4 planar format, with 16 bits per luma and chroma sample. It consists - /// of a Y-plane followed by interleaved UV plane. -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 929 - X16Y16Z16W16_MM10_Unorm = 0xCE, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of - /// each component are ignored. - X16Y16Z16W16_MM10_Uint = 0xCF, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of - /// each component are ignored. - X16Y16Z16W16_MM12_Unorm = 0xD0, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of - /// each component are ignored. - X16Y16Z16W16_MM12_Uint = 0xD1, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of - /// each component are ignored. -#endif - Count, - -}; - -/// Specifies which channel of a resource should be mapped to a particular component of an image view. -/// -/// @ingroup ResourceBinding -enum class ChannelSwizzle : Util::uint8 -{ - Zero = 0x0, ///< Ignore resource data and always fetch a 0 into this component. - One = 0x1, ///< Ignore resource data and always fetch a 1 into this component. - X = 0x2, ///< Use the X channel from resource for this component. - Y = 0x3, ///< Use the Y channel from resource for this component. - Z = 0x4, ///< Use the Z channel from resource for this component. - W = 0x5, ///< Use the W channel from resource for this component. - Count -}; - -/// Specifies a mapping for each component of an image or buffer view to a channel in its associated resource. -/// -/// @ingroup ResourceBinding -struct ChannelMapping -{ - union - { - struct - { - ChannelSwizzle r; ///< Red component swizzle. - ChannelSwizzle g; ///< Green component swizzle. - ChannelSwizzle b; ///< Blue component swizzle. - ChannelSwizzle a; ///< Alpha component swizzle. - }; - ChannelSwizzle swizzle[4]; ///< All four swizzles packed into one array. - Util::uint32 swizzleValue; - }; -}; - -/// Specifies a pixel format for an image or memory view and its corresponding channel swizzle. -struct SwizzledFormat -{ - ChNumFormat format; ///< Pixel format. - ChannelMapping swizzle; ///< Compatible channel swizzle for the above pixel format. -}; - -inline constexpr bool operator==(const SwizzledFormat& lhs, const SwizzledFormat& rhs) -{ - return (lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue); -} - -/// Constant for undefined formats. -constexpr SwizzledFormat UndefinedSwizzledFormat = -{ - ChNumFormat::Undefined, - { { { ChannelSwizzle::X, ChannelSwizzle::Zero, ChannelSwizzle::Zero, ChannelSwizzle::One } } }, -}; - -/// Flags structure reporting available capabilities of a particular format. -enum FormatFeatureFlags : Util::uint32 -{ - FormatFeatureCopy = 0x00001, ///< Images of this format can be used as a copy source or destination. - FormatFeatureFormatConversion = 0x00002, ///< Images of this format support format conversion in copy - /// operations. - FormatFeatureImageShaderRead = 0x00004, ///< Images of this format can be read from a shader. - FormatFeatureImageShaderWrite = 0x00008, ///< Images of this format can be written from a shader. - FormatFeatureImageShaderAtomics = 0x00010, ///< Images of this format can be written atomically from a shader. - FormatFeatureMemoryShaderRead = 0x00020, ///< Memory views of this format can be read from a shader. - FormatFeatureMemoryShaderWrite = 0x00040, ///< Memory views of this format can be written from a shader. - FormatFeatureMemoryShaderAtomics = 0x00080, ///< Memory views of this format can be written atomically from a - /// shader. - FormatFeatureColorTargetWrite = 0x00100, ///< Images of this format can be bound as a color target. - FormatFeatureColorTargetBlend = 0x00200, ///< Images of this format can be bound as a color target for blending. - FormatFeatureDepthTarget = 0x00400, ///< Images of this format can be bound as a depth target. - FormatFeatureStencilTarget = 0x00800, ///< Images of this format can be bound as a stencil target. - FormatFeatureMsaaTarget = 0x01000, ///< Images of this format can support multisampling. - FormatFeatureWindowedPresent = 0x02000, ///< Images of this format can support windowed-mode presents. - /// Fullscreen present capability is queried using the @ref - /// IScreen::GetScreenModeList method. - FormatFeatureImageFilterLinear = 0x04000, ///< Images of this format can be linearly filtered. - FormatFeatureImageFilterMinMax = 0x08000, ///< Images of this format can be min/max filtered. - FormatFeatureFormatConversionSrc = 0x10000, ///< Images of this format support format conversion in copy - /// operations as the source image. - /// @note This is aliased to FormatFeatureFormatConversionDst for - /// backwards compatibility. - FormatFeatureFormatConversionDst = 0x20000, ///< Images of this format support format conversion in copy - /// operations as the destination image. - /// @note This is aliased to FormatFeatureFormatConversionSrc for - /// backwards compatibility. -}; - -/// Enumeration for indexing into the format properties table based on tiling. -enum FormatPropertiesTiling : Util::uint32 -{ - IsLinear = 0, ///< Format properties requested is for linearly-tiled surfaces. - IsNonLinear, ///< Format properties requested is for non-linearly tiled surfaces. - Count, ///< Number of format property tile types. -}; - -/// The format properties lookup table. Contains information about which device access features are available for all -/// formats and tiling modes. The tiling features for non-linear tiling modes are identical so we only store linear -/// and non-linear tiling features. From left to right, it is indexed by format and "is-non-linear". -/// Returned by IDevice::GetFormatProperties(). -struct MergedFormatPropertiesTable -{ - FormatFeatureFlags features[static_cast(ChNumFormat::Count)][FormatPropertiesTiling::Count]; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFormat.h + * @brief Common include for the Platform Abstraction Library (PAL) interface. Defines format types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" + +/// Library-wide namespace encapsulating all PAL entities. +namespace Pal +{ + +/// Specifies the format for an image or GPU memory view. +/// +/// This defines the bit layout of the channels and how the value in each channel is interpreted. +/// +/// Channels are listed in DX10+-style order, where the least significant channels are listed first. For example, for +/// a uint32 val with an X8Y8Z8W8 value: X = val & 0xFF, Y = (val >> 8) & 0xFF, Z = (val >> 16) & 0xFF, +/// W = (val >> 24) & 0xFF. Enums without a more detailed explanation can be decoded in this manner. Multimedia, +/// or "YUV" formats are all exceptions to this rule. Each of those formats explicitly describes how their channels are +/// organized. +/// +/// Many of the multimedia (i.e., "YUV") formats are encoded such that the chrominance (chroma, CbCr, UV) samples are +/// stored at a lower resolution than the luminance (luma, Y) samples as a form of compression. The ratio of the +/// subsampling is often referred to using an A:B:C notation, where the trio of numbers A,B,C are used to describe the +/// number of luma and chroma samples in a hypothetical region which is A pixels wide and 2 pixels high. The three +/// numbers usually refer to the following quantities by convention: +/// A --> Width of the conceptual region of pixels, and is usually 4. +/// B --> Number of chroma samples in the first of two rows of A pixels. +/// C --> Number of changes of chroma samples between the first and second rows of A pixels. +/// +/// Common examples of this notation are described below: +/// 4:4:4 --> No chroma subsampling because luma and chroma both have 4 samples per row, and the number of chroma and +/// luma rows is the same. +/// 4:1:1 --> Each row of 4 pixels has 1 chroma sample, and the number of chroma and luma rows is the same. +/// 4:2:0 --> Each row of 4 pixels has 2 chroma samples, and there is only 1 chroma row for every 2 luma rows. +/// 4:2:2 --> Each row of 4 pixels has 2 chroma samples, and the number of chroma and luma rows is the same. +/// +/// Because of the subsampling ratios for multimedia formats, there are some restrictions on what dimensions can be used +/// when creating Images of these formats. 4:1:1 formats must have widths specified as a multiple of 4. 4:2:0 formats +/// must have widths and heights specified as multiples of 2. 4:2:2 formats must have widths specified as a multiple of +/// 2. 4:4:4 formats have no dimensional restrictions. +/// +/// Additionally, the YUV formats are broadly grouped into two categories: packed and planar formats. Packed formats +/// interleave the luma and chroma samples in each row of pixels. Planar formats are organized so that all of the luma +/// samples are together, followed by all of the chroma samples. Some planar formats interleave the U and V chroma +/// data, while some choose to have separate U and V planes. Both packed and planar formats can have any subsampling +/// ratio between the luma and chroma data. +enum class ChNumFormat : Util::uint32 +{ + Undefined = 0x0, ///< Used in situations where no format is needed, like raw memory views, or to + /// indicate no color/depth target will be attached when creating a graphics + /// pipeline. + X1_Unorm = 0x1, ///< _Untested._ + X1_Uscaled = 0x2, ///< _Untested._ + X4Y4_Unorm = 0x3, + X4Y4_Uscaled = 0x4, + L4A4_Unorm = 0x5, + X4Y4Z4W4_Unorm = 0x6, + X4Y4Z4W4_Uscaled = 0x7, + X5Y6Z5_Unorm = 0x8, + X5Y6Z5_Uscaled = 0x9, + X5Y5Z5W1_Unorm = 0xA, + X5Y5Z5W1_Uscaled = 0xB, + X1Y5Z5W5_Unorm = 0xC, + X1Y5Z5W5_Uscaled = 0xD, + X8_Unorm = 0xE, + X8_Snorm = 0xF, + X8_Uscaled = 0x10, + X8_Sscaled = 0x11, + X8_Uint = 0x12, + X8_Sint = 0x13, + X8_Srgb = 0x14, + A8_Unorm = 0x15, + L8_Unorm = 0x16, + P8_Unorm = 0x17, + X8Y8_Unorm = 0x18, + X8Y8_Snorm = 0x19, + X8Y8_Uscaled = 0x1A, + X8Y8_Sscaled = 0x1B, + X8Y8_Uint = 0x1C, + X8Y8_Sint = 0x1D, + X8Y8_Srgb = 0x1E, + L8A8_Unorm = 0x1F, + X8Y8Z8W8_Unorm = 0x20, + X8Y8Z8W8_Snorm = 0x21, + X8Y8Z8W8_Uscaled = 0x22, + X8Y8Z8W8_Sscaled = 0x23, + X8Y8Z8W8_Uint = 0x24, + X8Y8Z8W8_Sint = 0x25, + X8Y8Z8W8_Srgb = 0x26, + U8V8_Snorm_L8W8_Unorm = 0x27, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats + /// are X8Y8Z8W8_Snorm (to target U8V8_Snorm) and X8Y8Z8W8_Unorm (to target + /// L8W8_Unorm). + X10Y11Z11_Float = 0x28, + X11Y11Z10_Float = 0x29, + X10Y10Z10W2_Unorm = 0x2A, + X10Y10Z10W2_Snorm = 0x2B, + X10Y10Z10W2_Uscaled = 0x2C, + X10Y10Z10W2_Sscaled = 0x2D, + X10Y10Z10W2_Uint = 0x2E, + X10Y10Z10W2_Sint = 0x2F, + X10Y10Z10W2Bias_Unorm = 0x30, ///< A four-component, 32-bit 2.8-biased fixed-point format that supports 10 + /// bits for each color channel and 2-bit alpha. A shader must be aware of + /// *Bias* and must perform its own bias and scale on any data that is read + /// from or written. + U10V10W10_Snorm_A2_Unorm = 0X31, ///< Mixed signed/unsigned format. Valid Image and Color-Target View formats + /// are X10Y10Z10W2_Snorm (to target U10V10W10_Snorm) and X10Y10Z10W2_Unorm + /// (to target A2_Unorm). + X16_Unorm = 0x32, + X16_Snorm = 0x33, + X16_Uscaled = 0x34, + X16_Sscaled = 0x35, + X16_Uint = 0x36, + X16_Sint = 0x37, + X16_Float = 0x38, + L16_Unorm = 0x39, + X16Y16_Unorm = 0x3A, + X16Y16_Snorm = 0x3B, + X16Y16_Uscaled = 0x3C, + X16Y16_Sscaled = 0x3D, + X16Y16_Uint = 0x3E, + X16Y16_Sint = 0x3F, + X16Y16_Float = 0x40, + X16Y16Z16W16_Unorm = 0x41, + X16Y16Z16W16_Snorm = 0x42, + X16Y16Z16W16_Uscaled = 0x43, + X16Y16Z16W16_Sscaled = 0x44, + X16Y16Z16W16_Uint = 0x45, + X16Y16Z16W16_Sint = 0x46, + X16Y16Z16W16_Float = 0x47, + X32_Uint = 0x48, + X32_Sint = 0x49, + X32_Float = 0x4A, + X32Y32_Uint = 0x4B, + X32Y32_Sint = 0x4C, + X32Y32_Float = 0x4D, + X32Y32Z32_Uint = 0x4E, + X32Y32Z32_Sint = 0x4F, + X32Y32Z32_Float = 0x50, + X32Y32Z32W32_Uint = 0x51, + X32Y32Z32W32_Sint = 0x52, + X32Y32Z32W32_Float = 0x53, + D16_Unorm_S8_Uint = 0x54, + D32_Float_S8_Uint = 0x55, + X9Y9Z9E5_Float = 0x56, ///< Three partial-precision floating-point numbers encoded into a single 32-bit + /// value all sharing the same 5-bit exponent (variant of s10e5, which is sign + /// bit, 10-bit mantissa, and 5-bit biased (15) exponent). There is no sign + /// bit, and there is a shared 5-bit biased (15) exponent and a 9-bit mantissa + /// for each channelShared exponent format. + Bc1_Unorm = 0x57, ///< BC1 compressed texture format. + Bc1_Srgb = 0x58, ///< BC1 compressed texture format. + Bc2_Unorm = 0x59, ///< BC2 compressed texture format. + Bc2_Srgb = 0x5A, ///< BC2 compressed texture format. + Bc3_Unorm = 0x5B, ///< BC3 compressed texture format. + Bc3_Srgb = 0x5C, ///< BC3 compressed texture format. + Bc4_Unorm = 0x5D, ///< BC4 compressed texture format. + Bc4_Snorm = 0x5E, ///< BC4 compressed texture format. + Bc5_Unorm = 0x5F, ///< BC5 compressed texture format. + Bc5_Snorm = 0x60, ///< BC5 compressed texture format. + Bc6_Ufloat = 0x61, ///< BC6 unsigned compressed texture format. + Bc6_Sfloat = 0x62, ///< BC6 signed compressed texture format. + Bc7_Unorm = 0x63, ///< BC7 compressed texture format. + Bc7_Srgb = 0x64, ///< BC7 compressed texture format. + Etc2X8Y8Z8_Unorm = 0x65, + Etc2X8Y8Z8_Srgb = 0x66, + Etc2X8Y8Z8W1_Unorm = 0x67, + Etc2X8Y8Z8W1_Srgb = 0x68, + Etc2X8Y8Z8W8_Unorm = 0x69, + Etc2X8Y8Z8W8_Srgb = 0x6A, + Etc2X11_Unorm = 0x6B, + Etc2X11_Snorm = 0x6C, + Etc2X11Y11_Unorm = 0x6D, + Etc2X11Y11_Snorm = 0x6E, + AstcLdr4x4_Unorm = 0x6F, + AstcLdr4x4_Srgb = 0x70, + AstcLdr5x4_Unorm = 0x71, + AstcLdr5x4_Srgb = 0x72, + AstcLdr5x5_Unorm = 0x73, + AstcLdr5x5_Srgb = 0x74, + AstcLdr6x5_Unorm = 0x75, + AstcLdr6x5_Srgb = 0x76, + AstcLdr6x6_Unorm = 0x77, + AstcLdr6x6_Srgb = 0x78, + AstcLdr8x5_Unorm = 0x79, + AstcLdr8x5_Srgb = 0x7A, + AstcLdr8x6_Unorm = 0x7B, + AstcLdr8x6_Srgb = 0x7C, + AstcLdr8x8_Unorm = 0x7D, + AstcLdr8x8_Srgb = 0x7E, + AstcLdr10x5_Unorm = 0x7F, + AstcLdr10x5_Srgb = 0x80, + AstcLdr10x6_Unorm = 0x81, + AstcLdr10x6_Srgb = 0x82, + AstcLdr10x8_Unorm = 0x83, + AstcLdr10x8_Srgb = 0x84, + AstcLdr10x10_Unorm = 0x85, + AstcLdr10x10_Srgb = 0x86, + AstcLdr12x10_Unorm = 0x87, + AstcLdr12x10_Srgb = 0x88, + AstcLdr12x12_Unorm = 0x89, + AstcLdr12x12_Srgb = 0x8A, + AstcHdr4x4_Float = 0x8B, + AstcHdr5x4_Float = 0x8C, + AstcHdr5x5_Float = 0x8D, + AstcHdr6x5_Float = 0x8E, + AstcHdr6x6_Float = 0x8F, + AstcHdr8x5_Float = 0x90, + AstcHdr8x6_Float = 0x91, + AstcHdr8x8_Float = 0x92, + AstcHdr10x5_Float = 0x93, + AstcHdr10x6_Float = 0x94, + AstcHdr10x8_Float = 0x95, + AstcHdr10x10_Float = 0x96, + AstcHdr12x10_Float = 0x97, + AstcHdr12x12_Float = 0x98, + X8Y8_Z8Y8_Unorm = 0x99, ///< _Untested._ + X8Y8_Z8Y8_Uscaled = 0x9A, ///< _Untested._ + Y8X8_Y8Z8_Unorm = 0x9B, ///< _Untested._ + Y8X8_Y8Z8_Uscaled = 0x9C, ///< _Untested._ + AYUV = 0x9D, ///< YUV 4:4:4 packed format. Valid Image and Color-Target view formats are + /// { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully maps the + /// entire YUV subresource, with the V,U,Y,A channels mapped to the X,Y,Z,W + /// channels respectively. Additionally, Image views can use the { X32, Uint } + /// format where all four channels are packed into a single uint32. + UYVY = 0x9E, ///< YUV 4:2:2 packed format. The Image data is subsampled such that each 32bit + /// element contains two Y samples and one U and V sample. Valid Image view + /// formats are { X8Y8Z8W8, Unorm } and { X8Y8Z8W8, Uint }. Each view fully + /// maps the entire YUV subresource, with the X,Y,Z,W channels mapped to the + /// U0,Y0,V0,Y1 channels respectively. Additionally, Image views can use the + /// { X32, Uint } format where all four channels are packed into a single + /// uint32. Image views can also use the { X8Y8_Z8Y8, Unorm } format to access + /// these as well. In this case, the width of the Image view would appear to be + /// twice as wide as it normally does, and the X0,Y0,Z0,Y1 channels map to the + /// U0,Y0,V0,Y1 channels respectively. + VYUY = 0x9F, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::UYVY format, except with a different channel ordering. + /// Image views with X8Y8Z8W8 channel formats map the X,Y,Z,W channels to the + /// V0,Y0,U0,Y1 channels respectively. Image views with the X8Y8_Z8Y8 channel + /// format map the X0,Y0,Z0,Y1 channels to the V0,Y0,U0,Y1 channels + /// respectively. + YUY2 = 0xA0, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::UYVY format, except with a different channel ordering. + /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,U0,Y1,V0 + /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format + /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,U0,Y1,V0 channels. + YVY2 = 0xA1, ///< YUV 4:2:2 packed format. The image data is encoded just like the + /// @ref ChNumFormat::YUY2 format, except with a different channel ordering. + /// X8Y8Z8W8 Image view formats map the X,Y,Z,W channels to the Y0,V0,Y1,U0 + /// channels respectively. Image views can use the { Y8X8_Y8Z8, Unorm } format + /// where the Y0,X0,Y1,Z0 channels are mapped to the Y0,V0,Y1,U0 channels. + YV12 = 0xA2, ///< YVU 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containg a uint8 per sample. Next is the V plane and the U + /// plane, both of which have a uint8 per sample. Valid Image view formats are + /// { X8, Unorm } and { X8, Uint }. Each view only has access to one of the Y, + /// V, or U planes. + NV11 = 0xA3, ///< YUV 4:1:1 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containing a uint8 per sample. Next is a UV plane which + /// has interleaved U and V samples, each stored as a uint8. Valid Image and + /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } + /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view + /// only has access to the Y plane. When using X8Y8, the view only has access + /// to the UV plane. + NV12 = 0xA4, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. The Y + /// plane is first, containing a uint8 per sample. Next is a UV plane which + /// has interleaved U and V samples, each stored as a uint8. Valid Image and + /// Color-Target view formats are { X8, Unorm }, { X8, Uint }, { X8Y8, Unorm } + /// and { X8Y8, Uint }. When using an X8 channel format for the View, the view + /// only has access to the Y plane. When using X8Y8, the view only has access + /// to the UV plane. + NV21 = 0xA5, ///< YUV 4:2:0 planar format, with 8 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::NV12, except that the second plane swaps the + /// ordering of the U and V samples. Image views behave just like with + /// @ref ChNumFormat::NV12. + P016 = 0xA6, ///< YUV 4:2:0 planar format, with 16 bits per luma and chroma sample. The + /// plane ordering is identical to @ref ChNumFormat::NV12. Instead of uint8 + /// samples, this format uses 8.8 fixed point sample encoding. Image views + /// behave just like with @ref ChNumFormat::NV12, except R16 channel formats + /// are used for the Y plane, and X16Y16 channel formats are used for the UV + /// plane. + P010 = 0xA7, ///< YUV 4:2:0 planar format, with 10 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P016, except that the lowest 6 bits of each + /// luma and chroma sample are ignored. This allows the source data to be + /// interpreted as either P016 or P010 interchangably. + P210 = 0xA8, ///< YUV 4:2:2 planar format, with 10 bits per luma and chroma sample. This is + /// similar to @ref ChNumFormat::P010, except that the UV planes are sub-sampled + /// only in the horizontal direction, but still by a factor of 2 so the UV plane + /// ends up having the same number of lines as the Y plane. + X8_MM_Unorm = 0xA9, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces. Such as the Y plane or any plane in YV12. + X8_MM_Uint = 0xAA, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces. Such as the Y plane or any plane in YV12. + X8Y8_MM_Unorm = 0xAB, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces. + X8Y8_MM_Uint = 0xAC, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces. + X16_MM10_Unorm = 0xAD, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (10-bit). Such as the Y plane or any plane in YV12. + X16_MM10_Uint = 0xAE, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (10-bit). Such as the Y plane or any plane in YV12. + X16Y16_MM10_Unorm = 0xAF, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (10-bit). + X16Y16_MM10_Uint = 0xB0, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (10-bit). + P208 = 0xB1, ///< YUV 4:2:2 planar format, with 8 bits per luma and chroma sample. This is + /// similar to @ref ChNumFormat::NV12, except that the UV planes are sub-sampled + /// only in the horizontal direction, but still by a factor of 2 so the UV plane + /// ends up having the same number of lines as the Y plane. This format is + /// sometimes referred to as NV16. + X16_MM12_Unorm = 0xB2, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (12-bit). + X16_MM12_Uint = 0xB3, ///< Multi-media format used with DCC for non-interleaved planes in YUV planar + /// surfaces (12-bit). + X16Y16_MM12_Unorm = 0xB4, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (12-bit). + X16Y16_MM12_Uint = 0xB5, ///< Multi-media format used with DCC for the interleaved UV plane in YUV planar + /// surfaces (12-bit). + P012 = 0xB6, ///< YUV 4:2:0 planar format, with 12 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P010, except that the lowest 4 bits of each + /// luma and chroma sample are ignored. + P212 = 0xB7, ///< YUV 4:2:2 planar format, with 12 bits per luma and chroma sample. This is + /// identical to @ref ChNumFormat::P210, except that the lowest 4 bits of each + /// luma and chroma sample are ignored. + P412 = 0xB8, ///< YUV 4:4:4 planar format, with 12 bits per luma and chroma sample. It consists + /// of a Y-plane followed by an interleaved UV plane. + X10Y10Z10W2_Float = 0xB9, ///< RGBA format with three 10-bit floats (6e4) and a 2-bit unorm as alpha. + Y216 = 0xBA, ///< YUV 4:2:2 packed, with 16 bits per luma or chroma sample. No alpha. + Y210 = 0xBB, ///< YUV 4:2:2 packed, with 10 bits per luma or chroma sample. No alpha. + /// Same memory layout as @ref ChNumFormat::Y216. + /// The lowest 6 bits of each sample are ignored. + Y416 = 0xBC, ///< YUV 4:4:4 packed, with 16 bits per luma or chroma sample. + Y410 = 0xBD, ///< YUV 4:4:4 packed, with 10 bits per luma or chroma sample and 2 bits for alpha. + _ReservedBE = 0xBE, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 + P216 = 0xBF, ///< YUV 4:2:2 planar format, with 16 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + YUV_420P10 = 0xC0, ///< YUV 4:2:0 tri-planar format, with 10 bits per luma and chroma sample. + YUV_422P10 = 0xC1, ///< YUV 4:2:2 tri-planar format, with 10 bits per luma and chroma sample. + YUV_444P10 = 0xC2, ///< YUV 4:4:4 tri-planar format, with 10 bits per luma and chroma sample. + YUV_420P12 = 0xC3, ///< YUV 4:2:0 tri-planar format, with 12 bits per luma and chroma sample. + YUV_422P12 = 0xC4, ///< YUV 4:2:2 tri-planar format, with 12 bits per luma and chroma sample. + YUV_444P12 = 0xC5, ///< YUV 4:4:4 tri-planar format, with 12 bits per luma and chroma sample. + YUV_420P16 = 0xC6, ///< YUV 4:2:0 tri-planar format, with 16 bits per luma and chroma sample. + YUV_422P16 = 0xC7, ///< YUV 4:2:2 tri-planar format, with 16 bits per luma and chroma sample. + YUV_444P16 = 0xC8, ///< YUV 4:4:4 tri-planar format, with 16 bits per luma and chroma sample. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + YV16 = 0xC9, ///< YVU 4:2:2 tri-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled in + /// vertical direction. + YV24 = 0xCA, ///< YVU 4:4:4 tri-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::YV12, except chroma is not subsampled. + NV24 = 0xCB, ///< YUV 4:4:4 bi-planar format, with 8 bits per luma and chroma sample. This + /// is similar to @ref ChNumFormat::NV12, except chroma is not subsampled. + /// This format is sometimes referred to as P408. + P410 = 0xCC, ///< YUV 4:4:4 planar format, with 10 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. + P416 = 0xCD, ///< YUV 4:4:4 planar format, with 16 bits per luma and chroma sample. It consists + /// of a Y-plane followed by interleaved UV plane. +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 929 + X16Y16Z16W16_MM10_Unorm = 0xCE, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of + /// each component are ignored. + X16Y16Z16W16_MM10_Uint = 0xCF, ///< A four component format with 16 bits of storage per component. Lowest 6 bits of + /// each component are ignored. + X16Y16Z16W16_MM12_Unorm = 0xD0, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of + /// each component are ignored. + X16Y16Z16W16_MM12_Uint = 0xD1, ///< A four component format with 16 bits of storage per component. Lowest 4 bits of + /// each component are ignored. +#endif + Count, + +}; + +/// Specifies which channel of a resource should be mapped to a particular component of an image view. +/// +/// @ingroup ResourceBinding +enum class ChannelSwizzle : Util::uint8 +{ + Zero = 0x0, ///< Ignore resource data and always fetch a 0 into this component. + One = 0x1, ///< Ignore resource data and always fetch a 1 into this component. + X = 0x2, ///< Use the X channel from resource for this component. + Y = 0x3, ///< Use the Y channel from resource for this component. + Z = 0x4, ///< Use the Z channel from resource for this component. + W = 0x5, ///< Use the W channel from resource for this component. + Count +}; + +/// Specifies a mapping for each component of an image or buffer view to a channel in its associated resource. +/// +/// @ingroup ResourceBinding +struct ChannelMapping +{ + union + { + struct + { + ChannelSwizzle r; ///< Red component swizzle. + ChannelSwizzle g; ///< Green component swizzle. + ChannelSwizzle b; ///< Blue component swizzle. + ChannelSwizzle a; ///< Alpha component swizzle. + }; + ChannelSwizzle swizzle[4]; ///< All four swizzles packed into one array. + Util::uint32 swizzleValue; + }; +}; + +/// Specifies a pixel format for an image or memory view and its corresponding channel swizzle. +struct SwizzledFormat +{ + ChNumFormat format; ///< Pixel format. + ChannelMapping swizzle; ///< Compatible channel swizzle for the above pixel format. +}; + +inline constexpr bool operator==(const SwizzledFormat& lhs, const SwizzledFormat& rhs) +{ + return (lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue); +} + +/// Constant for undefined formats. +constexpr SwizzledFormat UndefinedSwizzledFormat = +{ + ChNumFormat::Undefined, + { { { ChannelSwizzle::X, ChannelSwizzle::Zero, ChannelSwizzle::Zero, ChannelSwizzle::One } } }, +}; + +/// Flags structure reporting available capabilities of a particular format. +enum FormatFeatureFlags : Util::uint32 +{ + FormatFeatureCopy = 0x00001, ///< Images of this format can be used as a copy source or destination. + FormatFeatureFormatConversion = 0x00002, ///< Images of this format support format conversion in copy + /// operations. + FormatFeatureImageShaderRead = 0x00004, ///< Images of this format can be read from a shader. + FormatFeatureImageShaderWrite = 0x00008, ///< Images of this format can be written from a shader. + FormatFeatureImageShaderAtomics = 0x00010, ///< Images of this format can be written atomically from a shader. + FormatFeatureMemoryShaderRead = 0x00020, ///< Memory views of this format can be read from a shader. + FormatFeatureMemoryShaderWrite = 0x00040, ///< Memory views of this format can be written from a shader. + FormatFeatureMemoryShaderAtomics = 0x00080, ///< Memory views of this format can be written atomically from a + /// shader. + FormatFeatureColorTargetWrite = 0x00100, ///< Images of this format can be bound as a color target. + FormatFeatureColorTargetBlend = 0x00200, ///< Images of this format can be bound as a color target for blending. + FormatFeatureDepthTarget = 0x00400, ///< Images of this format can be bound as a depth target. + FormatFeatureStencilTarget = 0x00800, ///< Images of this format can be bound as a stencil target. + FormatFeatureMsaaTarget = 0x01000, ///< Images of this format can support multisampling. + FormatFeatureWindowedPresent = 0x02000, ///< Images of this format can support windowed-mode presents. + /// Fullscreen present capability is queried using the @ref + /// IScreen::GetScreenModeList method. + FormatFeatureImageFilterLinear = 0x04000, ///< Images of this format can be linearly filtered. + FormatFeatureImageFilterMinMax = 0x08000, ///< Images of this format can be min/max filtered. + FormatFeatureFormatConversionSrc = 0x10000, ///< Images of this format support format conversion in copy + /// operations as the source image. + /// @note This is aliased to FormatFeatureFormatConversionDst for + /// backwards compatibility. + FormatFeatureFormatConversionDst = 0x20000, ///< Images of this format support format conversion in copy + /// operations as the destination image. + /// @note This is aliased to FormatFeatureFormatConversionSrc for + /// backwards compatibility. +}; + +/// Enumeration for indexing into the format properties table based on tiling. +enum FormatPropertiesTiling : Util::uint32 +{ + IsLinear = 0, ///< Format properties requested is for linearly-tiled surfaces. + IsNonLinear, ///< Format properties requested is for non-linearly tiled surfaces. + Count, ///< Number of format property tile types. +}; + +/// The format properties lookup table. Contains information about which device access features are available for all +/// formats and tiling modes. The tiling features for non-linear tiling modes are identical so we only store linear +/// and non-linear tiling features. From left to right, it is indexed by format and "is-non-linear". +/// Returned by IDevice::GetFormatProperties(). +struct MergedFormatPropertiesTable +{ + FormatFeatureFlags features[static_cast(ChNumFormat::Count)][FormatPropertiesTiling::Count]; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h b/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h index 9884191bd7..023a7c0eeb 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palFormatInfo.h @@ -1,881 +1,881 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palFormatInfo.h - * @brief Defines the Platform Abstraction Library (PAL) Format utility functions. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palDevice.h" -#include "palImage.h" -#include "palInlineFuncs.h" -#include "palMath.h" - -namespace Pal -{ - -/// Namespace encapsulating all PAL format utility functions. -namespace Formats -{ - -/// Specifies flags which indicate properties of each PAL channel format. -enum PropertyFlags : uint32 -{ - BitCountInaccurate = 0x1, ///< Indicates that format's bit count array is inaccurate - BlockCompressed = 0x2, ///< Indicates channel format is block-compressed - MacroPixelPacked = 0x4, ///< Indicates channel format has multiple pixels' data packed together into - /// one "macro pixel" - YuvPlanar = 0x8, ///< Indicates channel format is YUV-planar - YuvPacked = 0x10, ///< Indicates channel format is YUV packed -}; - -/// Specifies numeric support of a specified format. -enum class NumericSupportFlags : uint32 -{ - Undefined, ///< No numeric support. - Unorm, ///< Unsigned normalized. - Snorm, ///< Signed normalized. - Uscaled, ///< _Untested._ Treated as an unsigned integer inside the resource, but received by - /// the shader as a floating point number. - Sscaled, ///< _Untested._ Treated as a signed integer inside the resource, but received by - /// the shader as a floating point number. - Uint, ///< Unsigned integer. - Sint, ///< Signed integer. - Float, ///< Floating point number. - Srgb, ///< sRGB. - DepthStencil, ///< Depth/stencil support. - Yuv, ///< YUV support. -}; - -/// Specifies flags which indicate the presence of each color channel in a PAL channel format. -enum ChannelFlags : uint32 -{ - X = 0x1, ///< Indicates the X channel is present. - Y = 0x2, ///< Indicates the Y channel is present. - Z = 0x4, ///< Indicates the Z channel is present. - W = 0x8, ///< Indicates the W channel is present. -}; - -/// An entry in the channel-format info lookup table. Contains intrinsic properties describing a channel format. -struct FormatInfo -{ - uint32 bitsPerPixel; ///< Total count of bits in a signel pixel (or block). - uint32 componentCount; ///< Number of color components (channels) present. - - uint32 bitCount[4]; ///< Number of bits for each component in the format. These members are - /// only reliable if the 'bitCountInaccurate' flag is not set. - /// Listed in order: X, Y, Z, and W. - - uint32 channelMask; ///< Mask of @ref ChannelFlags values indicating which channels are present. - uint32 properties; ///< Mask of @ref PropertyFlags values indicating which properties a format - /// has. - NumericSupportFlags numericSupport; ///< Which numeric format this format represents. Used for easy identification. -}; - -/// BC block dimension (4x4) -static constexpr uint32 CompressedBcBlockDim = 4; - -/// ETC block dimension (4x4) -static constexpr uint32 CompressedEtcBlockDim = 4; - -/// Lookup table for intrinsic properties describing each channel format. Callers should access the members of this -/// table via BitsPerPixel() and related functions. -extern const FormatInfo FormatInfoTable[static_cast(ChNumFormat::Count)]; - -/// Convert a floating-point representation of a color value in RGBA order to the appropriate bit representation for -/// each channel based on the specified format. Swizzling is enabled by default to maintain backwards compatability. -/// There will be no swizzling functionality going forwards. -extern void ConvertColor( - SwizzledFormat format, - const float* pColorIn, - uint32* pColorOut); - -/// Convert an unsigned integer representation of a color value in YUVA order to the appropriate bit representation for -/// each channel based on the specified format. -extern void ConvertYuvColor( - SwizzledFormat format, - uint32 plane, - const uint32* pColorIn, - uint32* pColorOut); - -/// Packs a clear color value in RGBA order to a single element of the provided format and stores it in the -/// memory provided. Swizzling is enabled by default to maintain backwards compatability. There will be -/// no swizzling functionality going forwards. -extern void PackRawClearColor( - SwizzledFormat format, - const uint32* pColor, - void* pBufferMemory); - -/// Swizzles the color according to the provided format swizzle. -extern void SwizzleColor(SwizzledFormat format, const uint32* pColorIn, uint32* pColorOut); - -/// Compares two SwizzledFormats and checks for equality. -/// -/// @param lhs [in] Left hand side of comparison -/// @param rhs [in] Right hand side of comparison -/// -/// @return True if the formats are equal, false otherwise. -constexpr bool IsSameFormat( - const SwizzledFormat& lhs, - const SwizzledFormat& rhs) -{ - return ((lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue)); -} - -/// Queries the number of components for a particular channel format. -/// -/// @param [in] format The channel format to query for. -/// -/// @returns The number of components of the specified channel format. -inline uint32 NumComponents( - ChNumFormat format) -{ - return FormatInfoTable[static_cast(format)].componentCount; -} - -/// Queries the component mask for a particular format. -/// -/// @param [in] format The format to query for. -/// -/// @returns The component mask of @ref ChannelFlags for the specified format. -inline uint32 ComponentMask( - ChNumFormat format) -{ - uint32 mask = FormatInfoTable[static_cast(format)].channelMask; - PAL_ASSERT((mask & 0xF) == mask); - return mask; -} - -/// Checks if a format is undefined. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is undefined. False otherwise. -constexpr bool IsUndefined( - ChNumFormat format) -{ - return (format == ChNumFormat::Undefined); -} - -/// Checks if a format's numeric representation is unsigned normalized. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is unsigned normalized. False otherwise. -inline bool IsUnorm( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Unorm); -} - -/// Checks if a format's numeric representation is signed normalized. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is signed normalized. False otherwise. -inline bool IsSnorm( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Snorm); -} - -/// Checks if a format's numeric representation is unsigned scaled. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is unsigned scaled. False otherwise. -inline bool IsUscaled( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uscaled); -} - -/// Checks if a format's numeric representation is signed scaled. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is signed scaled. False otherwise. -inline bool IsSscaled( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sscaled); -} - -/// Checks if a format's numeric representation is unsigned integer. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is unsigned integer. False otherwise. -inline bool IsUint( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uint); -} - -/// Checks if a format's numeric representation is signed integer. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is signed integer. False otherwise. -inline bool IsSint( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sint); -} - -/// Checks if a format's numeric representation is floating point. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is floating point. False otherwise. -inline bool IsFloat( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Float); -} - -/// Checks if a format's numeric representation is gamma-corrected sRGB. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is sRGB. False otherwise. -inline bool IsSrgb( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Srgb); -} - -/// Checks if a format's numeric representation is normalized. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is normalized. False otherwise. -inline bool IsNormalized( - ChNumFormat format) -{ - return IsUnorm(format) || IsSnorm(format); -} - -/// Checks if a format's numeric representation is an integer format. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is an integer format. False otherwise. -inline bool IsInteger( - ChNumFormat format) -{ - return IsUint(format) || IsSint(format); -} - -/// Checks if a format is a depth/stencil only format. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is a depth/stencil only format. False otherwise. -inline bool IsDepthStencilOnly( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::DepthStencil); -} - -/// Checks if the specified format is one of the YUV-planar ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is YUV-planar. False otherwise. -inline bool IsYuvPlanar( - ChNumFormat format) -{ - return ((FormatInfoTable[static_cast(format)].properties & YuvPlanar) != 0); -} - -/// Checks if the specified format is one of the YUV-packed ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is YUV-packed. False otherwise. -inline bool IsYuvPacked( - ChNumFormat format) -{ - return ((FormatInfoTable[static_cast(format)].properties & YuvPacked) != 0); -} - -/// Checks if the specified format is one of the YUV ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is for YUV data. False otherwise. -inline bool IsYuv( - ChNumFormat format) -{ - return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Yuv); -} - -/// Checks if a format has alpha. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format has an alpha channel. False otherwise. -constexpr bool HasAlpha( - SwizzledFormat format) -{ - return ((format.format == ChNumFormat::A8_Unorm) || - (format.format == ChNumFormat::L4A4_Unorm) || - (format.format == ChNumFormat::L8A8_Unorm) || - ((ComponentMask(format.format) & ChannelFlags::W) != 0) || - ((format.swizzle.a != ChannelSwizzle::Zero) && (format.swizzle.a != ChannelSwizzle::One))); -} - -/// Checks if a format has an unused alpha channel. -/// -/// @param [in] format Pixel format. -/// -/// @returns True if the pixel format is a four channel format and has an unused alpha channel. False otherwise. -inline bool HasUnusedAlpha( - SwizzledFormat format) -{ - return ((NumComponents(format.format) == 4) && - (format.swizzle.r != ChannelSwizzle::W) && - (format.swizzle.g != ChannelSwizzle::W) && - (format.swizzle.b != ChannelSwizzle::W) && - (format.swizzle.a != ChannelSwizzle::W)); -} - -/// Converts format into its Unorm equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToUnorm(ChNumFormat format); - -/// Converts format into its Snorm equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToSnorm(ChNumFormat format); - -/// Converts format into its Uscaled equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToUscaled(ChNumFormat format); - -/// Converts format into its Sscaled equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToSscaled(ChNumFormat format); - -/// Converts format into its Uint equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Uint format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToUint(ChNumFormat format); - -/// Converts format into its Sint equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Sint format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToSint(ChNumFormat format); - -/// Converts format into its Float equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Float format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToFloat(ChNumFormat format); - -/// Converts format into its Srgb equivalent. -/// -/// @param [in] format Pixel format. -/// -/// @returns Srgb format equivalent of input format. Undefined if none exist. -extern ChNumFormat PAL_STDCALL ConvertToSrgb(ChNumFormat format); - -/// Converts source numeric format to the provided destination numeric format. -/// -/// @param [in] srcFormat Source Pixel format. -/// @param [in] dstFormat Destination Pixel format. -/// -/// @returns Source format with equivalent numeric format of destination format. Undefined if none exist. -extern ChNumFormat ConvertToDstNumFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); - -/// Determines whether the srcFormat and the dstFormat have the same channel formats. -/// -/// @param [in] srcFormat Source channel pixel format. -/// @param [in] dstFormat Destination channel pixel format. -/// -/// @returns True if both formats share the same channel format. False otherwise. -extern bool ShareChFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); - -/// Determines whether the srcFormat and the dstFormat have the same numeric formats. -/// -/// @param [in] srcFormat Source channel pixel format. -/// @param [in] dstFormat Destination channel pixel format. -/// -/// @returns True if both formats share the same numeric format. False otherwise. -inline bool HaveSameNumFmt( - ChNumFormat srcFormat, - ChNumFormat dstFormat) -{ - return (FormatInfoTable[static_cast(srcFormat)].numericSupport == - FormatInfoTable[static_cast(dstFormat)].numericSupport); -} - -/// Returns the block dimension for a compressed format. -/// -/// @param [in] format Format. -/// -/// @returns Corresponding block dimensions for the compressed format. -inline Extent3d CompressedBlockDim( - ChNumFormat format) -{ - Extent3d blockDim = {}; - - switch (format) - { - case ChNumFormat::Bc1_Unorm: - case ChNumFormat::Bc1_Srgb: - case ChNumFormat::Bc2_Unorm: - case ChNumFormat::Bc2_Srgb: - case ChNumFormat::Bc3_Unorm: - case ChNumFormat::Bc3_Srgb: - case ChNumFormat::Bc4_Unorm: - case ChNumFormat::Bc4_Snorm: - case ChNumFormat::Bc5_Unorm: - case ChNumFormat::Bc5_Snorm: - case ChNumFormat::Bc6_Ufloat: - case ChNumFormat::Bc6_Sfloat: - case ChNumFormat::Bc7_Unorm: - case ChNumFormat::Bc7_Srgb: - blockDim.width = CompressedBcBlockDim; - blockDim.height = CompressedBcBlockDim; - blockDim.depth = 1; - break; - case ChNumFormat::Etc2X8Y8Z8_Unorm: - case ChNumFormat::Etc2X8Y8Z8_Srgb: - case ChNumFormat::Etc2X8Y8Z8W1_Unorm: - case ChNumFormat::Etc2X8Y8Z8W1_Srgb: - case ChNumFormat::Etc2X8Y8Z8W8_Unorm: - case ChNumFormat::Etc2X8Y8Z8W8_Srgb: - case ChNumFormat::Etc2X11_Unorm: - case ChNumFormat::Etc2X11_Snorm: - case ChNumFormat::Etc2X11Y11_Unorm: - case ChNumFormat::Etc2X11Y11_Snorm: - blockDim.width = CompressedEtcBlockDim; - blockDim.height = CompressedEtcBlockDim; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr4x4_Unorm: - case ChNumFormat::AstcLdr4x4_Srgb: - case ChNumFormat::AstcHdr4x4_Float: - blockDim.width = 4; - blockDim.height = 4; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr5x4_Unorm: - case ChNumFormat::AstcLdr5x4_Srgb: - case ChNumFormat::AstcHdr5x4_Float: - blockDim.width = 5; - blockDim.height = 4; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr5x5_Unorm: - case ChNumFormat::AstcLdr5x5_Srgb: - case ChNumFormat::AstcHdr5x5_Float: - blockDim.width = 5; - blockDim.height = 5; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr6x5_Unorm: - case ChNumFormat::AstcLdr6x5_Srgb: - case ChNumFormat::AstcHdr6x5_Float: - blockDim.width = 6; - blockDim.height = 5; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr6x6_Unorm: - case ChNumFormat::AstcLdr6x6_Srgb: - case ChNumFormat::AstcHdr6x6_Float: - blockDim.width = 6; - blockDim.height = 6; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr8x5_Unorm: - case ChNumFormat::AstcLdr8x5_Srgb: - case ChNumFormat::AstcHdr8x5_Float: - blockDim.width = 8; - blockDim.height = 5; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr8x6_Unorm: - case ChNumFormat::AstcLdr8x6_Srgb: - case ChNumFormat::AstcHdr8x6_Float: - blockDim.width = 8; - blockDim.height = 6; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr8x8_Unorm: - case ChNumFormat::AstcLdr8x8_Srgb: - case ChNumFormat::AstcHdr8x8_Float: - blockDim.width = 8; - blockDim.height = 8; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr10x5_Unorm: - case ChNumFormat::AstcLdr10x5_Srgb: - case ChNumFormat::AstcHdr10x5_Float: - blockDim.width = 10; - blockDim.height = 5; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr10x6_Unorm: - case ChNumFormat::AstcLdr10x6_Srgb: - case ChNumFormat::AstcHdr10x6_Float: - blockDim.width = 10; - blockDim.height = 6; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr10x8_Unorm: - case ChNumFormat::AstcLdr10x8_Srgb: - case ChNumFormat::AstcHdr10x8_Float: - blockDim.width = 10; - blockDim.height = 8; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr10x10_Unorm: - case ChNumFormat::AstcLdr10x10_Srgb: - case ChNumFormat::AstcHdr10x10_Float: - blockDim.width = 10; - blockDim.height = 10; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr12x10_Unorm: - case ChNumFormat::AstcLdr12x10_Srgb: - case ChNumFormat::AstcHdr12x10_Float: - blockDim.width = 12; - blockDim.height = 10; - blockDim.depth = 1; - break; - case ChNumFormat::AstcLdr12x12_Unorm: - case ChNumFormat::AstcLdr12x12_Srgb: - case ChNumFormat::AstcHdr12x12_Float: - blockDim.width = 12; - blockDim.height = 12; - blockDim.depth = 1; - break; - default: - // This function should not be called on a non-compressed format. - PAL_ASSERT_ALWAYS(); - break; - } - - return blockDim; -} - -/// Convert a compressed format block coordinate to texels. -/// -/// @param [in] format Format. -/// @param [in] width Block width. -/// @param [in] height Block height. -/// @param [in] depth Block depth. -/// -/// @returns Structure containing the texel width, height and depth -inline Extent3d CompressedBlocksToTexels( - ChNumFormat format, - uint32 width, - uint32 height, - uint32 depth) -{ - Extent3d dims = CompressedBlockDim(format); - dims.width *= width; - dims.height *= height; - dims.depth *= depth; - return dims; -} - -/// Convert a compressed format texel coordinate to blocks. -/// -/// @param [in] format Format. -/// @param [in] width Texel width. -/// @param [in] height Texel height. -/// @param [in] depth Texel depth. -/// -/// @returns Structure containing the block width, height and depth -inline Extent3d CompressedTexelsToBlocks( - ChNumFormat format, - uint32 width, - uint32 height, - uint32 depth) -{ - Extent3d dims = CompressedBlockDim(format); - dims.width = Util::RoundUpQuotient(width, dims.width); - dims.height = Util::RoundUpQuotient(height, dims.height); - dims.depth = Util::RoundUpQuotient(depth, dims.depth); - return dims; -} - -/// Queries the number of bits in a pixel or element for the given format. -/// -/// @param format The format to query for. -/// -/// @return The number of bits per pixel for the given channel format. -inline uint32 BitsPerPixel( - ChNumFormat format) -{ - return FormatInfoTable[static_cast(format)].bitsPerPixel; -} - -/// Queries the number of bits in a pixel or element for the given format. -/// -/// @param format The format to query for. -/// -/// @return The number of bytes per pixel for the given channel format. -inline uint32 BytesPerPixel( - ChNumFormat format) -{ - return (BitsPerPixel(format) >> 3); -} - -/// Checks if the specified channel swizzle is allowed with the given format. -/// -/// @param [in] format The pixel format to check against. -/// @param [in] swizzle The specified channel swizzle to check with. -/// -/// @returns True if the specified channel swizzle is valid for the given format. False otherwise. -inline bool IsValidChannelSwizzle( - ChNumFormat format, - ChannelSwizzle swizzle) -{ - const uint32 mask = ComponentMask(format); - - bool valid = false; - switch (swizzle) - { - case ChannelSwizzle::Zero: - case ChannelSwizzle::One: - valid = true; - break; - case ChannelSwizzle::X: - valid = ((mask & ChannelFlags::X) != 0); - break; - case ChannelSwizzle::Y: - valid = ((mask & ChannelFlags::Y) != 0); - break; - case ChannelSwizzle::Z: - valid = ((mask & ChannelFlags::Z) != 0); - break; - case ChannelSwizzle::W: - valid = ((mask & ChannelFlags::W) != 0); - break; - default: - PAL_NEVER_CALLED(); - break; - } - - return valid; -} - -/// Queries the per-component bit counts for a particular format. -/// -/// @param [in] format The format to query for. -/// -/// @returns The corresponding component swizzles for the specified format. Returned as an array of four counts. -inline const uint32* ComponentBitCounts( - ChNumFormat format) -{ - return &FormatInfoTable[static_cast(format)].bitCount[0]; -} - -/// Determines the maximum bit-count of any component in the format. -/// -/// @param [in] format The channel format to query for. -/// -/// @returns The maximum bit-count of any component in the format. -inline uint32 MaxComponentBitCount( - ChNumFormat format) -{ - const FormatInfo& info = FormatInfoTable[static_cast(format)]; - - return Util::Max(Util::Max(info.bitCount[0], info.bitCount[1]), Util::Max(info.bitCount[2], info.bitCount[3])); -} - -/// Checks if the specified format is one of the block-compressed ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is block-compressed. False otherwise. -inline bool IsBlockCompressed( - ChNumFormat format) -{ - return ((FormatInfoTable[static_cast(format)].properties & BlockCompressed) != 0); -} - -/// Checks if the specified format is one of the macro-pixel-packed ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is macro-pixel-packed. False otherwise. -inline bool IsMacroPixelPacked( - ChNumFormat format) -{ - return ((FormatInfoTable[static_cast(format)].properties & MacroPixelPacked) != 0); -} - -/// Checks if the specified format is one of the rgb macro-pixel-packed ones. -/// -/// @param [in] format The format to check. -/// -/// @returns True if the specified format is a rgb macro-pixel-packed. False otherwise. -inline bool IsMacroPixelPackedRgbOnly( - ChNumFormat format) -{ - return (IsMacroPixelPacked(format) && (IsYuv(format) == false)); -} - -/// Returns the base-2 logarithm of of the subsampling ratio between the luma plane and chroma plane(s) of a YUV planar -/// format. The dimensions of the luma plane should be right-shifted by these amounts to determine the dimensions of the -/// chroma plane(s). -/// -/// @param [in] format Format. -/// @param [in] plane Image plane to query for. -/// -/// @returns Corresponding scaling factors between the luma plane and chroma plane(s). -inline Extent3d Log2SubsamplingRatio( - ChNumFormat format, - uint32 plane) -{ - // All planes for formats which are not YUV planar, and the 0th plane of a YUV planar format (the luma plane) are - // sampled at full rate, so the ratio is { log2(1), log2(1), log2(1) }, which equates to { 0,0,0 }. - Extent3d ratio = { }; - - if (IsYuvPlanar(format) && (plane != 0)) - { - PAL_ASSERT((plane == 1) || (plane == 2)); - switch (format) - { - // 4:4:4 formats have the same number of samples in every direction. - case ChNumFormat::P412: -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 - case ChNumFormat::YUV_444P10: - case ChNumFormat::YUV_444P12: - case ChNumFormat::YUV_444P16: -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 - case ChNumFormat::YV24: - case ChNumFormat::NV24: - case ChNumFormat::P410: - case ChNumFormat::P416: -#endif - break; - // 4:2:0 formats have 1/2 as many samples in both the horizontal and vertical directions. - case ChNumFormat::YV12: - case ChNumFormat::NV12: - case ChNumFormat::NV21: - case ChNumFormat::P010: - case ChNumFormat::P012: - case ChNumFormat::P016: -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 - case ChNumFormat::YUV_420P10: - case ChNumFormat::YUV_420P12: - case ChNumFormat::YUV_420P16: -#endif - ratio.width = 1; // log2(1/2) = -1 - ratio.height = 1; - break; - // 4:2:2 formats have 1/2 as many samples in the horizontal direction, and the same number of samples - // in the vertical direction. - case ChNumFormat::P208: - case ChNumFormat::P210: - case ChNumFormat::P212: -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 - case ChNumFormat::P216: -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 - case ChNumFormat::YUV_422P10: - case ChNumFormat::YUV_422P12: - case ChNumFormat::YUV_422P16: -#endif -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 - case ChNumFormat::YV16: -#endif - ratio.width = 1; - break; - // 4:1:1 formats have 1/4 as many samples in the horizontal direction, and the same number of samples - // in the vertical direction. - case ChNumFormat::NV11: - ratio.width = 2; // log2(1/4) = -2 - break; - default: - PAL_NEVER_CALLED(); // Did we miss a new YUV planar format? - break; - } - } - - return ratio; -} - -/// Converts a linearly-scaled color value to gamma-corrected sRGB. -/// -/// @param [in] linear Linear color value -/// -/// @returns Gamma-corrected sRGB color value -extern float LinearToGamma(float linear); - -/// Converts a gamma-corrected sRGB color value to linear color space. -/// -/// @param [in] gammaCorrectedVal Gamma-corrected sRGB color value -/// -/// @returns Linear color value -extern float GammaToLinear(float gammaCorrectedVal); - -/// Checks to see if a given format is a MM format -/// -/// -/// @returns bool is it an MM format -extern bool IsMmFormat(ChNumFormat format); - -/// Checks to see if a given format is a MM12 format -/// -/// -/// @returns bool is it an MM12 format -extern bool IsMm12Format(ChNumFormat format); - -/// Checks to see if a given format is a MM10 format -/// -/// -/// @returns bool is it an MM10 format -extern bool IsMm10Format(ChNumFormat format); - -} // Formats -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFormatInfo.h + * @brief Defines the Platform Abstraction Library (PAL) Format utility functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDevice.h" +#include "palImage.h" +#include "palInlineFuncs.h" +#include "palMath.h" + +namespace Pal +{ + +/// Namespace encapsulating all PAL format utility functions. +namespace Formats +{ + +/// Specifies flags which indicate properties of each PAL channel format. +enum PropertyFlags : uint32 +{ + BitCountInaccurate = 0x1, ///< Indicates that format's bit count array is inaccurate + BlockCompressed = 0x2, ///< Indicates channel format is block-compressed + MacroPixelPacked = 0x4, ///< Indicates channel format has multiple pixels' data packed together into + /// one "macro pixel" + YuvPlanar = 0x8, ///< Indicates channel format is YUV-planar + YuvPacked = 0x10, ///< Indicates channel format is YUV packed +}; + +/// Specifies numeric support of a specified format. +enum class NumericSupportFlags : uint32 +{ + Undefined, ///< No numeric support. + Unorm, ///< Unsigned normalized. + Snorm, ///< Signed normalized. + Uscaled, ///< _Untested._ Treated as an unsigned integer inside the resource, but received by + /// the shader as a floating point number. + Sscaled, ///< _Untested._ Treated as a signed integer inside the resource, but received by + /// the shader as a floating point number. + Uint, ///< Unsigned integer. + Sint, ///< Signed integer. + Float, ///< Floating point number. + Srgb, ///< sRGB. + DepthStencil, ///< Depth/stencil support. + Yuv, ///< YUV support. +}; + +/// Specifies flags which indicate the presence of each color channel in a PAL channel format. +enum ChannelFlags : uint32 +{ + X = 0x1, ///< Indicates the X channel is present. + Y = 0x2, ///< Indicates the Y channel is present. + Z = 0x4, ///< Indicates the Z channel is present. + W = 0x8, ///< Indicates the W channel is present. +}; + +/// An entry in the channel-format info lookup table. Contains intrinsic properties describing a channel format. +struct FormatInfo +{ + uint32 bitsPerPixel; ///< Total count of bits in a signel pixel (or block). + uint32 componentCount; ///< Number of color components (channels) present. + + uint32 bitCount[4]; ///< Number of bits for each component in the format. These members are + /// only reliable if the 'bitCountInaccurate' flag is not set. + /// Listed in order: X, Y, Z, and W. + + uint32 channelMask; ///< Mask of @ref ChannelFlags values indicating which channels are present. + uint32 properties; ///< Mask of @ref PropertyFlags values indicating which properties a format + /// has. + NumericSupportFlags numericSupport; ///< Which numeric format this format represents. Used for easy identification. +}; + +/// BC block dimension (4x4) +static constexpr uint32 CompressedBcBlockDim = 4; + +/// ETC block dimension (4x4) +static constexpr uint32 CompressedEtcBlockDim = 4; + +/// Lookup table for intrinsic properties describing each channel format. Callers should access the members of this +/// table via BitsPerPixel() and related functions. +extern const FormatInfo FormatInfoTable[static_cast(ChNumFormat::Count)]; + +/// Convert a floating-point representation of a color value in RGBA order to the appropriate bit representation for +/// each channel based on the specified format. Swizzling is enabled by default to maintain backwards compatability. +/// There will be no swizzling functionality going forwards. +extern void ConvertColor( + SwizzledFormat format, + const float* pColorIn, + uint32* pColorOut); + +/// Convert an unsigned integer representation of a color value in YUVA order to the appropriate bit representation for +/// each channel based on the specified format. +extern void ConvertYuvColor( + SwizzledFormat format, + uint32 plane, + const uint32* pColorIn, + uint32* pColorOut); + +/// Packs a clear color value in RGBA order to a single element of the provided format and stores it in the +/// memory provided. Swizzling is enabled by default to maintain backwards compatability. There will be +/// no swizzling functionality going forwards. +extern void PackRawClearColor( + SwizzledFormat format, + const uint32* pColor, + void* pBufferMemory); + +/// Swizzles the color according to the provided format swizzle. +extern void SwizzleColor(SwizzledFormat format, const uint32* pColorIn, uint32* pColorOut); + +/// Compares two SwizzledFormats and checks for equality. +/// +/// @param lhs [in] Left hand side of comparison +/// @param rhs [in] Right hand side of comparison +/// +/// @return True if the formats are equal, false otherwise. +constexpr bool IsSameFormat( + const SwizzledFormat& lhs, + const SwizzledFormat& rhs) +{ + return ((lhs.format == rhs.format) && (lhs.swizzle.swizzleValue == rhs.swizzle.swizzleValue)); +} + +/// Queries the number of components for a particular channel format. +/// +/// @param [in] format The channel format to query for. +/// +/// @returns The number of components of the specified channel format. +inline uint32 NumComponents( + ChNumFormat format) +{ + return FormatInfoTable[static_cast(format)].componentCount; +} + +/// Queries the component mask for a particular format. +/// +/// @param [in] format The format to query for. +/// +/// @returns The component mask of @ref ChannelFlags for the specified format. +inline uint32 ComponentMask( + ChNumFormat format) +{ + uint32 mask = FormatInfoTable[static_cast(format)].channelMask; + PAL_ASSERT((mask & 0xF) == mask); + return mask; +} + +/// Checks if a format is undefined. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is undefined. False otherwise. +constexpr bool IsUndefined( + ChNumFormat format) +{ + return (format == ChNumFormat::Undefined); +} + +/// Checks if a format's numeric representation is unsigned normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned normalized. False otherwise. +inline bool IsUnorm( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Unorm); +} + +/// Checks if a format's numeric representation is signed normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed normalized. False otherwise. +inline bool IsSnorm( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Snorm); +} + +/// Checks if a format's numeric representation is unsigned scaled. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned scaled. False otherwise. +inline bool IsUscaled( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uscaled); +} + +/// Checks if a format's numeric representation is signed scaled. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed scaled. False otherwise. +inline bool IsSscaled( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sscaled); +} + +/// Checks if a format's numeric representation is unsigned integer. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is unsigned integer. False otherwise. +inline bool IsUint( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Uint); +} + +/// Checks if a format's numeric representation is signed integer. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is signed integer. False otherwise. +inline bool IsSint( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Sint); +} + +/// Checks if a format's numeric representation is floating point. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is floating point. False otherwise. +inline bool IsFloat( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Float); +} + +/// Checks if a format's numeric representation is gamma-corrected sRGB. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is sRGB. False otherwise. +inline bool IsSrgb( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Srgb); +} + +/// Checks if a format's numeric representation is normalized. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is normalized. False otherwise. +inline bool IsNormalized( + ChNumFormat format) +{ + return IsUnorm(format) || IsSnorm(format); +} + +/// Checks if a format's numeric representation is an integer format. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is an integer format. False otherwise. +inline bool IsInteger( + ChNumFormat format) +{ + return IsUint(format) || IsSint(format); +} + +/// Checks if a format is a depth/stencil only format. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is a depth/stencil only format. False otherwise. +inline bool IsDepthStencilOnly( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::DepthStencil); +} + +/// Checks if the specified format is one of the YUV-planar ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is YUV-planar. False otherwise. +inline bool IsYuvPlanar( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & YuvPlanar) != 0); +} + +/// Checks if the specified format is one of the YUV-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is YUV-packed. False otherwise. +inline bool IsYuvPacked( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & YuvPacked) != 0); +} + +/// Checks if the specified format is one of the YUV ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is for YUV data. False otherwise. +inline bool IsYuv( + ChNumFormat format) +{ + return (FormatInfoTable[static_cast(format)].numericSupport == NumericSupportFlags::Yuv); +} + +/// Checks if a format has alpha. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format has an alpha channel. False otherwise. +constexpr bool HasAlpha( + SwizzledFormat format) +{ + return ((format.format == ChNumFormat::A8_Unorm) || + (format.format == ChNumFormat::L4A4_Unorm) || + (format.format == ChNumFormat::L8A8_Unorm) || + ((ComponentMask(format.format) & ChannelFlags::W) != 0) || + ((format.swizzle.a != ChannelSwizzle::Zero) && (format.swizzle.a != ChannelSwizzle::One))); +} + +/// Checks if a format has an unused alpha channel. +/// +/// @param [in] format Pixel format. +/// +/// @returns True if the pixel format is a four channel format and has an unused alpha channel. False otherwise. +inline bool HasUnusedAlpha( + SwizzledFormat format) +{ + return ((NumComponents(format.format) == 4) && + (format.swizzle.r != ChannelSwizzle::W) && + (format.swizzle.g != ChannelSwizzle::W) && + (format.swizzle.b != ChannelSwizzle::W) && + (format.swizzle.a != ChannelSwizzle::W)); +} + +/// Converts format into its Unorm equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUnorm(ChNumFormat format); + +/// Converts format into its Snorm equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSnorm(ChNumFormat format); + +/// Converts format into its Uscaled equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUscaled(ChNumFormat format); + +/// Converts format into its Sscaled equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSscaled(ChNumFormat format); + +/// Converts format into its Uint equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Uint format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToUint(ChNumFormat format); + +/// Converts format into its Sint equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Sint format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSint(ChNumFormat format); + +/// Converts format into its Float equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Float format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToFloat(ChNumFormat format); + +/// Converts format into its Srgb equivalent. +/// +/// @param [in] format Pixel format. +/// +/// @returns Srgb format equivalent of input format. Undefined if none exist. +extern ChNumFormat PAL_STDCALL ConvertToSrgb(ChNumFormat format); + +/// Converts source numeric format to the provided destination numeric format. +/// +/// @param [in] srcFormat Source Pixel format. +/// @param [in] dstFormat Destination Pixel format. +/// +/// @returns Source format with equivalent numeric format of destination format. Undefined if none exist. +extern ChNumFormat ConvertToDstNumFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); + +/// Determines whether the srcFormat and the dstFormat have the same channel formats. +/// +/// @param [in] srcFormat Source channel pixel format. +/// @param [in] dstFormat Destination channel pixel format. +/// +/// @returns True if both formats share the same channel format. False otherwise. +extern bool ShareChFmt(ChNumFormat srcFormat, ChNumFormat dstFormat); + +/// Determines whether the srcFormat and the dstFormat have the same numeric formats. +/// +/// @param [in] srcFormat Source channel pixel format. +/// @param [in] dstFormat Destination channel pixel format. +/// +/// @returns True if both formats share the same numeric format. False otherwise. +inline bool HaveSameNumFmt( + ChNumFormat srcFormat, + ChNumFormat dstFormat) +{ + return (FormatInfoTable[static_cast(srcFormat)].numericSupport == + FormatInfoTable[static_cast(dstFormat)].numericSupport); +} + +/// Returns the block dimension for a compressed format. +/// +/// @param [in] format Format. +/// +/// @returns Corresponding block dimensions for the compressed format. +inline Extent3d CompressedBlockDim( + ChNumFormat format) +{ + Extent3d blockDim = {}; + + switch (format) + { + case ChNumFormat::Bc1_Unorm: + case ChNumFormat::Bc1_Srgb: + case ChNumFormat::Bc2_Unorm: + case ChNumFormat::Bc2_Srgb: + case ChNumFormat::Bc3_Unorm: + case ChNumFormat::Bc3_Srgb: + case ChNumFormat::Bc4_Unorm: + case ChNumFormat::Bc4_Snorm: + case ChNumFormat::Bc5_Unorm: + case ChNumFormat::Bc5_Snorm: + case ChNumFormat::Bc6_Ufloat: + case ChNumFormat::Bc6_Sfloat: + case ChNumFormat::Bc7_Unorm: + case ChNumFormat::Bc7_Srgb: + blockDim.width = CompressedBcBlockDim; + blockDim.height = CompressedBcBlockDim; + blockDim.depth = 1; + break; + case ChNumFormat::Etc2X8Y8Z8_Unorm: + case ChNumFormat::Etc2X8Y8Z8_Srgb: + case ChNumFormat::Etc2X8Y8Z8W1_Unorm: + case ChNumFormat::Etc2X8Y8Z8W1_Srgb: + case ChNumFormat::Etc2X8Y8Z8W8_Unorm: + case ChNumFormat::Etc2X8Y8Z8W8_Srgb: + case ChNumFormat::Etc2X11_Unorm: + case ChNumFormat::Etc2X11_Snorm: + case ChNumFormat::Etc2X11Y11_Unorm: + case ChNumFormat::Etc2X11Y11_Snorm: + blockDim.width = CompressedEtcBlockDim; + blockDim.height = CompressedEtcBlockDim; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr4x4_Unorm: + case ChNumFormat::AstcLdr4x4_Srgb: + case ChNumFormat::AstcHdr4x4_Float: + blockDim.width = 4; + blockDim.height = 4; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr5x4_Unorm: + case ChNumFormat::AstcLdr5x4_Srgb: + case ChNumFormat::AstcHdr5x4_Float: + blockDim.width = 5; + blockDim.height = 4; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr5x5_Unorm: + case ChNumFormat::AstcLdr5x5_Srgb: + case ChNumFormat::AstcHdr5x5_Float: + blockDim.width = 5; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr6x5_Unorm: + case ChNumFormat::AstcLdr6x5_Srgb: + case ChNumFormat::AstcHdr6x5_Float: + blockDim.width = 6; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr6x6_Unorm: + case ChNumFormat::AstcLdr6x6_Srgb: + case ChNumFormat::AstcHdr6x6_Float: + blockDim.width = 6; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x5_Unorm: + case ChNumFormat::AstcLdr8x5_Srgb: + case ChNumFormat::AstcHdr8x5_Float: + blockDim.width = 8; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x6_Unorm: + case ChNumFormat::AstcLdr8x6_Srgb: + case ChNumFormat::AstcHdr8x6_Float: + blockDim.width = 8; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr8x8_Unorm: + case ChNumFormat::AstcLdr8x8_Srgb: + case ChNumFormat::AstcHdr8x8_Float: + blockDim.width = 8; + blockDim.height = 8; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x5_Unorm: + case ChNumFormat::AstcLdr10x5_Srgb: + case ChNumFormat::AstcHdr10x5_Float: + blockDim.width = 10; + blockDim.height = 5; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x6_Unorm: + case ChNumFormat::AstcLdr10x6_Srgb: + case ChNumFormat::AstcHdr10x6_Float: + blockDim.width = 10; + blockDim.height = 6; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x8_Unorm: + case ChNumFormat::AstcLdr10x8_Srgb: + case ChNumFormat::AstcHdr10x8_Float: + blockDim.width = 10; + blockDim.height = 8; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr10x10_Unorm: + case ChNumFormat::AstcLdr10x10_Srgb: + case ChNumFormat::AstcHdr10x10_Float: + blockDim.width = 10; + blockDim.height = 10; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr12x10_Unorm: + case ChNumFormat::AstcLdr12x10_Srgb: + case ChNumFormat::AstcHdr12x10_Float: + blockDim.width = 12; + blockDim.height = 10; + blockDim.depth = 1; + break; + case ChNumFormat::AstcLdr12x12_Unorm: + case ChNumFormat::AstcLdr12x12_Srgb: + case ChNumFormat::AstcHdr12x12_Float: + blockDim.width = 12; + blockDim.height = 12; + blockDim.depth = 1; + break; + default: + // This function should not be called on a non-compressed format. + PAL_ASSERT_ALWAYS(); + break; + } + + return blockDim; +} + +/// Convert a compressed format block coordinate to texels. +/// +/// @param [in] format Format. +/// @param [in] width Block width. +/// @param [in] height Block height. +/// @param [in] depth Block depth. +/// +/// @returns Structure containing the texel width, height and depth +inline Extent3d CompressedBlocksToTexels( + ChNumFormat format, + uint32 width, + uint32 height, + uint32 depth) +{ + Extent3d dims = CompressedBlockDim(format); + dims.width *= width; + dims.height *= height; + dims.depth *= depth; + return dims; +} + +/// Convert a compressed format texel coordinate to blocks. +/// +/// @param [in] format Format. +/// @param [in] width Texel width. +/// @param [in] height Texel height. +/// @param [in] depth Texel depth. +/// +/// @returns Structure containing the block width, height and depth +inline Extent3d CompressedTexelsToBlocks( + ChNumFormat format, + uint32 width, + uint32 height, + uint32 depth) +{ + Extent3d dims = CompressedBlockDim(format); + dims.width = Util::RoundUpQuotient(width, dims.width); + dims.height = Util::RoundUpQuotient(height, dims.height); + dims.depth = Util::RoundUpQuotient(depth, dims.depth); + return dims; +} + +/// Queries the number of bits in a pixel or element for the given format. +/// +/// @param format The format to query for. +/// +/// @return The number of bits per pixel for the given channel format. +inline uint32 BitsPerPixel( + ChNumFormat format) +{ + return FormatInfoTable[static_cast(format)].bitsPerPixel; +} + +/// Queries the number of bits in a pixel or element for the given format. +/// +/// @param format The format to query for. +/// +/// @return The number of bytes per pixel for the given channel format. +inline uint32 BytesPerPixel( + ChNumFormat format) +{ + return (BitsPerPixel(format) >> 3); +} + +/// Checks if the specified channel swizzle is allowed with the given format. +/// +/// @param [in] format The pixel format to check against. +/// @param [in] swizzle The specified channel swizzle to check with. +/// +/// @returns True if the specified channel swizzle is valid for the given format. False otherwise. +inline bool IsValidChannelSwizzle( + ChNumFormat format, + ChannelSwizzle swizzle) +{ + const uint32 mask = ComponentMask(format); + + bool valid = false; + switch (swizzle) + { + case ChannelSwizzle::Zero: + case ChannelSwizzle::One: + valid = true; + break; + case ChannelSwizzle::X: + valid = ((mask & ChannelFlags::X) != 0); + break; + case ChannelSwizzle::Y: + valid = ((mask & ChannelFlags::Y) != 0); + break; + case ChannelSwizzle::Z: + valid = ((mask & ChannelFlags::Z) != 0); + break; + case ChannelSwizzle::W: + valid = ((mask & ChannelFlags::W) != 0); + break; + default: + PAL_NEVER_CALLED(); + break; + } + + return valid; +} + +/// Queries the per-component bit counts for a particular format. +/// +/// @param [in] format The format to query for. +/// +/// @returns The corresponding component swizzles for the specified format. Returned as an array of four counts. +inline const uint32* ComponentBitCounts( + ChNumFormat format) +{ + return &FormatInfoTable[static_cast(format)].bitCount[0]; +} + +/// Determines the maximum bit-count of any component in the format. +/// +/// @param [in] format The channel format to query for. +/// +/// @returns The maximum bit-count of any component in the format. +inline uint32 MaxComponentBitCount( + ChNumFormat format) +{ + const FormatInfo& info = FormatInfoTable[static_cast(format)]; + + return Util::Max(Util::Max(info.bitCount[0], info.bitCount[1]), Util::Max(info.bitCount[2], info.bitCount[3])); +} + +/// Checks if the specified format is one of the block-compressed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is block-compressed. False otherwise. +inline bool IsBlockCompressed( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & BlockCompressed) != 0); +} + +/// Checks if the specified format is one of the macro-pixel-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is macro-pixel-packed. False otherwise. +inline bool IsMacroPixelPacked( + ChNumFormat format) +{ + return ((FormatInfoTable[static_cast(format)].properties & MacroPixelPacked) != 0); +} + +/// Checks if the specified format is one of the rgb macro-pixel-packed ones. +/// +/// @param [in] format The format to check. +/// +/// @returns True if the specified format is a rgb macro-pixel-packed. False otherwise. +inline bool IsMacroPixelPackedRgbOnly( + ChNumFormat format) +{ + return (IsMacroPixelPacked(format) && (IsYuv(format) == false)); +} + +/// Returns the base-2 logarithm of of the subsampling ratio between the luma plane and chroma plane(s) of a YUV planar +/// format. The dimensions of the luma plane should be right-shifted by these amounts to determine the dimensions of the +/// chroma plane(s). +/// +/// @param [in] format Format. +/// @param [in] plane Image plane to query for. +/// +/// @returns Corresponding scaling factors between the luma plane and chroma plane(s). +inline Extent3d Log2SubsamplingRatio( + ChNumFormat format, + uint32 plane) +{ + // All planes for formats which are not YUV planar, and the 0th plane of a YUV planar format (the luma plane) are + // sampled at full rate, so the ratio is { log2(1), log2(1), log2(1) }, which equates to { 0,0,0 }. + Extent3d ratio = { }; + + if (IsYuvPlanar(format) && (plane != 0)) + { + PAL_ASSERT((plane == 1) || (plane == 2)); + switch (format) + { + // 4:4:4 formats have the same number of samples in every direction. + case ChNumFormat::P412: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_444P10: + case ChNumFormat::YUV_444P12: + case ChNumFormat::YUV_444P16: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + case ChNumFormat::YV24: + case ChNumFormat::NV24: + case ChNumFormat::P410: + case ChNumFormat::P416: +#endif + break; + // 4:2:0 formats have 1/2 as many samples in both the horizontal and vertical directions. + case ChNumFormat::YV12: + case ChNumFormat::NV12: + case ChNumFormat::NV21: + case ChNumFormat::P010: + case ChNumFormat::P012: + case ChNumFormat::P016: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_420P10: + case ChNumFormat::YUV_420P12: + case ChNumFormat::YUV_420P16: +#endif + ratio.width = 1; // log2(1/2) = -1 + ratio.height = 1; + break; + // 4:2:2 formats have 1/2 as many samples in the horizontal direction, and the same number of samples + // in the vertical direction. + case ChNumFormat::P208: + case ChNumFormat::P210: + case ChNumFormat::P212: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 923 + case ChNumFormat::P216: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 924 + case ChNumFormat::YUV_422P10: + case ChNumFormat::YUV_422P12: + case ChNumFormat::YUV_422P16: +#endif +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 925 + case ChNumFormat::YV16: +#endif + ratio.width = 1; + break; + // 4:1:1 formats have 1/4 as many samples in the horizontal direction, and the same number of samples + // in the vertical direction. + case ChNumFormat::NV11: + ratio.width = 2; // log2(1/4) = -2 + break; + default: + PAL_NEVER_CALLED(); // Did we miss a new YUV planar format? + break; + } + } + + return ratio; +} + +/// Converts a linearly-scaled color value to gamma-corrected sRGB. +/// +/// @param [in] linear Linear color value +/// +/// @returns Gamma-corrected sRGB color value +extern float LinearToGamma(float linear); + +/// Converts a gamma-corrected sRGB color value to linear color space. +/// +/// @param [in] gammaCorrectedVal Gamma-corrected sRGB color value +/// +/// @returns Linear color value +extern float GammaToLinear(float gammaCorrectedVal); + +/// Checks to see if a given format is a MM format +/// +/// +/// @returns bool is it an MM format +extern bool IsMmFormat(ChNumFormat format); + +/// Checks to see if a given format is a MM12 format +/// +/// +/// @returns bool is it an MM12 format +extern bool IsMm12Format(ChNumFormat format); + +/// Checks to see if a given format is a MM10 format +/// +/// +/// @returns bool is it an MM10 format +extern bool IsMm10Format(ChNumFormat format); + +} // Formats +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h index cd6ba2524a..17d7093043 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemory.h @@ -1,717 +1,711 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palGpuMemory.h - * @brief Defines the Platform Abstraction Library (PAL) IGpuMemory interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" - -#if defined(_WIN32) -struct _SECURITY_ATTRIBUTES; -#endif - -namespace Pal -{ - -// Forward declarations. -class IGpuMemory; -class IDevice; -class IImage; -enum class VaRange : uint32; - -/// Specifies Base Level priority per GPU memory allocation as a hint to the memory manager in the event it needs to -/// select allocations to page out of their preferred heaps. -enum class GpuMemPriority : uint32 -{ - Unused = 0x0, ///< Indicates that the allocation is not currently being used at all, and should be the first - /// choice to be paged out. - VeryLow = 0x1, ///< Lowest priority to keep in its preferred heap. - Low = 0x2, ///< Low priority to keep in its preferred heap. - Normal = 0x3, ///< Normal priority to keep in its preferred heap. - High = 0x4, ///< High priority to keep in its preferred heap (e.g., render targets). - VeryHigh = 0x5, ///< Highest priority to keep in its preferred heap. Last choice to be paged out (e.g., page - /// tables, displayable allocations). - Count -}; - -/// Specifies a finer granularity to the base Level priority per GPU memory allocation as a hint to the memory manager -/// in the event it needs to select allocations to page out of their preferred heaps. -enum class GpuMemPriorityOffset : uint32 -{ - Offset0 = 0x0, ///< Same priority as Base Level - Offset1 = 0x1, ///< Next priority from Base Level - Offset2 = 0x2, ///< Next priority from Base Level. - Offset3 = 0x3, ///< Next priority from Base Level. - Offset4 = 0x4, ///< Next priority from Base Level. - Offset5 = 0x5, ///< Next priority from Base Level. - Offset6 = 0x6, ///< Next priority from Base Level - Offset7 = 0x7, ///< Highest priority from Base Level - Count -}; - -/// Speicfies access mode for unmapped pages in a virtual Gpu Memory. -enum class VirtualGpuMemAccessMode : uint32 -{ - Undefined = 0x0, ///< Used in situations where no special accessMode needed. - NoAccess = 0x1, ///< All accesses of unmapped pages will trigger a GPU page fault. - ReadZero = 0x2, ///< Reads of unmapped pages return zero, and writes are discarded. - Count -}; - -/// Controls the behavior of this allocation with respect to the MALL. -enum class GpuMemMallPolicy : uint32 -{ - Default = 0x0, ///< MALL policy is decided by the driver. - Never = 0x1, ///< This allocation is never put through the MALL. - Always = 0x2, ///< This allocation is always put through the MALL. -}; - -/// Bitmask of cases where RPM view memory accesses will bypass the MALL. -enum RpmViewsBypassMall : uint32 -{ - RpmViewsBypassMallOff = 0x0, ///< Disable MALL bypass - RpmViewsBypassMallOnRead = 0x1, ///< Skip MALL for read access of views created in RPM - RpmViewsBypassMallOnWrite = 0x2, ///< Skip MALL for write access of views created in RPM - RpmViewsBypassMallOnCbDbWrite = 0x4, ///< Control the RPM CB/DB behavior - -}; - -/// Used for specifying a subregion of the allocation as having a different mall policy from the rest of the -/// allocation. -struct GpuMemMallRange -{ - uint32 startPage; ///< Starting 4k page that will obey the specified mallPolicy. - uint32 numPages; ///< Number of 4k pages that will obey the specified mallPolicy. -}; - -/// Specifies flags for @ref IGpuMemory creation. -union GpuMemoryCreateFlags -{ - struct - { - uint64 virtualAlloc : 1; ///< Create a _virtual_ as opposed to _real_ GPU memory allocation. - /// Only VA space will be allocated, and pages must be mapped via - /// IQueue::RemapVirtualMemoryPages(). - uint64 shareable : 1; ///< Memory can be shared between devices in the same process that - /// report the sharedMemory flag from - /// IDevice::GetMultiGpuCompatibility(). - uint64 interprocess : 1; ///< Memory will be visible to other processes - /// (they may choose to open it). - uint64 presentable : 1; ///< Memory can be bound to an image that will be used by presents. - uint64 flippable : 1; ///< Memory can be bound to an image that will be used by flip - /// presents. - uint64 stereo : 1; ///< Memory will be used for stereo (DXGI or AQBS stereo). - uint64 globallyCoherent : 1; ///< Memory needs to be globally coherent, - /// indicating the driver must manage both - /// CPU caches and GPU caches that are not flushed on - /// command buffer boundaries. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 - uint64 xdmaBuffer : 1; ///< GPU memory will be used for an XDMA cache buffer for - /// transferring data -#else - uint64 reserved1 : 1; ///< Delete this bit when the MAJOR_VERSION backcompat is removed. -#endif - /// between GPUs in a multi-GPU configuration. - uint64 turboSyncSurface : 1; ///< The memory will be used for TurboSync private swapchain primary. - uint64 typedBuffer : 1; ///< GPU memory will be permanently considered a single - /// typed buffer pseudo-object - /// with the properties given in typedBufferInfo. - uint64 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. - uint64 useReservedGpuVa : 1; ///< Use GPU virtual address previously reserved by another - /// memory object. It is invalid when using the shadow descriptor - /// table VA range. - uint64 autoPriority : 1; ///< Allow the platform to automatically determine the priority of - /// this GPU memory allocation. Flag is only valid if the device - /// reports that it supports this feature, and will result in an - /// error otherwise. - uint64 busAddressable : 1; ///< Create Bus Addressable memory. Allow memory to be used by other - /// device on the PCIe bus by exposing a write-only bus address. - uint64 sdiExternal : 1; ///< Create External Physical memory from an already allocated memory - /// on remote device. Similar to virtual allocations (no physical - /// backing) but have an immutable page mapping. The client must - /// specify surfaceBusAddr and markerBusAddr either at creation time - /// in GpuMemoryCreateInfo or by calling SetSdiRemoteBusAddress - /// once before using the GPU memory. The page mappings for an - /// allocation with this flag set must be initialized by including a - /// reference to it in the ppExternPhysMem list for the first - /// submission that references it. - uint64 sharedViaNtHandle : 1; ///< Memory will be shared by using Nt handle. - uint64 peerWritable : 1; ///< The memory can be open as peer memory and be writable. - uint64 tmzProtected : 1; ///< The memory is protected using TMZ (Trusted Memory Zone) or HSFB - /// (Hybrid Secure Framebuffer). It is not CPU accessible, - /// and GPU access is restricted by the hardware such that data - /// cannot be copied from protected memory into unprotected memory. - uint64 placeholder0 : 1; ///< Placeholder. - uint64 externalOpened : 1; ///< Specifies the GPUMemory is opened. - uint64 restrictedContent : 1; ///< Specifies the GPUMemory is protected content. - uint64 restrictedAccess : 1; ///< Specifies the GPUMemory is restricted shared access resource. - uint64 crossAdapter : 1; ///< Specifies the GPUMemory is shared cross-adapter resource. - uint64 cpuInvisible : 1; ///< By default, PAL makes every allocation CPU visible if all of its - /// preferred - ///< heaps are CPU visible. This flag can be used to override this - /// behavior when the client knows the memory will never be mapped - /// for CPU access. If this flag is set, calls to IGpuMemory::Map() - /// on this object will fail. - uint64 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. - /// But the memory still would be cached by other cache hierarchy - /// like L0, RB caches, L1, and L3. - uint64 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the - /// MALL. If this is set, then the mallPolicy enumeration must be set - /// to either "always" or "never". - uint64 explicitSync : 1; ///< If set, shared memory will skip syncs in the kernel and all - /// drivers that use this memory must handle syncs explicitly. - uint64 privPrimary : 1; ///< This is a private primary surface gpu memory. - uint64 privateScreen : 1; ///< GPU memory will be used for a private screen image. - uint64 kmdShareUmdSysMem : 1; ///< UMD will allocate/free a memory buffer to be shared with KMD. - uint64 deferCpuVaReservation : 1; ///< KMD will allocate with the "CpuVisibleOnDemand" alloc flag. - /// Ignored for non-CPU-visible allocations. - uint64 placeholder1 : 1; - uint64 startVaHintFlag : 1; ///< startVaHintFlag is set to 1 for passing startVaHint address - /// to set baseVirtAddr as startVaHint for memory allocation. -#if PAL_AMDGPU_BUILD - uint64 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes - /// the allocation upon creation, currently, only GpuHeapLocal and - /// GpuHeapInvisible are supported. - uint64 discardable : 1; ///< If set, this gpu memory object can be discarded under memory - /// pressure without keeping the content. -#else - uint64 placeholder2 : 2; -#endif - uint64 directCaptureSource : 1; ///< Memory will be mapped to DirectCapture resource's KMD-managed - /// private VA. - uint64 videoEncoder : 1; ///< Video encoder output butffer stream. - uint64 videoDecoder : 1; ///< Video decoder input butffer stream. - uint64 reserved : 26; ///< Reserved for future use. - }; - uint64 u64All; ///< Flags packed as 64-bit uint. -}; - -/// Specifies properties of a typed buffer pseudo-object. When this is specified in GpuMemoryCreateInfo along with the -/// typedBuffer flag, the GPU memory object has been permanently cast as a single typed buffer. A typed buffer is very -/// similar to a linear 3D image: it has a format, extent, and row/depth pitch values. -/// -/// Note that the typed buffer concept is used in other parts of the PAL interface and some of those instances may not -/// require a permanent typed buffer association. In such cases multiple typed buffers can be "bound" to one GPU memory -/// object at arbitrary offsets without any need to set the typedBuffer flag or fill out a TypedBufferCreateInfo. -struct TypedBufferCreateInfo -{ - SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. - Extent3d extent; ///< Dimensions in pixels WxHxD. - uint32 rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - uint32 depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. - bool depthIsSubres; ///< True if the depth slices should be treated as an array of 2D subresources. -}; - -/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateGpuMemory(). -/// -/// See the @ref IGpuMemory for additional restrictions on the size, alignment, vaRange, and descrVirtAddr fields. -struct GpuMemoryCreateInfo -{ - GpuMemoryCreateFlags flags; ///< GPU memory flags. - gpusize size; ///< Amount of GPU memory to allocate in bytes. - gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the - /// allocation granularity will be used. - VaRange vaRange; ///< Virtual address range for the GPU memory allocation. - - union - { - const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to the - /// memory object which previously reserved the GPU VA range to be used - /// by the new memory object. - gpusize descrVirtAddr; ///< Must be zero unless vaRange is ShadowDescriptorTable, in which case - /// it must specify the GPU VA of the corresponding DescriptorTable. - /// It doesn't need to be the base VA of the DescriptorTable allocation - /// but must be aligned to "alignment". - gpusize replayVirtAddr; ///< Must be zero unless vRange is CaptureReplay, in which case it must - /// specify the GPU VA of the corresponding memory object. - gpusize startVaHint; ///< Client passes a start VA hint to set as baseVirtAddr. If the given - /// hint is not properly aligned, find next higher aligned address as - /// hint. If the hint is available and within right vaRange where - /// vaRange is VaRange::Default then set baseVirtAddr as hint. If the - /// hint is unavailable, find the higher available address between - /// startVaHint and max vaRange. If any of the two cases are failed, - /// set baseVirtAddr as normal. - }; - - GpuMemPriority priority; ///< Hint to the OS paging process on how important it is to keep this - /// allocation in its preferred heap. - GpuMemPriorityOffset priorityOffset; ///< Offset from the base level priority. A higher offset means higher priority - /// within same base Level. Currently supported on Windows only. - GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via the - /// MALL (memory access last level). Only valid if "supportsMall" is set in - /// DeviceProperties. - GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. - /// Any pages outside of this range will use the opposite MALL policy from - /// what is specified in "mallPolicy". - - /// Describes how the allocation will be accessed. If heapAccess is set to something other than @ref - /// GpuHeapAccessExplicit, then PAL decides the appropriate heap to allocate memory from based on this member and - /// @ref heaps is ignored. Otherwise heap selection respects the selection in @ref heaps. - GpuHeapAccess heapAccess; - uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. - GpuHeap heaps[GpuHeapCount]; ///< List of allowed memory heaps, in order of preference. It will be ignored if - /// @ref heapAccess is to something other than @ref GpuHeapAccessExplicit. - - /// The pointer to an Image object the memory object will be bound to. It must be specified in special cases where - /// a memory object is permanently linked to an Image such as presentable images or shared resources on Windows. - /// For other cases, it's highly encouraged to provide the image object pointer. - /// - /// When @ref compression is set to Default, clients can benefit from PAL's internal logic to choose the optimal - /// GFX12-style distributed compression setting (enabled or disabled) based on properties of this image (e.g., usage - /// flags); otherwise PAL will disable the compression by default when detecting nullptr @pImage. - IImage* pImage; - - TypedBufferCreateInfo typedBufferInfo; ///< This struct must be filled out if the @tref typedBuffer flag is set. - /// This GPU memory will be permanently considered a typed buffer. - - VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages, WDDM only. - gpusize surfaceBusAddr; ///< Surface bus address of Bus Addresable Memory. - /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. - gpusize markerBusAddr; ///< Marker bus address of Bus Addresable Memory. The client can: - /// 1. Write to marker - /// 2. Let GPU wait until a value is written to marker before issuing - /// the next command. - /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. - - /// Client override for GFX12-style distributed compression. Only meaningful on devices that set the - /// supportDistributedCompression DeviceProperties flag. By default, PAL will apply a heuristic to determine - /// whether or not to enable compression based on properties of the memory allocation (e.g., CPU-visibility or - /// properties of the attached pImage). This mode does not apply to virtual IGpuMemory objects (such object - /// inherit their compression behavior from their backing physical memory on a page-by-page basis). - /// - /// @note If allocating memory intended to back multiple resources (e.g., a heap for DX12 "placed resources") where - /// pImage is nullptr, the client should set compression to Enable then rely on per-resource or per-view controls to - /// disable compression for resources as needed (use compressionMode fields in ImageCreateInfo, BufferViewInfo, - /// ImageViewInfo, etc.). - /// - /// If clients intends to enable distributed compression on buffers, buffer base alignment and size should be - /// aligned to DCC minimum compression unit size (256 bytes); otherwise there is potential corruption issue. - /// e.g. An allocation holding two buffers (placed resource or driver suballocation). Suppose the first buffer has - /// size 4Kibytes+128 bytes with compression enabled and the second buffer has size 128bytes with compression - /// disabled (buffer view compressionMode is ReadBypassWriteDisable). When compressing the trailing 128bytes of - /// the first buffer, it will compress the second buffer as well. Reading the second buffer will result in corrupted - /// content. - TriState compression; -}; - -/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreatePinnedGpuMemory(). -/// -/// See the @ref IGpuMemory for additional restrictions on the size and vaRange fields. -struct PinnedGpuMemoryCreateInfo -{ - const void* pSysMem; ///< Pointer to the system memory that should be pinned for GPU access. Must be - /// aligned to realMemAllocGranularity in DeviceProperties. - size_t size; ///< Amount of system memory to pin for GPU access. - VaRange vaRange; ///< Virtual address range for the GPU memory allocation. - gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the - /// Platform's allocation granularity will be used. - GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be - /// accessed via the MALL (memory access last level). Only valid - /// if "supportsMall" is set in DeviceProperties. - GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive - /// is set. Any pages outside of this range will use the opposite - /// MALL policy from what is specified in "mallPolicy". -}; - -/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateSvmGpuMemory(). -/// -/// See the @ref IGpuMemory for additional restrictions on the size and alignment. -struct SvmGpuMemoryCreateInfo -{ - GpuMemoryCreateFlags flags; ///< GPU memory flags. - gpusize size; ///< Amount of SVM memory to allocate in bytes. - /// The total amount of SVM memory can't exceed the value set in - /// maxSvmSize when the platform is created. - gpusize alignment; ///< Byte alignment of the allocation's SVM VA. If zero, an - /// alignment matching the allocation granularity will be used. - const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to - /// the memory object which previously reserved the GPU VA range - /// to be used by the new memory object. - bool isUsedForKernel; ///< Memory will be used to store kernel and execute on gpu. - GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be - /// accessed via the MALL (memory access last level). Only valid - /// if "supportsMall" is set in DeviceProperties. - GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive - /// is set. Any pages outside of this range will use the opposite - /// MALL policy from what is specified in "mallPolicy". -}; - -/// Specifies parameters for opening a shared GPU memory object on another device. -struct GpuMemoryOpenInfo -{ - IGpuMemory* pSharedMem; ///< Shared GPU memory object from another device to open. -}; - -/// Specifies parameters for opening a GPU memory object on another device for peer-to-peer memory transfers. -struct PeerGpuMemoryOpenInfo -{ - IGpuMemory* pOriginalMem; ///< GPU memory object from another device to open for peer-to-peer memory transfers. -}; - -/// Specifies parameters for opening another non-PAL device's gpu memory for access from this device. Input structure to -/// IDevice::OpenExternalSharedGpuMemory(). -struct ExternalGpuMemoryOpenInfo -{ - ExternalResourceOpenInfo resourceInfo; ///< Information describing the external gpuMemory. - TypedBufferCreateInfo typedBufferInfo; ///< Information describing the typed buffer information. - GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via - /// the MALL (memory access last level). Only valid if "supportsMall" is - /// set in DeviceProperties. - GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. - /// Any pages outside of this range will use the opposite MALL policy - /// from what is specified in "mallPolicy". - union - { - struct - { - uint32 typedBuffer : 1; ///< GPU memory will be permanently considered a single typed buffer pseudo-object - /// with the properties given in typedBufferInfo. - uint32 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. - uint32 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the MALL. - /// If this is set, then the mallPolicy enumeration must be set to either - /// "always" or "never". - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< External Gpu memory open info flags. -}; - -/// The fundemental information that describes a GPU memory object that is stored directly in each IGpuMemory. -/// It can be accessed without a virtual call via IGpuMemory::Desc(). -struct GpuMemoryDesc -{ - gpusize gpuVirtAddr; ///< GPU virtual address of the GPU memory allocation. - gpusize size; ///< Size of the GPU memory allocation, in bytes. - gpusize clientSize; ///< Size of the client requested GPU memory allocation, in bytes. - gpusize alignment; ///< Required GPU virtual address alignment, in bytes. - uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. - GpuHeap heaps[GpuHeapCount]; ///< List of preferred memory heaps, in order of preference. - gpusize surfaceBusAddr; ///< Bus Address of SDI memory surface and marker. These will not be initialized - gpusize markerBusAddr; /// until the memory is made resident. Client needs to call - /// InitBusAddressableGpuMemory() to query and update before this is valid. - union - { - struct - { - uint32 isVirtual : 1; ///< GPU memory is not backed by physical memory and must be remapped before the - /// GPU can safely access it. Will also be set for sdiExternal allocations. See - /// GpuMemoryCreateFlags::sdiExternal - uint32 isPeer : 1; ///< GPU memory object was created with @ref IDevice::OpenPeerGpuMemory. - uint32 isShared : 1; ///< GPU memory object was created either with - /// @ref IDevice::OpenExternalSharedGpuMemory or OpenSharedGpuMemory. - /// This IGpuMemory references memory created either by another process or - /// another device with the exception of peer access. - uint32 isExternal : 1; ///< GPU memory object was created with @ref IDevice::OpenExternalSharedGpuMemory. - /// This IGpuMemory references memory that was created either by another process - /// or by a device that doesn't support sharedMemory with this object's device - /// (i.e., MDA sharing on Windows). - uint32 isSvmAlloc : 1; ///< GPU memory is allocated in system memory. - /// Valid only when IOMMUv2 is supported - uint32 isExecutable : 1; ///< GPU memory is used for execution. Valid only when IOMMUv2 is supported - uint32 isExternPhys : 1; ///< GPU memory is External Physical memory - - uint32 placeholder0 : 1; ///< Reserved for future memory flag - - uint32 isCompressed : 1; ///< Set for physical allocations where UMD requested PTE.D=1 to enable - /// GFX12-style distributed compression. - uint32 reserved : 23; ///< Reserved for future use - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory desc flags. - - uint64 uniqueId; ///< Unique ID given to each GPU memory object, allows client tracking of GPU memory allocations. -}; - -/// Defines GPU memory sub allocation info. Contains a GPU memory handle to the whole memory. And the offset and size -/// shows where is the sub allocated memory. -struct GpuMemSubAllocInfo -{ - gpusize address; ///< Start address of the memory, not including the offset. - gpusize offset; ///< Offset from the start address of the memory. - gpusize size; ///< Size of the memory. -}; - -/// Specifies a GPU memory object and flags with more specific usage details. An array of these structures is specified -/// to PAL residency operations. -/// -/// @see IDevice::AddGpuMemoryReferences -/// @see IQueue::Submit -struct GpuMemoryRef -{ - union - { - struct - { - uint32 readOnly : 1; ///< The allocation will not be written using this reference. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory reference flags. - - IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. -}; - -/// Specifies a Display Output Post-Processing (DOPP) allocation that will be referenced by a submission along with -/// additional info describing how it will be used. -/// -/// @see IQueue::Submit -struct DoppRef -{ - union - { - struct - { - uint32 pfpa : 1; ///< Access to this DOPP allocation will be redirected to the primary pending - /// present (i.e., pre-flip primary access). If not set, access will - /// refer to the current onscreen primary. - uint32 lastPfpaCmd : 1; ///< This submission will be the last access of this pfpa allocation - /// for this frame. The pfpa interval will end once this submit - /// completes, allowing the corresponding vidPnSource to flip. - /// This flag is invalid if the pfpa flag is not set. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< GPU memory reference flags. - - IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. -}; - -/// Specifies the types of the exporting memory. -enum class ExportHandleType : uint32 -{ - Default = 0, ///< Let PAL choose the export type -#if PAL_AMDGPU_BUILD - FileDescriptor, ///< Export using a Linux file descriptor - Kms, ///< Export through KMS -#endif -}; - -/// Specifies parameters for export a GPUMemory NT handle from its name. -struct GpuMemoryExportInfo -{ -#if PAL_KMT_BUILD - const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable - /// attribute. - const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT - /// handle with a name, and then the handle can be acquired - /// via this name. - uint32 accessFlags; ///< Desried access rights of GPU memory. -#endif - ExportHandleType exportType; ///< Type of handle to use for exporting the memory. -}; - -/** - *********************************************************************************************************************** - * @interface IGpuMemory - * @brief Interface representing a GPU-accessible memory allocation. - * - * Depending on creation parameters, this could correspond to: - * - * + A _real_ memory object, corresponding directly to a physical allocation made on this device (whether it resides in - * a local or non-local heap). - * + A _virtual_ memory object, only consisting of virtual address space that can be mapped on a page basis to pages in - * _real_ memory objects via IQueue::RemapVirtualMemoryPages. - * + Pinned memory, a _real_ memory object created by pinning down client system memory. - * + Peer memory, a _real_ memory object corresponding to GPU memory that is likely local to another GPU. Only copy - * operations (peer-to-peer transfers) are allowed with this memory. - * + Opened/shared memory, a _real_ memory object that is fully shared between multiple GPUs, residing in a non-local - * heap. - * + External shared memory, a _real_ memory object that was created by an external process and is fully shared between - * multiple GPUs. - * - * @see IDevice::CreateGpuMemory - * @see IDevice::CreatePinnedGpuMemory - * @see IDevice::OpenSharedGpuMemory - * @see IDevice::OpenPeerGpuMemory - * @see IDevice::OpenExternalSharedGpuMemory - * - * - * All of these kinds of GPU memory are assigned a set of fundemental properties specified in GpuMemoryDesc which are - * either specified by the client or by PAL. There are specific rules these properties must follow; those rules are - * documented here to avoid duplication. Violating these rules will cause the device's corresponding "get size" - * functions to return an error code, the create/open functions may not validate their arguments. - * - * - * With the exception of external memory objects being opened, PAL will adjust size and base alignments as necessary - * to meet device requirements. Typically this means going out to OS page boundaries. The client is no longer required -* to query device requirements and align for PAL. - * - * Note that the device alignment requirements apply equally to GPU VAs. However, other kinds of alignment - * restrictions (e.g., IGpuMemoryBindable's requirements) may only apply to one of those two properties. When creating - * GPU memory objects the client must be careful to set the "alignment" field to the alignment of the GPU VA. - * - * - * Second, the client can't directly specify a memory object's GPU VA but must specify its VA range, limiting which - * portions of the VA space can be used. Note that non-external shared and peer GPU memory objects will use the - * original memory's VA range. External shared GPU memory always uses the default VA range. - * - * The ShadowDescriptorTable VA range is special because it pairs the shadow GPU memory to an existing descriptor GPU - * memory. The client must specify the GPU VA of the corresponding DescriptorTable memory when creating a shadow GPU - * memory object via descrVirtAddr; it must satisfy the alignment requirements of the shadow GPU memory. Both GPU - * memory objects must be created on the same device. Note that descrVirtAddr can be offset into the descriptor - * allocation such that multiple shadow GPU memory objects correspond to one larger descriptor GPU memory object. - * - * - * The client can further influence the GPU VA of shared and peer GPU memory objects. If the globalGpuVa flag is set - * when the original GPU memory object is created, PAL will assign any shared or peer GPU memory objects that same VA. - * Note that globalGpuVa is only supported if globalGpuVaSupport is set in DeviceProperties. - *********************************************************************************************************************** - */ -class IGpuMemory : public IDestroyable -{ -public: - /// Sets a new priority for this GPU memory object. - /// - /// This call is not available for virtual or pinned memory. - /// - /// @param [in] priority New base priority for the GPU memory object. - /// - /// @param [in] priorityOffset New priority offset for the GPU memory object. This is a small bias that can be - /// used by the OS to raise the importance of an allocation when there are - /// multiple allocations in the same base priority level. You can think of it as - /// the fractional bits of the priority level. - /// - /// @returns Success if the priority was successfully updated. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnavailable if this is a virtual or pinned GPU memory object. - virtual Result SetPriority( - GpuMemPriority priority, - GpuMemPriorityOffset priorityOffset) = 0; - - /// Makes the GPU memory available for CPU access and gives the client a pointer to reference it. - /// - /// The allocation should be unmapped by the client once CPU access is complete, although it _is_ legal to keep an - /// allocation mapped while the GPU references the allocation from a command buffer. - /// - /// It is legal to map the allocation multiple times concurrently. Mapping is not available for pinned or virtual - /// memory objects. This call is thread safe for calls referencing this memory object. - /// - /// @see Unmap. - /// - /// @param [out] ppData CPU pointer to the GPU memory object. - /// - /// @returns Success if the map succeeded. Otherwise, *ppData will not be valid and one of the following errors may - /// be returned. - /// + ErrorInvalidPointer if ppData is null. - /// + ErrorGpuMemoryMapFailed if the object is busy and cannot be mapped by the OS. - /// + ErrorNotMappable if the memory object cannot be mapped due to some of its heaps not having the CPU - /// visible flag set. - /// + ErrorUnavailable if the memory object is not a real allocation. - virtual Result Map( - void** ppData) = 0; - - /// Removes CPU access from a previously mapped GPU memory object. - /// - /// This call is thread safe for calls referencing the same memory object. - /// - /// @see Map - /// - /// @returns Success if the unmap succeeded. Otherwise, one of the following errors may be returned: - /// + ErrorGpuMemoryUnmapFailed if the GPU memory object cannot be unlocked. - /// + ErrorUnavailable if the GPU memory object is not a real allocation. - virtual Result Unmap() = 0; - -#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD - /// Returns an OS-specific handle which can be used to refer to this GPU memory object across processes. This will - /// return a null or invalid handle if the object was not created with the @ref interprocess create flag set. - /// - /// @note This function is only available for Linux builds or KMT builds. - /// - /// @param [in] handleInfo The info is used to open handle. - /// - /// @returns An OS-specific handle which can be used to access the GPU memory object across processes. - virtual OsExternalHandle ExportExternalHandle(const GpuMemoryExportInfo& exportInfo) const = 0; -#endif - - /// Returns a structure containing some fundemental information that describes this GPU memory object. - /// - /// @returns A reference to this allocation's GpuMemoryDesc. - const GpuMemoryDesc& Desc() const { return m_desc; } - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - - /// Set SDI remote surface bus address and marker bus address. - /// - /// This GPU memory object must have been created with the sdiExternal flag set and with the GpuMemoryCreateInfo - /// surfaceBusAddr and markerBusAddr fields both set to zero. This function allows clients to defer setting those - /// addresses until after creation. It must be called exactly once to permanently bind the given SDI addresses to - /// this GPU memory object. - /// - /// @warning An sdiExternal GPU memory object is not complete until its given its SDI addresses! The gpuVirtAddr - /// field in this GPU memory's GpuMemoryDesc will not be valid until this function is called! - /// - /// @param [in] surfaceBusAddr Surface bus address of Bus Addressable Memory. - /// @param [in] markerBusAddr Marker bus address of Bus Addressable Memory. The client can write to the marker - /// and have the GPU wait until a value is written to marker before continuing. - /// - /// @returns Success if succeeded. Otherwise, one of the following errors may be returned: - /// + ErrorUnavailable if the GPU memory object is not external physical memory or it has already been set. - /// + ErrorInvalidValue if one of the input params is 0. - /// + One of the escape call failed error. - virtual Result SetSdiRemoteBusAddress(gpusize surfaceBusAddr, gpusize markerBusAddr) = 0; - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IGpuMemory() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IGpuMemory() { } - - GpuMemoryDesc m_desc; ///< Information that describes this GPU memory object. - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuMemory.h + * @brief Defines the Platform Abstraction Library (PAL) IGpuMemory interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +#if defined(_WIN32) +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Pal +{ + +// Forward declarations. +class IGpuMemory; +class IDevice; +class IImage; +enum class VaRange : uint32; + +/// Specifies Base Level priority per GPU memory allocation as a hint to the memory manager in the event it needs to +/// select allocations to page out of their preferred heaps. +enum class GpuMemPriority : uint32 +{ + Unused = 0x0, ///< Indicates that the allocation is not currently being used at all, and should be the first + /// choice to be paged out. + VeryLow = 0x1, ///< Lowest priority to keep in its preferred heap. + Low = 0x2, ///< Low priority to keep in its preferred heap. + Normal = 0x3, ///< Normal priority to keep in its preferred heap. + High = 0x4, ///< High priority to keep in its preferred heap (e.g., render targets). + VeryHigh = 0x5, ///< Highest priority to keep in its preferred heap. Last choice to be paged out (e.g., page + /// tables, displayable allocations). + Count +}; + +/// Specifies a finer granularity to the base Level priority per GPU memory allocation as a hint to the memory manager +/// in the event it needs to select allocations to page out of their preferred heaps. +enum class GpuMemPriorityOffset : uint32 +{ + Offset0 = 0x0, ///< Same priority as Base Level + Offset1 = 0x1, ///< Next priority from Base Level + Offset2 = 0x2, ///< Next priority from Base Level. + Offset3 = 0x3, ///< Next priority from Base Level. + Offset4 = 0x4, ///< Next priority from Base Level. + Offset5 = 0x5, ///< Next priority from Base Level. + Offset6 = 0x6, ///< Next priority from Base Level + Offset7 = 0x7, ///< Highest priority from Base Level + Count +}; + +/// Speicfies access mode for unmapped pages in a virtual Gpu Memory. +enum class VirtualGpuMemAccessMode : uint32 +{ + Undefined = 0x0, ///< Used in situations where no special accessMode needed. + NoAccess = 0x1, ///< All accesses of unmapped pages will trigger a GPU page fault. + ReadZero = 0x2, ///< Reads of unmapped pages return zero, and writes are discarded. + Count +}; + +/// Controls the behavior of this allocation with respect to the MALL. +enum class GpuMemMallPolicy : uint32 +{ + Default = 0x0, ///< MALL policy is decided by the driver. + Never = 0x1, ///< This allocation is never put through the MALL. + Always = 0x2, ///< This allocation is always put through the MALL. +}; + +/// Bitmask of cases where RPM view memory accesses will bypass the MALL. +enum RpmViewsBypassMall : uint32 +{ + RpmViewsBypassMallOff = 0x0, ///< Disable MALL bypass + RpmViewsBypassMallOnRead = 0x1, ///< Skip MALL for read access of views created in RPM + RpmViewsBypassMallOnWrite = 0x2, ///< Skip MALL for write access of views created in RPM + RpmViewsBypassMallOnCbDbWrite = 0x4, ///< Control the RPM CB/DB behavior + +}; + +/// Used for specifying a subregion of the allocation as having a different mall policy from the rest of the +/// allocation. +struct GpuMemMallRange +{ + uint32 startPage; ///< Starting 4k page that will obey the specified mallPolicy. + uint32 numPages; ///< Number of 4k pages that will obey the specified mallPolicy. +}; + +/// Specifies flags for @ref IGpuMemory creation. +union GpuMemoryCreateFlags +{ + struct + { + uint64 virtualAlloc : 1; ///< Create a _virtual_ as opposed to _real_ GPU memory allocation. + /// Only VA space will be allocated, and pages must be mapped via + /// IQueue::RemapVirtualMemoryPages(). + uint64 shareable : 1; ///< Memory can be shared between devices in the same process that + /// report the sharedMemory flag from + /// IDevice::GetMultiGpuCompatibility(). + uint64 interprocess : 1; ///< Memory will be visible to other processes + /// (they may choose to open it). + uint64 presentable : 1; ///< Memory can be bound to an image that will be used by presents. + uint64 flippable : 1; ///< Memory can be bound to an image that will be used by flip + /// presents. + uint64 stereo : 1; ///< Memory will be used for stereo (DXGI or AQBS stereo). + uint64 globallyCoherent : 1; ///< Memory needs to be globally coherent, + /// indicating the driver must manage both + /// CPU caches and GPU caches that are not flushed on + /// command buffer boundaries. + uint64 xdmaBuffer : 1; ///< GPU memory will be used for an XDMA cache buffer for + /// transferring data + /// between GPUs in a multi-GPU configuration. + uint64 turboSyncSurface : 1; ///< The memory will be used for TurboSync private swapchain primary. + uint64 typedBuffer : 1; ///< GPU memory will be permanently considered a single + /// typed buffer pseudo-object + /// with the properties given in typedBufferInfo. + uint64 globalGpuVa : 1; ///< The GPU virtual address must be visible to all devices. + uint64 useReservedGpuVa : 1; ///< Use GPU virtual address previously reserved by another + /// memory object. It is invalid when using the shadow descriptor + /// table VA range. + uint64 autoPriority : 1; ///< Allow the platform to automatically determine the priority of + /// this GPU memory allocation. Flag is only valid if the device + /// reports that it supports this feature, and will result in an + /// error otherwise. + uint64 busAddressable : 1; ///< Create Bus Addressable memory. Allow memory to be used by other + /// device on the PCIe bus by exposing a write-only bus address. + uint64 sdiExternal : 1; ///< Create External Physical memory from an already allocated memory + /// on remote device. Similar to virtual allocations (no physical + /// backing) but have an immutable page mapping. The client must + /// specify surfaceBusAddr and markerBusAddr either at creation time + /// in GpuMemoryCreateInfo or by calling SetSdiRemoteBusAddress + /// once before using the GPU memory. The page mappings for an + /// allocation with this flag set must be initialized by including a + /// reference to it in the ppExternPhysMem list for the first + /// submission that references it. + uint64 sharedViaNtHandle : 1; ///< Memory will be shared by using Nt handle. + uint64 peerWritable : 1; ///< The memory can be open as peer memory and be writable. + uint64 tmzProtected : 1; ///< The memory is protected using TMZ (Trusted Memory Zone) or HSFB + /// (Hybrid Secure Framebuffer). It is not CPU accessible, + /// and GPU access is restricted by the hardware such that data + /// cannot be copied from protected memory into unprotected memory. + uint64 placeholder0 : 1; ///< Placeholder. + uint64 externalOpened : 1; ///< Specifies the GPUMemory is opened. + uint64 restrictedContent : 1; ///< Specifies the GPUMemory is protected content. + uint64 restrictedAccess : 1; ///< Specifies the GPUMemory is restricted shared access resource. + uint64 crossAdapter : 1; ///< Specifies the GPUMemory is shared cross-adapter resource. + uint64 cpuInvisible : 1; ///< By default, PAL makes every allocation CPU visible if all of its + /// preferred + ///< heaps are CPU visible. This flag can be used to override this + /// behavior when the client knows the memory will never be mapped + /// for CPU access. If this flag is set, calls to IGpuMemory::Map() + /// on this object will fail. + uint64 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. + /// But the memory still would be cached by other cache hierarchy + /// like L0, RB caches, L1, and L3. + uint64 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the + /// MALL. If this is set, then the mallPolicy enumeration must be set + /// to either "always" or "never". + uint64 explicitSync : 1; ///< If set, shared memory will skip syncs in the kernel and all + /// drivers that use this memory must handle syncs explicitly. + uint64 privPrimary : 1; ///< This is a private primary surface gpu memory. + uint64 privateScreen : 1; ///< GPU memory will be used for a private screen image. + uint64 kmdShareUmdSysMem : 1; ///< UMD will allocate/free a memory buffer to be shared with KMD. + uint64 deferCpuVaReservation : 1; ///< KMD will allocate with the "CpuVisibleOnDemand" alloc flag. + /// Ignored for non-CPU-visible allocations. + uint64 placeholder1 : 1; + uint64 startVaHintFlag : 1; ///< startVaHintFlag is set to 1 for passing startVaHint address + /// to set baseVirtAddr as startVaHint for memory allocation. +#if PAL_AMDGPU_BUILD + uint64 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes + /// the allocation upon creation, currently, only GpuHeapLocal and + /// GpuHeapInvisible are supported. + uint64 discardable : 1; ///< If set, this gpu memory object can be discarded under memory + /// pressure without keeping the content. +#else + uint64 placeholder2 : 2; +#endif + uint64 directCaptureSource : 1; ///< Memory will be mapped to DirectCapture resource's KMD-managed + /// private VA. + uint64 reserved : 28; ///< Reserved for future use. + }; + uint64 u64All; ///< Flags packed as 64-bit uint. +}; + +/// Specifies properties of a typed buffer pseudo-object. When this is specified in GpuMemoryCreateInfo along with the +/// typedBuffer flag, the GPU memory object has been permanently cast as a single typed buffer. A typed buffer is very +/// similar to a linear 3D image: it has a format, extent, and row/depth pitch values. +/// +/// Note that the typed buffer concept is used in other parts of the PAL interface and some of those instances may not +/// require a permanent typed buffer association. In such cases multiple typed buffers can be "bound" to one GPU memory +/// object at arbitrary offsets without any need to set the typedBuffer flag or fill out a TypedBufferCreateInfo. +struct TypedBufferCreateInfo +{ + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + Extent3d extent; ///< Dimensions in pixels WxHxD. + uint32 rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + uint32 depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + bool depthIsSubres; ///< True if the depth slices should be treated as an array of 2D subresources. +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size, alignment, vaRange, and descrVirtAddr fields. +struct GpuMemoryCreateInfo +{ + GpuMemoryCreateFlags flags; ///< GPU memory flags. + gpusize size; ///< Amount of GPU memory to allocate in bytes. + gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the + /// allocation granularity will be used. + VaRange vaRange; ///< Virtual address range for the GPU memory allocation. + + union + { + const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to the + /// memory object which previously reserved the GPU VA range to be used + /// by the new memory object. + gpusize descrVirtAddr; ///< Must be zero unless vaRange is ShadowDescriptorTable, in which case + /// it must specify the GPU VA of the corresponding DescriptorTable. + /// It doesn't need to be the base VA of the DescriptorTable allocation + /// but must be aligned to "alignment". + gpusize replayVirtAddr; ///< Must be zero unless vRange is CaptureReplay, in which case it must + /// specify the GPU VA of the corresponding memory object. + gpusize startVaHint; ///< Client passes a start VA hint to set as baseVirtAddr. If the given + /// hint is not properly aligned, find next higher aligned address as + /// hint. If the hint is available and within right vaRange where + /// vaRange is VaRange::Default then set baseVirtAddr as hint. If the + /// hint is unavailable, find the higher available address between + /// startVaHint and max vaRange. If any of the two cases are failed, + /// set baseVirtAddr as normal. + }; + + GpuMemPriority priority; ///< Hint to the OS paging process on how important it is to keep this + /// allocation in its preferred heap. + GpuMemPriorityOffset priorityOffset; ///< Offset from the base level priority. A higher offset means higher priority + /// within same base Level. Currently supported on Windows only. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via the + /// MALL (memory access last level). Only valid if "supportsMall" is set in + /// DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. + /// Any pages outside of this range will use the opposite MALL policy from + /// what is specified in "mallPolicy". + + /// Describes how the allocation will be accessed. If heapAccess is set to something other than @ref + /// GpuHeapAccessExplicit, then PAL decides the appropriate heap to allocate memory from based on this member and + /// @ref heaps is ignored. Otherwise heap selection respects the selection in @ref heaps. + GpuHeapAccess heapAccess; + uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. + GpuHeap heaps[GpuHeapCount]; ///< List of allowed memory heaps, in order of preference. It will be ignored if + /// @ref heapAccess is to something other than @ref GpuHeapAccessExplicit. + + /// The pointer to an Image object the memory object will be bound to. It must be specified in special cases where + /// a memory object is permanently linked to an Image such as presentable images or shared resources on Windows. + /// For other cases, it's highly encouraged to provide the image object pointer. + /// + /// When @ref compression is set to Default, clients can benefit from PAL's internal logic to choose the optimal + /// GFX12-style distributed compression setting (enabled or disabled) based on properties of this image (e.g., usage + /// flags); otherwise PAL will disable the compression by default when detecting nullptr @pImage. + IImage* pImage; + + TypedBufferCreateInfo typedBufferInfo; ///< This struct must be filled out if the @tref typedBuffer flag is set. + /// This GPU memory will be permanently considered a typed buffer. + + VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages, WDDM only. + gpusize surfaceBusAddr; ///< Surface bus address of Bus Addresable Memory. + /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. + gpusize markerBusAddr; ///< Marker bus address of Bus Addresable Memory. The client can: + /// 1. Write to marker + /// 2. Let GPU wait until a value is written to marker before issuing + /// the next command. + /// Only valid when GpuMemoryCreateFlags::sdiExternal is set. + + /// Client override for GFX12-style distributed compression. Only meaningful on devices that set the + /// supportDistributedCompression DeviceProperties flag. By default, PAL will apply a heuristic to determine + /// whether or not to enable compression based on properties of the memory allocation (e.g., CPU-visibility or + /// properties of the attached pImage). This mode does not apply to virtual IGpuMemory objects (such object + /// inherit their compression behavior from their backing physical memory on a page-by-page basis). + /// + /// @note If allocating memory intended to back multiple resources (e.g., a heap for DX12 "placed resources") where + /// pImage is nullptr, the client should set compression to Enable then rely on per-resource or per-view controls to + /// disable compression for resources as needed (use compressionMode fields in ImageCreateInfo, BufferViewInfo, + /// ImageViewInfo, etc.). + /// + /// If clients intends to enable distributed compression on buffers, buffer base alignment and size should be + /// aligned to DCC minimum compression unit size (256 bytes); otherwise there is potential corruption issue. + /// e.g. An allocation holding two buffers (placed resource or driver suballocation). Suppose the first buffer has + /// size 4Kibytes+128 bytes with compression enabled and the second buffer has size 128bytes with compression + /// disabled (buffer view compressionMode is ReadBypassWriteDisable). When compressing the trailing 128bytes of + /// the first buffer, it will compress the second buffer as well. Reading the second buffer will result in corrupted + /// content. + TriState compression; +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreatePinnedGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size and vaRange fields. +struct PinnedGpuMemoryCreateInfo +{ + const void* pSysMem; ///< Pointer to the system memory that should be pinned for GPU access. Must be + /// aligned to realMemAllocGranularity in DeviceProperties. + size_t size; ///< Amount of system memory to pin for GPU access. + VaRange vaRange; ///< Virtual address range for the GPU memory allocation. + gpusize alignment; ///< Byte alignment of the allocation's GPU VA. If zero, an alignment matching the + /// Platform's allocation granularity will be used. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be + /// accessed via the MALL (memory access last level). Only valid + /// if "supportsMall" is set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive + /// is set. Any pages outside of this range will use the opposite + /// MALL policy from what is specified in "mallPolicy". +}; + +/// Specifies properties for @ref IGpuMemory creation. Input structure to IDevice::CreateSvmGpuMemory(). +/// +/// See the @ref IGpuMemory for additional restrictions on the size and alignment. +struct SvmGpuMemoryCreateInfo +{ + GpuMemoryCreateFlags flags; ///< GPU memory flags. + gpusize size; ///< Amount of SVM memory to allocate in bytes. + /// The total amount of SVM memory can't exceed the value set in + /// maxSvmSize when the platform is created. + gpusize alignment; ///< Byte alignment of the allocation's SVM VA. If zero, an + /// alignment matching the allocation granularity will be used. + const IGpuMemory* pReservedGpuVaOwner; ///< Must be zero unless "useReservedGpuVa" is true. It points to + /// the memory object which previously reserved the GPU VA range + /// to be used by the new memory object. + bool isUsedForKernel; ///< Memory will be used to store kernel and execute on gpu. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be + /// accessed via the MALL (memory access last level). Only valid + /// if "supportsMall" is set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive + /// is set. Any pages outside of this range will use the opposite + /// MALL policy from what is specified in "mallPolicy". +}; + +/// Specifies parameters for opening a shared GPU memory object on another device. +struct GpuMemoryOpenInfo +{ + IGpuMemory* pSharedMem; ///< Shared GPU memory object from another device to open. +}; + +/// Specifies parameters for opening a GPU memory object on another device for peer-to-peer memory transfers. +struct PeerGpuMemoryOpenInfo +{ + IGpuMemory* pOriginalMem; ///< GPU memory object from another device to open for peer-to-peer memory transfers. +}; + +/// Specifies parameters for opening another non-PAL device's gpu memory for access from this device. Input structure to +/// IDevice::OpenExternalSharedGpuMemory(). +struct ExternalGpuMemoryOpenInfo +{ + ExternalResourceOpenInfo resourceInfo; ///< Information describing the external gpuMemory. + TypedBufferCreateInfo typedBufferInfo; ///< Information describing the typed buffer information. + GpuMemMallPolicy mallPolicy; ///< Used to control whether or not this allocation will be accessed via + /// the MALL (memory access last level). Only valid if "supportsMall" is + /// set in DeviceProperties. + GpuMemMallRange mallRange; ///< These parameters are only meaningful if flags.mallRangeActive is set. + /// Any pages outside of this range will use the opposite MALL policy + /// from what is specified in "mallPolicy". + union + { + struct + { + uint32 typedBuffer : 1; ///< GPU memory will be permanently considered a single typed buffer pseudo-object + /// with the properties given in typedBufferInfo. + uint32 gl2Uncached : 1; ///< Specifies the GPU Memory is un-cached on GPU L2 cache. + uint32 mallRangeActive : 1; ///< If set, then this allocation will be partially allocated in the MALL. + /// If this is set, then the mallPolicy enumeration must be set to either + /// "always" or "never". + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External Gpu memory open info flags. +}; + +/// The fundemental information that describes a GPU memory object that is stored directly in each IGpuMemory. +/// It can be accessed without a virtual call via IGpuMemory::Desc(). +struct GpuMemoryDesc +{ + gpusize gpuVirtAddr; ///< GPU virtual address of the GPU memory allocation. + gpusize size; ///< Size of the GPU memory allocation, in bytes. + gpusize clientSize; ///< Size of the client requested GPU memory allocation, in bytes. + gpusize alignment; ///< Required GPU virtual address alignment, in bytes. + uint32 heapCount; ///< Number of entries in heaps[]. Must be 0 for virtual allocations. + GpuHeap heaps[GpuHeapCount]; ///< List of preferred memory heaps, in order of preference. + gpusize surfaceBusAddr; ///< Bus Address of SDI memory surface and marker. These will not be initialized + gpusize markerBusAddr; /// until the memory is made resident. Client needs to call + /// InitBusAddressableGpuMemory() to query and update before this is valid. + union + { + struct + { + uint32 isVirtual : 1; ///< GPU memory is not backed by physical memory and must be remapped before the + /// GPU can safely access it. Will also be set for sdiExternal allocations. See + /// GpuMemoryCreateFlags::sdiExternal + uint32 isPeer : 1; ///< GPU memory object was created with @ref IDevice::OpenPeerGpuMemory. + uint32 isShared : 1; ///< GPU memory object was created either with + /// @ref IDevice::OpenExternalSharedGpuMemory or OpenSharedGpuMemory. + /// This IGpuMemory references memory created either by another process or + /// another device with the exception of peer access. + uint32 isExternal : 1; ///< GPU memory object was created with @ref IDevice::OpenExternalSharedGpuMemory. + /// This IGpuMemory references memory that was created either by another process + /// or by a device that doesn't support sharedMemory with this object's device + /// (i.e., MDA sharing on Windows). + uint32 isSvmAlloc : 1; ///< GPU memory is allocated in system memory. + /// Valid only when IOMMUv2 is supported + uint32 isExecutable : 1; ///< GPU memory is used for execution. Valid only when IOMMUv2 is supported + uint32 isExternPhys : 1; ///< GPU memory is External Physical memory + + uint32 placeholder0 : 1; ///< Reserved for future memory flag + + uint32 isCompressed : 1; ///< Set for physical allocations where UMD requested PTE.D=1 to enable + /// GFX12-style distributed compression. + uint32 reserved : 23; ///< Reserved for future use + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory desc flags. + + uint64 uniqueId; ///< Unique ID given to each GPU memory object, allows client tracking of GPU memory allocations. +}; + +/// Defines GPU memory sub allocation info. Contains a GPU memory handle to the whole memory. And the offset and size +/// shows where is the sub allocated memory. +struct GpuMemSubAllocInfo +{ + gpusize address; ///< Start address of the memory, not including the offset. + gpusize offset; ///< Offset from the start address of the memory. + gpusize size; ///< Size of the memory. +}; + +/// Specifies a GPU memory object and flags with more specific usage details. An array of these structures is specified +/// to PAL residency operations. +/// +/// @see IDevice::AddGpuMemoryReferences +/// @see IQueue::Submit +struct GpuMemoryRef +{ + union + { + struct + { + uint32 readOnly : 1; ///< The allocation will not be written using this reference. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory reference flags. + + IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. +}; + +/// Specifies a Display Output Post-Processing (DOPP) allocation that will be referenced by a submission along with +/// additional info describing how it will be used. +/// +/// @see IQueue::Submit +struct DoppRef +{ + union + { + struct + { + uint32 pfpa : 1; ///< Access to this DOPP allocation will be redirected to the primary pending + /// present (i.e., pre-flip primary access). If not set, access will + /// refer to the current onscreen primary. + uint32 lastPfpaCmd : 1; ///< This submission will be the last access of this pfpa allocation + /// for this frame. The pfpa interval will end once this submit + /// completes, allowing the corresponding vidPnSource to flip. + /// This flag is invalid if the pfpa flag is not set. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< GPU memory reference flags. + + IGpuMemory* pGpuMemory; ///< The GPU memory object referenced by this residency operation. +}; + +/// Specifies the types of the exporting memory. +enum class ExportHandleType : uint32 +{ + Default = 0, ///< Let PAL choose the export type +#if PAL_AMDGPU_BUILD + FileDescriptor, ///< Export using a Linux file descriptor + Kms, ///< Export through KMS +#endif +}; + +/// Specifies parameters for export a GPUMemory NT handle from its name. +struct GpuMemoryExportInfo +{ +#if PAL_KMT_BUILD + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT + /// handle with a name, and then the handle can be acquired + /// via this name. + uint32 accessFlags; ///< Desried access rights of GPU memory. +#endif + ExportHandleType exportType; ///< Type of handle to use for exporting the memory. +}; + +/** + *********************************************************************************************************************** + * @interface IGpuMemory + * @brief Interface representing a GPU-accessible memory allocation. + * + * Depending on creation parameters, this could correspond to: + * + * + A _real_ memory object, corresponding directly to a physical allocation made on this device (whether it resides in + * a local or non-local heap). + * + A _virtual_ memory object, only consisting of virtual address space that can be mapped on a page basis to pages in + * _real_ memory objects via IQueue::RemapVirtualMemoryPages. + * + Pinned memory, a _real_ memory object created by pinning down client system memory. + * + Peer memory, a _real_ memory object corresponding to GPU memory that is likely local to another GPU. Only copy + * operations (peer-to-peer transfers) are allowed with this memory. + * + Opened/shared memory, a _real_ memory object that is fully shared between multiple GPUs, residing in a non-local + * heap. + * + External shared memory, a _real_ memory object that was created by an external process and is fully shared between + * multiple GPUs. + * + * @see IDevice::CreateGpuMemory + * @see IDevice::CreatePinnedGpuMemory + * @see IDevice::OpenSharedGpuMemory + * @see IDevice::OpenPeerGpuMemory + * @see IDevice::OpenExternalSharedGpuMemory + * + * + * All of these kinds of GPU memory are assigned a set of fundemental properties specified in GpuMemoryDesc which are + * either specified by the client or by PAL. There are specific rules these properties must follow; those rules are + * documented here to avoid duplication. Violating these rules will cause the device's corresponding "get size" + * functions to return an error code, the create/open functions may not validate their arguments. + * + * + * With the exception of external memory objects being opened, PAL will adjust size and base alignments as necessary + * to meet device requirements. Typically this means going out to OS page boundaries. The client is no longer required +* to query device requirements and align for PAL. + * + * Note that the device alignment requirements apply equally to GPU VAs. However, other kinds of alignment + * restrictions (e.g., IGpuMemoryBindable's requirements) may only apply to one of those two properties. When creating + * GPU memory objects the client must be careful to set the "alignment" field to the alignment of the GPU VA. + * + * + * Second, the client can't directly specify a memory object's GPU VA but must specify its VA range, limiting which + * portions of the VA space can be used. Note that non-external shared and peer GPU memory objects will use the + * original memory's VA range. External shared GPU memory always uses the default VA range. + * + * The ShadowDescriptorTable VA range is special because it pairs the shadow GPU memory to an existing descriptor GPU + * memory. The client must specify the GPU VA of the corresponding DescriptorTable memory when creating a shadow GPU + * memory object via descrVirtAddr; it must satisfy the alignment requirements of the shadow GPU memory. Both GPU + * memory objects must be created on the same device. Note that descrVirtAddr can be offset into the descriptor + * allocation such that multiple shadow GPU memory objects correspond to one larger descriptor GPU memory object. + * + * + * The client can further influence the GPU VA of shared and peer GPU memory objects. If the globalGpuVa flag is set + * when the original GPU memory object is created, PAL will assign any shared or peer GPU memory objects that same VA. + * Note that globalGpuVa is only supported if globalGpuVaSupport is set in DeviceProperties. + *********************************************************************************************************************** + */ +class IGpuMemory : public IDestroyable +{ +public: + /// Sets a new priority for this GPU memory object. + /// + /// This call is not available for virtual or pinned memory. + /// + /// @param [in] priority New base priority for the GPU memory object. + /// + /// @param [in] priorityOffset New priority offset for the GPU memory object. This is a small bias that can be + /// used by the OS to raise the importance of an allocation when there are + /// multiple allocations in the same base priority level. You can think of it as + /// the fractional bits of the priority level. + /// + /// @returns Success if the priority was successfully updated. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnavailable if this is a virtual or pinned GPU memory object. + virtual Result SetPriority( + GpuMemPriority priority, + GpuMemPriorityOffset priorityOffset) = 0; + + /// Makes the GPU memory available for CPU access and gives the client a pointer to reference it. + /// + /// The allocation should be unmapped by the client once CPU access is complete, although it _is_ legal to keep an + /// allocation mapped while the GPU references the allocation from a command buffer. + /// + /// It is legal to map the allocation multiple times concurrently. Mapping is not available for pinned or virtual + /// memory objects. This call is thread safe for calls referencing this memory object. + /// + /// @see Unmap. + /// + /// @param [out] ppData CPU pointer to the GPU memory object. + /// + /// @returns Success if the map succeeded. Otherwise, *ppData will not be valid and one of the following errors may + /// be returned. + /// + ErrorInvalidPointer if ppData is null. + /// + ErrorGpuMemoryMapFailed if the object is busy and cannot be mapped by the OS. + /// + ErrorNotMappable if the memory object cannot be mapped due to some of its heaps not having the CPU + /// visible flag set. + /// + ErrorUnavailable if the memory object is not a real allocation. + virtual Result Map( + void** ppData) = 0; + + /// Removes CPU access from a previously mapped GPU memory object. + /// + /// This call is thread safe for calls referencing the same memory object. + /// + /// @see Map + /// + /// @returns Success if the unmap succeeded. Otherwise, one of the following errors may be returned: + /// + ErrorGpuMemoryUnmapFailed if the GPU memory object cannot be unlocked. + /// + ErrorUnavailable if the GPU memory object is not a real allocation. + virtual Result Unmap() = 0; + +#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD + /// Returns an OS-specific handle which can be used to refer to this GPU memory object across processes. This will + /// return a null or invalid handle if the object was not created with the @ref interprocess create flag set. + /// + /// @note This function is only available for Linux builds or KMT builds. + /// + /// @param [in] handleInfo The info is used to open handle. + /// + /// @returns An OS-specific handle which can be used to access the GPU memory object across processes. + virtual OsExternalHandle ExportExternalHandle(const GpuMemoryExportInfo& exportInfo) const = 0; +#endif + + /// Returns a structure containing some fundemental information that describes this GPU memory object. + /// + /// @returns A reference to this allocation's GpuMemoryDesc. + const GpuMemoryDesc& Desc() const { return m_desc; } + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Set SDI remote surface bus address and marker bus address. + /// + /// This GPU memory object must have been created with the sdiExternal flag set and with the GpuMemoryCreateInfo + /// surfaceBusAddr and markerBusAddr fields both set to zero. This function allows clients to defer setting those + /// addresses until after creation. It must be called exactly once to permanently bind the given SDI addresses to + /// this GPU memory object. + /// + /// @warning An sdiExternal GPU memory object is not complete until its given its SDI addresses! The gpuVirtAddr + /// field in this GPU memory's GpuMemoryDesc will not be valid until this function is called! + /// + /// @param [in] surfaceBusAddr Surface bus address of Bus Addressable Memory. + /// @param [in] markerBusAddr Marker bus address of Bus Addressable Memory. The client can write to the marker + /// and have the GPU wait until a value is written to marker before continuing. + /// + /// @returns Success if succeeded. Otherwise, one of the following errors may be returned: + /// + ErrorUnavailable if the GPU memory object is not external physical memory or it has already been set. + /// + ErrorInvalidValue if one of the input params is 0. + /// + One of the escape call failed error. + virtual Result SetSdiRemoteBusAddress(gpusize surfaceBusAddr, gpusize markerBusAddr) = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IGpuMemory() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IGpuMemory() { } + + GpuMemoryDesc m_desc; ///< Information that describes this GPU memory object. + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h index 5f0f499d12..086556f815 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palGpuMemoryBindable.h @@ -1,139 +1,139 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palGpuMemoryBindable.h - * @brief Defines the Platform Abstraction Library (PAL) IGpuMemoryBindable interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" - -namespace Pal -{ - -// Forward declarations. -class IGpuMemory; - -/// Reports required properties of a GPU memory object bound to a specific object. The client must query these -/// properties via IGpuMemoryBindable::GetGpuMemoryRequirements() and bind an @ref IGpuMemory object matching these -/// requirements to the @ref IGpuMemoryBindable object using IGpuMemoryBindable::BindGpuMemory(). -struct GpuMemoryRequirements -{ - union - { - struct - { - uint32 cpuAccess : 1; ///< CPU access is required. If set, the client must not set cpuInvisible in - /// GpuMemoryCreateFlags and must provide CPU visible heaps or CPU visible heap - /// access mode. If not set, it's strongly recommended to set cpuInvisible. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Flags specifying required GPU memory properties. - - gpusize size; ///< Amount of GPU memory required, in bytes. - gpusize alignment; ///< Required GPU memory virtual address alignment, in bytes. - uint32 heapCount; ///< Number of valid entries in heaps[]. - GpuHeap heaps[GpuHeapCount]; ///< List of allowed heaps for the GPU memory in order of predicted performance. -}; - -/** - *********************************************************************************************************************** - * @interface IGpuMemoryBindable - * @brief Interface inherited by objects that may require GPU memory be bound to them. - * - * In the future, PAL may discover a need to allocate GPU memory for a class that currently doesn't require it. In that - * situation, that class will be updated to inherit from IGpuMemoryBindable. This change would break backward - * compatibility and would result in the major interface version being incremented. - *********************************************************************************************************************** - */ -class IGpuMemoryBindable : public IDestroyable -{ -public: - /// Queries the GPU memory properties required by this object. The client should query properties with this method, - /// create/sub-allocate a memory range matching the requirements, then bind the memory to the object via - /// @ref BindGpuMemory(). - /// - /// @note Not all objects may actually need GPU memory, and in that case the memory properties will reflect a 0 size - /// and alignment. - /// - /// @param [out] pGpuMemReqs Required properties of GPU memory to be bound to this object. Includes properties like - /// size, alignment, and allowed heaps. - virtual void GetGpuMemoryRequirements( - GpuMemoryRequirements* pGpuMemReqs) const = 0; - - /// Binds GPU memory to this object according to the requirements queried via GetGpuMemoryRequirements(). - /// - /// Binding memory to objects other than images automatically initializes the object memory as necessary. Image - /// objects used as color or depth-stencil targets have to be explicitly initialized in command buffers using a - /// ICmdBuffer::CmdReleaseThenAcquire() command to transition them out of the LayoutUninitializedTarget usage. - /// - /// Binding memory to an object automatically unbinds any previously bound memory. There is no need to bind null to - /// an object to explicitly unbind a previously bound allocation before binding a new allocation. - /// - /// This call is invalid on objects that have no memory requirements, even if binding null. - /// - /// @param [in] pGpuMemory GPU memory to be bound. If null, the previous binding will be released. - /// @param [in] offset Offset into the GPU memory where the object's memory range should begin. This allows - /// sub-allocating many object's GPU memory from the same IGpuMemory object. - /// - /// @returns Success if the specified GPU memory was successfully bound to the object. Otherwise, one of the - /// following errors may be returned: - /// + ErrorUnavailable if binding a non-image to a virtual allocation. - /// + ErrorInvalidAlignment if the offset does not match the alignment requirements of the object. - /// + ErrorInvalidMemorySize if the object's required memory size does not fit completely within the given - /// memory object at the specified offset. - virtual Result BindGpuMemory( - IGpuMemory* pGpuMemory, - gpusize offset) = 0; - - /// Returns the GPU memory object and offset that this object is bound to or nullptr and 0 if not bound. - /// - /// @param [out] ppGpuMemory Returns the GPU memory object to the address specified in this pointer. - /// Returns nullptr if this object is not bound to any GPU memory. - /// @param [out] pOffset Returns the GPU memory offset to the address specified in this pointer. - /// Returns 0 if this object is not bound to any GPU memory. - /// - /// @returns Success if the GPU memory and offset was successfully returned. Otherwise, one of the following errors - /// may be returned: - /// + ErrorGpuMemoryNotBound if this object is not bound to any GPU memory. - /// + ErrorInvalidPointer if either ppGpuMemory or pOffset is nullptr. - /// + ErrorUnavailable if binding is not supported in the derived class - virtual Result GetGpuMemory( - IGpuMemory** ppGpuMemory, - gpusize* pOffset) const = 0; - -protected: - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IGpuMemoryBindable() { } -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuMemoryBindable.h + * @brief Defines the Platform Abstraction Library (PAL) IGpuMemoryBindable interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +// Forward declarations. +class IGpuMemory; + +/// Reports required properties of a GPU memory object bound to a specific object. The client must query these +/// properties via IGpuMemoryBindable::GetGpuMemoryRequirements() and bind an @ref IGpuMemory object matching these +/// requirements to the @ref IGpuMemoryBindable object using IGpuMemoryBindable::BindGpuMemory(). +struct GpuMemoryRequirements +{ + union + { + struct + { + uint32 cpuAccess : 1; ///< CPU access is required. If set, the client must not set cpuInvisible in + /// GpuMemoryCreateFlags and must provide CPU visible heaps or CPU visible heap + /// access mode. If not set, it's strongly recommended to set cpuInvisible. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Flags specifying required GPU memory properties. + + gpusize size; ///< Amount of GPU memory required, in bytes. + gpusize alignment; ///< Required GPU memory virtual address alignment, in bytes. + uint32 heapCount; ///< Number of valid entries in heaps[]. + GpuHeap heaps[GpuHeapCount]; ///< List of allowed heaps for the GPU memory in order of predicted performance. +}; + +/** + *********************************************************************************************************************** + * @interface IGpuMemoryBindable + * @brief Interface inherited by objects that may require GPU memory be bound to them. + * + * In the future, PAL may discover a need to allocate GPU memory for a class that currently doesn't require it. In that + * situation, that class will be updated to inherit from IGpuMemoryBindable. This change would break backward + * compatibility and would result in the major interface version being incremented. + *********************************************************************************************************************** + */ +class IGpuMemoryBindable : public IDestroyable +{ +public: + /// Queries the GPU memory properties required by this object. The client should query properties with this method, + /// create/sub-allocate a memory range matching the requirements, then bind the memory to the object via + /// @ref BindGpuMemory(). + /// + /// @note Not all objects may actually need GPU memory, and in that case the memory properties will reflect a 0 size + /// and alignment. + /// + /// @param [out] pGpuMemReqs Required properties of GPU memory to be bound to this object. Includes properties like + /// size, alignment, and allowed heaps. + virtual void GetGpuMemoryRequirements( + GpuMemoryRequirements* pGpuMemReqs) const = 0; + + /// Binds GPU memory to this object according to the requirements queried via GetGpuMemoryRequirements(). + /// + /// Binding memory to objects other than images automatically initializes the object memory as necessary. Image + /// objects used as color or depth-stencil targets have to be explicitly initialized in command buffers using a + /// ICmdBuffer::CmdReleaseThenAcquire() command to transition them out of the LayoutUninitializedTarget usage. + /// + /// Binding memory to an object automatically unbinds any previously bound memory. There is no need to bind null to + /// an object to explicitly unbind a previously bound allocation before binding a new allocation. + /// + /// This call is invalid on objects that have no memory requirements, even if binding null. + /// + /// @param [in] pGpuMemory GPU memory to be bound. If null, the previous binding will be released. + /// @param [in] offset Offset into the GPU memory where the object's memory range should begin. This allows + /// sub-allocating many object's GPU memory from the same IGpuMemory object. + /// + /// @returns Success if the specified GPU memory was successfully bound to the object. Otherwise, one of the + /// following errors may be returned: + /// + ErrorUnavailable if binding a non-image to a virtual allocation. + /// + ErrorInvalidAlignment if the offset does not match the alignment requirements of the object. + /// + ErrorInvalidMemorySize if the object's required memory size does not fit completely within the given + /// memory object at the specified offset. + virtual Result BindGpuMemory( + IGpuMemory* pGpuMemory, + gpusize offset) = 0; + + /// Returns the GPU memory object and offset that this object is bound to or nullptr and 0 if not bound. + /// + /// @param [out] ppGpuMemory Returns the GPU memory object to the address specified in this pointer. + /// Returns nullptr if this object is not bound to any GPU memory. + /// @param [out] pOffset Returns the GPU memory offset to the address specified in this pointer. + /// Returns 0 if this object is not bound to any GPU memory. + /// + /// @returns Success if the GPU memory and offset was successfully returned. Otherwise, one of the following errors + /// may be returned: + /// + ErrorGpuMemoryNotBound if this object is not bound to any GPU memory. + /// + ErrorInvalidPointer if either ppGpuMemory or pOffset is nullptr. + /// + ErrorUnavailable if binding is not supported in the derived class + virtual Result GetGpuMemory( + IGpuMemory** ppGpuMemory, + gpusize* pOffset) const = 0; + +protected: + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IGpuMemoryBindable() { } +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palImage.h b/shared/amdgpu-windows-interop/pal/inc/core/palImage.h index 986efcf70c..0fcdb16204 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palImage.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palImage.h @@ -1,1029 +1,983 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palImage.h - * @brief Defines the Platform Abstraction Library (PAL) IImage interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palGpuMemoryBindable.h" - -namespace Pal -{ - -// Forward declarations. -class IImage; -class IPrivateScreen; -class IScreen; -class ISwapChain; -enum class CompressionMode : uint32; -enum class ClientCompressionMode : uint32; -enum SwizzleMode : uint32; -struct ImageCopyRegion; - -/// When used as the value of the viewFormatCount parameter of image creation it indicates that all compatible formats -/// can be used for views of the created image. -constexpr uint32 AllCompatibleFormats = UINT32_MAX; - -/// Specifies dimensionality of an image (i.e., 1D, 2D, or 3D). -enum class ImageType : uint32 -{ - Tex1d = 0x0, - Tex2d = 0x1, - Tex3d = 0x2, - Count -}; - -/// Specifies the tiling (address swizzling) to use for an image. When a linear tiled image is mapped its contents will -/// be laid out in row-major ordering. All other tiling modes require the use of swizzles equation to locate texels. -enum class ImageTiling : uint32 -{ - Linear = 0x0, ///< Image is laid out in scan-line (row-major) order. - Optimal = 0x1, ///< Image is laid out in a GPU-optimal order. - Standard64Kb = 0x2, ///< Image is laid out in the cross-IHV, 64KB, standard swizzle tiling. - Count -}; - -/// Hints to pal to identify a preference for how this image is organized. This is a preference setting, and may be -/// ignored if pal believes better options exist. -enum class ImageTilingPattern : uint32 -{ - Default = 0x0, ///< No swizzle mode is preferred. - Standard = 0x1, ///< Prefer standard swizzle modes. - XMajor = 0x2, ///< Prefer x-coordinate major swizzle modes. - YMajor = 0x3, ///< Prefer y-coordinate major swizzle modes. - Interleaved = 0x4, ///< Prefer interleaved coordinate swizzle modes. - Count -}; - -/// Hints to pal to select the appropriate tiling mode for a optimization target. -enum class TilingOptMode : uint32 -{ - Balanced = 0x0, ///< Balance memory foorprint and rendering performance. - OptForSpace = 0x1, ///< Optimize tiling mode for saving memory footprint - OptForSpeed = 0x2, ///< Optimize tiling mode for rendering performance. - Count -}; - -/// Image metadata modes. -enum class MetadataMode : uint16 -{ - Default = 0, ///< Default behavior. PAL chooses if metadata should be present or not. - ForceEnabled, ///< Optimization Hint: The client would prefer Metadata if possible. Useful for scenarios where - /// metadata isn't an obvious win and clients can enable based on some heuristic or app-detect. - Disabled, ///< The Image will not contain any compression metadata. - FmaskOnly, ///< The color msaa Image will only contain Cmask/Fmask metadata; this mode is only valid for color - /// msaa Image. On GPUs with GFX12-style distributed compression (see supportDistributedCompression - /// flag in @ref DeviceProperties), metadataMode only controls UMD metadata (Hi-Z and Hi-S). On such - /// GPUs, the FmaskOnly enum is ignored and treated like Default. - Count, -}; - -/// Image metadata TC compat modes. -enum class MetadataTcCompatMode : uint16 -{ - Default = 0, ///< Default behavior. PAL chooses if TC compat should be enabled (if compressed). - ForceEnabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is TC compat. - Disabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is not TC compat. - Count, -}; - -/// Image shared metadata support level -enum class MetadataSharingLevel : uint32 -{ - FullExpand = 0, ///< The metadata need to be fully expanded at ownership transition time. - ReadOnly = 1, ///< The metadata are expected to have read-only usage after the ownership is transitioned. - FullOptimal = 2, ///< The metadata can remain as-is if possible at ownership transition time. -}; - -/// Specifies the type of PRT map image being created. -enum class PrtMapType : uint32 -{ - None = 0, ///< This is not an auxillary image used for PRT plus functionality. - Residency = 1, ///< Image data is really a low-resolution map containing the finest populated LOD - /// for a particular UV space region. - SamplingStatus = 2, ///< Indicates the validity of a given tile on a per-mip level basis. - Count, -}; - -/// Specifies how to interpret a clear color. -enum class ClearColorType : uint32 -{ - Uint = 0, ///< The color is stored as an unsigned integer in RGBA order in u32Color. It will be swizzled and - /// compacted before it is written to memory. - Sint = 1, ///< The color is stored as a signed integer in RGBA order in i32Color. It will be swizzled and - /// compacted before it is written to memory. - Float = 2, ///< The color is stored as floating point in RGBA order. It will be swizzled and converted to the - /// appropriate numeric format before it is written to memory. - Yuv = 3, ///< The color is stored as an unsigned integer in YUVA order in u32Color. It will be swizzled and - /// compacted before it is written to memory. The client must clamp the clear color within the - /// valid range, e.g. [0, 255] for 8-bit. - Count -}; - -/// Contains everything necessary to store and interpret a clear color. -struct ClearColor -{ - ClearColorType type; ///< How to interpret this clear color. - uint8 disabledChannelMask; ///< This 4 bits are used to selectively disable the A,B,G,R channels - /// from being written. 0 means write ABRG. 0xF means write nothing. - /// 0x8 means write Blue, Green, Red. 0x7 means write Alpha. etc... - - union - { - uint32 u32Color[4]; ///< The clear color, interpreted as four unsigned integers. - float f32Color[4]; ///< The clear color, interpreted as four floating point values. - }; -}; - -/// Specifies a set of image creation flags. -union ImageCreateFlags -{ - struct - { - uint32 invariant : 1; ///< Images with this flag set and all other creation identical are - /// guaranteed to have a consistent data layout. - uint32 cloneable : 1; ///< Image is valid as a source or destination of a clone operation. - /// See @ref IDevice::ImagePrefersCloneCopy() for more details. - uint32 shareable : 1; ///< Image can be shared between compatible devices. - uint32 presentable : 1; ///< Indicates this image can be used in presents. - uint32 flippable : 1; ///< Image can be used for flip presents. - uint32 stereo : 1; ///< Indicates AMD quad buffer stereo extension (AQBS extension) image - uint32 dxgiStereo : 1; ///< Indicates DXGI stereo (Win8 stereo) image - uint32 cubemap : 1; ///< Image will be used as a cubemap. - uint32 prt : 1; ///< Image is a partially resident texture (aka, sparse image or tiled - /// resource) - uint32 needSwizzleEqs : 1; ///< Image requires valid swizzle equations. - uint32 perSubresInit : 1; ///< The image may have its subresources initialized independently using - /// barrier calls out of the uninitialized layout. - uint32 separateDepthPlaneInit : 1; ///< If set, the caller may transition the stencil and depth planes from - /// "Uninitialized" state at any time. Otherwise, both planes must be - /// transitioned in the same barrier call. Only meaningful if - /// "perSubresInit" is set. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 957 - uint32 repetitiveResolve : 1; ///< Optimization: Is this image resolved multiple times to an image which - /// is mostly similar to this image? -#else - uint32 reservedRepResolve : 1; ///< Reserved for future use. -#endif - uint32 preferSwizzleEqs : 1; ///< Image prefers valid swizzle equations, but an invalid swizzle - /// equation is also acceptable. - uint32 fixedTileSwizzle : 1; ///< Fix this image's tile swizzle to ImageCreateInfo::tileSwizzle. This - /// is only supported for single-sampled color images. - uint32 videoReferenceOnly : 1; ///< Image is used by video hardware for reference buffer only. - /// It uses a different tiling format than the decoder output buffer. - uint32 optimalShareable : 1; ///< Indicates metadata information is to be added into private data on - /// creation time and honored on open time. - uint32 sampleLocsAlwaysKnown : 1; ///< Sample pattern is always known in client driver for MSAA depth image. - uint32 fullResolveDstOnly : 1; ///< Indicates any ICmdBuffer::CmdResolveImage using this image as a - /// desination will overwrite the entire image (width and height of - /// resolve region is same as width and height of resolve dst). -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 960 - uint32 fullCopyDstOnly : 1; ///< Indicates any copy to this image will overwrite the entire image. - /// A perf optimization of using post-copy metadata fixup to replace heavy - /// expand at barrier to LayoutCopyDst. Unsafe to enable it if there is - /// potential partial copy to the image. -#else - uint32 reserved956 : 1; -#endif - uint32 pipSwapChain : 1; ///< Indicates this image is PIP swap-chain. It is only supported on - /// Windows platforms. - uint32 view3dAs2dArray : 1; ///< If set client can view 3D image as 2D with its depth as array slices. - /// Note that not all 3D images supports it. The image creation will - /// return error if we fail to create a compatible image. - - uint32 tmzProtected : 1; ///< Indicate this image is protected or not. - uint32 sharedWithMesa : 1; ///< Indicate this Image was opened from a Mesa shared Image - uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes - uint32 hasModifier : 1; ///< Set if the image uses drm format modifier. - uint32 disableDccStateTracking : 1; ///< Disable a PAL optimization which is commonly broken by app bugs. - /// Setting this flag may increase DCC decompress overhead. -#if PAL_CLIENT_EXAMPLE - uint32 useFixedSwizzleMode : 1; ///< If set, require the fixed swizzle mode provided. - /// Fails creation on incompatible swizzles. -#else - uint32 reservedSwMode : 1; ///< Reserved for future use. -#endif - uint32 reserved : 4; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies a set of ways an image might be used by the GPU (color target, shader read, etc.). -union ImageUsageFlags -{ - struct - { - uint32 shaderRead : 1; ///< Image will be read from shader (i.e., texture). - uint32 shaderWrite : 1; ///< Image will be written from a shader (i.e., UAV). - uint32 resolveSrc : 1; ///< Image will be used as resolve source image - uint32 resolveDst : 1; ///< Image will be used as resolve dst image - uint32 colorTarget : 1; ///< Image will be bound as a color target. - uint32 depthStencil : 1; ///< Image will be bound as a depth/stencil target. - uint32 noStencilShaderRead : 1; ///< Image will be neither read as stencil nor resolved on stencil plane. - /// Note that if resolveSrc bit has been set to indicate that the image - /// could be adopted as a resolveSrc image and there could be stencil - /// resolve, noStencilShaderRead must be set to 0, since shader-read - /// based stencil resolve might be performed. - uint32 hiZNeverInvalid : 1; ///< Hint to PAL indicating the client will guarantee that no operations - /// performed on this Image while it is in a decompressed state will cause - /// Hi-Z metadata to become invalid. This allows PAL to avoid an expensive - /// resummarization blit in some resource barriers. - uint32 depthAsZ24 : 1; ///< Use a 24-bit format for HW programming of a native 32-bit surface. - /// If set, border color and Z-reference values are treated as Z-24. - uint32 firstShaderWritableMip : 4; ///< Only relevant if the shaderWrite flag is set. Typically set to 0 so - /// entire image is writable. If non0, such as an image where only level0 - /// is used as a color target and compute is used to generate mipmaps,PAL - /// may be able to enable additional compression on the baseLevels which - /// are used exclusively as color target and shader read. - uint32 cornerSampling : 1; ///< Set if this image will use corner sampling in image-read scenarios. - /// With corner sampling, the extent refers to the number of pixel corners - /// which will be one more than the number of pixels. Border color is - /// ignored when corner sampling is enabled. - - uint32 vrsDepth : 1; ///< Set if this depth image will be bound when VRS rendering is enabled. - uint32 disableOptimizedDisplay: 1; ///< Do not create Display Dcc - uint32 useLossy : 1; ///< Set if this image may use lossy compression. - uint32 stencilOnlyTarget : 1; ///< This must be set if a stencil-only IDepthStencilView will be created - ///< for this image. - uint32 vrsRateImage : 1; ///< This image is potentially used with CmdBindSampleRateImage - uint32 videoDecoder : 1; ///< Indicating this Image is video decoder target - uint32 videoEncoder : 1; ///< Indicating this Image is video encoder input. - uint32 reserved : 11; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies properties for @ref IImage creation. Input structure to IDevice::CreateImage(). -/// -/// Note that by default PAL may instruct the hardware to swizzle the contents of an image in memory; if this occurs -/// two images created with identical properties will not map their texels to the same offsets in GPU memory and may -/// even have different sizes. At the expense of performance this behavior can be limited by setting the invariant flag, -/// which guarantees that images with identical properties will have identical GPU memory layouts. -/// -/// For single-sampled color images, there is a middle ground between these two modes. If the fixedTileSwizzle flag is -/// set, PAL will use the tileSwizzle property instead of generating its own swizzle value. The tileSwizzle value must -/// be obtained from the base subresource of a single-sampled color image with identical properties (excluding -/// fixedTileSwizzle and tileSwizzle). This allows the client to force certain similar images to share the same GPU -/// memory layouts without forcing all similar images to a single GPU memory layout. -struct ImageCreateInfo -{ - ImageCreateFlags flags; ///< Image creation flags. - ImageUsageFlags usageFlags; ///< Image usage flags. - ImageType imageType; ///< Dimensionality of image (1D/2D/3D). - SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. - Extent3d extent; ///< Dimensions in pixels WxHxD. - uint32 mipLevels; ///< Number of mipmap levels. Cannot be 0. - uint32 arraySize; ///< Number of slices. Set to 1 for non-array images. - uint32 samples; ///< Number of coverage samples. Set to 1 for single sample images. Must be - /// greater than or equal to the number of fragments. - uint32 fragments; ///< Number of color/depth fragments. Set to 1 for single sample images. - ImageTiling tiling; ///< Controls layout of pixels in the image. - ImageTilingPattern tilingPreference; ///< Controls preferred tile swizzle organization for this image. - TilingOptMode tilingOptMode; ///< Hints to pal to select the appropriate tiling mode. - uint32 tileSwizzle; ///< If fixedTileSwizzle is set, use this value for the image's base swizzle. -#if PAL_CLIENT_EXAMPLE - SwizzleMode fixedSwizzleMode; ///< For directed image tests, force a particular swizzle mode. -#endif - - /// Metadata behavior mode for this image. On GPUs with GFX12-style distributed compression - /// (see supportDistributedCompression flag in @ref DeviceProperties), metadataMode only controls UMD metadata - /// (Hi-Z and Hi-S). On such GPUs, the FmaskOnly enum is ignored and treated like Default. - MetadataMode metadataMode; - MetadataTcCompatMode metadataTcCompatMode; ///< TC compat mode for this image. - - /// Distributed compression contains GL2/DF DCC compression and RB backend client compression which includes - /// fragment client compression (previous FMASK compression alike) on color MSAA images and Z Plane client - /// compression on depth stencil images. Only relevant if the backing memory pages enable compression, controllable - /// by client with @ref GpuMemoryCreateInfo::compression. - CompressionMode compressionMode; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 876 - /// Client compression is part of distributed compression (aka physical compression); it can only be enabled if - /// physical compression is enabled. - /// - /// On Gfx12, controls (legacy FMask based) color fragment compression and Z plane compression. - ClientCompressionMode clientCompressionMode; ///< Controls client compression behavior for this resource. -#else - TriState clientCompressionMode; ///< Controls client compression behavior for this resource. -#endif - - uint32 maxBaseAlign; ///< Maximum address alignment for this image or zero for an unbounded alignment. - float imageMemoryBudget; ///< The memoryBudget value used in SW addrlib to determine the minSizeBlk for textures. - /// It must be >= 0.0. When in [0.0, 1.0) addrlib uses legacy logic to decide minSizeBlk. - /// When == 1.0, addrlib uses minimizeAlign. When > 1.0, addrlib applies memory budget - /// algorithm. Despite 1.5 in tests show significant texture allocation size reduction, - /// default value 0.0 (legacy behavior) is recommended if not specified by client. - - struct - { - PrtMapType mapType; ///< Indicates what sort of PRT metadata is stored in this image. If this image is PRT - /// metadata, then it can only be associated with an image that is a power-of-two - /// multiple bigger (or the same size). Image properties needs to include - /// "PrtFeaturePrtPlus" to create PRT map images. Format must be set to X8_Unorm for - /// residency map and sampling-status map types. - Extent3d lodRegion; ///< Useful only if mapType is not "none". Defines the region size of the parent image - /// that one pixel of this image matches with. The map image can only be paired with a - /// parent image of matching dimensions. This parameter can be left at zero. - } prtPlus; - - /// The following "pitch" members must be zeroed unless the client is creating a @ref ImageTiling::Linear image and - /// wishes to directly specify the image's row and depth pitches. In that case, they must be integer multiples of - /// the alignments given by @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. - uint32 rowPitch; ///< The image must have this row pitch for the first mip level (in bytes). - uint32 depthPitch; ///< The image must have this depth pitch for the first mip level (in bytes). - - Rational refreshRate; ///< The expected refresh rate when presenting this flippable or stereo image. - - /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). - /// If the client wishes to create image views with other formats they must fill out the following fields. - /// - /// Valid combinations of these fields include: - /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. - /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. - /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and - /// viewFormatCount is the length of that array. - /// - /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through - /// @ref GetImageCreateInfo. - uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. - const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. - -#if defined(__unix__) - uint64 modifier; ///< Drm format modifier. Ignored if flags.hasModifier unset. - uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. - gpusize modifierMemoryPlaneOffset[3]; ///< Offset of main surface, display Dcc surface and gfx Dcc surface. -#endif -}; - -inline constexpr bool operator==(const ImageCreateInfo& lhs, const ImageCreateInfo& rhs) -{ - bool same = (lhs.flags.u32All == rhs.flags.u32All) && - (lhs.usageFlags.u32All == rhs.usageFlags.u32All) && - (lhs.imageType == rhs.imageType) && - (lhs.swizzledFormat == rhs.swizzledFormat) && - (lhs.extent == rhs.extent) && - (lhs.mipLevels == rhs.mipLevels) && - (lhs.arraySize == rhs.arraySize) && - (lhs.samples == rhs.samples) && - (lhs.fragments == rhs.fragments) && - (lhs.tiling == rhs.tiling) && - (lhs.tilingPreference == rhs.tilingPreference) && - (lhs.tilingOptMode == rhs.tilingOptMode) && - (lhs.tileSwizzle == rhs.tileSwizzle) && -#if PAL_CLIENT_EXAMPLE - (lhs.fixedSwizzleMode == rhs.fixedSwizzleMode) && -#endif - (lhs.metadataMode == rhs.metadataMode) && - (lhs.metadataTcCompatMode == rhs.metadataTcCompatMode) && - (lhs.compressionMode == rhs.compressionMode) && - (lhs.clientCompressionMode == rhs.clientCompressionMode) && - (lhs.maxBaseAlign == rhs.maxBaseAlign) && - (lhs.imageMemoryBudget == rhs.imageMemoryBudget) && - (lhs.prtPlus.mapType == rhs.prtPlus.mapType) && - (lhs.prtPlus.lodRegion == rhs.prtPlus.lodRegion) && - (lhs.rowPitch == rhs.rowPitch) && - (lhs.depthPitch == rhs.depthPitch) && - (lhs.refreshRate.numerator == rhs.refreshRate.numerator) && - (lhs.refreshRate.denominator == rhs.refreshRate.denominator) && - (lhs.viewFormatCount == rhs.viewFormatCount) && - true; - -#if defined(__unix__) - if (same && (lhs.flags.hasModifier != 0)) - { - same = (lhs.modifier == rhs.modifier) && - (lhs.modifierPlaneCount == rhs.modifierPlaneCount) && - (memcmp(&lhs.modifierMemoryPlaneOffset[0], - &rhs.modifierMemoryPlaneOffset[0], - sizeof(gpusize) * lhs.modifierPlaneCount) == 0); - } -#endif - - if (same && (lhs.viewFormatCount > 0) && (lhs.viewFormatCount != AllCompatibleFormats)) - { - same = (memcmp(lhs.pViewFormats, rhs.pViewFormats, lhs.viewFormatCount * sizeof(SwizzledFormat)) == 0); - } - - return same; -} - -/// Specifies properties for presentable @ref IImage creation. Input structure to IDevice::CreatePresentableImage(). -struct PresentableImageCreateInfo -{ - union - { - struct - { - uint32 fullscreen : 1; ///< Image supports fullscreen presentation. - uint32 stereo : 1; ///< Image supports stereoscopic rendering and display. - /// Implies an array size of 2. Fullscreen must be set. - uint32 turbosync : 1; ///< Image supports turbosync flip - uint32 peerWritable : 1; ///< Indicates if the memory allocated will be writable by other devices - uint32 tmzProtected : 1; ///< Indicates this presenatble image's memory is tmz Protected. -#if PAL_AMDGPU_BUILD - uint32 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes - /// the allocation upon creation, currently, only GpuHeapLocal and - /// GpuHeapInvisible are supported. -#else - uint32 placeholder0 : 1; ///< Placeholder. -#endif - uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes. - uint32 reserved : 25; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Presentable image creation flags. - - SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. - ImageUsageFlags usage; ///< Image usage flags. - Extent2d extent; ///< Width/height of the image. - CompressionMode compressionMode; ///< Specify GFX12-style distributed compression behavior for this resource. - /// Only relevant if the backing memory pages enable compression (controllable by - /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). - const IScreen* pScreen; ///< Target screen for fullscreen presentable images. Can be null if the fullscreen - /// flag is 0. - OsDisplayHandle hDisplay; ///< Display handle of the local display system only for WSI. - OsWindowHandle hWindow; ///< Window handle only for WSI. - ISwapChain* pSwapChain; ///< SwapChain object which the presentable image belongs to. - - /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). - /// If the client wishes to create image views with other formats they must fill out the following fields. - /// - /// Valid combinations of these fields include: - /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. - /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. - /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and - /// viewFormatCount is the length of that array. - /// - /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through - /// @ref GetImageCreateInfo. - uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. - const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. -}; - -/// Specifies properties for private screen @ref IImage image creation. Input structure to -/// IDevice::CreatePrivateScreenImage(). -struct PrivateScreenImageCreateInfo -{ - union - { - struct - { - uint32 invariant : 1; ///< Images with this flag set and all other creation identical are guaranteed - /// to have a consistent data layout. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Private screen image creation flags. - - SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. - ImageUsageFlags usage; ///< Image usage flags. - Extent2d extent; ///< Width/height of the image. - IPrivateScreen* pScreen; ///< Private screen this image is created on (then this image can be used to be - /// presented on this private screen). - CompressionMode compressionMode;///< Specify GFX12-style distributed compression behavior for this resource. - /// Only relevant if the backing memory pages enable compression (controllable by - /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). - - /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). - /// If the client wishes to create image views with other formats they must fill out the following fields. - /// - /// Valid combinations of these fields include: - /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. - /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. - /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and - /// viewFormatCount is the length of that array. - /// - /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through - /// @ref GetImageCreateInfo. - uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. - const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. -}; - -/// Specifies parameters for opening another device's image for peer access from this device. Input structure to -/// IDevice::OpenPeerImage(). -struct PeerImageOpenInfo -{ - const IImage* pOriginalImage; ///< Other device's image to be opened for peer access. -}; - -/// Specifies parameters for opening another non-PAL device's image for access from this device. Input structure to -/// IDevice::OpenExternalSharedImage(). -struct ExternalImageOpenInfo -{ - ExternalResourceOpenInfo resourceInfo; ///< Information describing the external image. - Extent3d extent; ///< Expected extent for the external image. This reference value would be - /// ignored and use extents from shared metadata if any dimension of the - /// reference extent is zero. - SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. Or UndefinedFormat to infer the - /// format internally. - ImageCreateFlags flags; ///< Image Creation flags. - ImageUsageFlags usage; ///< Image usage flags. - IPrivateScreen* pScreen; ///< Private screen this image is created on, or null. - gpusize gpuMemOffset; ///< GpuMemory offset -#if defined(__unix__) - gpusize dccOffset; ///< Offset of gfx Dcc surface if nonzero. - gpusize displayDccOffset; ///< Offset of display Dcc surface if nonzero. - uint64 modifier; ///< Drm format modifier, if flags.hasModifier is set. - uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. -#endif - /// The following members must be set to zero unless the client is opening a @ref ImageTiling::Linear image with - /// specified row and depth pitches. In that case, they must be integer multiples of the alignments given by - /// @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. - gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines - /// of the subresource. - gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive - /// slices. -}; - -/// Reports the overall GPU memory layout of the entire image. Output structure for IImage::GetMemoryLayout(). Unused -/// sections will have a size of zero, an offset of zero, and an alignment of one. The layout is split into: -/// + Image Data: The raw texel values for all subresources of the image. -/// + Image Metadata: Additional data that will be used to optimize GPU operations that access the image. -/// + Image Metadata Header: A special subsection of the metadata for small bits of data with weaker alignment. -struct ImageMemoryLayout -{ - gpusize dataSize; ///< The size, in bytes, of the image's core data section. - gpusize dataAlignment; ///< The alignment, in bytes, of the image's core data section. - - gpusize metadataOffset; ///< The offset, in bytes, of the image's metadata section. - gpusize metadataSize; ///< The size, in bytes, of the image's metadata section. - gpusize metadataAlignment; ///< The alignment, in bytes, of the image's metadata section. - - gpusize metadataHeaderOffset; ///< The offset, in bytes, of the image's metadata header. - gpusize metadataHeaderSize; ///< The size, in bytes, of the image's metadata header. - gpusize metadataHeaderAlignment; ///< The alignment, in bytes, of the image's metadata header. - - uint8 swizzleEqIndices[2]; ///< Which swizzle equations this image uses or InvalidSwizzleEqIndex if - /// there are no swizzle equations for this image's layout. - uint8 swizzleEqTransitionMip; ///< Before this mip level, the image uses swizzleEqIndices[0]; from this - /// mip level onwards, the image uses swizzleEqIndices[1]. - uint8 swizzleEqTransitionPlane; ///< Before this mip plane, the image uses swizzleEqIndices[0]; from this - /// plane onward, the image uses swizzleEqIndices[1]. - - uint32 prtTileWidth; ///< Width, in texels, of a PRT tile - uint32 prtTileHeight; ///< Height, in texels, of a PRT tile - uint32 prtTileDepth; ///< Depth, in texels, of a PRT tile - uint32 prtMinPackedLod; ///< First mip level that is packed into the PRT mip tail. - uint32 prtMipTailTileCount; ///< Number of tiles in the packed mip tail. This may either indicate the - /// size per slice or per image depending on the support for - /// PrtFeaturePerLayerMipTail (@see PrtFeatureFlags) - uint32 stereoLineOffset; ///< Y offset to the right eye data, in texels -}; - -/// Collection of bitmasks specifying which operations are currently allowed on an image, and which queues are allowed -/// to perform those operations. Based on this information, PAL can determine the best compression state of the image. -struct ImageLayout -{ - uint32 usages : 24; ///< Bitmask of @ref ImageLayoutUsageFlags values. - uint32 engines : 8; ///< Bitmask of @ref ImageLayoutEngineFlags values. -}; - -/** -**************************************************************************************************** -* @brief -* Enumerates swizzle modes useable on any supported GPU. -* @note -* For details please check _AddrSwizzleMode -* -**************************************************************************************************** -*/ -enum SwizzleMode : uint32 -{ - SwizzleModeLinear = 0, - SwizzleMode256BS, - SwizzleMode256BD, - SwizzleMode256BR, - SwizzleMode4KbZ, - SwizzleMode4KbS, - SwizzleMode4KbD, - SwizzleMode4KbR, - SwizzleMode64KbZ, - SwizzleMode64KbS, - SwizzleMode64KbD, - SwizzleMode64KbR, - SwizzleMode64KbZT, - SwizzleMode64KbST, - SwizzleMode64KbDT, - SwizzleMode64KbRT, - SwizzleMode4KbZX, - SwizzleMode4KbSX, - SwizzleMode4KbDX, - SwizzleMode4KbRX, - SwizzleMode64KbZX, - SwizzleMode64KbSX, - SwizzleMode64KbDX, - SwizzleMode64KbRX, - SwizzleMode256KbVarZX, - SwizzleMode256KbVarSX, - SwizzleMode256KbVarDX, - SwizzleMode256KbVarRX, - - ///< the meaning of swizzle mode varies by generation, - /// do not compare directly with the _R / _S / _D / _S types". - SwizzleMode256B2D, - SwizzleMode4Kb2D, - SwizzleMode4Kb3D, - SwizzleMode64Kb2D, - SwizzleMode64Kb3D, - SwizzleMode256Kb2D, - SwizzleMode256Kb3D, - SwizzleMode64Kb2Dz, - SwizzleMode256Kb2Dz, - SwizzleModeCount, -}; - -/// Reports position and memory layout information for a specific subresource in an image. Output structure for -/// IImage::GetSubresourceLayout(). -struct SubresLayout -{ - uint32 elementBytes; ///< size of each element in bytes - gpusize offset; ///< Offset in bytes from the base of the image's GPU memory where the subresource starts. - gpusize swizzleOffset; ///< Offset in bytes used for supporting parameterized swizzle - gpusize size; ///< Size of the subresource in bytes. - gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines of the subresource. - gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. - uint32 tileToken; ///< Token representing various tiling information necessary for determining compatible - /// optimally tiled copies. - uint32 tileSwizzle; ///< Bank/Pipe swizzle bits for macro-tiling modes. - Extent3d blockSize; ///< Size of a tile block in texels - micro tile for 1D tiling and macro tile for 2D tiling. - Offset3d mipTailCoord; ///< coords of the subresource within the mip tail - - Extent3d extentTexels; ///< Unpadded extent of the subresource in texels. - Extent3d extentElements; ///< Unpadded extent of the subresource in elements. - Extent3d paddedExtent; ///< Extent of the subresource in elements, including all internal padding for this subresource. - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 912 - /// Reports supported engines and usages for this subresource while it can remain in its optimal compression state. - /// Clients using CmdRelease()/CmdAcquire() without complete knowledge of the application's next usage during - /// CmdRelease() or its previous usage at CmdAcquire() can treat this layout as a performant target for an - /// intermediate state that will avoid unnecessary decompressions. - /// - /// This value is only valid if supportSplitReleaseAcquire is set in @ref DeviceProperties. - ImageLayout defaultGfxLayout; -#endif - - SwizzledFormat planeFormat; ///< Swizzled format for plane. Planar resource like D32-S8 - /// will have different swizzled format per plane. - SwizzleMode swizzleMode; ///< Swizzle mode for plane, based on AddrSwizzleMode - uint32 hwSwizzleMode; ///< Hardware Swizzle enum, enum type dependent on gfx version -}; - -/// Selects a specific subresource of an image resource. -/// -/// Most images only have a single data plane but in some cases conceptually related data will be stored in physically -/// separate locations which we call planes. If an image only has a single plane it will always be plane 0. -/// We define the following fixed mappings for all multi-plane formats. -/// + Depth-stencil: if the image format contains depth and stencil data, plane 0 is depth and plane 1 is stencil. -/// + YUV-planar: if the image format is @ref YuvPlanar it has either two or three planes. The luma plane -/// is always plane 0. If the format is @ref ChNumFormat::YV12 it has three planes where plane 1 is the -/// red-difference chrominance plane and plane 2 is the blue-difference chrominance plane. Otherwise, plane 1 -/// interleaves blue-difference and red-difference chrominance values. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 886 -struct SubresId -{ - uint8 plane; ///< Selects a data plane. - uint8 mipLevel; ///< Selects a mip level. - uint16 arraySlice; ///< Selects an array slice. -}; - -/// Defines a range of subresources. -struct SubresRange -{ - SubresId startSubres; ///< First subresource in the range. - uint8 numPlanes; ///< Number of planes in the range. - uint8 numMips; ///< Number of mip levels in the range. - uint16 numSlices; ///< Number of slices in the range. -}; - -#else -struct SubresId -{ - uint32 plane; ///< Selects a data plane. - uint32 mipLevel; ///< Selects a mip level. - uint32 arraySlice; ///< Selects an array slice. -}; - -/// Defines a range of subresources. -struct SubresRange -{ - SubresId startSubres; ///< First subresource in the range. - uint32 numPlanes; ///< Number of planes in the range. - uint32 numMips; ///< Number of mip levels in the range. - uint32 numSlices; ///< Number of slices in the range. -}; - -#endif - -/// A variant struct of MemoryImageCopyRegion -/// Specifies parameters for a copy from CPU memory to Image. -/// An input for Image::CopyMemoryToImage(). -struct MemoryToImageCopyRegion -{ - SubresId imageSubres; ///< Selects the image subresource. - Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. - Extent3d imageExtent; ///< Size of the image region in elements. - uint32 numSlices; ///< Number of slices the copy will span. - const void* pHostPtr; ///< Pointer to the host memory copy from. - gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. -}; - -/// A variant struct of MemoryImageCopyRegion -/// Specifies parameters for a copy from Image to CPU memory. -/// An input for Image::CopyImageToMemory(). -struct ImageToMemoryCopyRegion -{ - SubresId imageSubres; ///< Selects the image subresource. - Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. - Extent3d imageExtent; ///< Size of the image region in elements. - uint32 numSlices; ///< Number of slices the copy will span. - void* pHostPtr; ///< Pointer to the host memory copy to. - gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. - gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. -}; - -inline constexpr bool operator==(const SubresId& lhs, const SubresId& rhs) -{ - return (lhs.plane == rhs.plane) && - (lhs.mipLevel == rhs.mipLevel) && - (lhs.arraySlice == rhs.arraySlice); -} - -inline constexpr bool operator!=(const SubresId& lhs, const SubresId& rhs) -{ - return ((lhs == rhs) == false); -} - -inline constexpr bool operator==(const SubresRange& lhs, const SubresRange& rhs) -{ - return (lhs.startSubres == rhs.startSubres) && - (lhs.numPlanes == rhs.numPlanes) && - (lhs.numMips == rhs.numMips) && - (lhs.numSlices == rhs.numSlices); -} - -/// Determines if two subresource ranges are overlapped. -/// -/// @returns True if two subresource ranges are overlapped, false otherwise. -inline constexpr bool OverlappedSubresRanges( - const SubresRange& a, - const SubresRange& b) -{ - const SubresId aStart = a.startSubres; - const SubresId bStart = b.startSubres; - - return (aStart.plane < (bStart.plane + b.numPlanes)) && - (bStart.plane < (aStart.plane + a.numPlanes)) && - (aStart.mipLevel < (bStart.mipLevel + b.numMips)) && - (bStart.mipLevel < (aStart.mipLevel + a.numMips)) && - (aStart.arraySlice < (bStart.arraySlice + b.numSlices)) && - (bStart.arraySlice < (aStart.arraySlice + a.numSlices)); -} - -/** - *********************************************************************************************************************** - * @interface IImage - * @brief Represents an image resource that can be accessed by the GPU. - * - * @see IDevice::CreateImage() - * @see IDevice::OpenPeerImage() - *********************************************************************************************************************** - */ -class IImage : public IGpuMemoryBindable -{ -public: - /// Reports information on the layout of the image in memory such as core data size and metadata alignment. - /// - /// @returns the reference to ImageCreateInfo - virtual const ImageMemoryLayout& GetMemoryLayout() const = 0; - - /// Reports information on the full range of the image's subresources. - /// - /// @returns Reports info on the full range of the image's subresources such as number of mips and planes. - virtual SubresRange GetFullSubresourceRange() const = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 953 - /// Reports information on the full range of the image's subresources. - /// - /// @param [out] pRange Reports info on the full range of the image's subresources such as number of mips and - /// planes. - /// - /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be - /// returned: - /// + ErrorInvalidPointer if pRange is null. - Result GetFullSubresourceRange(SubresRange* pRange) const - { - Result result = Result::ErrorInvalidPointer; - if (pRange != nullptr) - { - *pRange = GetFullSubresourceRange(); - result = Result::Success; - } - return result; - } -#endif - - /// Reports information on the layout of the specified subresource in memory. - /// - /// @param [in] subresId Selects a subresource from the image (aspect/mip/slice). - /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. - /// - /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be - /// returned: - /// + ErrorInvalidPointer if pLayout is null. - /// + ErrorInvalidValue is the subresId is out of range for this image. - virtual Result GetSubresourceLayout( - SubresId subresId, - SubresLayout* pLayout) const = 0; - -#if defined(__unix__) - /// Reports information on the memory plane layout of the specified subresource in memory for image with modifier. - /// - /// @param [in] memoryPlane Selects a memory plane from the image. - /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. - /// - /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be - /// returned: - /// + ErrorInvalidValue is the memory plane is out of range for this image. - virtual Result GetModifierSubresourceLayout( - uint32 memoryPlane, - SubresLayout* pLayout) const = 0; -#endif - - /// Reports the create info of image. - /// - /// @returns the reference to ImageCreateInfo - const ImageCreateInfo& GetImageCreateInfo() const { return m_createInfo; } - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -#if defined(_WIN32) - /// This method checks if the image is an opened cross-adapter shared image on MS hybrid graphics system. - /// - /// @returns True if the image is an opened cross-adapter shared image. False otherwise. - virtual bool IsCrossAdapter() const = 0; - - /// Returns a special resource ID. Can be used to associate an opened resource with its original resource. - /// - /// @returns The optimal sharing ID. - virtual uint64 GetOptimalSharingId() const = 0; -#endif - - /// Sets level of optimal sharing by opening APIs using this optimal sharable image and pass this information to the - /// creator. This function is supposed to be called by openers only. The call by creator is ignored. - /// - /// @param [in] level Level to be set to specified client API. - virtual void SetOptimalSharingLevel( - MetadataSharingLevel level) = 0; - - /// Returns support level set by all possible opening APIs. - /// - /// @returns A summarized supporting level. - virtual MetadataSharingLevel GetOptimalSharingLevel() const = 0; - - /// Gives the client access to the resource ID used for internal Pal events. - /// EX: Resource Create, Resource Bind, Resource Destroy. - /// - /// @returns The Resource ID. - virtual const void* GetResourceId() const = 0; - - /// Copies data directly from CPU memory to an Image. - /// - /// @param [in] pRegions Pointer to an array of Image regions to copy from. - /// @param [in] regionCount Number of regions to copy. - /// @param [in] useMemcpy Indicates that it could be copied more efficiently from host memory to image with - /// image data is already swizzled in host memory - /// - /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the - /// returned: - /// + ErrorInvalidPointer if the memory object bound to image is null. - /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. - /// + ErrorOutOfMemory if out of system memory. - /// + ErrorInvalidValue if copy between image and memory failed. - virtual Result CopyMemoryToImage( - const MemoryToImageCopyRegion* pRegions, - const uint32 regionCount, - bool useMemcpy) const = 0; - - /// Copies data directly from an Image to CPU memory. - /// - /// @param [in] pRegions Pointer to an array of Image regions to copy to. - /// @param [in] regionCount Number of regions to copy. - /// @param [in] useMemcpy Indicates that it could be copied more efficiently from image to host memory and the - /// image data will be obtained while retaining the physical layout of the image. - /// - /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the - /// returned: - /// + ErrorInvalidPointer if the memory object bound to image is null. - /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. - /// + ErrorOutOfMemory if out of system memory. - /// + ErrorInvalidValue if copy between image and memory failed. - virtual Result CopyImageToMemory( - const ImageToMemoryCopyRegion* pRegions, - const uint32 regionCount, - bool useMemcpy) const = 0; - - /// Copies data between two images with specified regions. - /// - /// @param [in] pDstImage Pointer to the destination image where the data will be copied. - /// @param [in] pImgRegions Pointer to an array of regions specifying the area of image to be copied. - /// @param [in] regionCount Number of regions to copy between the source and destination images. - /// - /// @returns Success if the copy operation was successfully performed. Otherwise, one of the following error codes - /// may be returned: - /// + ErrorInvalidPointer if any of the input pointers are null. - /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. - /// + ErrorOutOfMemory if there is insufficient memory to perform the operation. - /// + ErrorInvalidValue if copy between images failed. - virtual Result CopyBetweenImages( - IImage* pDstImage, - const ImageCopyRegion* pImgRegions, - const uint32 regionCount) const = 0; - - /// Check if the provided layout transition is compatible (no layout transition blt necessary) or not (requires - /// layout transition blt). - /// - /// @param [in] subresRange Image subresource range. - /// @param [in] oldLayout Specifies the current image layout based on bitmasks of allowed operations and - /// engines up to this point. These masks imply the previous compression state. No - /// usage flags should ever be set in oldLayout.usages that correspond to usages - /// that are not supported by the engine that is performing the transition. The engine - /// type performing the transition must be set in oldLayout.engines. - /// @param [in] newLayout Specifies the upcoming image layout based on bitmasks of allowed operations and - /// engines after this point. These masks imply the upcoming compression state. - /// A difference between oldLayoutUsageMask and newLayoutUsageMask may result in layout - /// transition blt (e.g. decompression) and returns compatible = false. - /// - /// @returns True if the layout transition is compatible which indicates no need layout transition blt. - /// False otherwise if layout transition is incompatible and requires layout transition blt. - virtual bool IsLayoutTransitionCompatible( - const SubresRange subresRange, - const ImageLayout oldLayout, - const ImageLayout newLayout) const = 0; - -protected: - /// @internal Constructor. - /// - /// @param [in] createInfo App-specified parameters describing the desired image properties. - IImage(const ImageCreateInfo& createInfo) : m_createInfo(createInfo), m_pClientData(nullptr) { } - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IImage() { } - - /// Retained Image create info - const ImageCreateInfo m_createInfo; - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palImage.h + * @brief Defines the Platform Abstraction Library (PAL) IImage interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +// Forward declarations. +class IImage; +class IPrivateScreen; +class IScreen; +class ISwapChain; +enum class CompressionMode : uint32; +enum class ClientCompressionMode : uint32; +enum SwizzleMode : uint32; +struct ImageCopyRegion; + +/// When used as the value of the viewFormatCount parameter of image creation it indicates that all compatible formats +/// can be used for views of the created image. +constexpr uint32 AllCompatibleFormats = UINT32_MAX; + +/// Specifies dimensionality of an image (i.e., 1D, 2D, or 3D). +enum class ImageType : uint32 +{ + Tex1d = 0x0, + Tex2d = 0x1, + Tex3d = 0x2, + Count +}; + +/// Specifies the tiling (address swizzling) to use for an image. When a linear tiled image is mapped its contents will +/// be laid out in row-major ordering. All other tiling modes require the use of swizzles equation to locate texels. +enum class ImageTiling : uint32 +{ + Linear = 0x0, ///< Image is laid out in scan-line (row-major) order. + Optimal = 0x1, ///< Image is laid out in a GPU-optimal order. + Standard64Kb = 0x2, ///< Image is laid out in the cross-IHV, 64KB, standard swizzle tiling. + Count +}; + +/// Hints to pal to identify a preference for how this image is organized. This is a preference setting, and may be +/// ignored if pal believes better options exist. +enum class ImageTilingPattern : uint32 +{ + Default = 0x0, ///< No swizzle mode is preferred. + Standard = 0x1, ///< Prefer standard swizzle modes. + XMajor = 0x2, ///< Prefer x-coordinate major swizzle modes. + YMajor = 0x3, ///< Prefer y-coordinate major swizzle modes. + Interleaved = 0x4, ///< Prefer interleaved coordinate swizzle modes. + Count +}; + +/// Hints to pal to select the appropriate tiling mode for a optimization target. +enum class TilingOptMode : uint32 +{ + Balanced = 0x0, ///< Balance memory foorprint and rendering performance. + OptForSpace = 0x1, ///< Optimize tiling mode for saving memory footprint + OptForSpeed = 0x2, ///< Optimize tiling mode for rendering performance. + Count +}; + +/// Image metadata modes. +enum class MetadataMode : uint16 +{ + Default = 0, ///< Default behavior. PAL chooses if metadata should be present or not. + ForceEnabled, ///< Optimization Hint: The client would prefer Metadata if possible. Useful for scenarios where + /// metadata isn't an obvious win and clients can enable based on some hueristic or app-detect. + Disabled, ///< The Image will not contain any compression metadata. + FmaskOnly, ///< The color msaa Image will only contain Cmask/Fmask metadata; this mode is only valid for color + /// msaa Image. On GPUs with GFX12-style distributed compression (see supportDistributedCompression + /// flag in @ref DeviceProperties), metadataMode only controls UMD metadata (Hi-Z and Hi-S). On such + /// GPUs, the FmaskOnly enum is ignored and treated like Default. + Count, +}; + +/// Image metadata TC compat modes. +enum class MetadataTcCompatMode : uint16 +{ + Default = 0, ///< Default behavior. PAL chooses if TC compat should be enabled (if compressed). + ForceEnabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is TC compat. + Disabled, ///< Optimization Hint: Tells PAL that the client would prefer Metadata is not TC compat. + Count, +}; + +/// Image shared metadata support level +enum class MetadataSharingLevel : uint32 +{ + FullExpand = 0, ///< The metadata need to be fully expanded at ownership transition time. + ReadOnly = 1, ///< The metadata are expected to have read-only usage after the ownership is transitioned. + FullOptimal = 2, ///< The metadata can remain as-is if possible at ownership transition time. +}; + +/// Specifies the type of PRT map image being created. +enum class PrtMapType : uint32 +{ + None = 0, ///< This is not an auxillary image used for PRT plus functionality. + Residency = 1, ///< Image data is really a low-resolution map containing the finest populated LOD + /// for a particular UV space region. + SamplingStatus = 2, ///< Indicates the validity of a given tile on a per-mip level basis. + Count, +}; + +/// Specifies how to interpret a clear color. +enum class ClearColorType : uint32 +{ + Uint = 0, ///< The color is stored as an unsigned integer in RGBA order in u32Color. It will be swizzled and + /// compacted before it is written to memory. + Sint = 1, ///< The color is stored as a signed integer in RGBA order in i32Color. It will be swizzled and + /// compacted before it is written to memory. + Float = 2, ///< The color is stored as floating point in RGBA order. It will be swizzled and converted to the + /// appropriate numeric format before it is written to memory. + Yuv = 3, ///< The color is stored as an unsigned integer in YUVA order in u32Color. It will be swizzled and + /// compacted before it is written to memory. The client must clamp the clear color within the + /// valid range, e.g. [0, 255] for 8-bit. + Count +}; + +/// Contains everything necessary to store and interpret a clear color. +struct ClearColor +{ + ClearColorType type; ///< How to interpret this clear color. + uint8 disabledChannelMask; ///< This 4 bits are used to selectively disable the A,B,G,R channels + /// from being written. 0 means write ABRG. 0xF means write nothing. + /// 0x8 means write Blue, Green, Red. 0x7 means write Alpha. etc... + + union + { + uint32 u32Color[4]; ///< The clear color, interpreted as four unsigned integers. + float f32Color[4]; ///< The clear color, interpreted as four floating point values. + }; +}; + +/// Specifies a set of image creation flags. +union ImageCreateFlags +{ + struct + { + uint32 invariant : 1; ///< Images with this flag set and all other creation identical are + /// guaranteed to have a consistent data layout. + uint32 cloneable : 1; ///< Image is valid as a source or destination of a clone operation. + /// See @ref IDevice::ImagePrefersCloneCopy() for more details. + uint32 shareable : 1; ///< Image can be shared between compatible devices. + uint32 presentable : 1; ///< Indicates this image can be used in presents. + uint32 flippable : 1; ///< Image can be used for flip presents. + uint32 stereo : 1; ///< Indicates AMD quad buffer stereo extension (AQBS extension) image + uint32 dxgiStereo : 1; ///< Indicates DXGI stereo (Win8 stereo) image + uint32 cubemap : 1; ///< Image will be used as a cubemap. + uint32 prt : 1; ///< Image is a partially resident texture (aka, sparse image or tiled + /// resource) + uint32 needSwizzleEqs : 1; ///< Image requires valid swizzle equations. + uint32 perSubresInit : 1; ///< The image may have its subresources initialized independently using + /// barrier calls out of the uninitialized layout. + uint32 separateDepthPlaneInit : 1; ///< If set, the caller may transition the stencil and depth planes from + /// "Uninitialized" state at any time. Otherwise, both planes must be + /// transitioned in the same barrier call. Only meaningful if + /// "perSubresInit" is set. + uint32 repetitiveResolve : 1; ///< Optimization: Is this image resolved multiple times to an image which + /// is mostly similar to this image? + uint32 preferSwizzleEqs : 1; ///< Image prefers valid swizzle equations, but an invalid swizzle + /// equation is also acceptable. + uint32 fixedTileSwizzle : 1; ///< Fix this image's tile swizzle to ImageCreateInfo::tileSwizzle. This + /// is only supported for single-sampled color images. + uint32 videoReferenceOnly : 1; ///< Image is used by video hardware for reference buffer only. + /// It uses a different tiling format than the decoder output buffer. + uint32 optimalShareable : 1; ///< Indicates metadata information is to be added into private data on + /// creation time and honored on open time. + uint32 sampleLocsAlwaysKnown : 1; ///< Sample pattern is always known in client driver for MSAA depth image. + uint32 fullResolveDstOnly : 1; ///< Indicates any ICmdBuffer::CmdResolveImage using this image as a + /// desination will overwrite the entire image (width and height of + /// resolve region is same as width and height of resolve dst). + uint32 fullCopyDstOnly : 1; ///< Indicates any copy to this image will overwrite the entire image. + /// A perf optimization of using post-copy metadata fixup to replace heavy + /// expand at barrier to LayoutCopyDst. Unsafe to enable it if there is + /// potential partial copy to the image. + uint32 pipSwapChain : 1; ///< Indicates this image is PIP swap-chain. It is only supported on + /// Windows platforms. + uint32 view3dAs2dArray : 1; ///< If set client can view 3D image as 2D with its depth as array slices. + /// Note that not all 3D images supports it. The image creation will + /// return error if we fail to create a compatible image. + + uint32 tmzProtected : 1; ///< Indicate this image is protected or not. + uint32 sharedWithMesa : 1; ///< Indicate this Image was opened from a Mesa shared Image + uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes + uint32 hasModifier : 1; ///< Set if the image uses drm format modifier. + uint32 disableDccStateTracking : 1; ///< Disable a PAL optimization which is commonly broken by app bugs. + /// Setting this flag may increase DCC decompress overhead. +#if PAL_CLIENT_EXAMPLE + uint32 useFixedSwizzleMode : 1; ///< If set, require the fixed swizzle mode provided. + /// Fails creation on incompatible swizzles. +#else + uint32 reservedSwMode : 1; ///< Reserved for future use. +#endif + uint32 reserved : 4; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies a set of ways an image might be used by the GPU (color target, shader read, etc.). +union ImageUsageFlags +{ + struct + { + uint32 shaderRead : 1; ///< Image will be read from shader (i.e., texture). + uint32 shaderWrite : 1; ///< Image will be written from a shader (i.e., UAV). + uint32 resolveSrc : 1; ///< Image will be used as resolve source image + uint32 resolveDst : 1; ///< Image will be used as resolve dst image + uint32 colorTarget : 1; ///< Image will be bound as a color target. + uint32 depthStencil : 1; ///< Image will be bound as a depth/stencil target. + uint32 noStencilShaderRead : 1; ///< Image will be neither read as stencil nor resolved on stencil plane. + /// Note that if resolveSrc bit has been set to indicate that the image + /// could be adopted as a resolveSrc image and there could be stencil + /// resolve, noStencilShaderRead must be set to 0, since shader-read + /// based stencil resolve might be performed. + uint32 hiZNeverInvalid : 1; ///< Hint to PAL indicating the client will guarantee that no operations + /// performed on this Image while it is in a decompressed state will cause + /// Hi-Z metadata to become invalid. This allows PAL to avoid an expensive + /// resummarization blit in some resource barriers. + uint32 depthAsZ24 : 1; ///< Use a 24-bit format for HW programming of a native 32-bit surface. + /// If set, border color and Z-reference values are treated as Z-24. + uint32 firstShaderWritableMip : 4; ///< Only relevant if the shaderWrite flag is set. Typically set to 0 so + /// entire image is writable. If non0, such as an image where only level0 + /// is used as a color target and compute is used to generate mipmaps,PAL + /// may be able to enable additional compression on the baseLevels which + /// are used exclusively as color target and shader read. + uint32 cornerSampling : 1; ///< Set if this image will use corner sampling in image-read scenarios. + /// With corner sampling, the extent refers to the number of pixel corners + /// which will be one more than the number of pixels. Border color is + /// ignored when corner sampling is enabled. + + uint32 vrsDepth : 1; ///< Set if this depth image will be bound when VRS rendering is enabled. + uint32 disableOptimizedDisplay: 1; ///< Do not create Display Dcc + uint32 useLossy : 1; ///< Set if this image may use lossy compression. + uint32 stencilOnlyTarget : 1; ///< This must be set if a stencil-only IDepthStencilView will be created + ///< for this image. + uint32 vrsRateImage : 1; ///< This image is potentially used with CmdBindSampleRateImage + uint32 videoDecoder : 1; ///< Indicating this Image is video decoder target + uint32 reserved : 12; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies properties for @ref IImage creation. Input structure to IDevice::CreateImage(). +/// +/// Note that by default PAL may instruct the hardware to swizzle the contents of an image in memory; if this occurs +/// two images created with identical properties will not map their texels to the same offsets in GPU memory and may +/// even have different sizes. At the expense of performance this behavior can be limited by setting the invariant flag, +/// which guarantees that images with identical properties will have identical GPU memory layouts. +/// +/// For single-sampled color images, there is a middle ground between these two modes. If the fixedTileSwizzle flag is +/// set, PAL will use the tileSwizzle property instead of generating its own swizzle value. The tileSwizzle value must +/// be obtained from the base subresource of a single-sampled color image with identical properties (excluding +/// fixedTileSwizzle and tileSwizzle). This allows the client to force certain similar images to share the same GPU +/// memory layouts without forcing all similar images to a single GPU memory layout. +struct ImageCreateInfo +{ + ImageCreateFlags flags; ///< Image creation flags. + ImageUsageFlags usageFlags; ///< Image usage flags. + ImageType imageType; ///< Dimensionality of image (1D/2D/3D). + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + Extent3d extent; ///< Dimensions in pixels WxHxD. + uint32 mipLevels; ///< Number of mipmap levels. Cannot be 0. + uint32 arraySize; ///< Number of slices. Set to 1 for non-array images. + uint32 samples; ///< Number of coverage samples. Set to 1 for single sample images. Must be + /// greater than or equal to the number of fragments. + uint32 fragments; ///< Number of color/depth fragments. Set to 1 for single sample images. + ImageTiling tiling; ///< Controls layout of pixels in the image. + ImageTilingPattern tilingPreference; ///< Controls preferred tile swizzle organization for this image. + TilingOptMode tilingOptMode; ///< Hints to pal to select the appropriate tiling mode. + uint32 tileSwizzle; ///< If fixedTileSwizzle is set, use this value for the image's base swizzle. +#if PAL_CLIENT_EXAMPLE + SwizzleMode fixedSwizzleMode; ///< For directed image tests, force a particular swizzle mode. +#endif + + /// Metadata behavior mode for this image. On GPUs with GFX12-style distributed compression + /// (see supportDistributedCompression flag in @ref DeviceProperties), metadataMode only controls UMD metadata + /// (Hi-Z and Hi-S). On such GPUs, the FmaskOnly enum is ignored and treated like Default. + MetadataMode metadataMode; + MetadataTcCompatMode metadataTcCompatMode; ///< TC compat mode for this image. + + /// Distributed compression contains GL2/DF DCC compression and RB backend client compression which includes + /// fragment client compression (previous FMASK compression alike) on color MSAA images and Z Plane client + /// compression on depth stencil images. Only relevant if the backing memory pages enable compression, controllable + /// by client with @ref GpuMemoryCreateInfo::compression. + CompressionMode compressionMode; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 876 + /// Client compression is part of distributed compression (aka physical compression); it can only be enabled if + /// physical compression is enabled. + /// + /// On Gfx12, controls (legacy FMask based) color fragment compression and Z plane compression. + ClientCompressionMode clientCompressionMode; ///< Controls client compression behavior for this resource. +#else + TriState clientCompressionMode; ///< Controls client compression behavior for this resource. +#endif + + uint32 maxBaseAlign; ///< Maximum address alignment for this image or zero for an unbounded alignment. + float imageMemoryBudget; ///< The memoryBudget value used in SW addrlib to determine the minSizeBlk for textures. + /// It must be >= 0.0. When in [0.0, 1.0) addrlib uses legacy logic to decide minSizeBlk. + /// When == 1.0, addrlib uses minimizeAlign. When > 1.0, addrlib applies memory budget + /// algorithm. Despite 1.5 in tests show significant texture allocation size reduction, + /// default value 0.0 (legacy behavior) is recommended if not specified by client. + + struct + { + PrtMapType mapType; ///< Indicates what sort of PRT metadata is stored in this image. If this image is PRT + /// metadata, then it can only be associated with an image that is a power-of-two + /// multiple bigger (or the same size). Image properties needs to include + /// "PrtFeaturePrtPlus" to create PRT map images. Format must be set to X8_Unorm for + /// residency map and sampling-status map types. + Extent3d lodRegion; ///< Useful only if mapType is not "none". Defines the region size of the parent image + /// that one pixel of this image matches with. The map image can only be paired with a + /// parent image of matching dimensions. This parameter can be left at zero. + } prtPlus; + + /// The following "pitch" members must be zeroed unless the client is creating a @ref ImageTiling::Linear image and + /// wishes to directly specify the image's row and depth pitches. In that case, they must be integer multiples of + /// the alignments given by @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. + uint32 rowPitch; ///< The image must have this row pitch for the first mip level (in bytes). + uint32 depthPitch; ///< The image must have this depth pitch for the first mip level (in bytes). + + Rational refreshRate; ///< The expected refresh rate when presenting this flippable or stereo image. + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. + +#if defined(__unix__) + uint64 modifier; ///< Drm format modifier. Ignored if flags.hasModifier unset. + uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. + gpusize modifierMemoryPlaneOffset[3]; ///< Offset of main surface, display Dcc surface and gfx Dcc surface. +#endif +}; + +inline constexpr bool operator==(const ImageCreateInfo& lhs, const ImageCreateInfo& rhs) +{ + bool same = (lhs.flags.u32All == rhs.flags.u32All) && + (lhs.usageFlags.u32All == rhs.usageFlags.u32All) && + (lhs.imageType == rhs.imageType) && + (lhs.swizzledFormat == rhs.swizzledFormat) && + (lhs.extent == rhs.extent) && + (lhs.mipLevels == rhs.mipLevels) && + (lhs.arraySize == rhs.arraySize) && + (lhs.samples == rhs.samples) && + (lhs.fragments == rhs.fragments) && + (lhs.tiling == rhs.tiling) && + (lhs.tilingPreference == rhs.tilingPreference) && + (lhs.tilingOptMode == rhs.tilingOptMode) && + (lhs.tileSwizzle == rhs.tileSwizzle) && +#if PAL_CLIENT_EXAMPLE + (lhs.fixedSwizzleMode == rhs.fixedSwizzleMode) && +#endif + (lhs.metadataMode == rhs.metadataMode) && + (lhs.metadataTcCompatMode == rhs.metadataTcCompatMode) && + (lhs.compressionMode == rhs.compressionMode) && + (lhs.clientCompressionMode == rhs.clientCompressionMode) && + (lhs.maxBaseAlign == rhs.maxBaseAlign) && + (lhs.imageMemoryBudget == rhs.imageMemoryBudget) && + (lhs.prtPlus.mapType == rhs.prtPlus.mapType) && + (lhs.prtPlus.lodRegion == rhs.prtPlus.lodRegion) && + (lhs.rowPitch == rhs.rowPitch) && + (lhs.depthPitch == rhs.depthPitch) && + (lhs.refreshRate.numerator == rhs.refreshRate.numerator) && + (lhs.refreshRate.denominator == rhs.refreshRate.denominator) && + (lhs.viewFormatCount == rhs.viewFormatCount) && + true; + +#if defined(__unix__) + if (same && (lhs.flags.hasModifier != 0)) + { + same = (lhs.modifier == rhs.modifier) && + (lhs.modifierPlaneCount == rhs.modifierPlaneCount) && + (memcmp(&lhs.modifierMemoryPlaneOffset[0], + &rhs.modifierMemoryPlaneOffset[0], + sizeof(gpusize) * lhs.modifierPlaneCount) == 0); + } +#endif + + if (same && (lhs.viewFormatCount > 0) && (lhs.viewFormatCount != AllCompatibleFormats)) + { + same = (memcmp(lhs.pViewFormats, rhs.pViewFormats, lhs.viewFormatCount * sizeof(SwizzledFormat)) == 0); + } + + return same; +} + +/// Specifies properties for presentable @ref IImage creation. Input structure to IDevice::CreatePresentableImage(). +struct PresentableImageCreateInfo +{ + union + { + struct + { + uint32 fullscreen : 1; ///< Image supports fullscreen presentation. + uint32 stereo : 1; ///< Image supports stereoscopic rendering and display. + /// Implies an array size of 2. Fullscreen must be set. + uint32 turbosync : 1; ///< Image supports turbosync flip + uint32 peerWritable : 1; ///< Indicates if the memory allocated will be writable by other devices + uint32 tmzProtected : 1; ///< Indicates this presenatble image's memory is tmz Protected. +#if PAL_AMDGPU_BUILD + uint32 initializeToZero : 1; ///< If set, PAL will request that the host OS zero-initializes + /// the allocation upon creation, currently, only GpuHeapLocal and + /// GpuHeapInvisible are supported. +#else + uint32 placeholder0 : 1; ///< Placeholder. +#endif + uint32 enable256KBSwizzleModes : 1; ///< Enable 256 KiB swizzle modes. + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Presentable image creation flags. + + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + ImageUsageFlags usage; ///< Image usage flags. + Extent2d extent; ///< Width/height of the image. + CompressionMode compressionMode; ///< Specify GFX12-style distributed compression behavior for this resource. + /// Only relevant if the backing memory pages enable compression (controllable by + /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). + const IScreen* pScreen; ///< Target screen for fullscreen presentable images. Can be null if the fullscreen + /// flag is 0. + OsDisplayHandle hDisplay; ///< Display handle of the local display system only for WSI. + OsWindowHandle hWindow; ///< Window handle only for WSI. + ISwapChain* pSwapChain; ///< SwapChain object which the presentable image belongs to. + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. +}; + +/// Specifies properties for private screen @ref IImage image creation. Input structure to +/// IDevice::CreatePrivateScreenImage(). +struct PrivateScreenImageCreateInfo +{ + union + { + struct + { + uint32 invariant : 1; ///< Images with this flag set and all other creation identical are guaranteed + /// to have a consistent data layout. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Private screen image creation flags. + + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. + ImageUsageFlags usage; ///< Image usage flags. + Extent2d extent; ///< Width/height of the image. + IPrivateScreen* pScreen; ///< Private screen this image is created on (then this image can be used to be + /// presented on this private screen). + CompressionMode compressionMode;///< Specify GFX12-style distributed compression behavior for this resource. + /// Only relevant if the backing memory pages enable compression (controllable by + /// client with the distributedCompression field in @ref GpuMemoryCreateInfo). + + /// By default an image can only be used with image views that exactly match @ref swizzledFormat (the base format). + /// If the client wishes to create image views with other formats they must fill out the following fields. + /// + /// Valid combinations of these fields include: + /// - pViewFormats = nullptr, viewFormatCount = 0: all image views must use the base format. + /// - pViewFormats = nullptr, viewFormatCount = AllCompatibleFormats: image views can use any compatible formats. + /// - Otherwise pViewFormats is an array of additional image view formats (excluding the base format) and + /// viewFormatCount is the length of that array. + /// + /// @warning pViewFormats is consumed at image creation time and should not be accessed afterwards through + /// @ref GetImageCreateInfo. + uint32 viewFormatCount; ///< Must be 0, AllCompatibleFormats, or the length of pViewFormats. + const SwizzledFormat* pViewFormats; ///< See the block comment above for a full description. +}; + +/// Specifies parameters for opening another device's image for peer access from this device. Input structure to +/// IDevice::OpenPeerImage(). +struct PeerImageOpenInfo +{ + const IImage* pOriginalImage; ///< Other device's image to be opened for peer access. +}; + +/// Specifies parameters for opening another non-PAL device's image for access from this device. Input structure to +/// IDevice::OpenExternalSharedImage(). +struct ExternalImageOpenInfo +{ + ExternalResourceOpenInfo resourceInfo; ///< Information describing the external image. + Extent3d extent; ///< Expected extent for the external image. This reference value would be + /// ignored and use extents from shared metadata if any dimension of the + /// reference extent is zero. + SwizzledFormat swizzledFormat; ///< Pixel format and channel swizzle. Or UndefinedFormat to infer the + /// format internally. + ImageCreateFlags flags; ///< Image Creation flags. + ImageUsageFlags usage; ///< Image usage flags. + IPrivateScreen* pScreen; ///< Private screen this image is created on, or null. + gpusize gpuMemOffset; ///< GpuMemory offset +#if defined(__unix__) + gpusize dccOffset; ///< Offset of gfx Dcc surface if nonzero. + gpusize displayDccOffset; ///< Offset of display Dcc surface if nonzero. + uint64 modifier; ///< Drm format modifier, if flags.hasModifier is set. + uint32 modifierPlaneCount; ///< Number of memory planes of drm format modifier. +#endif + /// The following members must be set to zero unless the client is opening a @ref ImageTiling::Linear image with + /// specified row and depth pitches. In that case, they must be integer multiples of the alignments given by + /// @ref IDevice::GetLinearImageAlignments, called with an appropriate maxElementSize. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines + /// of the subresource. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive + /// slices. +}; + +/// Reports the overall GPU memory layout of the entire image. Output structure for IImage::GetMemoryLayout(). Unused +/// sections will have a size of zero, an offset of zero, and an alignment of one. The layout is split into: +/// + Image Data: The raw texel values for all subresources of the image. +/// + Image Metadata: Additional data that will be used to optimize GPU operations that access the image. +/// + Image Metadata Header: A special subsection of the metadata for small bits of data with weaker alignment. +struct ImageMemoryLayout +{ + gpusize dataSize; ///< The size, in bytes, of the image's core data section. + gpusize dataAlignment; ///< The alignment, in bytes, of the image's core data section. + + gpusize metadataOffset; ///< The offset, in bytes, of the image's metadata section. + gpusize metadataSize; ///< The size, in bytes, of the image's metadata section. + gpusize metadataAlignment; ///< The alignment, in bytes, of the image's metadata section. + + gpusize metadataHeaderOffset; ///< The offset, in bytes, of the image's metadata header. + gpusize metadataHeaderSize; ///< The size, in bytes, of the image's metadata header. + gpusize metadataHeaderAlignment; ///< The alignment, in bytes, of the image's metadata header. + + uint8 swizzleEqIndices[2]; ///< Which swizzle equations this image uses or InvalidSwizzleEqIndex if + /// there are no swizzle equations for this image's layout. + uint8 swizzleEqTransitionMip; ///< Before this mip level, the image uses swizzleEqIndices[0]; from this + /// mip level onwards, the image uses swizzleEqIndices[1]. + uint8 swizzleEqTransitionPlane; ///< Before this mip plane, the image uses swizzleEqIndices[0]; from this + /// plane onward, the image uses swizzleEqIndices[1]. + + uint32 prtTileWidth; ///< Width, in texels, of a PRT tile + uint32 prtTileHeight; ///< Height, in texels, of a PRT tile + uint32 prtTileDepth; ///< Depth, in texels, of a PRT tile + uint32 prtMinPackedLod; ///< First mip level that is packed into the PRT mip tail. + uint32 prtMipTailTileCount; ///< Number of tiles in the packed mip tail. This may either indicate the + /// size per slice or per image depending on the support for + /// PrtFeaturePerLayerMipTail (@see PrtFeatureFlags) + uint32 stereoLineOffset; ///< Y offset to the right eye data, in texels +}; + +/// Collection of bitmasks specifying which operations are currently allowed on an image, and which queues are allowed +/// to perform those operations. Based on this information, PAL can determine the best compression state of the image. +struct ImageLayout +{ + uint32 usages : 24; ///< Bitmask of @ref ImageLayoutUsageFlags values. + uint32 engines : 8; ///< Bitmask of @ref ImageLayoutEngineFlags values. +}; + +/** +**************************************************************************************************** +* @brief +* Enumerates swizzle modes useable on any supported GPU. +* @note +* For details please check _AddrSwizzleMode +* +**************************************************************************************************** +*/ +enum SwizzleMode : uint32 +{ + SwizzleModeLinear = 0, + SwizzleMode256BS, + SwizzleMode256BD, + SwizzleMode256BR, + SwizzleMode4KbZ, + SwizzleMode4KbS, + SwizzleMode4KbD, + SwizzleMode4KbR, + SwizzleMode64KbZ, + SwizzleMode64KbS, + SwizzleMode64KbD, + SwizzleMode64KbR, + SwizzleMode64KbZT, + SwizzleMode64KbST, + SwizzleMode64KbDT, + SwizzleMode64KbRT, + SwizzleMode4KbZX, + SwizzleMode4KbSX, + SwizzleMode4KbDX, + SwizzleMode4KbRX, + SwizzleMode64KbZX, + SwizzleMode64KbSX, + SwizzleMode64KbDX, + SwizzleMode64KbRX, + SwizzleMode256KbVarZX, + SwizzleMode256KbVarSX, + SwizzleMode256KbVarDX, + SwizzleMode256KbVarRX, + + ///< the meaning of swizzle mode varies by generation, + /// do not compare directly with the _R / _S / _D / _S types". + SwizzleMode256B2D, + SwizzleMode4Kb2D, + SwizzleMode4Kb3D, + SwizzleMode64Kb2D, + SwizzleMode64Kb3D, + SwizzleMode256Kb2D, + SwizzleMode256Kb3D, + SwizzleMode64Kb2Dz, + SwizzleMode256Kb2Dz, + SwizzleModeCount, +}; + +/// Reports position and memory layout information for a specific subresource in an image. Output structure for +/// IImage::GetSubresourceLayout(). +struct SubresLayout +{ + uint32 elementBytes; ///< size of each element in bytes + gpusize offset; ///< Offset in bytes from the base of the image's GPU memory where the subresource starts. + gpusize swizzleOffset; ///< Offset in bytes used for supporting parameterized swizzle + gpusize size; ///< Size of the subresource in bytes. + gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines of the subresource. + gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. + uint32 tileToken; ///< Token representing various tiling information necessary for determining compatible + /// optimally tiled copies. + uint32 tileSwizzle; ///< Bank/Pipe swizzle bits for macro-tiling modes. + Extent3d blockSize; ///< Size of a tile block in texels - micro tile for 1D tiling and macro tile for 2D tiling. + Offset3d mipTailCoord; ///< coords of the subresource within the mip tail + + Extent3d extentTexels; ///< Unpadded extent of the subresource in texels. + Extent3d extentElements; ///< Unpadded extent of the subresource in elements. + Extent3d paddedExtent; ///< Extent of the subresource in elements, including all internal padding for this subresource. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 912 + /// Reports supported engines and usages for this subresource while it can remain in its optimal compression state. + /// Clients using CmdRelease()/CmdAcquire() without complete knowledge of the application's next usage during + /// CmdRelease() or its previous usage at CmdAcquire() can treat this layout as a performant target for an + /// intermediate state that will avoid unnecessary decompressions. + /// + /// This value is only valid if supportSplitReleaseAcquire is set in @ref DeviceProperties. + ImageLayout defaultGfxLayout; +#endif + + SwizzledFormat planeFormat; ///< Swizzled format for plane. Planar resource like D32-S8 + /// will have different swizzled format per plane. + SwizzleMode swizzleMode; ///< Swizzle mode for plane, based on AddrSwizzleMode + uint32 hwSwizzleMode; ///< Hardware Swizzle enum, enum type dependent on gfx version +}; + +/// Selects a specific subresource of an image resource. +/// +/// Most images only have a single data plane but in some cases conceptually related data will be stored in physically +/// separate locations which we call planes. If an image only has a single plane it will always be plane 0. +/// We define the following fixed mappings for all multi-plane formats. +/// + Depth-stencil: if the image format contains depth and stencil data, plane 0 is depth and plane 1 is stencil. +/// + YUV-planar: if the image format is @ref YuvPlanar it has either two or three planes. The luma plane +/// is always plane 0. If the format is @ref ChNumFormat::YV12 it has three planes where plane 1 is the +/// red-difference chrominance plane and plane 2 is the blue-difference chrominance plane. Otherwise, plane 1 +/// interleaves blue-difference and red-difference chrominance values. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 886 +struct SubresId +{ + uint8 plane; ///< Selects a data plane. + uint8 mipLevel; ///< Selects a mip level. + uint16 arraySlice; ///< Selects an array slice. +}; + +/// Defines a range of subresources. +struct SubresRange +{ + SubresId startSubres; ///< First subresource in the range. + uint8 numPlanes; ///< Number of planes in the range. + uint8 numMips; ///< Number of mip levels in the range. + uint16 numSlices; ///< Number of slices in the range. +}; + +#else +struct SubresId +{ + uint32 plane; ///< Selects a data plane. + uint32 mipLevel; ///< Selects a mip level. + uint32 arraySlice; ///< Selects an array slice. +}; + +/// Defines a range of subresources. +struct SubresRange +{ + SubresId startSubres; ///< First subresource in the range. + uint32 numPlanes; ///< Number of planes in the range. + uint32 numMips; ///< Number of mip levels in the range. + uint32 numSlices; ///< Number of slices in the range. +}; + +#endif + +/// A variant struct of MemoryImageCopyRegion +/// Specifies parameters for a copy from CPU memory to Image. +/// An input for Image::CopyMemoryToImage(). +struct MemoryToImageCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in elements. + uint32 numSlices; ///< Number of slices the copy will span. + const void* pHostPtr; ///< Pointer to the host memory copy from. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +/// A variant struct of MemoryImageCopyRegion +/// Specifies parameters for a copy from Image to CPU memory. +/// An input for Image::CopyImageToMemory(). +struct ImageToMemoryCopyRegion +{ + SubresId imageSubres; ///< Selects the image subresource. + Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. + Extent3d imageExtent; ///< Size of the image region in elements. + uint32 numSlices; ///< Number of slices the copy will span. + void* pHostPtr; ///< Pointer to the host memory copy to. + gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. + gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. +}; + +inline constexpr bool operator==(const SubresId& lhs, const SubresId& rhs) +{ + return (lhs.plane == rhs.plane) && + (lhs.mipLevel == rhs.mipLevel) && + (lhs.arraySlice == rhs.arraySlice); +} + +inline constexpr bool operator!=(const SubresId& lhs, const SubresId& rhs) +{ + return ((lhs == rhs) == false); +} + +inline constexpr bool operator==(const SubresRange& lhs, const SubresRange& rhs) +{ + return (lhs.startSubres == rhs.startSubres) && + (lhs.numPlanes == rhs.numPlanes) && + (lhs.numMips == rhs.numMips) && + (lhs.numSlices == rhs.numSlices); +} + +/// Determines if two subresource ranges are overlapped. +/// +/// @returns True if two subresource ranges are overlapped, false otherwise. +inline constexpr bool OverlappedSubresRanges( + const SubresRange& a, + const SubresRange& b) +{ + const SubresId aStart = a.startSubres; + const SubresId bStart = b.startSubres; + + return (aStart.plane < (bStart.plane + b.numPlanes)) && + (bStart.plane < (aStart.plane + a.numPlanes)) && + (aStart.mipLevel < (bStart.mipLevel + b.numMips)) && + (bStart.mipLevel < (aStart.mipLevel + a.numMips)) && + (aStart.arraySlice < (bStart.arraySlice + b.numSlices)) && + (bStart.arraySlice < (aStart.arraySlice + a.numSlices)); +} + +/** + *********************************************************************************************************************** + * @interface IImage + * @brief Represents an image resource that can be accessed by the GPU. + * + * @see IDevice::CreateImage() + * @see IDevice::OpenPeerImage() + *********************************************************************************************************************** + */ +class IImage : public IGpuMemoryBindable +{ +public: + /// Reports information on the layout of the image in memory such as core data size and metadata alignment. + /// + /// @returns the reference to ImageCreateInfo + virtual const ImageMemoryLayout& GetMemoryLayout() const = 0; + + /// Reports information on the full range of the image's subresources. + /// + /// @param [out] pRange Reports info on the full range of the image's subresources such as number of mips and + /// planes. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidPointer if pRange is null. + virtual Result GetFullSubresourceRange(SubresRange* pRange) const = 0; + + /// Reports information on the layout of the specified subresource in memory. + /// + /// @param [in] subresId Selects a subresource from the image (aspect/mip/slice). + /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidPointer if pLayout is null. + /// + ErrorInvalidValue is the subresId is out of range for this image. + virtual Result GetSubresourceLayout( + SubresId subresId, + SubresLayout* pLayout) const = 0; + +#if defined(__unix__) + /// Reports information on the memory plane layout of the specified subresource in memory for image with modifier. + /// + /// @param [in] memoryPlane Selects a memory plane from the image. + /// @param [out] pLayout Reports info on the subresource layout such as size and pitch. + /// + /// @returns Success if the layout was successfully reported. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidValue is the memory plane is out of range for this image. + virtual Result GetModifierSubresourceLayout( + uint32 memoryPlane, + SubresLayout* pLayout) const = 0; +#endif + + /// Reports the create info of image. + /// + /// @returns the reference to ImageCreateInfo + const ImageCreateInfo& GetImageCreateInfo() const { return m_createInfo; } + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +#if defined(_WIN32) + /// This method checks if the image is an opened cross-adapter shared image on MS hybrid graphics system. + /// + /// @returns True if the image is an opened cross-adapter shared image. False otherwise. + virtual bool IsCrossAdapter() const = 0; + + /// Returns a special resource ID. Can be used to associate an opened resource with its original resource. + /// + /// @returns The optimal sharing ID. + virtual uint64 GetOptimalSharingId() const = 0; +#endif + + /// Sets level of optimal sharing by opening APIs using this optimal sharable image and pass this information to the + /// creator. This function is supposed to be called by openers only. The call by creator is ignored. + /// + /// @param [in] level Level to be set to specified client API. + virtual void SetOptimalSharingLevel( + MetadataSharingLevel level) = 0; + + /// Returns support level set by all possible opening APIs. + /// + /// @returns A summarized supporting level. + virtual MetadataSharingLevel GetOptimalSharingLevel() const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Copies data directly from CPU memory to an Image. + /// + /// @param [in] pRegions Pointer to an array of Image regions to copy from. + /// @param [in] regionCount Number of regions to copy. + /// @param [in] useMemcpy Indicates that it could be copied more efficiently from host memory to image with + /// image data is already swizzled in host memory + /// + /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the + /// returned: + /// + ErrorInvalidPointer if the memory object bound to image is null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorInvalidValue if copy between image and memory failed. + virtual Result CopyMemoryToImage( + const MemoryToImageCopyRegion* pRegions, + const uint32 regionCount, + bool useMemcpy) const = 0; + + /// Copies data directly from an Image to CPU memory. + /// + /// @param [in] pRegions Pointer to an array of Image regions to copy to. + /// @param [in] regionCount Number of regions to copy. + /// @param [in] useMemcpy Indicates that it could be copied more efficiently from image to host memory and the + /// image data will be obtained while retaining the physical layout of the image. + /// + /// @returns Success if the copy between the image and memory was successfully performed. Otherwise, one of the + /// returned: + /// + ErrorInvalidPointer if the memory object bound to image is null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if out of system memory. + /// + ErrorInvalidValue if copy between image and memory failed. + virtual Result CopyImageToMemory( + const ImageToMemoryCopyRegion* pRegions, + const uint32 regionCount, + bool useMemcpy) const = 0; + + /// Copies data between two images with specified regions. + /// + /// @param [in] pDstImage Pointer to the destination image where the data will be copied. + /// @param [in] pImgRegions Pointer to an array of regions specifying the area of image to be copied. + /// @param [in] regionCount Number of regions to copy between the source and destination images. + /// + /// @returns Success if the copy operation was successfully performed. Otherwise, one of the following error codes + /// may be returned: + /// + ErrorInvalidPointer if any of the input pointers are null. + /// + ErrorGpuMemoryMapFailed if the memory object is busy and cannot be mapped by the OS. + /// + ErrorOutOfMemory if there is insufficient memory to perform the operation. + /// + ErrorInvalidValue if copy between images failed. + virtual Result CopyBetweenImages( + IImage* pDstImage, + const ImageCopyRegion* pImgRegions, + const uint32 regionCount) const = 0; + +protected: + /// @internal Constructor. + /// + /// @param [in] createInfo App-specified parameters describing the desired image properties. + IImage(const ImageCreateInfo& createInfo) : m_createInfo(createInfo), m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IImage() { } + + /// Retained Image create info + const ImageCreateInfo m_createInfo; + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palLib.h b/shared/amdgpu-windows-interop/pal/inc/core/palLib.h index ec35c5092d..8f42570262 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palLib.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palLib.h @@ -1,510 +1,508 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palLib.h - * @brief Defines the Platform Abstraction Library (PAL) initialization and destruction functions. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palSysMemory.h" -#include "palDbgPrint.h" - -/// Major interface version. Note that the interface version is distinct from the PAL version itself, which is returned -/// in @ref Pal::PlatformProperties. -/// -/// @attention Updates to the major version indicate an interface change that is not backward compatible and may require -/// action from each client during their next integration. When determining if a change is backward -/// compatible, it is assumed that the client will default-initialize all structs. -/// -/// @ingroup LibInit -#define PAL_INTERFACE_MAJOR_VERSION 960 - -/// Minimum major interface version. This is the minimum interface version PAL supports in order to support backward -/// compatibility. When it is equal to PAL_INTERFACE_MAJOR_VERSION, only the latest interface version is supported. -/// -/// @ingroup LibInit -#define PAL_MINIMUM_INTERFACE_MAJOR_VERSION 872 - -/// Minimum supported major interface version for devdriver library. This is the minimum interface version of the -/// devdriver library that PAL is backwards compatible to. -/// -/// @ingroup LibInit -#define PAL_MINIMUM_GPUOPEN_INTERFACE_MAJOR_VERSION 38 - -/** - *********************************************************************************************************************** - * @def PAL_INTERFACE_VERSION - * @ingroup LibInit - * @brief Current PAL interface version packed into a 32-bit unsigned integer. The low 16 bits are always zero. - * They used to contain the interface minor version and remain as a placeholder in case we add it back. - * - * @see PAL_INTERFACE_MAJOR_VERSION - * - * @hideinitializer - *********************************************************************************************************************** - */ -#define PAL_INTERFACE_VERSION (PAL_INTERFACE_MAJOR_VERSION << 16) - -namespace Pal -{ - -// Forward declarations -class IPlatform; - -/// This is a list of GPUs that the NULL OS layer can compile shaders to in offline mode. -enum class NullGpuId : uint32 -{ - Default = 0, ///< PAL gives the client an arbitrary supported null device. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - Polaris10, ///< 8.0.3 - Polaris11, ///< 8.0.3 - Polaris12, ///< 8.0.3 - Vega10, ///< 9.0.0 - Raven, ///< 9.0.2 - Vega12, ///< 9.0.4 - Vega20, ///< 9.0.6 - Raven2, ///< 9.0.9 - Renoir, ///< 9.0.9 -#endif - Navi10, ///< 10.1.0 - Navi12, ///< 10.1.1 - Navi14, ///< 10.1.2 - Navi21, ///< 10.3.0 - Navi22, ///< 10.3.1 - Navi23, ///< 10.3.2 - Navi24, ///< 10.3.4 - Rembrandt, ///< 10.3.5 - Raphael, ///< 10.3.6 - Navi31, ///< 11.0.0 - Navi32, ///< 11.0.1 - Navi33, ///< 11.0.2 - Phoenix1, ///< 11.0.3 - Phoenix2, ///< 11.0.3 - Strix1, ///< 11.5.0 - StrixHalo, ///< 11.5.1 - Krackan1, ///< 11.5.2 - Navi44, ///< 12.0.0 - Navi48, ///< 12.0.1 -#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION>= 888) -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 958 -#endif -#endif - Max, ///< The maximum count of null devices. - All, ///< If you want to enumerate all null devices. -}; - -/// Specifies which graphics IP level (GFXIP) this device has. -enum class GfxIpLevel : uint32 -{ - _None = 0, ///< @internal The device does not have an GFXIP block, or its level cannot be determined - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< The device does not have an GFXIP block, or its level cannot be determined -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - GfxIp10_1, ///< GFXIP 10.1 (Navi1x) - GfxIp10_3, ///< GFXIP 10.3 (Navi2x, Rembrandt, Raphael, Mendocino) - GfxIp11_0, ///< GFXIP 11.0 (Navi3x, Phoenix) - GfxIp11_5, ///< GFXIP 11.5 (Strix) - GfxIp12, ///< GFXIP 12.0 (Navi4x) -#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 - GfxIp6 = 0x1, - GfxIp7 = 0x2, - GfxIp8 = 0x3, - GfxIp8_1 = 0x4, - GfxIp9 = 0x5, - GfxIp10_1 = 0x7, - GfxIp10_3 = 0x9, - GfxIp11_0 = 0xC, - GfxIp11_5 = 0xF, - GfxIp12 = 0x11, -#endif -}; - -/// Specifies the hardware revision. Some AMD tools hard-code these values so we cannot change them. New ASICs should -/// be added at the end of the list and be given the next highest value. -enum class AsicRevision : uint32 -{ - Unknown = 0x00, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 - Tahiti = 0x01, - Pitcairn = 0x02, - Capeverde = 0x03, - Oland = 0x04, - Hainan = 0x05, - Bonaire = 0x06, - Hawaii = 0x07, - HawaiiPro = 0x08, - Kalindi = 0x0A, - Godavari = 0x0B, - Spectre = 0x0C, - Spooky = 0x0D, - Carrizo = 0x0E, - Bristol = 0x0F, - Stoney = 0x10, - Iceland = 0x11, - Tonga = 0x12, - TongaPro = Tonga, - Fiji = 0x13, - Polaris10 = 0x14, - Polaris11 = 0x15, - Polaris12 = 0x16, - Vega10 = 0x18, - Vega12 = 0x19, - Vega20 = 0x1A, - Raven = 0x1B, - Raven2 = 0x1C, - Renoir = 0x1D, -#endif - Navi10 = 0x1F, ///< 10.1.0 - Navi12 = 0x21, ///< 10.1.1 - Navi14 = 0x23, ///< 10.1.2 - Navi21 = 0x24, ///< 10.3.0 - Navi22 = 0x25, ///< 10.3.1 - Navi23 = 0x26, ///< 10.3.2 - Navi24 = 0x27, ///< 10.3.4 - Navi31 = 0x2C, ///< 11.0.0 - Navi32 = 0x2D, ///< 11.0.1 - Navi33 = 0x2E, ///< 11.0.2 - Rembrandt = 0x2F, ///< 10.3.5 - Strix1 = 0x33, ///< 11.5.0 - Raphael = 0x34, ///< 10.3.6 - Phoenix1 = 0x35, ///< 11.0.3 - Phoenix2 = 0x38, ///< 11.0.3 - HawkPoint1 = 0x39, ///< 11.0.3 - HawkPoint2 = 0x3A, ///< 11.0.3 - Krackan1 = 0x3B, ///< 11.5.2 - StrixHalo = 0x3C, ///< 11.5.1 - Navi44 = 0x3D, ///< 12.0.0 - Navi48 = 0x3E, ///< 12.0.1 -}; - -/// Maps a null GPU ID to its associated text name. -struct NullGpuInfo -{ - NullGpuId nullGpuId; ///< ID of an ASIC that PAL supports for override purposes - const char* pGpuName; ///< Text name of the ASIC specified by nullGpuId -}; - -/// Various IDs and info associated with a particular GPU. -struct GpuInfo -{ - AsicRevision asicRev; ///< PAL specific ASIC revision identifier. - NullGpuId nullId; ///< PAL specific GPU ID supported by the NULL OS layer. - GfxIpLevel gfxIpLevel; ///< PAL specific identifier for the device's graphics IP level (GFXIP). - uint32 familyId; ///< Hardware family ID. Driver-defined identifier for a particular family of devices. - uint32 eRevId; ///< GPU emulation/internal revision ID. - uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different SKUs or revisions. - uint32 gfxEngineId; ///< Coarse-grain GFX engine ID (R800, SI, etc.). - uint32 deviceId; ///< PCI device ID (e.g., Hawaii XT = 0x67B0). - const char* pGpuName; ///< ASIC name and AMDGPU target name (e.g., "NAVI31:gfx1100"). -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 915 -/// PAL client APIs. -enum class ClientApi : uint32 -{ - Pal = 0, - Dx9 = 1, - Dx12 = 3, - Vulkan = 4, - OpenCl = 7, - Hip = 8, - Amf = 9, -}; -#else -/// The client UMD must identify its API using this enum. Some UMD builds may implement multiple APIs so they must -/// specify which API they're implementing at runtime. Note that the PAL_CLIENT macros are the preferred way to -/// implement client-specific behavior; runtime ClientApi checks should only be used when necessary. -enum class ClientApi : uint32 -{ - OpenCl, - Hip -}; -#endif - -/// Specifies properties for @ref IPlatform creation. Input structure to Pal::CreatePlatform(). -struct PlatformCreateInfo -{ - const Util::AllocCallbacks* pAllocCb; ///< Optional client-provided callbacks. If non-null, PAL will call the - /// specified callbacks to allocate and free all internal system - /// memory. If null, PAL will manage memory on its own through the C - /// runtime library. - const Util::LogCallbackInfo* pLogInfo; ///< Optional client-provided callback info. If non-null, Pal will - /// call the callback to pass debug prints to the client. - - const char* pSettingsPath; ///< A null-terminated string describing the path to where settings are - /// located on the system. For example, on Windows, this will refer to - /// which UMD subkey to look in under a device's key. For Linux, this - /// is the path to the settings file. - - union - { - struct - { - uint32 disableGpuTimeout : 1; ///< Disables GPU timeout detection (Windows only) - uint32 force32BitVaSpace : 1; ///< Forces 32bit VA space for the flat address with 32bit ISA - uint32 createNullDevice : 1; ///< Set to create a null device, so "nullGpuId" below for the - /// ID of the GPU the created device will be based on. Null - /// devices operate in IFH mode; useful for off-line shader - /// compilations. - uint32 enableSvmMode : 1; ///< Enable SVM mode. When this bit is set, PAL will reserve - /// cpu va range with size "maxSvmSize", and allow client to - /// to create gpu or pinned memory for use of Svm. - /// For detail of SVM, please refer to CreateSvmGpuMemory - uint32 requestShadowDescriptorVaRange : 1; ///< Requests that PAL provides support for the client to use - /// the @ref VaRange::ShadowDescriptorTable virtual-address - /// range. Some GPU's may not be capable of supporting this, - /// even when requested by the client. - uint32 disableInternalResidencyOpts : 1; ///< Disables residency optimizations for internal GPU memory - /// allocations. Some clients may wish to have them turned - /// off to save on system resources. - uint32 supportRgpTraces : 1; ///< Indicates that the client supports RGP tracing. PAL will - /// use this flag and the hardware support flag to setup the - /// DevDriver RgpServer. - uint32 dontOpenPrimaryNode : 1; ///< No primary node is needed (Linux only) - uint32 disableDevDriver : 1; ///< If no DevDriverMgr should be created with this Platform. - uint32 reserved : 23; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Platform-wide creation flags. - - ClientApi clientApiId; ///< Client API ID. - NullGpuId nullGpuId; ///< ID for the null device. Ignored unless the above flags.createNullDevice bit is set. - uint16 apiMajorVer; ///< Major API version number to be used by RGP. Should be set by client based on their - /// contract with RGP. - uint16 apiMinorVer; ///< Minor API version number to be used by RGP. Should be set by client based on their - /// contract with RGP. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 916 - uint32 instrApiVer; /// Instrumentation specification version for API-specific SQTT instrumentation fields. - /// Should be set by client based on the SQTT instrumentation spec version being targeted. -#endif - gpusize maxSvmSize; ///< Maximum amount of virtual address space that will be reserved for SVM -}; - -/** -************************************************************************************************************************ -* @brief Determines the amount of system memory required for a Platform object. -* -* This function must be called before any other interaction with PAL. An allocation of this amount of memory must be -* provided in the pPlacementAddr parameter of Pal::CreatePlatform. -* -* @ingroup LibInit -* -* @returns Size, in bytes, of system memory required for an IPlatform object. -************************************************************************************************************************ -*/ -size_t PAL_STDCALL GetPlatformSize(); - -/** - *********************************************************************************************************************** - * @brief Creates the Platform Abstraction Library. - * - * On execution of CreatePlatform(), PAL will establish a connection for OS and KMD communication, install the specified - * system memory allocation callbacks, and initialize any global internal services. Finally, the client will be - * returned an object pointer to the instantiated platform object, which is used to query the capabilities of the - * system. - * - * @ingroup LibInit - * - * @param [in] createInfo Parameters indicating the client requirements for the platform such as allocation - callbacks or the settings path. - * @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as - * much size available here as reported by calling GetPlatformSize(). - * @param [out] ppPlatform Platform object pointer to the instantiated platform. Must not be null. - * - * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be - * returned: - * + ErrorInvalidPointer will be returned if: - * - pPlatform is null. - * - pPlacementAddr is null. - * - createInfo.pAllocCb is non-null but pfnAlloc and/or pfnFree is null. - * - createInfo.pSettingsPath is null. - * + ErrorInitializationFailed will be returned if PAL is unable to open a connection to the OS. - *********************************************************************************************************************** - */ -Result PAL_STDCALL CreatePlatform( - const PlatformCreateInfo& createInfo, - void* pPlacementAddr, - IPlatform** ppPlatform); - -/** - *********************************************************************************************************************** - * @brief Provides an association of NULL devices and their associated text name. NULL devices operate in IFH mode - * and are primarily intended for off-line shader compilation mode. The text name is provided for end-user - * identification of the GPU device being created. - * - * @param [in,out] pNullDeviceCount On input, this is the size of the "pNullDevices" array. On output, this - * reflects the number of valid entries in the "pNullDevices" array. - * @param [out] pNullDevices Includes information on the valid NULL devices supported by the system. If - * this is NULL, then pNullDeviceCount reflects the maximum possible size of the - * null-devices array. - * - * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be - * returned: - * + ErrorInvalidPointer will be returned if either input is NULL. - *********************************************************************************************************************** - */ -Result PAL_STDCALL EnumerateNullDevices( - uint32* pNullDeviceCount, - NullGpuInfo* pNullDevices); - -/** - *********************************************************************************************************************** - * @brief Provides the NULL device GpuInfo data for the specified NullGpuId. - * - * @param [in] nullGpuId Null GPU ID to lookup. - * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. - * - * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: - * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. - * + NotFound will be returned if the Null GPU ID was not found. - *********************************************************************************************************************** - */ -Result PAL_STDCALL GetNullGpuInfoForNullGpuId( - NullGpuId nullGpuId, - GpuInfo* pGpuInfo); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 -inline Result PAL_STDCALL GetGpuInfoForNullGpuId( - NullGpuId nullGpuId, - GpuInfo* pGpuInfo) -{ - return GetNullGpuInfoForNullGpuId(nullGpuId, pGpuInfo); -} -#endif - -/** - *********************************************************************************************************************** - * @brief Provides the NULL device GpuInfo data for the specified GPU name string. - * - * @param [in] pGpuName Name string of the GPU to lookup (e.g., "NAVI10"). - * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. - * - * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: - * + ErrorInvalidPointer will be returned if pGpuName or pGpuInfo are NULL. - * + NotFound will be returned if the Name string was not found. - *********************************************************************************************************************** - */ -Result PAL_STDCALL GetNullGpuInfoForName( - const char* pGpuName, - GpuInfo* pGpuInfo); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 -inline Result PAL_STDCALL GetGpuInfoForName( - const char* pGpuName, - GpuInfo* pGpuInfo) -{ - return GetNullGpuInfoForName(pGpuName, pGpuInfo); -} -#endif - -/** - *********************************************************************************************************************** - * @brief Provides the NULL device GpuInfo data for the specified hardware revision. - * - * @param [in] asicRevision Hardware revision to lookup. - * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. - * - * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: - * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. - * + NotFound will be returned if the hardware revision was not found. - *********************************************************************************************************************** - */ -Result PAL_STDCALL GetNullGpuInfoForAsicRevision( - AsicRevision asicRevision, - GpuInfo* pGpuInfo); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 -inline Result PAL_STDCALL GetGpuInfoForAsicRevision( - AsicRevision asicRevision, - GpuInfo* pGpuInfo) -{ - return GetNullGpuInfoForAsicRevision(asicRevision, pGpuInfo); -} -#endif - -/** - *********************************************************************************************************************** - * @defgroup LibInit Library Initialization and Destruction - * - * Before initializing PAL, it is important to make sure that the interface version is consistent with the client's - * expectations. The client should check @ref PAL_INTERFACE_MAJOR_VERSION to ensure the major interface version has not - * changed since the last PAL integration. Ideally, this should be performed with a compile-time assert comparing - * @ref PAL_INTERFACE_MAJOR_VERSION against a client-maintained expected major version. Minor interface version - * changes should be backward compatible, and do not require a client change to maintain previous levels of - * functionality. - * - * On startup, the client's first call to PAL must be GetPlatformSize() followed by CreatePlatform(). This function - * gives an opportunity for PAL to perform any necessary platform-wide initialization such as opening a connection for - * communication with the operating system and kernel mode driver or initializing tracking facilities for system memory - * management. CreatePlatform() returns a created IPlatform object for future interaction with PAL. - * - * PAL optionally allows the client to specify a set of memory management callbacks during initialization. If - * specified, PAL will not allocate or free any memory directly from the runtime, instead calling back to the client. - * The client (or application, if the client forwards on the requests) may be able to implement a more efficient - * allocation scheme. - * - * After a successful call to CreatePlatform(), the client should call @ref IPlatform::EnumerateDevices() in order to - * get a list of supported devices attached to the system. This function returns an array of @ref IDevice objects - * which are used by the client to query properties of the devicess and eventually execute work on those devices. - * IPlatform::EnumerateDevices() is not available to util-only clients (PAL_BUILD_CORE=0). - * - * The client may re-enumerate devices at any time by calling IPlatform::EnumerateDevices(). The client must make sure - * there is no active work on any device and that all objects associated with those devices have been destroyed. - * IPlatform::EnumerateDevices() will destroy all previously reported @ref IDevice objects and return a fresh set. - * The client is required to re-enumerate devices when it receives a ErrorDeviceLost error from PAL. - * - * After enumerating devices, either during start-up or when recovering from an ErrorDeviceLost error, the client must - * setup and finalize PAL's per-device settings. See IDevice::GetPublicSettings(), IDevice::SetDxRuntimeData(), - * IDevice::CommitSettingsAndInit(), and IDevice::Finalize() for details. - * - * After enumerating devices and finalizing them, the client may query the set of available screens. This is done by - * calling the @ref IPlatform::GetScreens() function. Note that screens are not available for DX clients. Each screen - * is accessible by zero or more of the enumerated devices. Most screens are accessible from a "main" device as well as - * several other devices which can perform cross-display Flip presents to the screen. In some configurations, screens - * may not be directly to any of PAL's devices, in which case fullscreen presents are unavailable to that screen. (This - * typically only occurs in PowerExpress configurations.) Note that when IPlatform::EnumerateDevices() is called, any - * enumerated @ref IScreen objects which existed prior to that call are invalidated for the specified platform and - * IPlatform::GetScreens() needs to be called again to get the updated list of screens. - * - * On shutdown, the client should call @ref IPlatform::Destroy() to allow PAL to cleanup and free any remaining - * platform-wide resources. The client must ensure this call is not made until all other created objects are idle and - * destroyed (if destroyable). - * - * When the client is asked to destroy a device it may call IDevice::Cleanup() to explicitly clean up the device. Some - * clients will find it necessary to call Cleanup(), for example, if their devices have OS handles that become invalid. - * Note that Cleanup() doesn't destroy the device; it will return to its initial state, as if it was newly enumerated. - *********************************************************************************************************************** - */ - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palLib.h + * @brief Defines the Platform Abstraction Library (PAL) initialization and destruction functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palSysMemory.h" +#include "palDbgPrint.h" + +/// Major interface version. Note that the interface version is distinct from the PAL version itself, which is returned +/// in @ref Pal::PlatformProperties. +/// +/// @attention Updates to the major version indicate an interface change that is not backward compatible and may require +/// action from each client during their next integration. When determining if a change is backward +/// compatible, it is assumed that the client will default-initialize all structs. +/// +/// @ingroup LibInit +#define PAL_INTERFACE_MAJOR_VERSION 942 + +/// Minimum major interface version. This is the minimum interface version PAL supports in order to support backward +/// compatibility. When it is equal to PAL_INTERFACE_MAJOR_VERSION, only the latest interface version is supported. +/// +/// @ingroup LibInit +#define PAL_MINIMUM_INTERFACE_MAJOR_VERSION 872 + +/// Minimum supported major interface version for devdriver library. This is the minimum interface version of the +/// devdriver library that PAL is backwards compatible to. +/// +/// @ingroup LibInit +#define PAL_MINIMUM_GPUOPEN_INTERFACE_MAJOR_VERSION 38 + +/** + *********************************************************************************************************************** + * @def PAL_INTERFACE_VERSION + * @ingroup LibInit + * @brief Current PAL interface version packed into a 32-bit unsigned integer. The low 16 bits are always zero. + * They used to contain the interface minor version and remain as a placeholder in case we add it back. + * + * @see PAL_INTERFACE_MAJOR_VERSION + * + * @hideinitializer + *********************************************************************************************************************** + */ +#define PAL_INTERFACE_VERSION (PAL_INTERFACE_MAJOR_VERSION << 16) + +namespace Pal +{ + +// Forward declarations +class IPlatform; + +/// This is a list of GPUs that the NULL OS layer can compile shaders to in offline mode. +enum class NullGpuId : uint32 +{ + Default = 0, ///< PAL gives the client an arbitrary supported null device. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + Polaris10, ///< 8.0.3 + Polaris11, ///< 8.0.3 + Polaris12, ///< 8.0.3 + Vega10, ///< 9.0.0 + Raven, ///< 9.0.2 + Vega12, ///< 9.0.4 + Vega20, ///< 9.0.6 + Raven2, ///< 9.0.9 + Renoir, ///< 9.0.9 +#endif + Navi10, ///< 10.1.0 + Navi12, ///< 10.1.1 + Navi14, ///< 10.1.2 + Navi21, ///< 10.3.0 + Navi22, ///< 10.3.1 + Navi23, ///< 10.3.2 + Navi24, ///< 10.3.4 + Rembrandt, ///< 10.3.5 + Raphael, ///< 10.3.6 + Navi31, ///< 11.0.0 + Navi32, ///< 11.0.1 + Navi33, ///< 11.0.2 + Phoenix1, ///< 11.0.3 + Phoenix2, ///< 11.0.3 + Strix1, ///< 11.5.0 + StrixHalo, ///< 11.5.1 + Krackan1, ///< 11.5.2 + Navi44, ///< 12.0.0 + Navi48, ///< 12.0.1 +#if (PAL_CLIENT_INTERFACE_MAJOR_VERSION>= 888) +#endif + Max, ///< The maximum count of null devices. + All, ///< If you want to enumerate all null devices. +}; + +/// Specifies which graphics IP level (GFXIP) this device has. +enum class GfxIpLevel : uint32 +{ + _None = 0, ///< @internal The device does not have an GFXIP block, or its level cannot be determined + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< The device does not have an GFXIP block, or its level cannot be determined +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + GfxIp10_1, ///< GFXIP 10.1 (Navi1x) + GfxIp10_3, ///< GFXIP 10.3 (Navi2x, Rembrandt, Raphael, Mendocino) + GfxIp11_0, ///< GFXIP 11.0 (Navi3x, Phoenix) + GfxIp11_5, ///< GFXIP 11.5 (Strix) + GfxIp12, ///< GFXIP 12.0 (Navi4x) +#else // PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 888 + GfxIp6 = 0x1, + GfxIp7 = 0x2, + GfxIp8 = 0x3, + GfxIp8_1 = 0x4, + GfxIp9 = 0x5, + GfxIp10_1 = 0x7, + GfxIp10_3 = 0x9, + GfxIp11_0 = 0xC, + GfxIp11_5 = 0xF, + GfxIp12 = 0x11, +#endif +}; + +/// Specifies the hardware revision. Some AMD tools hard-code these values so we cannot change them. New ASICs should +/// be added at the end of the list and be given the next highest value. +enum class AsicRevision : uint32 +{ + Unknown = 0x00, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 + Tahiti = 0x01, + Pitcairn = 0x02, + Capeverde = 0x03, + Oland = 0x04, + Hainan = 0x05, + Bonaire = 0x06, + Hawaii = 0x07, + HawaiiPro = 0x08, + Kalindi = 0x0A, + Godavari = 0x0B, + Spectre = 0x0C, + Spooky = 0x0D, + Carrizo = 0x0E, + Bristol = 0x0F, + Stoney = 0x10, + Iceland = 0x11, + Tonga = 0x12, + TongaPro = Tonga, + Fiji = 0x13, + Polaris10 = 0x14, + Polaris11 = 0x15, + Polaris12 = 0x16, + Vega10 = 0x18, + Vega12 = 0x19, + Vega20 = 0x1A, + Raven = 0x1B, + Raven2 = 0x1C, + Renoir = 0x1D, +#endif + Navi10 = 0x1F, ///< 10.1.0 + Navi12 = 0x21, ///< 10.1.1 + Navi14 = 0x23, ///< 10.1.2 + Navi21 = 0x24, ///< 10.3.0 + Navi22 = 0x25, ///< 10.3.1 + Navi23 = 0x26, ///< 10.3.2 + Navi24 = 0x27, ///< 10.3.4 + Navi31 = 0x2C, ///< 11.0.0 + Navi32 = 0x2D, ///< 11.0.1 + Navi33 = 0x2E, ///< 11.0.2 + Rembrandt = 0x2F, ///< 10.3.5 + Strix1 = 0x33, ///< 11.5.0 + Raphael = 0x34, ///< 10.3.6 + Phoenix1 = 0x35, ///< 11.0.3 + Phoenix2 = 0x38, ///< 11.0.3 + HawkPoint1 = 0x39, ///< 11.0.3 + HawkPoint2 = 0x3A, ///< 11.0.3 + Krackan1 = 0x3B, ///< 11.5.2 + StrixHalo = 0x3C, ///< 11.5.1 + Navi44 = 0x3D, ///< 12.0.0 + Navi48 = 0x3E, ///< 12.0.1 +}; + +/// Maps a null GPU ID to its associated text name. +struct NullGpuInfo +{ + NullGpuId nullGpuId; ///< ID of an ASIC that PAL supports for override purposes + const char* pGpuName; ///< Text name of the ASIC specified by nullGpuId +}; + +/// Various IDs and info associated with a particular GPU. +struct GpuInfo +{ + AsicRevision asicRev; ///< PAL specific ASIC revision identifier. + NullGpuId nullId; ///< PAL specific GPU ID supported by the NULL OS layer. + GfxIpLevel gfxIpLevel; ///< PAL specific identifier for the device's graphics IP level (GFXIP). + uint32 familyId; ///< Hardware family ID. Driver-defined identifier for a particular family of devices. + uint32 eRevId; ///< GPU emulation/internal revision ID. + uint32 revisionId; ///< GPU revision. HW-specific value differentiating between different SKUs or revisions. + uint32 gfxEngineId; ///< Coarse-grain GFX engine ID (R800, SI, etc.). + uint32 deviceId; ///< PCI device ID (e.g., Hawaii XT = 0x67B0). + const char* pGpuName; ///< ASIC name and AMDGPU target name (e.g., "NAVI31:gfx1100"). +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 915 +/// PAL client APIs. +enum class ClientApi : uint32 +{ + Pal = 0, + Dx9 = 1, + Dx12 = 3, + Vulkan = 4, + OpenCl = 7, + Hip = 8, + Amf = 9, +}; +#else +/// The client UMD must identify its API using this enum. Some UMD builds may implement multiple APIs so they must +/// specify which API they're implementing at runtime. Note that the PAL_CLIENT macros are the preferred way to +/// implement client-specific behavior; runtime ClientApi checks should only be used when necessary. +enum class ClientApi : uint32 +{ + OpenCl, + Hip +}; +#endif + +/// Specifies properties for @ref IPlatform creation. Input structure to Pal::CreatePlatform(). +struct PlatformCreateInfo +{ + const Util::AllocCallbacks* pAllocCb; ///< Optional client-provided callbacks. If non-null, PAL will call the + /// specified callbacks to allocate and free all internal system + /// memory. If null, PAL will manage memory on its own through the C + /// runtime library. + const Util::LogCallbackInfo* pLogInfo; ///< Optional client-provided callback info. If non-null, Pal will + /// call the callback to pass debug prints to the client. + + const char* pSettingsPath; ///< A null-terminated string describing the path to where settings are + /// located on the system. For example, on Windows, this will refer to + /// which UMD subkey to look in under a device's key. For Linux, this + /// is the path to the settings file. + + union + { + struct + { + uint32 disableGpuTimeout : 1; ///< Disables GPU timeout detection (Windows only) + uint32 force32BitVaSpace : 1; ///< Forces 32bit VA space for the flat address with 32bit ISA + uint32 createNullDevice : 1; ///< Set to create a null device, so "nullGpuId" below for the + /// ID of the GPU the created device will be based on. Null + /// devices operate in IFH mode; useful for off-line shader + /// compilations. + uint32 enableSvmMode : 1; ///< Enable SVM mode. When this bit is set, PAL will reserve + /// cpu va range with size "maxSvmSize", and allow client to + /// to create gpu or pinned memory for use of Svm. + /// For detail of SVM, please refer to CreateSvmGpuMemory + uint32 requestShadowDescriptorVaRange : 1; ///< Requests that PAL provides support for the client to use + /// the @ref VaRange::ShadowDescriptorTable virtual-address + /// range. Some GPU's may not be capable of supporting this, + /// even when requested by the client. + uint32 disableInternalResidencyOpts : 1; ///< Disables residency optimizations for internal GPU memory + /// allocations. Some clients may wish to have them turned + /// off to save on system resources. + uint32 supportRgpTraces : 1; ///< Indicates that the client supports RGP tracing. PAL will + /// use this flag and the hardware support flag to setup the + /// DevDriver RgpServer. + uint32 dontOpenPrimaryNode : 1; ///< No primary node is needed (Linux only) + uint32 disableDevDriver : 1; ///< If no DevDriverMgr should be created with this Platform. + uint32 reserved : 23; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Platform-wide creation flags. + + ClientApi clientApiId; ///< Client API ID. + NullGpuId nullGpuId; ///< ID for the null device. Ignored unless the above flags.createNullDevice bit is set. + uint16 apiMajorVer; ///< Major API version number to be used by RGP. Should be set by client based on their + /// contract with RGP. + uint16 apiMinorVer; ///< Minor API version number to be used by RGP. Should be set by client based on their + /// contract with RGP. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 916 + uint32 instrApiVer; /// Instrumentation specification version for API-specific SQTT instrumentation fields. + /// Should be set by client based on the SQTT instrumentation spec version being targeted. +#endif + gpusize maxSvmSize; ///< Maximum amount of virtual address space that will be reserved for SVM +}; + +/** +************************************************************************************************************************ +* @brief Determines the amount of system memory required for a Platform object. +* +* This function must be called before any other interaction with PAL. An allocation of this amount of memory must be +* provided in the pPlacementAddr parameter of Pal::CreatePlatform. +* +* @ingroup LibInit +* +* @returns Size, in bytes, of system memory required for an IPlatform object. +************************************************************************************************************************ +*/ +size_t PAL_STDCALL GetPlatformSize(); + +/** + *********************************************************************************************************************** + * @brief Creates the Platform Abstraction Library. + * + * On execution of CreatePlatform(), PAL will establish a connection for OS and KMD communication, install the specified + * system memory allocation callbacks, and initialize any global internal services. Finally, the client will be + * returned an object pointer to the instantiated platform object, which is used to query the capabilities of the + * system. + * + * @ingroup LibInit + * + * @param [in] createInfo Parameters indicating the client requirements for the platform such as allocation + callbacks or the settings path. + * @param [in] pPlacementAddr Pointer to the location where PAL should construct this object. There must be as + * much size available here as reported by calling GetPlatformSize(). + * @param [out] ppPlatform Platform object pointer to the instantiated platform. Must not be null. + * + * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be + * returned: + * + ErrorInvalidPointer will be returned if: + * - pPlatform is null. + * - pPlacementAddr is null. + * - createInfo.pAllocCb is non-null but pfnAlloc and/or pfnFree is null. + * - createInfo.pSettingsPath is null. + * + ErrorInitializationFailed will be returned if PAL is unable to open a connection to the OS. + *********************************************************************************************************************** + */ +Result PAL_STDCALL CreatePlatform( + const PlatformCreateInfo& createInfo, + void* pPlacementAddr, + IPlatform** ppPlatform); + +/** + *********************************************************************************************************************** + * @brief Provides an association of NULL devices and their associated text name. NULL devices operate in IFH mode + * and are primarily intended for off-line shader compilation mode. The text name is provided for end-user + * identification of the GPU device being created. + * + * @param [in,out] pNullDeviceCount On input, this is the size of the "pNullDevices" array. On output, this + * reflects the number of valid entries in the "pNullDevices" array. + * @param [out] pNullDevices Includes information on the valid NULL devices supported by the system. If + * this is NULL, then pNullDeviceCount reflects the maximum possible size of the + * null-devices array. + * + * @returns Success if the initialization completed successfully. Otherwise, one of the following error codes may be + * returned: + * + ErrorInvalidPointer will be returned if either input is NULL. + *********************************************************************************************************************** + */ +Result PAL_STDCALL EnumerateNullDevices( + uint32* pNullDeviceCount, + NullGpuInfo* pNullDevices); + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified NullGpuId. + * + * @param [in] nullGpuId Null GPU ID to lookup. + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. + * + NotFound will be returned if the Null GPU ID was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForNullGpuId( + NullGpuId nullGpuId, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForNullGpuId( + NullGpuId nullGpuId, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForNullGpuId(nullGpuId, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified GPU name string. + * + * @param [in] pGpuName Name string of the GPU to lookup (e.g., "NAVI10"). + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuName or pGpuInfo are NULL. + * + NotFound will be returned if the Name string was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForName( + const char* pGpuName, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForName( + const char* pGpuName, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForName(pGpuName, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @brief Provides the NULL device GpuInfo data for the specified hardware revision. + * + * @param [in] asicRevision Hardware revision to lookup. + * @param [out] pGpuInfo GpuInfo data on successful lookup. Must not be null. + * + * @returns Success if the lookup completed successfully. Otherwise, one of the following error codes may be returned: + * + ErrorInvalidPointer will be returned if pGpuInfo is NULL. + * + NotFound will be returned if the hardware revision was not found. + *********************************************************************************************************************** + */ +Result PAL_STDCALL GetNullGpuInfoForAsicRevision( + AsicRevision asicRevision, + GpuInfo* pGpuInfo); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 933 +inline Result PAL_STDCALL GetGpuInfoForAsicRevision( + AsicRevision asicRevision, + GpuInfo* pGpuInfo) +{ + return GetNullGpuInfoForAsicRevision(asicRevision, pGpuInfo); +} +#endif + +/** + *********************************************************************************************************************** + * @defgroup LibInit Library Initialization and Destruction + * + * Before initializing PAL, it is important to make sure that the interface version is consistent with the client's + * expectations. The client should check @ref PAL_INTERFACE_MAJOR_VERSION to ensure the major interface version has not + * changed since the last PAL integration. Ideally, this should be performed with a compile-time assert comparing + * @ref PAL_INTERFACE_MAJOR_VERSION against a client-maintained expected major version. Minor interface version + * changes should be backward compatible, and do not require a client change to maintain previous levels of + * functionality. + * + * On startup, the client's first call to PAL must be GetPlatformSize() followed by CreatePlatform(). This function + * gives an opportunity for PAL to perform any necessary platform-wide initialization such as opening a connection for + * communication with the operating system and kernel mode driver or initializing tracking facilities for system memory + * management. CreatePlatform() returns a created IPlatform object for future interaction with PAL. + * + * PAL optionally allows the client to specify a set of memory management callbacks during initialization. If + * specified, PAL will not allocate or free any memory directly from the runtime, instead calling back to the client. + * The client (or application, if the client forwards on the requests) may be able to implement a more efficient + * allocation scheme. + * + * After a successful call to CreatePlatform(), the client should call @ref IPlatform::EnumerateDevices() in order to + * get a list of supported devices attached to the system. This function returns an array of @ref IDevice objects + * which are used by the client to query properties of the devicess and eventually execute work on those devices. + * IPlatform::EnumerateDevices() is not available to util-only clients (PAL_BUILD_CORE=0). + * + * The client may re-enumerate devices at any time by calling IPlatform::EnumerateDevices(). The client must make sure + * there is no active work on any device and that all objects associated with those devices have been destroyed. + * IPlatform::EnumerateDevices() will destroy all previously reported @ref IDevice objects and return a fresh set. + * The client is required to re-enumerate devices when it receives a ErrorDeviceLost error from PAL. + * + * After enumerating devices, either during start-up or when recovering from an ErrorDeviceLost error, the client must + * setup and finalize PAL's per-device settings. See IDevice::GetPublicSettings(), IDevice::SetDxRuntimeData(), + * IDevice::CommitSettingsAndInit(), and IDevice::Finalize() for details. + * + * After enumerating devices and finalizing them, the client may query the set of available screens. This is done by + * calling the @ref IPlatform::GetScreens() function. Note that screens are not available for DX clients. Each screen + * is accessible by zero or more of the enumerated devices. Most screens are accessible from a "main" device as well as + * several other devices which can perform cross-display Flip presents to the screen. In some configurations, screens + * may not be directly to any of PAL's devices, in which case fullscreen presents are unavailable to that screen. (This + * typically only occurs in PowerExpress configurations.) Note that when IPlatform::EnumerateDevices() is called, any + * enumerated @ref IScreen objects which existed prior to that call are invalidated for the specified platform and + * IPlatform::GetScreens() needs to be called again to get the updated list of screens. + * + * On shutdown, the client should call @ref IPlatform::Destroy() to allow PAL to cleanup and free any remaining + * platform-wide resources. The client must ensure this call is not made until all other created objects are idle and + * destroyed (if destroyable). + * + * When the client is asked to destroy a device it may call IDevice::Cleanup() to explicitly clean up the device. Some + * clients will find it necessary to call Cleanup(), for example, if their devices have OS handles that become invalid. + * Note that Cleanup() doesn't destroy the device; it will return to its initial state, as if it was newly enumerated. + *********************************************************************************************************************** + */ + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h b/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h index ffcffa12ef..0441f723b0 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palMsaaState.h @@ -1,187 +1,187 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palMsaaState.h - * @brief Defines the Platform Abstraction Library (PAL) IMsaaState interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" - -namespace Pal -{ - -/// Specifies conservative rasterization mode -enum class ConservativeRasterizationMode : uint8 -{ - Overestimate = 0x0, ///< Fragments will be generated if the primitive area covers any portion of the pixel. - Underestimate = 0x1, ///< Fragments will be generated if all of the pixel is covered by the primitive. - Count -}; - -/// Maximum supported number of MSAA color samples. -constexpr uint32 MaxMsaaColorSamples = 16; - -/// Maximum supported number of MSAA depth samples. -constexpr uint32 MaxMsaaDepthSamples = 8; - -/// Maximum supported number of MSAA fragments. -constexpr uint32 MaxMsaaFragments = 8; - -/// Sampling pattern grid size. This is a quad of pixels, i.e. 2x2 grid of pixels. -constexpr Extent2d MaxGridSize = { 2, 2 }; - -/// The positions are rounded to 1/Pow2(SubPixelBits) -constexpr uint32 SubPixelBits = 4; - -/// Each pixel is subdivided into Pow2(SubPixelBits) x Pow2(SubPixelBits) grid of possible sample locations. -constexpr Extent2d SubPixelGridSize = { 16, 16 }; - -/// Represents a 2D coordinate with each component in [-8/16, 7/16] -struct SampleLocation -{ - int8 x; ///< X offset. - int8 y; ///< Y offset. - - /// Conversion operator that does sign-extension. - operator Offset2d() const { return { x, y }; } -}; - -/// Specifies a custom multisample pattern for a pixel quad. -struct MsaaQuadSamplePattern -{ - SampleLocation topLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for TL pixel of quad. - SampleLocation topRight[MaxMsaaRasterizerSamples]; ///< Sample locations for TR pixel of quad. - SampleLocation bottomLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for BL pixel of quad. - SampleLocation bottomRight[MaxMsaaRasterizerSamples]; ///< Sample locations for BR pixel of quad. -}; - -/// Specifies properties for creation of an @ref IMsaaState object. Input structure to IDevice::CreateMsaaState(). -struct MsaaStateCreateInfo -{ - uint8 coverageSamples; ///< Number of rasterizer samples. Must be greater than or equal to all sample - /// rates in the pipeline. Valid values are 1, 2, 4, 8, and 16. - uint8 exposedSamples; ///< Number of samples exposed in the pixel shader coverage mask. Must be less - /// than or equal to coverageSamples. Valid values are 1, 2, 4, and 8. - uint8 pixelShaderSamples; ///< Controls the pixel shader execution rate. Must be less than or equal to - /// coverageSamples. Valid values are 1, 2, 4, and 8. Note that value with - /// greater than 1 doesn't mean sample rate shading is enabled. Sample rate - /// shading is enabled by either @ref forceSampleRateShading or pixel shader. - uint8 depthStencilSamples; ///< Number of samples in the bound depth target. Must be less than or equal to - /// coverageSamples. Valid values are 1, 2, 4, and 8. - uint8 shaderExportMaskSamples; ///< Number of samples to use in the shader export mask. Should match the number - /// of color target fragments clamped to - /// @ref DeviceProperties imageProperties.maxMsaaFragments. - uint8 sampleClusters; ///< Number of sample clusters to control over-rasterization (all samples in a - /// cluster are rasterized if any are hit). Must be less than or equal to - /// coverageSamples. Valid values are 1, 2, 4, and 8. - uint8 alphaToCoverageSamples; ///< How many samples of quality to generate with alpha-to-coverage. Must be - /// less than or equal to coverageSamples. Valid values are 1, 2, 4, 8, and 16. - uint8 occlusionQuerySamples; ///< Controls the number of samples to use for occlusion queries. - /// This value must never exceed the MSAA rate. - uint16 sampleMask; ///< Bitmask of which color target and depth/stencil samples should be updated. - /// The lowest bit corresponds to sample 0. - - /// Selects overestimate or underestimate conservative rasterization mode. Used only if - /// @ref MsaaStateCreateInfo::flags::enableConservativeRasterization is set to true. - ConservativeRasterizationMode conservativeRasterizationMode; - - union - { - struct - { - uint8 enableConservativeRasterization : 1; ///< Set to true to enable conservative rasterization - uint8 enable1xMsaaSampleLocations : 1; ///< Set to true to enable 1xMSAA quad sample pattern - uint8 disableAlphaToCoverageDither : 1; ///< Disables coverage dithering. - uint8 enableLineStipple : 1; ///< Set to true to enable line stippling - uint8 forceSampleRateShading : 1; ///< Sample rate shading can be enabled by either the pixel - /// shader, or forced here with forceSampleRateShading = 1. - /// Value 0 means sample rate shading is decided by pixel shader - /// and value 1 means sample rate shading is forced enabled. - /// This bit is for openGL glMinSampleShading, where sample rate - /// shading can be enabled by glEnable(GL_SAMPLE_SHADING) - /// instead of by the pixel shader. - uint8 reserved : 3; ///< Reserved for future use - }; - uint8 u8All; - } flags; -}; - -/** - *********************************************************************************************************************** - * @interface IMsaaState - * @brief Dynamic state object controlling fixed function MSAA state. - * - * Configures sample counts of various portions of the pipeline, specifies sample positions, etc. The full range of - * EQAA hardware features are exposed. - * - * @see IDevice::CreateMsaaState - *********************************************************************************************************************** - */ -class IMsaaState : public IDestroyable -{ -public: - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IMsaaState() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IMsaaState() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMsaaState.h + * @brief Defines the Platform Abstraction Library (PAL) IMsaaState interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" + +namespace Pal +{ + +/// Specifies conservative rasterization mode +enum class ConservativeRasterizationMode : uint8 +{ + Overestimate = 0x0, ///< Fragments will be generated if the primitive area covers any portion of the pixel. + Underestimate = 0x1, ///< Fragments will be generated if all of the pixel is covered by the primitive. + Count +}; + +/// Maximum supported number of MSAA color samples. +constexpr uint32 MaxMsaaColorSamples = 16; + +/// Maximum supported number of MSAA depth samples. +constexpr uint32 MaxMsaaDepthSamples = 8; + +/// Maximum supported number of MSAA fragments. +constexpr uint32 MaxMsaaFragments = 8; + +/// Sampling pattern grid size. This is a quad of pixels, i.e. 2x2 grid of pixels. +constexpr Extent2d MaxGridSize = { 2, 2 }; + +/// The positions are rounded to 1/Pow2(SubPixelBits) +constexpr uint32 SubPixelBits = 4; + +/// Each pixel is subdivided into Pow2(SubPixelBits) x Pow2(SubPixelBits) grid of possible sample locations. +constexpr Extent2d SubPixelGridSize = { 16, 16 }; + +/// Represents a 2D coordinate with each component in [-8/16, 7/16] +struct SampleLocation +{ + int8 x; ///< X offset. + int8 y; ///< Y offset. + + /// Conversion operator that does sign-extension. + operator Offset2d() const { return { x, y }; } +}; + +/// Specifies a custom multisample pattern for a pixel quad. +struct MsaaQuadSamplePattern +{ + SampleLocation topLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for TL pixel of quad. + SampleLocation topRight[MaxMsaaRasterizerSamples]; ///< Sample locations for TR pixel of quad. + SampleLocation bottomLeft[MaxMsaaRasterizerSamples]; ///< Sample locations for BL pixel of quad. + SampleLocation bottomRight[MaxMsaaRasterizerSamples]; ///< Sample locations for BR pixel of quad. +}; + +/// Specifies properties for creation of an @ref IMsaaState object. Input structure to IDevice::CreateMsaaState(). +struct MsaaStateCreateInfo +{ + uint8 coverageSamples; ///< Number of rasterizer samples. Must be greater than or equal to all sample + /// rates in the pipeline. Valid values are 1, 2, 4, 8, and 16. + uint8 exposedSamples; ///< Number of samples exposed in the pixel shader coverage mask. Must be less + /// than or equal to coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 pixelShaderSamples; ///< Controls the pixel shader execution rate. Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. Note that value with + /// greater than 1 doesn't mean sample rate shading is enabled. Sample rate + /// shading is enabled by either @ref forceSampleRateShading or pixel shader. + uint8 depthStencilSamples; ///< Number of samples in the bound depth target. Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 shaderExportMaskSamples; ///< Number of samples to use in the shader export mask. Should match the number + /// of color target fragments clamped to + /// @ref DeviceProperties imageProperties.maxMsaaFragments. + uint8 sampleClusters; ///< Number of sample clusters to control over-rasterization (all samples in a + /// cluster are rasterized if any are hit). Must be less than or equal to + /// coverageSamples. Valid values are 1, 2, 4, and 8. + uint8 alphaToCoverageSamples; ///< How many samples of quality to generate with alpha-to-coverage. Must be + /// less than or equal to coverageSamples. Valid values are 1, 2, 4, 8, and 16. + uint8 occlusionQuerySamples; ///< Controls the number of samples to use for occlusion queries. + /// This value must never exceed the MSAA rate. + uint16 sampleMask; ///< Bitmask of which color target and depth/stencil samples should be updated. + /// The lowest bit corresponds to sample 0. + + /// Selects overestimate or underestimate conservative rasterization mode. Used only if + /// @ref MsaaStateCreateInfo::flags::enableConservativeRasterization is set to true. + ConservativeRasterizationMode conservativeRasterizationMode; + + union + { + struct + { + uint8 enableConservativeRasterization : 1; ///< Set to true to enable conservative rasterization + uint8 enable1xMsaaSampleLocations : 1; ///< Set to true to enable 1xMSAA quad sample pattern + uint8 disableAlphaToCoverageDither : 1; ///< Disables coverage dithering. + uint8 enableLineStipple : 1; ///< Set to true to enable line stippling + uint8 forceSampleRateShading : 1; ///< Sample rate shading can be enabled by either the pixel + /// shader, or forced here with forceSampleRateShading = 1. + /// Value 0 means sample rate shading is decided by pixel shader + /// and value 1 means sample rate shading is forced enabled. + /// This bit is for openGL glMinSampleShading, where sample rate + /// shading can be enabled by glEnable(GL_SAMPLE_SHADING) + /// instead of by the pixel shader. + uint8 reserved : 3; ///< Reserved for future use + }; + uint8 u8All; + } flags; +}; + +/** + *********************************************************************************************************************** + * @interface IMsaaState + * @brief Dynamic state object controlling fixed function MSAA state. + * + * Configures sample counts of various portions of the pipeline, specifies sample positions, etc. The full range of + * EQAA hardware features are exposed. + * + * @see IDevice::CreateMsaaState + *********************************************************************************************************************** + */ +class IMsaaState : public IDestroyable +{ +public: + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IMsaaState() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IMsaaState() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h b/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h index cd56422e20..eea8b32fd6 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPerfExperiment.h @@ -1,614 +1,619 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palPerfExperiment.h - * @brief Defines the Platform Abstraction Library (PAL) IPerfExperiment interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palGpuMemoryBindable.h" - -namespace Pal -{ - -/// Specifies a particular block on the GPU to gather counters for. -enum class GpuBlock : uint32 -{ - Cpf = 0x0, - Ia = 0x1, - Vgt = 0x2, - Pa = 0x3, - Sc = 0x4, - Spi = 0x5, - Sq = 0x6, - Sx = 0x7, - Ta = 0x8, - Td = 0x9, - Tcp = 0xA, - Tcc = 0xB, - Tca = 0xC, - Db = 0xD, - Cb = 0xE, - Gds = 0xF, - Srbm = 0x10, - Grbm = 0x11, - GrbmSe = 0x12, - Rlc = 0x13, - Dma = 0x14, - Mc = 0x15, - Cpg = 0x16, - Cpc = 0x17, - Wd = 0x18, - Tcs = 0x19, - Atc = 0x1A, - AtcL2 = 0x1B, - McVmL2 = 0x1C, - Ea = 0x1D, - Rpb = 0x1E, - Rmi = 0x1F, - Umcch = 0x20, - Ge = 0x21, - Gl1a = 0x22, - Gl1c = 0x23, - Gl1cg = 0x24, - Gl2a = 0x25, // TCA is used in Gfx9, and changed to GL2A in Gfx10 - Gl2c = 0x26, // TCC is used in Gfx9, and changed to GL2C in Gfx10 - Cha = 0x27, - Chc = 0x28, - Chcg = 0x29, - Gus = 0x2A, - Gcr = 0x2B, - Ph = 0x2C, - UtcL1 = 0x2D, - Ge1 = Ge, - GeDist = 0x2E, - GeSe = 0x2F, - DfMall = 0x30, // The DF subblocks have unique instances and event IDs but they all share the DF's perf counters. - SqWgp = 0x31, // SQ counters that can be sampled at WGP granularity. - Pc = 0x32, - Gl1xa = 0x33, - Gl1xc = 0x34, - Wgs = 0x35, - EaCpwd = 0x36, - EaSe = 0x37, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 926 - RlcUser = 0x38, -#else - RlcLocal = 0x38, -#endif - Count -}; - -/// Distinguishes between global and streaming performance monitor (SPM) counters. -enum class PerfCounterType : uint32 -{ - Global = 0x0, ///< Represents the traditional summary perf counters. - Spm = 0x1, ///< Represents streaming performance counters. - Spm32 = 0x2, ///< Represents 32bit streaming performance counters - Count -}; - -/// Reports the type of data the hardware writes for a particular counter. -enum class PerfCounterDataType : uint32 -{ - Uint32 = 0x0, - Uint64 = 0x1, - Count -}; - -/// Distinguishes between normal thread traces and streaming performance monitor (SPM) traces. -enum class PerfTraceType : uint32 -{ - ThreadTrace = 0x0, - SpmTrace = 0x1, - Count -}; - -/// Mask values ORed together to choose which shader stages a performance experiment should sample. -enum PerfExperimentShaderFlags -{ - PerfShaderMaskPs = 0x01, - PerfShaderMaskVs = 0x02, - PerfShaderMaskGs = 0x04, - PerfShaderMaskEs = 0x08, - PerfShaderMaskHs = 0x10, - PerfShaderMaskLs = 0x20, - PerfShaderMaskCs = 0x40, - PerfShaderMaskAll = 0x7f, -}; - -/// Selects one of generic performance trace markers, which the client can use to track data of its own choosing. -enum class PerfTraceMarkerType : uint32 -{ - SqttA = 0x0, - SqttB = 0x1, -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 874 - A = SqttA, - B = SqttB, -#endif - SpmA = 0x2, - SpmB = 0x3, - SpmC = 0x4, - SpmD = 0x5, - Count -}; - -/// Specifies available features in device for supporting performance measurements. -union PerfExperimentDeviceFeatureFlags -{ - struct - { - uint32 counters : 1; ///< Device supports performance counters. - uint32 threadTrace : 1; ///< Device supports thread traces. - uint32 spmTrace : 1; ///< Device supports streaming perf monitor traces. - uint32 dfSpmTrace : 1; ///< Device supports streaming df perf monitor traces. - uint32 supportPs1Events : 1; ///< The thread trace HW of this Device is capable of producing event tokens - /// from the second PS backend of SC. - uint32 sqttBadScPackerId : 1; ///< Hardware is affected by bug causing the packer ID specified in new PS waves - /// to be incorrect in SQ thread trace data. - uint32 reserved : 26; ///< Reserved for future use. - }; - uint32 u32All; ///< Feature flags packed as 32-bit uint. -}; - -/// Specifies properties for a perf counter being added to a perf experiment. Input structure to -/// IPerfExperiment::AddCounter(). -/// -/// A note for GpuBlock::SqWgp -/// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity -/// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock gating (CGCG) -/// and power gating. PAL expose this feature to clients. -/// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same -/// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. -/// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features -/// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. -/// The counter data is still reported per WGP (not aggregated for the whole SE). -/// -struct PerfCounterInfo -{ - PerfCounterType counterType; ///< Type of counter to add. - GpuBlock block; ///< Which block to reference. - uint32 instance; ///< Instance of that block in the device. - uint32 eventId; ///< Which event ID to track. - - // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for - // the relevant blocks. It's recommended to zero them out when not in use. - union - { - struct - { - uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. - } df; - - struct - { - uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. - uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold). - uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). - } umc; - - uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events - - uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds - } subConfig; -}; - -/// Specifies properties for setting up a streaming performance counter trace. Input structure to -/// IPerfExperiment::AddSpmTrace(). -struct SpmTraceCreateInfo -{ - uint32 spmInterval; ///< Interval between each sample in terms of GPU sclks. Minimum of 32. - gpusize ringSize; ///< Suggested size of the SPM output ring buffer in bytes. PAL may use - /// a smaller ring in practice but it cannot exceed this size. - uint32 numPerfCounters; ///< Number of performance counters to be collected in this trace. - const PerfCounterInfo* pPerfCounterInfos; ///< Array of size numPerfCounters of PerfCounterInfo(s). -}; - -/// Reports layout of a single global perf counter sample. -struct GlobalSampleLayout -{ - GpuBlock block; ///< Type of GPU block. - uint32 instance; ///< Which instance of that type of GPU block. - uint32 slot; ///< Slot varies in meaning per block. - uint32 eventId; ///< Sampled event ID. - PerfCounterDataType dataType; ///< What type of data is written (e.g., 32-bit uint). - gpusize beginValueOffset; ///< Offset in bytes where the sample data begins. - gpusize endValueOffset; ///< Offset in bytes where the sample data ends. -}; - -/// Describes the layout of global perf counter data in memory. -struct GlobalCounterLayout -{ - uint32 sampleCount; ///< Number of samples described in samples[]. - GlobalSampleLayout samples[1]; ///< Describes the layout of each sample. This structure is repeated (sampleCount - /// - 1) additional times. -}; - -/// Enumeration of SQ Thread trace token types. All versions of Thread Trace (TT) are represented. If an unsupported -/// token is enabled, no error is reported. -enum ThreadTraceTokenTypeFlags : Pal::uint32 -{ - Misc = 0x00000001, ///< A miscellaneous event has been sent. TT 2.3 - Timestamp = 0x00000002, ///< Timestamp tokens. TT 2.3 - Reg = 0x00000004, ///< Register activity token. TT 2.3 - WaveStart = 0x00000008, ///< A wavefront has started. TT 2.3 - WaveAlloc = 0x00000010, ///< Output space has been allocated for vertex position or color/Z. TT 2.3. - RegCsPriv = 0x00000020, ///< There has been a compute pipeline private data, state or threadgroup update. TT 2.3. - WaveEnd = 0x00000040, ///< Wavefront completion. TT 2.3 - Event = 0x00000080, ///< An event has reached the top of a shader stage. TT 2.3 - EventCs = 0x00000100, ///< An event has reached the top of a compute shader stage. TT 2.3 - EventGfx1 = 0x00000200, ///< An event has reached the top of a shader stage for the second GFX pipe. TT 2.3 - Inst = 0x00000400, ///< The shader has executed an instruction. TT 2.3 - InstPc = 0x00000800, ///< The shader has explicitly written the PC value. TT 2.3 - InstUserData = 0x00001000, ///< The shader has written user data into the thread trace buffer. TT 2.3 - Issue = 0x00002000, ///< Provides information about instruction scheduling. TT 2.3 - Perf = 0x00004000, ///< The performance counter delta has been updated. TT 2.3 and below only. - RegCs = 0x00008000, ///< A compute state update packet has been received by the SPI. TT 2.3 - VmemExec = 0x00010000, ///< A previously issued VMEM instruction is now being sent to LDS/TA. TT 3.0 - AluExec = 0x00020000, ///< A previously issued VALU instruction is now being executed. TT 3.0 - ValuInst = 0x00040000, ///< A VALU instruction has been issued. TT 3.0. - WaveRdy = 0x00080000, ///< Mask of which waves became ready this cycle but did not issue an instruction. TT 3.0 - Immed1 = 0x00100000, ///< One wave issued an immediate instruction this cycle. TT 3.0. - Immediate = 0x00200000, ///< One or more waves have issued an immediate instruction this cycle. TT 3.0. - UtilCounter = 0x00400000, ///< A new set of utilization counter values. TT 3.0. - RealTime = 0x00800000, ///< Output realtime. TT 3.3. - All = 0xFFFFFFFF ///< Enable all the above tokens. -}; - -/// Enumeration of register types whose reads/writes can be traced. Register reads are disabled by default as it can -/// generate a lot of traffic and cause the GPU to hang. -enum ThreadTraceRegTypeFlags : Pal::uint32 -{ - EventRegs = 0x00000001, ///< Event registers. TT 2.3. - DrawRegs = 0x00000002, ///< Draw registers. TT 2.3. - DispatchRegs = 0x00000004, ///< Dispatch registers. TT 2.3. - UserdataRegs = 0x00000008, ///< UserData Registers. Must be explicitly requested in TT 2.3. - MarkerRegs = 0x00000010, ///< Thread trace marker data regs. TT 2.3. - ShaderConfigRegs = 0x00000020, ///< Shader configuration state. TT 3.0. - ShaderLaunchStateRegs = 0x00000040, ///< Shader program launch state. TT 3.0. - GraphicsPipeStateRegs = 0x00000080, ///< Graphics pipeline state. TT 3.0. - AsyncComputeRegs = 0x00000100, ///< Async compute registers. TT 3.0. - GraphicsContextRegs = 0x00000200, ///< Graphics context registers. TT 3.0. - OtherConfigRegs = 0x00000400, ///< Other regs. TT 2.3. - AllRegWrites = 0x000007FF, ///< All reg writes other than OtherBusRegs. - OtherBusRegs = 0x00000800, ///< All write activity over gfx and compute buses. Debug only. TT 3.0. - AllRegReads = 0x00001000, ///< Not encouraged to be enabled. This can cause a GPU hang. - AllReadsAndWrites = 0xFFFFFFFF ///< All reads and writes. Not encouraged. This can cause a GPU hang. -}; - -/// Represents thread trace token types and register types that can be enabled to be reported in the trace data. If -/// a particular token type or reg type is unsupported, no error is returned and the thread trace is configured with -/// the minimum supported tokens in the user provided config. -struct ThreadTraceTokenConfig -{ - /// Mask of ThreadTraceTokenTypeFlags - uint32 tokenMask; - - /// Mask of ThreadTraceRegTypeFlags - uint32 regMask; -}; - -/// Specifies properties for a perf trace being added to a perf experiment. Input structure to -/// IPerfExperiment::AddThreadTrace(). -struct ThreadTraceInfo -{ - PerfTraceType traceType; ///< Type of trace to add. - uint32 instance; ///< Selected trace instance. - - union - { - struct - { - // Options common to all traces - uint32 bufferSize : 1; - - // Thread trace only options - uint32 threadTraceTargetSh : 1; - uint32 threadTraceTargetCu : 1; - uint32 threadTraceSh0CounterMask : 1; - uint32 threadTraceSh1CounterMask : 1; - uint32 threadTraceSimdMask : 1; - uint32 threadTraceVmIdMask : 1; - uint32 threadTraceRandomSeed : 1; - uint32 threadTraceShaderTypeMask : 1; - uint32 threadTraceIssueMask : 1; - uint32 threadTraceWrapBuffer : 1; - uint32 threadTraceStallBehavior : 1; - uint32 threadTraceTokenConfig : 1; - uint32 threadTraceStallAllSimds : 1; - uint32 threadTraceExcludeNonDetailShaderData : 1; - uint32 threadTraceEnableExecPop : 1; - uint32 placeholder3 : 1; - uint32 reserved : 15; - }; - uint32 u32All; - } optionFlags; - - struct - { - // Options common to all traces - size_t bufferSize; - - // Thread trace only options - ThreadTraceTokenConfig threadTraceTokenConfig; - uint32 threadTraceTargetSh; - uint32 threadTraceTargetCu; - uint32 threadTraceSh0CounterMask; - uint32 threadTraceSh1CounterMask; - uint32 threadTraceSimdMask; - uint32 threadTraceVmIdMask; - uint32 threadTraceRandomSeed; - PerfExperimentShaderFlags threadTraceShaderTypeMask; - uint32 threadTraceIssueMask; - bool threadTraceWrapBuffer; - uint32 threadTraceStallBehavior; - bool threadTraceStallAllSimds; - bool threadTraceExcludeNonDetailShaderData; - bool threadTraceEnableExecPop; - } optionValues; -}; - -/// Reports thread trace data written when the trace is stopped (copied from internal SQ registers). -struct ThreadTraceInfoData -{ - uint32 curOffset; ///< Contents of SQ_THREAD_TRACE_WPTR register. - uint32 traceStatus; ///< Contents of SQ_THREAD_TRACE_STATUS register. - uint32 writeCounter; ///< Contents of SQ_THREAD_TRACE_CNTR register. -}; - -/// Describes the layout of a single shader engine's thread trace data. -struct ThreadTraceSeLayout -{ - uint32 shaderEngine; ///< Shader engine index. - uint32 computeUnit; ///< Compute unit index. - gpusize infoOffset; ///< Offset to ThreadTraceInfoData in memory. - gpusize infoSize; ///< Size in bytes reserved for ThreadTraceInfoData. - gpusize dataOffset; ///< Offset in bytes to the actual trace data. - gpusize dataSize; ///< Amount of trace data, in bytes. -}; - -/// Describes how the thread trace data is laid out. -struct ThreadTraceLayout -{ - uint32 traceCount; ///< Number of entries in traces[]. - ThreadTraceSeLayout traces[1]; ///< ThreadTraceSeLayout repeated (traceCount - 1) times. -}; - -/// Describes a single SPM counter instance. -struct SpmCounterData -{ - GpuBlock gpuBlock; ///< The kind of GPU block this counter measured. - uint32 instance; ///< Which specific global block instance this counter measured. - uint32 eventId; ///< The event that was measured by this counter. - uint32 offsetLo; ///< Byte offset within each sample to the lower 16-bit half of the counter data. - uint32 offsetHi; ///< Byte offset within each sample to the upper 16-bit half of the counter data. - bool is32Bit; ///< If the client must combine the independent 16-bit halves into a single 32-bit value. - /// If this is false offsetLo points to the full 16-bit data value and offsetHi is ignored. -}; - -/// All information required to parse the counter data out of a SpmTrace results buffer. -/// -/// Note that the hardware will continue to write samples to the SPM ring buffer even if it runs out of unused space. -/// The hardware will simply wrap the ring's write pointer back around to the first sample's location. Each subsequent -/// sample will overwrite the oldest sample in the ring. When the trace is finished we will have at most @ref -/// maxNumSamples valid samples. -/// -/// PAL doesn't zero out the ring memory so it's generally hard for the client to distinguish valid samples from random -/// data present in unused sample locations. PAL does guarantee that the final sample location in the ring has its -/// timestamp zeroed out before the SPM trace starts. This means this last timestamp will only be non-zero if the ring -/// has completely filled up and the WrPtr has wrapped one or more times. The client must inspect this timestamp when -/// parsing the sample data: -/// 1. The last timestamp is zero. The ring did not wrap. The oldest sample is at @ref sampleOffset. The ring's write -/// pointer tells us how many samples were written. From the write pointer onwards the ring contains invalid data. -/// 2. The last timestamp is non-zero. The ring did wrap. The ring's write pointer points to the oldest sample, -/// effectively a random sample offset into the ring. The full ring contains valid sample data but it's not in -/// oldest-to-newest order, it's shifted. The client can walk the ring from the write pointer's location (wrapping -/// as they go) to parse all @ref maxNumSamples samples out in oldest-to-newest order. -struct SpmTraceLayout -{ - gpusize offset; ///< Byte offset into the bound GPU memory where the spm trace data begins. - /// The @ref wrPtrOffset and @ref sampleOffset are relative to this value. - uint32 wrPtrOffset; ///< Byte offset within SPM trace data to the HW's write pointer (WrPtr) DWORD. - /// The WrPtr's value is an offset relative to @ref sampleOffset. Don't assume this is - /// a byte offset (see @ref wrPtrGranularity). The WrPtr's value shows where the HW's - /// theoretical next sample would go. This value may wrap back to zero if the HW runs of - /// space in the SPM ring buffer. - uint32 wrPtrGranularity; ///< The WrPtr's granularity. Multiply WrPtr's value by this value to get a byte offset. - uint32 sampleOffset; ///< Byte offset within the SPM trace data to the array of samples. The HW will write the - /// first sample here but it will be overwritten if the ring wraps (see the top comment). - uint32 sampleStride; ///< The distance between consecutive samples in bytes. May include empty padding. - uint32 maxNumSamples; ///< The maximum number of samples the HW can write before wrapping. The SPM ring buffer - /// ends at sampleOffset + sampleStride * maxNumSamples. - uint32 numCounters; ///< The true length of counterData. The client must allocate extra memory for the array. - - SpmCounterData counterData[1]; ///< The layout and identity of the counters in the samples. -}; - -/// Represents the information that is stored in the DF SPM trace metadata buffer. -struct DfSpmTraceMetadataLayout -{ - uint32 numRecordPairs; ///< The number of 64-byte blocks written by this trace. There are two time segments - ///< per 64-byte block so we have to check the lastSpmPkt bit to see which half of - ///< the last 64-byte block is the last packet. - uint32 padding; ///< Padding to match what the compiler does by default. - uint64 beginTimestamp; ///< The DF timestamp at the start of the DF SPM trace. - uint64 endTimestamp; ///< The DF timestamp at the finish of the DF SPM trace. -}; - -/// Specifies properties for creation of an @ref IPerfExperiment object. Input structure to -/// IDevice::CreatePerfExperiment(). -struct PerfExperimentCreateInfo -{ - union - { - struct - { - uint32 cacheFlushOnCounterCollection : 1; - uint32 sampleInternalOperations : 1; - uint32 sqShaderMask : 1; - uint32 sqWgpShaderMask : 1; - uint32 reserved : 28; - }; - uint32 u32All; - } optionFlags; - - struct - { - bool cacheFlushOnCounterCollection; - bool sampleInternalOperations; - PerfExperimentShaderFlags sqShaderMask; ///< GpuBlock::Sq counters only look at these shader types. - PerfExperimentShaderFlags sqWgpShaderMask; ///< GpuBlock::SqWgp counters only look at these shader types. - } optionValues; -}; - -/** - *********************************************************************************************************************** - * @interface IPerfExperiment - * @brief Set of performance profiling activities to be performed over a specific range of commands in a command - * buffer. - * - * @warning The details of building a performance experiment are not very well documented here. Please see your local - * hardware performance expert for more details until this documentation can be fully fleshed out. - * - * @see IDevice::CreatePerfExperiment - *********************************************************************************************************************** - */ -class IPerfExperiment : public IGpuMemoryBindable -{ -public: - /// Adds the specified performance counter to be tracked as part of this perf experiment. - /// - /// @param [in] counterInfo Specifies which counter to add: which hardware block, instance, any options, etc. - /// - /// @returns Success if the counter was successfully added to the experiment, otherwise an appropriate error code. - virtual Result AddCounter( - const PerfCounterInfo& counterInfo) = 0; - - /// Queries the layout of counter results in memory for this perf experiment. - /// - /// @param [out] pLayout Layout describing the begin and end offset of each counter in the resulting GPU memory once - /// this perf experiment is executed. Should correspond with counters added via AddCounter(). - /// - /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. - virtual Result GetGlobalCounterLayout( - GlobalCounterLayout* pLayout) const = 0; - - /// Addes the specified thread trace to be recorded as part of this perf experiment. - /// - /// @param [in] traceInfo Specifies what type of trace to record, which block instance to trace, and options, etc. - /// - /// @returns Success if the trace was successfully added to the experiment, otherwise an appropriate error code. - virtual Result AddThreadTrace( - const ThreadTraceInfo& traceInfo) = 0; - - /// Adds the specified DfSpmTrace to be recorded as part of this perf experiment. - /// - /// @param [in] dfSpmCreateInfo Specifies the parameters of the df spm trace and - /// provides the list of perf counters. - /// - /// @returns Success if the df spm trace was successfully added to the experiment, - /// otherwise and appropriate error code. - virtual Result AddDfSpmTrace( - const SpmTraceCreateInfo& dfSpmCreateInfo) = 0; - - /// Adds the specified SpmTrace to be recorded as part of this perf experiment. - /// - /// @param [in] spmCreateInfo Specifies the parameters of the spm trace and provides the list of perf counters. - /// - /// @returns Success if the spm trace was successfully added to the experiment, otherwise an appropriate error code. - virtual Result AddSpmTrace( - const SpmTraceCreateInfo& spmCreateInfo) = 0; - - /// Queries the layout of thread trace results in memory for this perf experiment. - /// - /// @param [out] pLayout Layout describing how the results of each thread trace will be written to GPU memory when - /// this perf experiment is executed. Should correspond with counters added via AddTrace(). - /// - /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. - virtual Result GetThreadTraceLayout( - ThreadTraceLayout* pLayout) const = 0; - - /// Queries the layout of streaming counter trace results in memory for this perf experiment. - /// - /// The caller is expected to call this function twice. The first time with pLayout->numCounters = 0 which prompts - /// PAL to only set numCounters to the correct number of SPM counters and return. The second call with a non-zero - /// numCounters prompts PAL to fill out the full structure and counterData array. - /// - /// Note that @ref SpmTraceLayout contains a variable length array. The caller must allocate enough memory for - /// an additional "numCounters - 1" copies of @ref SpmCounterData. - /// - /// @param [out] pLayout Layout describing the layout of the streaming counter trace results in the resulting - /// GPU memory once this perf experiment is executed. - /// - /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. - virtual Result GetSpmTraceLayout( - SpmTraceLayout* pLayout) const = 0; - - /// Finalizes the performance experiment preparing it for execution. - /// - /// @returns Success if the operation executed successfully, otherwise an appropriate error code. - virtual Result Finalize() = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IPerfExperiment() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IPerfExperiment() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPerfExperiment.h + * @brief Defines the Platform Abstraction Library (PAL) IPerfExperiment interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +/// Specifies a particular block on the GPU to gather counters for. +enum class GpuBlock : uint32 +{ + Cpf = 0x0, + Ia = 0x1, + Vgt = 0x2, + Pa = 0x3, + Sc = 0x4, + Spi = 0x5, + Sq = 0x6, + Sx = 0x7, + Ta = 0x8, + Td = 0x9, + Tcp = 0xA, + Tcc = 0xB, + Tca = 0xC, + Db = 0xD, + Cb = 0xE, + Gds = 0xF, + Srbm = 0x10, + Grbm = 0x11, + GrbmSe = 0x12, + Rlc = 0x13, + Dma = 0x14, + Mc = 0x15, + Cpg = 0x16, + Cpc = 0x17, + Wd = 0x18, + Tcs = 0x19, + Atc = 0x1A, + AtcL2 = 0x1B, + McVmL2 = 0x1C, + Ea = 0x1D, + Rpb = 0x1E, + Rmi = 0x1F, + Umcch = 0x20, + Ge = 0x21, + Gl1a = 0x22, + Gl1c = 0x23, + Gl1cg = 0x24, + Gl2a = 0x25, // TCA is used in Gfx9, and changed to GL2A in Gfx10 + Gl2c = 0x26, // TCC is used in Gfx9, and changed to GL2C in Gfx10 + Cha = 0x27, + Chc = 0x28, + Chcg = 0x29, + Gus = 0x2A, + Gcr = 0x2B, + Ph = 0x2C, + UtcL1 = 0x2D, + Ge1 = Ge, + GeDist = 0x2E, + GeSe = 0x2F, + DfMall = 0x30, // The DF subblocks have unique instances and event IDs but they all share the DF's perf counters. + SqWgp = 0x31, // SQ counters that can be sampled at WGP granularity. + Pc = 0x32, + Gl1xa = 0x33, + Gl1xc = 0x34, + Wgs = 0x35, + EaCpwd = 0x36, + EaSe = 0x37, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 926 + RlcUser = 0x38, +#else + RlcLocal = 0x38, +#endif + Count +}; + +/// Distinguishes between global and streaming performance monitor (SPM) counters. +enum class PerfCounterType : uint32 +{ + Global = 0x0, ///< Represents the traditional summary perf counters. + Spm = 0x1, ///< Represents streaming performance counters. + Spm32 = 0x2, ///< Represents 32bit streaming performance counters + Count +}; + +/// Reports the type of data the hardware writes for a particular counter. +enum class PerfCounterDataType : uint32 +{ + Uint32 = 0x0, + Uint64 = 0x1, + Count +}; + +/// Distinguishes between normal thread traces and streaming performance monitor (SPM) traces. +enum class PerfTraceType : uint32 +{ + ThreadTrace = 0x0, + SpmTrace = 0x1, + Count +}; + +/// Mask values ORed together to choose which shader stages a performance experiment should sample. +enum PerfExperimentShaderFlags +{ + PerfShaderMaskPs = 0x01, + PerfShaderMaskVs = 0x02, + PerfShaderMaskGs = 0x04, + PerfShaderMaskEs = 0x08, + PerfShaderMaskHs = 0x10, + PerfShaderMaskLs = 0x20, + PerfShaderMaskCs = 0x40, + PerfShaderMaskAll = 0x7f, +}; + +/// Selects one of generic performance trace markers, which the client can use to track data of its own choosing. +enum class PerfTraceMarkerType : uint32 +{ + SqttA = 0x0, + SqttB = 0x1, +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 874 + A = SqttA, + B = SqttB, +#endif + SpmA = 0x2, + SpmB = 0x3, + SpmC = 0x4, + SpmD = 0x5, + Count +}; + +/// Specifies available features in device for supporting performance measurements. +union PerfExperimentDeviceFeatureFlags +{ + struct + { + uint32 counters : 1; ///< Device supports performance counters. + uint32 threadTrace : 1; ///< Device supports thread traces. + uint32 spmTrace : 1; ///< Device supports streaming perf monitor traces. + uint32 dfSpmTrace : 1; ///< Device supports streaming df perf monitor traces. + uint32 supportPs1Events : 1; ///< The thread trace HW of this Device is capable of producing event tokens + /// from the second PS backend of SC. + uint32 sqttBadScPackerId : 1; ///< Hardware is affected by bug causing the packer ID specified in new PS waves + /// to be incorrect in SQ thread trace data. + uint32 reserved : 26; ///< Reserved for future use. + }; + uint32 u32All; ///< Feature flags packed as 32-bit uint. +}; + +/// Specifies properties for a perf counter being added to a perf experiment. Input structure to +/// IPerfExperiment::AddCounter(). +/// +/// A note for GpuBlock::SqWgp +/// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity +/// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock gating (CGCG) +/// and power gating. PAL expose this feature to clients. +/// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same +/// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. +/// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features +/// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. +/// The counter data is still reported per WGP (not aggregated for the whole SE). +/// +struct PerfCounterInfo +{ + PerfCounterType counterType; ///< Type of counter to add. + GpuBlock block; ///< Which block to reference. + uint32 instance; ///< Instance of that block in the device. + uint32 eventId; ///< Which event ID to track. + + // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for + // the relevant blocks. It's recommended to zero them out when not in use. + union + { + struct + { + uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. + } df; + + struct + { + uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. + uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold). + uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). + } umc; + + uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events + + uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds + } subConfig; +}; + +/// Specifies properties for setting up a streaming performance counter trace. Input structure to +/// IPerfExperiment::AddSpmTrace(). +struct SpmTraceCreateInfo +{ + uint32 spmInterval; ///< Interval between each sample in terms of GPU sclks. Minimum of 32. + gpusize ringSize; ///< Suggested size of the SPM output ring buffer in bytes. PAL may use + /// a smaller ring in practice but it cannot exceed this size. + uint32 numPerfCounters; ///< Number of performance counters to be collected in this trace. + const PerfCounterInfo* pPerfCounterInfos; ///< Array of size numPerfCounters of PerfCounterInfo(s). +}; + +/// Reports layout of a single global perf counter sample. +struct GlobalSampleLayout +{ + GpuBlock block; ///< Type of GPU block. + uint32 instance; ///< Which instance of that type of GPU block. + uint32 slot; ///< Slot varies in meaning per block. + uint32 eventId; ///< Sampled event ID. + PerfCounterDataType dataType; ///< What type of data is written (e.g., 32-bit uint). + gpusize beginValueOffset; ///< Offset in bytes where the sample data begins. + gpusize endValueOffset; ///< Offset in bytes where the sample data ends. +}; + +/// Describes the layout of global perf counter data in memory. +struct GlobalCounterLayout +{ + uint32 sampleCount; ///< Number of samples described in samples[]. + GlobalSampleLayout samples[1]; ///< Describes the layout of each sample. This structure is repeated (sampleCount + /// - 1) additional times. +}; + +/// Enumeration of SQ Thread trace token types. All versions of Thread Trace (TT) are represented. If an unsupported +/// token is enabled, no error is reported. +enum ThreadTraceTokenTypeFlags : Pal::uint32 +{ + Misc = 0x00000001, ///< A miscellaneous event has been sent. TT 2.3 + Timestamp = 0x00000002, ///< Timestamp tokens. TT 2.3 + Reg = 0x00000004, ///< Register activity token. TT 2.3 + WaveStart = 0x00000008, ///< A wavefront has started. TT 2.3 + WaveAlloc = 0x00000010, ///< Output space has been allocated for vertex position or color/Z. TT 2.3. + RegCsPriv = 0x00000020, ///< There has been a compute pipeline private data, state or threadgroup update. TT 2.3. + WaveEnd = 0x00000040, ///< Wavefront completion. TT 2.3 + Event = 0x00000080, ///< An event has reached the top of a shader stage. TT 2.3 + EventCs = 0x00000100, ///< An event has reached the top of a compute shader stage. TT 2.3 + EventGfx1 = 0x00000200, ///< An event has reached the top of a shader stage for the second GFX pipe. TT 2.3 + Inst = 0x00000400, ///< The shader has executed an instruction. TT 2.3 + InstPc = 0x00000800, ///< The shader has explicitly written the PC value. TT 2.3 + InstUserData = 0x00001000, ///< The shader has written user data into the thread trace buffer. TT 2.3 + Issue = 0x00002000, ///< Provides information about instruction scheduling. TT 2.3 + Perf = 0x00004000, ///< The performance counter delta has been updated. TT 2.3 and below only. + RegCs = 0x00008000, ///< A compute state update packet has been received by the SPI. TT 2.3 + VmemExec = 0x00010000, ///< A previously issued VMEM instruction is now being sent to LDS/TA. TT 3.0 + AluExec = 0x00020000, ///< A previously issued VALU instruction is now being executed. TT 3.0 + ValuInst = 0x00040000, ///< A VALU instruction has been issued. TT 3.0. + WaveRdy = 0x00080000, ///< Mask of which waves became ready this cycle but did not issue an instruction. TT 3.0 + Immed1 = 0x00100000, ///< One wave issued an immediate instruction this cycle. TT 3.0. + Immediate = 0x00200000, ///< One or more waves have issued an immediate instruction this cycle. TT 3.0. + UtilCounter = 0x00400000, ///< A new set of utilization counter values. TT 3.0. + RealTime = 0x00800000, ///< Output realtime. TT 3.3. + All = 0xFFFFFFFF ///< Enable all the above tokens. +}; + +/// Enumeration of register types whose reads/writes can be traced. Register reads are disabled by default as it can +/// generate a lot of traffic and cause the GPU to hang. +enum ThreadTraceRegTypeFlags : Pal::uint32 +{ + EventRegs = 0x00000001, ///< Event registers. TT 2.3. + DrawRegs = 0x00000002, ///< Draw registers. TT 2.3. + DispatchRegs = 0x00000004, ///< Dispatch registers. TT 2.3. + UserdataRegs = 0x00000008, ///< UserData Registers. Must be explicitly requested in TT 2.3. + MarkerRegs = 0x00000010, ///< Thread trace marker data regs. TT 2.3. + ShaderConfigRegs = 0x00000020, ///< Shader configuration state. TT 3.0. + ShaderLaunchStateRegs = 0x00000040, ///< Shader program launch state. TT 3.0. + GraphicsPipeStateRegs = 0x00000080, ///< Graphics pipeline state. TT 3.0. + AsyncComputeRegs = 0x00000100, ///< Async compute registers. TT 3.0. + GraphicsContextRegs = 0x00000200, ///< Graphics context registers. TT 3.0. + OtherConfigRegs = 0x00000400, ///< Other regs. TT 2.3. + AllRegWrites = 0x000007FF, ///< All reg writes other than OtherBusRegs. + OtherBusRegs = 0x00000800, ///< All write activity over gfx and compute buses. Debug only. TT 3.0. + AllRegReads = 0x00001000, ///< Not encouraged to be enabled. This can cause a GPU hang. + AllReadsAndWrites = 0xFFFFFFFF ///< All reads and writes. Not encouraged. This can cause a GPU hang. +}; + +/// Represents thread trace token types and register types that can be enabled to be reported in the trace data. If +/// a particular token type or reg type is unsupported, no error is returned and the thread trace is configured with +/// the minimum supported tokens in the user provided config. +struct ThreadTraceTokenConfig +{ + /// Mask of ThreadTraceTokenTypeFlags + uint32 tokenMask; + + /// Mask of ThreadTraceRegTypeFlags + uint32 regMask; +}; + +/// Specifies properties for a perf trace being added to a perf experiment. Input structure to +/// IPerfExperiment::AddThreadTrace(). +struct ThreadTraceInfo +{ + PerfTraceType traceType; ///< Type of trace to add. + uint32 instance; ///< Selected trace instance. + + union + { + struct + { + // Options common to all traces + uint32 bufferSize : 1; + + // Thread trace only options + uint32 threadTraceTargetSh : 1; + uint32 threadTraceTargetCu : 1; + uint32 threadTraceSh0CounterMask : 1; + uint32 threadTraceSh1CounterMask : 1; + uint32 threadTraceSimdMask : 1; + uint32 threadTraceVmIdMask : 1; + uint32 threadTraceRandomSeed : 1; + uint32 threadTraceShaderTypeMask : 1; + uint32 threadTraceIssueMask : 1; + uint32 threadTraceWrapBuffer : 1; + uint32 threadTraceStallBehavior : 1; + uint32 threadTraceTokenConfig : 1; + uint32 threadTraceStallAllSimds : 1; + uint32 threadTraceExcludeNonDetailShaderData : 1; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + uint32 threadTraceEnableExecPop : 1; +#else + uint32 placeholder2 : 1; +#endif + uint32 reserved : 16; + }; + uint32 u32All; + } optionFlags; + + struct + { + // Options common to all traces + size_t bufferSize; + + // Thread trace only options + ThreadTraceTokenConfig threadTraceTokenConfig; + uint32 threadTraceTargetSh; + uint32 threadTraceTargetCu; + uint32 threadTraceSh0CounterMask; + uint32 threadTraceSh1CounterMask; + uint32 threadTraceSimdMask; + uint32 threadTraceVmIdMask; + uint32 threadTraceRandomSeed; + PerfExperimentShaderFlags threadTraceShaderTypeMask; + uint32 threadTraceIssueMask; + bool threadTraceWrapBuffer; + uint32 threadTraceStallBehavior; + bool threadTraceStallAllSimds; + bool threadTraceExcludeNonDetailShaderData; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + bool threadTraceEnableExecPop; +#endif + } optionValues; +}; + +/// Reports thread trace data written when the trace is stopped (copied from internal SQ registers). +struct ThreadTraceInfoData +{ + uint32 curOffset; ///< Contents of SQ_THREAD_TRACE_WPTR register. + uint32 traceStatus; ///< Contents of SQ_THREAD_TRACE_STATUS register. + uint32 writeCounter; ///< Contents of SQ_THREAD_TRACE_CNTR register. +}; + +/// Describes the layout of a single shader engine's thread trace data. +struct ThreadTraceSeLayout +{ + uint32 shaderEngine; ///< Shader engine index. + uint32 computeUnit; ///< Compute unit index. + gpusize infoOffset; ///< Offset to ThreadTraceInfoData in memory. + gpusize infoSize; ///< Size in bytes reserved for ThreadTraceInfoData. + gpusize dataOffset; ///< Offset in bytes to the actual trace data. + gpusize dataSize; ///< Amount of trace data, in bytes. +}; + +/// Describes how the thread trace data is laid out. +struct ThreadTraceLayout +{ + uint32 traceCount; ///< Number of entries in traces[]. + ThreadTraceSeLayout traces[1]; ///< ThreadTraceSeLayout repeated (traceCount - 1) times. +}; + +/// Describes a single SPM counter instance. +struct SpmCounterData +{ + GpuBlock gpuBlock; ///< The kind of GPU block this counter measured. + uint32 instance; ///< Which specific global block instance this counter measured. + uint32 eventId; ///< The event that was measured by this counter. + uint32 offsetLo; ///< Byte offset within each sample to the lower 16-bit half of the counter data. + uint32 offsetHi; ///< Byte offset within each sample to the upper 16-bit half of the counter data. + bool is32Bit; ///< If the client must combine the independent 16-bit halves into a single 32-bit value. + /// If this is false offsetLo points to the full 16-bit data value and offsetHi is ignored. +}; + +/// All information required to parse the counter data out of a SpmTrace results buffer. +/// +/// Note that the hardware will continue to write samples to the SPM ring buffer even if it runs out of unused space. +/// The hardware will simply wrap the ring's write pointer back around to the first sample's location. Each subsequent +/// sample will overwrite the oldest sample in the ring. When the trace is finished we will have at most @ref +/// maxNumSamples valid samples. +/// +/// PAL doesn't zero out the ring memory so it's generally hard for the client to distinguish valid samples from random +/// data present in unused sample locations. PAL does guarantee that the final sample location in the ring has its +/// timestamp zeroed out before the SPM trace starts. This means this last timestamp will only be non-zero if the ring +/// has completely filled up and the WrPtr has wrapped one or more times. The client must inspect this timestamp when +/// parsing the sample data: +/// 1. The last timestamp is zero. The ring did not wrap. The oldest sample is at @ref sampleOffset. The ring's write +/// pointer tells us how many samples were written. From the write pointer onwards the ring contains invalid data. +/// 2. The last timestamp is non-zero. The ring did wrap. The ring's write pointer points to the oldest sample, +/// effectively a random sample offset into the ring. The full ring contains valid sample data but it's not in +/// oldest-to-newest order, it's shifted. The client can walk the ring from the write pointer's location (wrapping +/// as they go) to parse all @ref maxNumSamples samples out in oldest-to-newest order. +struct SpmTraceLayout +{ + gpusize offset; ///< Byte offset into the bound GPU memory where the spm trace data begins. + /// The @ref wrPtrOffset and @ref sampleOffset are relative to this value. + uint32 wrPtrOffset; ///< Byte offset within SPM trace data to the HW's write pointer (WrPtr) DWORD. + /// The WrPtr's value is an offset relative to @ref sampleOffset. Don't assume this is + /// a byte offset (see @ref wrPtrGranularity). The WrPtr's value shows where the HW's + /// theoretical next sample would go. This value may wrap back to zero if the HW runs of + /// space in the SPM ring buffer. + uint32 wrPtrGranularity; ///< The WrPtr's granularity. Multiply WrPtr's value by this value to get a byte offset. + uint32 sampleOffset; ///< Byte offset within the SPM trace data to the array of samples. The HW will write the + /// first sample here but it will be overwritten if the ring wraps (see the top comment). + uint32 sampleStride; ///< The distance between consecutive samples in bytes. May include empty padding. + uint32 maxNumSamples; ///< The maximum number of samples the HW can write before wrapping. The SPM ring buffer + /// ends at sampleOffset + sampleStride * maxNumSamples. + uint32 numCounters; ///< The true length of counterData. The client must allocate extra memory for the array. + + SpmCounterData counterData[1]; ///< The layout and identity of the counters in the samples. +}; + +/// Represents the information that is stored in the DF SPM trace metadata buffer. +struct DfSpmTraceMetadataLayout +{ + uint32 numRecordPairs; ///< The number of 64-byte blocks written by this trace. There are two time segments + ///< per 64-byte block so we have to check the lastSpmPkt bit to see which half of + ///< the last 64-byte block is the last packet. + uint32 padding; ///< Padding to match what the compiler does by default. + uint64 beginTimestamp; ///< The DF timestamp at the start of the DF SPM trace. + uint64 endTimestamp; ///< The DF timestamp at the finish of the DF SPM trace. +}; + +/// Specifies properties for creation of an @ref IPerfExperiment object. Input structure to +/// IDevice::CreatePerfExperiment(). +struct PerfExperimentCreateInfo +{ + union + { + struct + { + uint32 cacheFlushOnCounterCollection : 1; + uint32 sampleInternalOperations : 1; + uint32 sqShaderMask : 1; + uint32 sqWgpShaderMask : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } optionFlags; + + struct + { + bool cacheFlushOnCounterCollection; + bool sampleInternalOperations; + PerfExperimentShaderFlags sqShaderMask; ///< GpuBlock::Sq counters only look at these shader types. + PerfExperimentShaderFlags sqWgpShaderMask; ///< GpuBlock::SqWgp counters only look at these shader types. + } optionValues; +}; + +/** + *********************************************************************************************************************** + * @interface IPerfExperiment + * @brief Set of performance profiling activities to be performed over a specific range of commands in a command + * buffer. + * + * @warning The details of building a performance experiment are not very well documented here. Please see your local + * hardware performance expert for more details until this documentation can be fully fleshed out. + * + * @see IDevice::CreatePerfExperiment + *********************************************************************************************************************** + */ +class IPerfExperiment : public IGpuMemoryBindable +{ +public: + /// Adds the specified performance counter to be tracked as part of this perf experiment. + /// + /// @param [in] counterInfo Specifies which counter to add: which hardware block, instance, any options, etc. + /// + /// @returns Success if the counter was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddCounter( + const PerfCounterInfo& counterInfo) = 0; + + /// Queries the layout of counter results in memory for this perf experiment. + /// + /// @param [out] pLayout Layout describing the begin and end offset of each counter in the resulting GPU memory once + /// this perf experiment is executed. Should correspond with counters added via AddCounter(). + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetGlobalCounterLayout( + GlobalCounterLayout* pLayout) const = 0; + + /// Addes the specified thread trace to be recorded as part of this perf experiment. + /// + /// @param [in] traceInfo Specifies what type of trace to record, which block instance to trace, and options, etc. + /// + /// @returns Success if the trace was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddThreadTrace( + const ThreadTraceInfo& traceInfo) = 0; + + /// Adds the specified DfSpmTrace to be recorded as part of this perf experiment. + /// + /// @param [in] dfSpmCreateInfo Specifies the parameters of the df spm trace and + /// provides the list of perf counters. + /// + /// @returns Success if the df spm trace was successfully added to the experiment, + /// otherwise and appropriate error code. + virtual Result AddDfSpmTrace( + const SpmTraceCreateInfo& dfSpmCreateInfo) = 0; + + /// Adds the specified SpmTrace to be recorded as part of this perf experiment. + /// + /// @param [in] spmCreateInfo Specifies the parameters of the spm trace and provides the list of perf counters. + /// + /// @returns Success if the spm trace was successfully added to the experiment, otherwise an appropriate error code. + virtual Result AddSpmTrace( + const SpmTraceCreateInfo& spmCreateInfo) = 0; + + /// Queries the layout of thread trace results in memory for this perf experiment. + /// + /// @param [out] pLayout Layout describing how the results of each thread trace will be written to GPU memory when + /// this perf experiment is executed. Should correspond with counters added via AddTrace(). + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetThreadTraceLayout( + ThreadTraceLayout* pLayout) const = 0; + + /// Queries the layout of streaming counter trace results in memory for this perf experiment. + /// + /// The caller is expected to call this function twice. The first time with pLayout->numCounters = 0 which prompts + /// PAL to only set numCounters to the correct number of SPM counters and return. The second call with a non-zero + /// numCounters prompts PAL to fill out the full structure and counterData array. + /// + /// Note that @ref SpmTraceLayout contains a variable length array. The caller must allocate enough memory for + /// an additional "numCounters - 1" copies of @ref SpmCounterData. + /// + /// @param [out] pLayout Layout describing the layout of the streaming counter trace results in the resulting + /// GPU memory once this perf experiment is executed. + /// + /// @returns Success if the layout was successfully returned in pLayout, otherwise an appropriate error code. + virtual Result GetSpmTraceLayout( + SpmTraceLayout* pLayout) const = 0; + + /// Finalizes the performance experiment preparing it for execution. + /// + /// @returns Success if the operation executed successfully, otherwise an appropriate error code. + virtual Result Finalize() = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPerfExperiment() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPerfExperiment() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h b/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h index 3a1e027170..54c32b8b80 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPipeline.h @@ -1,909 +1,896 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palPipeline.h - * @brief Defines the Platform Abstraction Library (PAL) IPipeline interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palGpuMemoryBindable.h" -#include "palDestroyable.h" -#include "palImage.h" -#include "palShaderLibrary.h" -#include "palSpan.h" -#include - -namespace Util -{ -namespace Abi -{ -union ApiHwShaderMapping; -enum class HardwareStage : uint32; -} - -namespace HsaAbi -{ -struct KernelArgument; -} -} - -namespace Pal -{ -struct GpuMemSubAllocInfo; -enum class PrimitiveTopology : uint8; - -/// Specifies a shader type (i.e., what stage of the pipeline this shader was written for). -enum class ShaderType : uint32 -{ - Compute = 0, - Task, - Vertex, - Hull, - Domain, - Geometry, - Mesh, - Pixel, - - Count -}; - -/// Number of shader program types supported by PAL. -constexpr uint32 NumShaderTypes = static_cast(ShaderType::Count); - -/// Maximum number of viewports. -constexpr uint32 MaxViewports = 16; - -/// Maximum number of supported stream-output declaration entries by any PAL device. -constexpr uint32 MaxStreamOutEntries = 512; - -/// Specifies a general primitive category without differentiating between a strip or list and without specifying -/// whether a the primitive will include adjacency info or not. -enum class PrimitiveType : uint32 -{ - Point = 0x0, - Line = 0x1, - Triangle = 0x2, - Rect = 0x3, - Quad = 0x4, - Patch = 0x5, - Count -}; - -/// Specifies the target range of Z values after viewport transform. -enum class DepthRange : uint32 -{ - ZeroToOne = 0x0, - NegativeOneToOne = 0x1, -}; - -/// Specifies whether the v/t texture coordinates of a point sprite map 0 to 1 from top to bottom or bottom to top. -enum class PointOrigin : uint32 -{ - UpperLeft = 0x0, - LowerLeft = 0x1, - Count -}; - -/// Specifies primitive's shade mode. -enum class ShadeMode : uint32 -{ - Gouraud = 0x0, ///< Gouraud shading mode, pixel shader input is interpolation of vertex - Flat = 0x1, ///< Flat shading mode, pixel shader input from provoking vertex - Count -}; - -/// Defines a logical operation applied between the color coming from the pixel shader and the current value in the -/// target image. -enum class LogicOp : uint32 -{ - Copy = 0x0, - Clear = 0x1, - And = 0x2, - AndReverse = 0x3, - AndInverted = 0x4, - Noop = 0x5, - Xor = 0x6, - Or = 0x7, - Nor = 0x8, - Equiv = 0x9, - Invert = 0xA, - OrReverse = 0xB, - CopyInverted = 0xC, - OrInverted = 0xD, - Nand = 0xE, - Set = 0xF, -}; - -/// Shader Engine Dispatch Interleave Size -/// -/// This determines how many Threads or Threadgroups are sent to one SE before switching to the next SE. -/// Work is always distributed in Threadgroups though. -/// -/// The 1D values are specified in Threads and the Threadgroups are walked in a 1D typewriter fashion. -/// The 2D values are specified in Threadgroups and also walked in typewriter fashion (in groups of the 2D pattern). -/// -/// Clients should check for 1D and 2D support separately in: -/// - DeviceProperties::gfxipProperties::flags::support1dDispatchInterleave -/// - DeviceProperties::gfxipProperties::flags::support2dDispatchInterleave -/// -/// Default will result in "Disable" for chips which do not support 1D or 2D. -/// Disable means that every Threadgroup is issued to the next SE. -enum class DispatchInterleaveSize : uint32 -{ - Default, - Disable, - - _1D_64_Threads, - _1D_128_Threads, - _1D_256_Threads, - _1D_512_Threads, - - _2D_1x1_ThreadGroups, - _2D_1x2_ThreadGroups, - _2D_1x4_ThreadGroups, - _2D_1x8_ThreadGroups, - _2D_1x16_ThreadGroups, - - _2D_2x1_ThreadGroups, - _2D_2x2_ThreadGroups, - _2D_2x4_ThreadGroups, - _2D_2x8_ThreadGroups, - - _2D_4x1_ThreadGroups, - _2D_4x2_ThreadGroups, - _2D_4x4_ThreadGroups, - - _2D_8x1_ThreadGroups, - _2D_8x2_ThreadGroups, - - _2D_16x1_ThreadGroups, - - Count, -}; - -/// Specifies whether to override binning setting for pipeline. Enum value of Default follows the PBB global setting. -/// Enable or Disable value overrides PBB global setting for the pipeline and sets binning accordingly. -enum class BinningOverride : uint32 -{ - Default = 0x0, - Disable = 0x1, - Enable = 0x2, - Count -}; - -/// GPU behavior is controlled by LDS_GROUP_SIZE. -enum class LdsPsGroupSizeOverride : uint32 -{ - Default = 0x0, - SingleWave = 0x1, - DoubleWaves = 0x2 -}; - -/// Tri-state enum which controls enabling or disabling a feature or behavior, or letting PAL select a sensible default -enum class OverrideMode : int32 -{ - Default = -1, ///< PAL selects the default behavior, which could be either enabled or disabled. - Disabled = 0, ///< Force to disabled. Equal to set to False. - Enabled = 1, ///< Force to enabled. Equal to set to True. -}; - -/// Enumerates the depth clamping modes a pipeline can use. -enum class DepthClampMode : uint32 -{ - Viewport = 0x0, ///< Clamps to the viewport min/max depth bounds - _None = 0x1, ///< Disables depth clamping -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 950 - ZeroToOne = 0x2, ///< Clamps between 0.0 and 1.0. - UserDefined = 0x3, ///< Clamps based on ViewportParams::userDepthClamp. -#else - UserDefined = 0x2, ///< Clamps based on ViewportParams::userDepthClamp. -#endif - - /// @note Do not add entries 0x4 or higher. DynamicGraphicsState::depthClampMode is a 2-bit field. - - // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either - // undefing None before including this header or using _None when dealing with PAL. -#ifndef None - None = _None, ///< Disables depth clamping -#endif -}; - -/// Common flags controlling creation of both compute and graphics pipeline. -union PipelineCreateFlags -{ - struct - { - uint32 clientInternal : 1; ///< Internal pipeline not created by the application. - uint32 reverseWorkgroupOrder : 1; ///< Indicates that any Dispatch using this pipeline should execute in - /// reverse workgroup order. This superceeds the flag on the CommandBuffer - /// (dispatchPingPongWalk) - always forcing reverse workgroup order! This - /// is a best effort as not all implementations or Queues may support this. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Constant definining the max number of view instance count that is supported. -constexpr uint32 MaxViewInstanceCount = 6; - -/// Specifies graphic pipeline view instancing state. -struct ViewInstancingDescriptor -{ - uint32 viewInstanceCount; ///< The view instance count of the graphic pipeline - uint32 viewId[MaxViewInstanceCount]; ///< The view instance ids. - uint32 renderTargetArrayIdx[MaxViewInstanceCount]; ///< The instance render target array index, can be - /// used in hardware accelerated stereo rendering. - uint16 viewportArrayIdx[MaxViewInstanceCount]; ///< The instance viewport array index, can be - /// used in hardware accelerated stereo rendering. - bool enableMasking; ///< Indicate whether instance masking is enabled. -}; - -// Specifies the input parameters for the MSAA coverage out feature. MSAA coverage out is used in conjunction with a -// single sampled color image. This feature exports a mask indicating which samples would have been used if the -// image had been multi-sampled. The mask is exported to the specified channel of the MRT pointing to the rendered -// image. That is, the MRT must be an active bound render target. This MSAA mask data can then be post-processed. -struct MsaaCoverageOutDescriptor -{ - union - { - struct - { - uint32 enable : 1; ///< Set to true to enable render target channel output - uint32 numSamples : 4; ///< Number of samples to export - uint32 mrt : 3; ///< Which MRT to export to. - uint32 channel : 2; ///< Which channel to export to (x = 0, y = 1, z = 2, w = 3) - uint32 reserved : 22; - }; - - uint32 u32All; - } flags; -}; - -/// Specifies properties about an indirect function belonging to a compute @ref IPipelne object. Part of the input -/// structure to IDevice::CreateComputePipeline(). -struct ComputePipelineIndirectFuncInfo -{ - const char* pSymbolName; ///< ELF Symbol name for the associated function. Must not be null. - gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during - /// pipeline creation. -}; - -/// Specifies properties for creation of a compute @ref IPipeline object. Input structure to -/// IDevice::CreateComputePipeline(). -struct ComputePipelineCreateInfo -{ - PipelineCreateFlags flags; ///< Flags controlling pipeline creation. - - const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI - /// interface. The Pipeline ELF contains pre-compiled shaders, - /// register values, and additional metadata. - size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. - uint32 maxFunctionCallDepth; ///< Maximum depth for indirect function calls. Not used for a new - /// path ray-tracing pipeline as the compiler has pre-calculated - /// stack requirements. - bool disablePartialDispatchPreemption; ///< Prevents scenarios where a subset of the dispatched thread groups are - /// preempted and the remaining thread groups run to completion. This - /// can occur when thread group granularity preemption is available and - /// instruction level (CWSR) is not. This setting is useful for allowing - /// dispatches with interdependent thread groups. - DispatchInterleaveSize interleaveSize; ///< Controls how many thread groups are sent to one SE before switching to - /// the next one. - - /// PAL expects a fixed 3D thread group size for each compute pipeline but the HSA ABI supports dynamic group sizes. - /// If this pipeline's ELF binary metadata doesn't specify a fixed thread group size, this should be used to force - /// a particular thread group size. If this extent is set to all zeros PAL will use the metadata's group size. - /// This field is not supported on PAL ABI ELFs, it should be set to all zeros. - Extent3d threadsPerGroup; - TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle - /// issuing of low priority waves when it detects too many higher priority waves are - /// failing to schedule due to resource contraints. - - const char* pKernelName; ///< When create pipeline with hsa ELF binary of multiple kernels, need to set one - /// kernel to create the pipeline. null means only one kernel in ELF binary. - -}; - -/// Specifies information about the viewport behavior of an assembled graphics pipeline. Part of the input -/// structure @ref GraphicsPipelineCreateInfo. -struct ViewportInfo -{ - bool depthClipNearEnable; ///< Enable clipping based on Near Z coordinate. - bool depthClipFarEnable; ///< Enable clipping based on Far Z coordinate. - DepthRange depthRange; ///< Specifies Z dimensions of screen space (i.e., post viewport transform: - /// 0 to 1 or -1 to 1). -}; - -/// Specifies edgeRule for rasterization -enum class EdgeRuleMode : uint32 -{ - D3dCompliant = 0x0, ///< Use rasterization edge-rules which comply with the D3D spec. - OpenGlDefault = 0x1, ///< Use rasterization edge-rules compatible with the default OpenGL driver. -}; - -/// Specifies Rasterizer state in properties for creation of a graphics -struct RasterizerState -{ - PointOrigin pointCoordOrigin; ///< Controls texture coordinate orientation for point sprites. - bool expandLineWidth; ///< If true, line primitives will have their width expanded by 1/cos(a) - /// where a is the minimum angle from horizontal or vertical. - /// This can be used in conjunction with PS patching for a client to - /// implement line antialiasing. - ShadeMode shadeMode; ///< Specifies shading mode, Gouraud or Flat - bool rasterizeLastLinePixel; ///< Specifies whether to draw last pixel in a line. - bool outOfOrderPrimsEnable; ///< Enables out-of-order primitive rasterization. PAL silently - /// ignores this if it is unsupported in hardware. - bool perpLineEndCapsEnable; ///< Forces the use of perpendicular line end caps as opposed to - /// axis-aligned line end caps during line rasterization. - BinningOverride binningOverride; ///< Binning setting for this pipeline. - - DepthClampMode depthClampMode; ///< Depth clamping behavior - - union - { - struct - { - uint8 clipDistMaskValid : 1; ///< Whether or not @ref clipDiskMask, below, is valid. - uint8 cullDistMaskValid : 1; ///< Whether or not @ref cullDistMask, below, is valid. - uint8 reserved : 6; - }; - uint8 u8All; ///< All the flags as a single value. - } flags; - - uint8 cullDistMask; ///< Mask of which cullDistance exports to leave enabled. - uint8 clipDistMask; ///< Mask of which clipDistance exports to leave enabled. - bool dx10DiamondTestDisable; ///< Disable DX10 diamond test during line rasterization. - EdgeRuleMode edgeRule; -}; - -/// Specifies Per-MRT color target info in olor target state -struct ColorTargetInfo -{ - SwizzledFormat swizzledFormat; ///< Color target format and channel swizzle. Set the format to invalid - /// if no color target will be bound at this slot. - uint8 channelWriteMask; ///< Color target write mask. Bit 0 controls the red channel, bit 1 is - /// green, bit 2 is blue, and bit 3 is alpha. - bool forceAlphaToOne; ///< Treat alpha as one regardless of the shader output. Ignored unless - /// supportAlphaToOne is set in DeviceProperties. -}; - -/// Specifies color target state in properties for creation of a graphics -struct ColorTargetState -{ - bool alphaToCoverageEnable; ///< Enable alpha to coverage. - bool dualSourceBlendEnable; ///< Blend state bound at draw time will use a dual source blend mode. - LogicOp logicOp; ///< Logic operation to perform. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 904 - bool uavExportSingleDraw; ///< When UAV export is enabled, acts as a hint that only a single draw - /// is done on a color target with this or subsequent pipelines before - /// a barrier. Improves performance by allowing pipelines to overlap. -#endif - - ColorTargetInfo target[MaxColorTargets]; ///< Per-MRT color target info. -}; - -/// Specifies properties for creation of a graphics @ref IPipeline object. Input structure to -/// IDevice::CreateGraphicsPipeline(). -struct GraphicsPipelineCreateInfo -{ - PipelineCreateFlags flags; ///< Flags controlling pipeline creation. - - const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI - /// interface. The Pipeline ELF contains pre-compiled shaders, - /// register values, and additional metadata. - size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. - const IShaderLibrary** ppShaderLibraries; ///< An array of graphics @ref IShaderLibrary object. pPipelineBinary - /// and ppShaderLibraries can't be valid at the same time. - /// If the client does not know whether the pipeline is complete, - /// it can add the shader library for a "dummy partial pipeline" to - /// the end of the array to ensure the pipeline is complete. - /// In practice, "complete" means "has a PS on hardware that requires - /// it", although that is an implementation detail that the client - /// does not need to know. - size_t numShaderLibraries; ///< Number of graphics shaderLibrary object in ppShaderLibraries. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 959 - bool useLateAllocVsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally - /// determining the limit. - uint32 lateAllocVsLimit; ///< The number of VS waves that can be in flight without having param - /// cache and position buffer space. If useLateAllocVsLimit flag is set, - /// PAL will use this limit instead of the PAL-specified limit. -#endif - bool useLateAllocGsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally - /// determining the limit. - uint32 lateAllocGsLimit; ///< Controls GS LateAlloc val (for pos/prim allocations NOT param cache) - /// on NGG pipelines. Can be no more than 127. - struct - { - struct - { - PrimitiveType primitiveType; ///< Basic primitive category: points, line, triangles, patches. - bool topologyIsPolygon; ///< Indicates that triangle primitives are combined to represent more - /// complex polygons. Only valid for triangle primitive types. - uint32 patchControlPoints; ///< Number of control points per patch. Only required if primitiveType - /// is PrimitiveType::Patch. - } topologyInfo; ///< Various information about the primitive topology that will be used with this pipeline. - /// All of this info must be consistent with the full topology specified by - /// ICmdBuffer::SetPrimitiveTopology() when drawing with this pipeline bound. - - /// Number of vertex buffer slots which are accessed by this pipeline. Behavior is undefined if the pipeline - /// tries to access a vertex buffer slot outside the range [0, vertexBufferCount). It is generally advisable - /// to make this the minimum value possible because that reduces the number of vertex buffer slots PAL has to - /// maintain for this pipeline when recording command buffers. - uint32 vertexBufferCount; - } iaState; ///< Input assembler state. - - RasterizerState rsState; ///< Rasterizer state. - ColorTargetState cbState; ///< Color target state. - - ViewInstancingDescriptor viewInstancingDesc; ///< Descriptor describes view instancing state - /// of the graphics pipeline - MsaaCoverageOutDescriptor coverageOutDesc; ///< Descriptor describes input parameters for MSAA coverage out. - ViewportInfo viewportInfo; ///< Viewport info. - DispatchInterleaveSize taskInterleaveSize; ///< Ignored for pipelines without a task shader. For pipelines with - /// a task shader, controls how many thread groups are sent to one - /// SE before switching to the next one. - LdsPsGroupSizeOverride ldsPsGroupSizeOverride; ///< Whether to override ldsPsGroupSize setting for pipeline. - - TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle - /// issuing of low priority waves when it detects too many higher priority waves are - /// failing to schedule due to resource contraints. - bool noForceReZ; ///< Disables the ability for PAL to force ReZ modes outside of what was chosen by - /// the compiler for this pipeline. -}; - -/// The graphic pipeline view instancing information. This is used to determine if hardware accelerated stereo rendering -/// can be enabled for a graphic pipeline. -struct GraphicPipelineViewInstancingInfo -{ - union - { - struct - { - uint32 shaderUseViewId : 1; ///< If any shader in pipeline uses view id. - uint32 gsExportRendertargetArrayIndex : 1; ///< If gs exports render target array index, - /// must be 0 if there is no gs. - uint32 gsExportViewportArrayIndex : 1; ///< If gs exports viewport array index, - /// must be 0 if there is no gs. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 apiShaderFlags; - }; - - const ViewInstancingDescriptor* pViewInstancingDesc; ///< View Instancing descriptor -}; - -/// Reports properties of a compiled pipeline. This includes hashes for the pipeline and shaders that the client can -/// use to correlate PAL pipeline/shader dumps with corresponding API-level pipelines/shaders. -struct PipelineInfo -{ - const char* pName; ///< Non-owning pointer to a null-terminated string containing human-readable - ///< name of this pipeline. nullptr indicates no name was provided. - - PipelineHash internalPipelineHash; ///< 128-bit identifier extracted from this pipeline's ELF binary, composed of - /// the state the compiler decided was appropriate to identify the compiled - /// shaders. The lower 64 bits are "stable"; the upper 64 bits are "unique". - - struct - { - ShaderHash hash; ///< Unique 128-bit identifier for this shader. 0 indicates there is no shader bound for - /// the corresponding shader stage. - } shader[NumShaderTypes]; ///< Array of per-shader pipeline properties. - - union - { - struct - { - uint32 hsaAbi : 1; ///< This pipeline uses the HSA ABI (i.e. bind arguments not user-data) - uint32 usesCps : 1; ///< This pipeline uses continuations passing shaders (CPS). In an archive pipeline, - /// this bit is set iff it is set in any constituent pipeline. - uint32 cpsGlobal : 1; ///< If using continuations passing shaders (CPS), stack is in global rather than - /// scratch. In an archive pipeline, this bit is set iff it is set in any - /// constituent pipeline. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< All flags combined as a single uint32. - } flags; ///< Pipeline properties. - - struct - { - union - { - struct - { - uint32 perSampleShading : 1; ///< Shader instructions want per-sample execution. - uint32 usesSampleMask : 1; ///< Shader is using sample mask. - uint32 enablePops : 1; ///< Primitive order pixel shader is enabled. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< All flags combined as a single uint32. - } flags; - } ps; ///< Pixel shader properties. - - uint64 resourceMappingHash; ///< 64-bit hash of the resource mapping used when compiling the pipeline, - /// if available (0 otherwise). - - /// The GPU address and size in bytes of the traceRays table, or { 0, 0 } if none. - /// The format of the table is private to the compiler. - /// Compiled code requires a buffer descriptor for the traceRays table to be passed in to the launch kernel - /// in a particular place. - gpusize traceRaysTable; - gpusize traceRaysTableSize; - - /// Pointer to the the traceRays table for debugging purpose, or nullptr if none. - const char* pTraceRaysTable; - - /// Pointer to and size in bytes of the shader identifiers table, or { nullptr, 0 } if none. - /// The table has an entry for each export and then each hit group specified when the pipeline was compiled. - /// Each entry has a pointer to the shader or hit group name in the same block of data (always 64 bits even - /// on a 32-bit host), then the 32 byte shader identifier whose format is agreed between the compiler and - /// GPURT's library code, for a total of 40 bytes per entry. - const void* pShaderIdentifiers; - size_t shaderIdentifiersSize; - - uint32 unifiedRgsNameHash; ///< 32-bit hash of unified RGS name, 0 otherwise - -}; - -/// A structure that represents any 3D arrangement of threads or thread groups as part of a compute shader dispatch. -/// -/// This structure is halfway between Extent3d and Offset3d, depending on the context it may represent an offset or -/// an extent. Essentially it's meaning is tied to the concept of 3D thread or thread group grids rather than generic -/// contexts like "extent" or "offset". Whether it represents threads or thread groups is also context specific. -struct DispatchDims -{ - uint32 x; ///< Threads or thread groups in the X dimension. - uint32 y; ///< Threads or thread groups in the Y dimension. - uint32 z; ///< Threads or thread groups in the Z dimension. - - /// Computes the volume of this 3D arrangement of threads or thread groups. - /// - /// @returns the total number of threads or threads groups this struct represents. - uint32 Flatten() const { return x * y * z; } -}; - -// There are some places where we'd like to directly cast DispatchDims to an array of three uint32s. -static_assert(sizeof(DispatchDims) == sizeof(uint32) * 3, "DispatchDims not castable to uint32*"); - -/// Component-wise addition of two DispatchDims. -/// -/// @param [in] l The left-hand argument. -/// @param [in] r The right-hand argument. -/// -/// @returns A new DispatchDims which contains the sum of 'l' and 'r' along each dimension. -inline DispatchDims operator+(DispatchDims l, DispatchDims r) { return {l.x + r.x, l.y + r.y, l.z + r.z}; } - -/// Component-wise addition of one DispatchDims into another. -/// -/// @param [in] l The left-hand argument. -/// @param [in] r The right-hand argument. -/// -/// @returns A reference to 'l' after it is updated to the sum of 'l' and 'r'. -inline DispatchDims& operator+=(DispatchDims& l, DispatchDims r) { return l = (l + r); } - -/// Component-wise multiplication of two DispatchDims. -/// -/// @param [in] l The left-hand argument. -/// @param [in] r The right-hand argument. -/// -/// @returns A new DispatchDims which contains the product of 'l' and 'r' along each dimension. -inline DispatchDims operator*(DispatchDims l, DispatchDims r) { return {l.x * r.x, l.y * r.y, l.z * r.z}; } - -/// Component-wise multiplication of one DispatchDims into another. -/// -/// @param [in] l The left-hand argument. -/// @param [in] r The right-hand argument. -/// -/// @returns A reference to 'l' after it is updated to the product of 'l' and 'r'. -inline DispatchDims& operator*=(DispatchDims& l, DispatchDims r) { return l = (l * r); } - -/// Used to represent API level shader stage. -enum ShaderStageFlagBits : uint32 -{ - ApiShaderStageCompute = (1u << static_cast(ShaderType::Compute)), - ApiShaderStageTask = (1u << static_cast(ShaderType::Task)), - ApiShaderStageVertex = (1u << static_cast(ShaderType::Vertex)), - ApiShaderStageHull = (1u << static_cast(ShaderType::Hull)), - ApiShaderStageDomain = (1u << static_cast(ShaderType::Domain)), - ApiShaderStageGeometry = (1u << static_cast(ShaderType::Geometry)), - ApiShaderStageMesh = (1u << static_cast(ShaderType::Mesh)), - ApiShaderStagePixel = (1u << static_cast(ShaderType::Pixel)), -}; - -/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined -/// due to HW support. The same information will be repeated for both the constituent shaders in this case. -struct ShaderStats -{ - uint32 shaderStageMask; ///< Indicates the stages of the pipeline this shader is - /// used for. If multiple bits are set, it implies - /// shaders were merged. See @ref ShaderStageFlagBits. - CommonShaderStats common; ///< The shader compilation parameters for this shader. - /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum - /// of any architectural restriction and any client-requested limit intended to increase the number of waves in - /// flight. - uint32 numAvailableVgprs; - /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum - /// of any architectural restriction and any client-requested limit intended to increase the number of waves in - /// flight. - uint32 numAvailableSgprs; - size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. - ShaderHash palShaderHash; ///< Internal hash of the shader compilation data used by PAL. - - union - { - struct - { - uint32 writesUAV : 1; ///< This shader performs writes to UAVs. - uint32 writesDepth : 1; ///< Indicates explicit depth writes performed by the shader stage. - uint32 streamOut : 1; ///< The shader performs stream out of shader generated data. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< All flags combined as a single uint32. - - } shaderOperations; ///< Flags depicting shader operations. - - struct - { - DispatchDims numThreadsPerGroup; ///< Number of compute threads per thread group in X, Y, and Z dimensions. - } cs; ///< Parameters specific to compute shader only. - - union - { - struct - { - uint8 copyShaderPresent : 1; ///< Indicates that the copy shader data is valid. - uint8 reserved : 7; ///< Reserved for future use. - }; - uint8 u8All; ///< All the flags as a single value. - } flags; ///< Flags related to this shader data. - - CommonShaderStats copyShader; ///< This data is valid only when the copyShaderPresent flag above is set. -}; - - /** - *********************************************************************************************************************** - * @interface IPipeline - * @brief Monolithic object containing all shaders and a large amount of "shader adjacent" state. Separate concrete - * implementations will support compute or graphics pipelines. - * - * @see IDevice::CreateComputePipeline() - * @see IDevice::CreateGraphicsPipeline() - * @see IDevice::LoadPipeline() - *********************************************************************************************************************** - */ -class IPipeline : public IDestroyable -{ -public: - /// Returns PAL-computed properties of this pipeline and its corresponding shaders. - /// - /// @returns Property structure describing this pipeline. - virtual const PipelineInfo& GetInfo() const = 0; - - /// Returns a list of GPU memory allocations used by this pipeline. - /// - /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value - /// reports the number of GPU memory allocations. - /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it - /// will reflect the number of allocations that make up this pipeline. If - /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number - /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the - /// number of entries in pAllocInfoList that are valid. - /// @returns Success if the allocation info was successfully written to the buffer. - /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. - /// + ErrorInvalidPointer if pNumEntries is nullptr. - virtual Result QueryAllocationInfo( - size_t* pNumEntries, - GpuMemSubAllocInfo* const pAllocInfoList) const = 0; - - /// Gives the client access to the resource ID used for internal Pal events. - /// EX: Resource Create, Resource Bind, Resource Destroy. - /// - /// @returns The Resource ID. - virtual const void* GetResourceId() const = 0; - - /// Obtains the binary code object for this pipeline. - /// - /// @param [in, out] pSize Represents the size of the shader ISA code. - /// - /// @param [out] pBuffer If non-null, the pipeline ELF is written in the buffer. If null, the size required - /// for the pipeline ELF is given out in the location pSize. - /// - /// @returns Success if the pipeline binary was fetched successfully. - /// +ErrorUnavailable if the pipeline binary was not fetched successfully. - virtual Result GetCodeObject( - uint32* pSize, - void* pBuffer) const = 0; - - /// Obtains the pointer of code object with ELF format according to the shader type. Returned ELF object is not - /// guaranteed to be unique with different shader type, because a single code object can contain multiple shaders. - /// - /// @param [in] shaderType The shader stage for which the code object are requested. - /// @param [out] pSize The size of the ELF binary. - /// - /// @returns The pointer of ELF binary which contains requested shader stage. - virtual const void* GetCodeObjectWithShaderType( - ShaderType shaderType, - size_t* pSize) const = 0; - - /// Obtains the shader pre and post compilation stats/params for the specified shader stage. - /// - /// @param [in] shaderType The shader stage for which the stats are requested. - /// - /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for - /// the shader stage mentioned in shaderType. This cannot be nullptr. - /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the - /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. - /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. - /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error - /// occured. - virtual Result GetShaderStats( - ShaderType shaderType, - ShaderStats* pShaderStats, - bool getDisassemblySize) const = 0; - - /// Obtains the compiled shader ISA code for the shader stage specified. - /// - /// @param [in] shaderType The shader stage for which the shader cache entry is requested. - /// - /// @param [in, out] pSize Represents the size of the shader ISA code. - /// - /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required - /// for the shader ISA is given out in the location pSize. - /// - /// @returns Success if the shader ISA code was fetched successfully. - /// +ErrorUnavailable if the shader ISA code was not fetched successfully. - virtual Result GetShaderCode( - ShaderType shaderType, - size_t* pSize, - void* pBuffer) const = 0; - - /// Obtains the generated performance data for the shader stage specified. - /// - /// @param [in] hardwareStage The hardware stage of the shader which the performance data is requested. - /// @param [in, out] pSize Represents the size of the performance data. - /// @param [out] pBuffer If non-null, the performance data is written in the buffer. If null, the size - /// required for the performance data is given out in the location pSize. - /// - /// @returns Success if the performance data was fetched successfully. - /// +ErrorUnavailable if the performance data was not fetched successfully. - virtual Result GetPerformanceData( - Util::Abi::HardwareStage hardwareStage, - size_t* pSize, - void* pBuffer) = 0; - - /// Notifies PAL that this pipeline may make indirect function calls to any function contained within any of the - /// specified @ref IShaderLibrary objects. This gives PAL a chance to perform any late linking steps required to - /// valid execution of the possible function calls (this could include adjusting hardware resources such as GPRs - /// or LDS space for the pipeline). - /// - /// This may be called multiple times on the same pipeline object. Subsequent calls do not invalidate the result - /// of previous calls. - /// - /// This must be called prior to binding this pipeline to a command buffer which will make function calls into any - /// shader function contained within any of the specified libraries. Failure to comply is an error and will result - /// in undefined behavior. - /// - /// Currently only supported on compute pipelines. - /// - /// @param [in] ppLibraryList List of @ref IShaderLibrary object to link with. - /// @param [in] libraryCount Number of valid library objects in the ppLibraryList array. - /// - /// @returns Success if the operation is successful. Other return codes may include: - /// + ErrorUnavailable if called on a graphics pipeline. - /// + ErrorBadPipelineData if any of the libraries in ppLibraryList are not compatible with this pipeline. - /// Reasons for incompatibility include (but are not limited to) different user-data mappings, different - /// wavefront sizes, and other reasons. - virtual Result LinkWithLibraries( - const IShaderLibrary*const* ppLibraryList, - uint32 libraryCount) = 0; - - /// Sets the stack size for indirect function calls made by this pipeline. This may be smaller than or equal to the - /// stack size already determined during pipeline creation or during an earlier call to LinkWithLibraries() because - /// the client has access to more information about which functions contained in those libraries (or in the pipeline - /// itself) are actually going to be called. - /// - /// Note that a future call to LinkWithLibraries() will invalidate this value and this should - /// be called again. - /// - /// @param [in] stackSizeInBytes Client-specified stack size, in bytes. - virtual void SetStackSizeInBytes( - uint32 stackSizeInBytes) = 0; - - /// Retrieve the stack sizes managed by compiler, including the frontend stack and the backend stack. - /// - /// @param [out] pSizes To be filled with both the frontend stack size and the backend stack size, in bytes. - /// - /// @returns SUCCESS - virtual Result GetStackSizes( - CompilerStackSizes* pSizes) const = 0; - - /// Returns the API shader type to hardware stage mapping for the pipeline. - /// - /// @returns The appropriate mapping for this pipeline. - virtual Util::Abi::ApiHwShaderMapping ApiHwShaderMapping() const = 0; - - /// Given the zero-based position of a kernel argument, return a pointer to that argument's metadata. - /// - /// @note Only compute pipelines using the HSA ABI have kernel arguments. - /// - /// @param [in] index The zero-based position of the kernel argument to query. - /// - /// @returns A pointer to the kernel argument's metadata, or null if this pipeline doesn't have this argument. - virtual const Util::HsaAbi::KernelArgument* GetKernelArgument(uint32 index) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const { return m_pClientData; } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - - /// Get the array of underlying pipelines that this pipeline contains. For a normal non-multi-pipeline, - /// this returns a single-entry array pointing to the same IPipeline. For a multi-pipeline compiled in - /// dynamic launch mode, this returns an empty array. The contents of the returned array remain valid - /// until the IPipeline is destroyed. - /// - /// @returns The array of underlying pipelines. - virtual Util::Span GetPipelines() const = 0; - - /// Get the array of underlying shader libraries that this pipeline contains. For a normal non-multi-pipeline, - /// this returns the empty array. The contents of the returned array remain valid until the IPipeline is - /// destroyed. - /// - /// @returns The array of underlying shader libraries. - virtual Util::Span GetLibraries() const { return {}; } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IPipeline() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IPipeline() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; - - IPipeline(const IPipeline&) = delete; - IPipeline& operator=(const IPipeline&) = delete; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPipeline.h + * @brief Defines the Platform Abstraction Library (PAL) IPipeline interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" +#include "palDestroyable.h" +#include "palImage.h" +#include "palShaderLibrary.h" +#include "palSpan.h" +#include + +namespace Util +{ +namespace Abi +{ +union ApiHwShaderMapping; +enum class HardwareStage : uint32; +} + +namespace HsaAbi +{ +struct KernelArgument; +} +} + +namespace Pal +{ +struct GpuMemSubAllocInfo; +enum class PrimitiveTopology : uint8; + +/// Specifies a shader type (i.e., what stage of the pipeline this shader was written for). +enum class ShaderType : uint32 +{ + Compute = 0, + Task, + Vertex, + Hull, + Domain, + Geometry, + Mesh, + Pixel, + + Count +}; + +/// Number of shader program types supported by PAL. +constexpr uint32 NumShaderTypes = static_cast(ShaderType::Count); + +/// Maximum number of viewports. +constexpr uint32 MaxViewports = 16; + +/// Maximum number of supported stream-output declaration entries by any PAL device. +constexpr uint32 MaxStreamOutEntries = 512; + +/// Specifies a general primitive category without differentiating between a strip or list and without specifying +/// whether a the primitive will include adjacency info or not. +enum class PrimitiveType : uint32 +{ + Point = 0x0, + Line = 0x1, + Triangle = 0x2, + Rect = 0x3, + Quad = 0x4, + Patch = 0x5, + Count +}; + +/// Specifies the target range of Z values after viewport transform. +enum class DepthRange : uint32 +{ + ZeroToOne = 0x0, + NegativeOneToOne = 0x1, +}; + +/// Specifies whether the v/t texture coordinates of a point sprite map 0 to 1 from top to bottom or bottom to top. +enum class PointOrigin : uint32 +{ + UpperLeft = 0x0, + LowerLeft = 0x1, + Count +}; + +/// Specifies primitive's shade mode. +enum class ShadeMode : uint32 +{ + Gouraud = 0x0, ///< Gouraud shading mode, pixel shader input is interpolation of vertex + Flat = 0x1, ///< Flat shading mode, pixel shader input from provoking vertex + Count +}; + +/// Defines a logical operation applied between the color coming from the pixel shader and the current value in the +/// target image. +enum class LogicOp : uint32 +{ + Copy = 0x0, + Clear = 0x1, + And = 0x2, + AndReverse = 0x3, + AndInverted = 0x4, + Noop = 0x5, + Xor = 0x6, + Or = 0x7, + Nor = 0x8, + Equiv = 0x9, + Invert = 0xA, + OrReverse = 0xB, + CopyInverted = 0xC, + OrInverted = 0xD, + Nand = 0xE, + Set = 0xF, +}; + +/// Shader Engine Dispatch Interleave Size +/// +/// This determines how many Threads or Threadgroups are sent to one SE before switching to the next SE. +/// Work is always distributed in Threadgroups though. +/// +/// The 1D values are specified in Threads and the Threadgroups are walked in a 1D typewriter fashion. +/// The 2D values are specified in Threadgroups and also walked in typewriter fashion (in groups of the 2D pattern). +/// +/// Clients should check for 1D and 2D support separately in: +/// - DeviceProperties::gfxipProperties::flags::support1dDispatchInterleave +/// - DeviceProperties::gfxipProperties::flags::support2dDispatchInterleave +/// +/// Default will result in "Disable" for chips which do not support 1D or 2D. +/// Disable means that every Threadgroup is issued to the next SE. +enum class DispatchInterleaveSize : uint32 +{ + Default, + Disable, + + _1D_64_Threads, + _1D_128_Threads, + _1D_256_Threads, + _1D_512_Threads, + + _2D_1x1_ThreadGroups, + _2D_1x2_ThreadGroups, + _2D_1x4_ThreadGroups, + _2D_1x8_ThreadGroups, + _2D_1x16_ThreadGroups, + + _2D_2x1_ThreadGroups, + _2D_2x2_ThreadGroups, + _2D_2x4_ThreadGroups, + _2D_2x8_ThreadGroups, + + _2D_4x1_ThreadGroups, + _2D_4x2_ThreadGroups, + _2D_4x4_ThreadGroups, + + _2D_8x1_ThreadGroups, + _2D_8x2_ThreadGroups, + + _2D_16x1_ThreadGroups, + + Count, +}; + +/// Specifies whether to override binning setting for pipeline. Enum value of Default follows the PBB global setting. +/// Enable or Disable value overrides PBB global setting for the pipeline and sets binning accordingly. +enum class BinningOverride : uint32 +{ + Default = 0x0, + Disable = 0x1, + Enable = 0x2, + Count +}; + +/// GPU behavior is controlled by LDS_GROUP_SIZE. +enum class LdsPsGroupSizeOverride : uint32 +{ + Default = 0x0, + SingleWave = 0x1, + DoubleWaves = 0x2 +}; + +/// Tri-state enum which controls enabling or disabling a feature or behavior, or letting PAL select a sensible default +enum class OverrideMode : int32 +{ + Default = -1, ///< PAL selects the default behavior, which could be either enabled or disabled. + Disabled = 0, ///< Force to disabled. Equal to set to False. + Enabled = 1, ///< Force to enabled. Equal to set to True. +}; + +/// Enumerates the depth clamping modes a pipeline can use. +enum class DepthClampMode : uint32 +{ + Viewport = 0x0, ///< Clamps to the viewport min/max depth bounds + _None = 0x1, ///< Disables depth clamping +#if PAL_BUILD_SUPPORT_DEPTHCLAMPMODE_ZERO_TO_ONE + ZeroToOne = 0x2, ///< Clamps between 0.0 and 1.0. +#endif + + // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either + // undefing None before including this header or using _None when dealing with PAL. +#ifndef None + None = _None, ///< Disables depth clamping +#endif +}; + +/// Common flags controlling creation of both compute and graphics pipeline. +union PipelineCreateFlags +{ + struct + { + uint32 clientInternal : 1; ///< Internal pipeline not created by the application. + uint32 reverseWorkgroupOrder : 1; ///< Indicates that any Dispatch using this pipeline should execute in + /// reverse workgroup order. This superceeds the flag on the CommandBuffer + /// (dispatchPingPongWalk) - always forcing reverse workgroup order! This + /// is a best effort as not all implementations or Queues may support this. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Constant definining the max number of view instance count that is supported. +constexpr uint32 MaxViewInstanceCount = 6; + +/// Specifies graphic pipeline view instancing state. +struct ViewInstancingDescriptor +{ + uint32 viewInstanceCount; ///< The view instance count of the graphic pipeline + uint32 viewId[MaxViewInstanceCount]; ///< The view instance ids. + uint32 renderTargetArrayIdx[MaxViewInstanceCount]; ///< The instance render target array index, can be + /// used in hardware accelerated stereo rendering. + uint16 viewportArrayIdx[MaxViewInstanceCount]; ///< The instance viewport array index, can be + /// used in hardware accelerated stereo rendering. + bool enableMasking; ///< Indicate whether instance masking is enabled. +}; + +// Specifies the input parameters for the MSAA coverage out feature. MSAA coverage out is used in conjunction with a +// single sampled color image. This feature exports a mask indicating which samples would have been used if the +// image had been multi-sampled. The mask is exported to the specified channel of the MRT pointing to the rendered +// image. That is, the MRT must be an active bound render target. This MSAA mask data can then be post-processed. +struct MsaaCoverageOutDescriptor +{ + union + { + struct + { + uint32 enable : 1; ///< Set to true to enable render target channel output + uint32 numSamples : 4; ///< Number of samples to export + uint32 mrt : 3; ///< Which MRT to export to. + uint32 channel : 2; ///< Which channel to export to (x = 0, y = 1, z = 2, w = 3) + uint32 reserved : 22; + }; + + uint32 u32All; + } flags; +}; + +/// Specifies properties about an indirect function belonging to a compute @ref IPipelne object. Part of the input +/// structure to IDevice::CreateComputePipeline(). +struct ComputePipelineIndirectFuncInfo +{ + const char* pSymbolName; ///< ELF Symbol name for the associated function. Must not be null. + gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during + /// pipeline creation. +}; + +/// Specifies properties for creation of a compute @ref IPipeline object. Input structure to +/// IDevice::CreateComputePipeline(). +struct ComputePipelineCreateInfo +{ + PipelineCreateFlags flags; ///< Flags controlling pipeline creation. + + const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI + /// interface. The Pipeline ELF contains pre-compiled shaders, + /// register values, and additional metadata. + size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. + uint32 maxFunctionCallDepth; ///< Maximum depth for indirect function calls. Not used for a new + /// path ray-tracing pipeline as the compiler has pre-calculated + /// stack requirements. + bool disablePartialDispatchPreemption; ///< Prevents scenarios where a subset of the dispatched thread groups are + /// preempted and the remaining thread groups run to completion. This + /// can occur when thread group granularity preemption is available and + /// instruction level (CWSR) is not. This setting is useful for allowing + /// dispatches with interdependent thread groups. + DispatchInterleaveSize interleaveSize; ///< Controls how many thread groups are sent to one SE before switching to + /// the next one. + + /// PAL expects a fixed 3D thread group size for each compute pipeline but the HSA ABI supports dynamic group sizes. + /// If this pipeline's ELF binary metadata doesn't specify a fixed thread group size, this should be used to force + /// a particular thread group size. If this extent is set to all zeros PAL will use the metadata's group size. + /// This field is not supported on PAL ABI ELFs, it should be set to all zeros. + Extent3d threadsPerGroup; + TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle + /// issuing of low priority waves when it detects too many higher priority waves are + /// failing to schedule due to resource contraints. + + const char* pKernelName; ///< When create pipeline with hsa ELF binary of multiple kernels, need to set one + /// kernel to create the pipeline. null means only one kernel in ELF binary. + +}; + +/// Specifies information about the viewport behavior of an assembled graphics pipeline. Part of the input +/// structure @ref GraphicsPipelineCreateInfo. +struct ViewportInfo +{ + bool depthClipNearEnable; ///< Enable clipping based on Near Z coordinate. + bool depthClipFarEnable; ///< Enable clipping based on Far Z coordinate. + DepthRange depthRange; ///< Specifies Z dimensions of screen space (i.e., post viewport transform: + /// 0 to 1 or -1 to 1). +}; + +/// Specifies edgeRule for rasterization +enum class EdgeRuleMode : uint32 +{ + D3dCompliant = 0x0, ///< Use rasterization edge-rules which comply with the D3D spec. + OpenGlDefault = 0x1, ///< Use rasterization edge-rules compatible with the default OpenGL driver. +}; + +/// Specifies Rasterizer state in properties for creation of a graphics +struct RasterizerState +{ + PointOrigin pointCoordOrigin; ///< Controls texture coordinate orientation for point sprites. + bool expandLineWidth; ///< If true, line primitives will have their width expanded by 1/cos(a) + /// where a is the minimum angle from horizontal or vertical. + /// This can be used in conjunction with PS patching for a client to + /// implement line antialiasing. + ShadeMode shadeMode; ///< Specifies shading mode, Gouraud or Flat + bool rasterizeLastLinePixel; ///< Specifies whether to draw last pixel in a line. + bool outOfOrderPrimsEnable; ///< Enables out-of-order primitive rasterization. PAL silently + /// ignores this if it is unsupported in hardware. + bool perpLineEndCapsEnable; ///< Forces the use of perpendicular line end caps as opposed to + /// axis-aligned line end caps during line rasterization. + BinningOverride binningOverride; ///< Binning setting for this pipeline. + + DepthClampMode depthClampMode; ///< Depth clamping behavior + + union + { + struct + { + uint8 clipDistMaskValid : 1; ///< Whether or not @ref clipDiskMask, below, is valid. + uint8 cullDistMaskValid : 1; ///< Whether or not @ref cullDistMask, below, is valid. + uint8 reserved : 6; + }; + uint8 u8All; ///< All the flags as a single value. + } flags; + + uint8 cullDistMask; ///< Mask of which cullDistance exports to leave enabled. + uint8 clipDistMask; ///< Mask of which clipDistance exports to leave enabled. + bool dx10DiamondTestDisable; ///< Disable DX10 diamond test during line rasterization. + EdgeRuleMode edgeRule; +}; + +/// Specifies Per-MRT color target info in olor target state +struct ColorTargetInfo +{ + SwizzledFormat swizzledFormat; ///< Color target format and channel swizzle. Set the format to invalid + /// if no color target will be bound at this slot. + uint8 channelWriteMask; ///< Color target write mask. Bit 0 controls the red channel, bit 1 is + /// green, bit 2 is blue, and bit 3 is alpha. + bool forceAlphaToOne; ///< Treat alpha as one regardless of the shader output. Ignored unless + /// supportAlphaToOne is set in DeviceProperties. +}; + +/// Specifies color target state in properties for creation of a graphics +struct ColorTargetState +{ + bool alphaToCoverageEnable; ///< Enable alpha to coverage. + bool dualSourceBlendEnable; ///< Blend state bound at draw time will use a dual source blend mode. + LogicOp logicOp; ///< Logic operation to perform. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 904 + bool uavExportSingleDraw; ///< When UAV export is enabled, acts as a hint that only a single draw + /// is done on a color target with this or subsequent pipelines before + /// a barrier. Improves performance by allowing pipelines to overlap. +#endif + + ColorTargetInfo target[MaxColorTargets]; ///< Per-MRT color target info. +}; + +/// Specifies properties for creation of a graphics @ref IPipeline object. Input structure to +/// IDevice::CreateGraphicsPipeline(). +struct GraphicsPipelineCreateInfo +{ + PipelineCreateFlags flags; ///< Flags controlling pipeline creation. + + const void* pPipelineBinary; ///< Pointer to Pipeline ELF binary implementing the Pipeline ABI + /// interface. The Pipeline ELF contains pre-compiled shaders, + /// register values, and additional metadata. + size_t pipelineBinarySize; ///< Size of Pipeline ELF binary in bytes. + const IShaderLibrary** ppShaderLibraries; ///< An array of graphics @ref IShaderLibrary object. pPipelineBinary + /// and ppShaderLibraries can't be valid at the same time. + size_t numShaderLibraries; ///< Number of graphics shaderLibrary object in ppShaderLibraries. + bool useLateAllocVsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally + /// determining the limit. + uint32 lateAllocVsLimit; ///< The number of VS waves that can be in flight without having param + /// cache and position buffer space. If useLateAllocVsLimit flag is set, + /// PAL will use this limit instead of the PAL-specified limit. + bool useLateAllocGsLimit; ///< If set, use the specified lateAllocVsLimit instead of PAL internally + /// determining the limit. + uint32 lateAllocGsLimit; ///< Controls GS LateAlloc val (for pos/prim allocations NOT param cache) + /// on NGG pipelines. Can be no more than 127. + struct + { + struct + { + PrimitiveType primitiveType; ///< Basic primitive category: points, line, triangles, patches. + bool topologyIsPolygon; ///< Indicates that triangle primitives are combined to represent more + /// complex polygons. Only valid for triangle primitive types. + uint32 patchControlPoints; ///< Number of control points per patch. Only required if primitiveType + /// is PrimitiveType::Patch. + } topologyInfo; ///< Various information about the primitive topology that will be used with this pipeline. + /// All of this info must be consistent with the full topology specified by + /// ICmdBuffer::SetPrimitiveTopology() when drawing with this pipeline bound. + + /// Number of vertex buffer slots which are accessed by this pipeline. Behavior is undefined if the pipeline + /// tries to access a vertex buffer slot outside the range [0, vertexBufferCount). It is generally advisable + /// to make this the minimum value possible because that reduces the number of vertex buffer slots PAL has to + /// maintain for this pipeline when recording command buffers. + uint32 vertexBufferCount; + } iaState; ///< Input assembler state. + + RasterizerState rsState; ///< Rasterizer state. + ColorTargetState cbState; ///< Color target state. + + ViewInstancingDescriptor viewInstancingDesc; ///< Descriptor describes view instancing state + /// of the graphics pipeline + MsaaCoverageOutDescriptor coverageOutDesc; ///< Descriptor describes input parameters for MSAA coverage out. + ViewportInfo viewportInfo; ///< Viewport info. + DispatchInterleaveSize taskInterleaveSize; ///< Ignored for pipelines without a task shader. For pipelines with + /// a task shader, controls how many thread groups are sent to one + /// SE before switching to the next one. + LdsPsGroupSizeOverride ldsPsGroupSizeOverride; ///< Whether to override ldsPsGroupSize setting for pipeline. + + TriState groupLaunchGuarantee; ///< Force the group launch guarantee mechanism on or off. This feature will throttle + /// issuing of low priority waves when it detects too many higher priority waves are + /// failing to schedule due to resource contraints. + bool noForceReZ; ///< Disables the ability for PAL to force ReZ modes outside of what was chosen by + /// the compiler for this pipeline. +}; + +/// The graphic pipeline view instancing information. This is used to determine if hardware accelerated stereo rendering +/// can be enabled for a graphic pipeline. +struct GraphicPipelineViewInstancingInfo +{ + union + { + struct + { + uint32 shaderUseViewId : 1; ///< If any shader in pipeline uses view id. + uint32 gsExportRendertargetArrayIndex : 1; ///< If gs exports render target array index, + /// must be 0 if there is no gs. + uint32 gsExportViewportArrayIndex : 1; ///< If gs exports viewport array index, + /// must be 0 if there is no gs. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 apiShaderFlags; + }; + + const ViewInstancingDescriptor* pViewInstancingDesc; ///< View Instancing descriptor +}; + +/// Reports properties of a compiled pipeline. This includes hashes for the pipeline and shaders that the client can +/// use to correlate PAL pipeline/shader dumps with corresponding API-level pipelines/shaders. +struct PipelineInfo +{ + const char* pName; ///< Non-owning pointer to a null-terminated string containing human-readable + ///< name of this pipeline. nullptr indicates no name was provided. + + PipelineHash internalPipelineHash; ///< 128-bit identifier extracted from this pipeline's ELF binary, composed of + /// the state the compiler decided was appropriate to identify the compiled + /// shaders. The lower 64 bits are "stable"; the upper 64 bits are "unique". + + struct + { + ShaderHash hash; ///< Unique 128-bit identifier for this shader. 0 indicates there is no shader bound for + /// the corresponding shader stage. + } shader[NumShaderTypes]; ///< Array of per-shader pipeline properties. + + union + { + struct + { + uint32 hsaAbi : 1; ///< This pipeline uses the HSA ABI (i.e. bind arguments not user-data) + uint32 usesCps : 1; ///< This pipeline uses continuations passing shaders (CPS). In an archive pipeline, + /// this bit is set iff it is set in any constituent pipeline. + uint32 cpsGlobal : 1; ///< If using continuations passing shaders (CPS), stack is in global rather than + /// scratch. In an archive pipeline, this bit is set iff it is set in any + /// constituent pipeline. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + } flags; ///< Pipeline properties. + + struct + { + union + { + struct + { + uint32 perSampleShading : 1; ///< Shader instructions want per-sample execution. + uint32 usesSampleMask : 1; ///< Shader is using sample mask. + uint32 enablePops : 1; ///< Primitive order pixel shader is enabled. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + } flags; + } ps; ///< Pixel shader properties. + + uint64 resourceMappingHash; ///< 64-bit hash of the resource mapping used when compiling the pipeline, + /// if available (0 otherwise). + + /// The GPU address and size in bytes of the traceRays table, or { 0, 0 } if none. + /// The format of the table is private to the compiler. + /// Compiled code requires a buffer descriptor for the traceRays table to be passed in to the launch kernel + /// in a particular place. + gpusize traceRaysTable; + gpusize traceRaysTableSize; + + /// Pointer to the the traceRays table for debugging purpose, or nullptr if none. + const char* pTraceRaysTable; + + /// Pointer to and size in bytes of the shader identifiers table, or { nullptr, 0 } if none. + /// The table has an entry for each export and then each hit group specified when the pipeline was compiled. + /// Each entry has a pointer to the shader or hit group name in the same block of data (always 64 bits even + /// on a 32-bit host), then the 32 byte shader identifier whose format is agreed between the compiler and + /// GPURT's library code, for a total of 40 bytes per entry. + const void* pShaderIdentifiers; + size_t shaderIdentifiersSize; + + uint32 unifiedRgsNameHash; ///< 32-bit hash of unified RGS name, 0 otherwise + +}; + +/// A structure that represents any 3D arrangement of threads or thread groups as part of a compute shader dispatch. +/// +/// This structure is halfway between Extent3d and Offset3d, depending on the context it may represent an offset or +/// an extent. Essentially it's meaning is tied to the concept of 3D thread or thread group grids rather than generic +/// contexts like "extent" or "offset". Whether it represents threads or thread groups is also context specific. +struct DispatchDims +{ + uint32 x; ///< Threads or thread groups in the X dimension. + uint32 y; ///< Threads or thread groups in the Y dimension. + uint32 z; ///< Threads or thread groups in the Z dimension. + + /// Computes the volume of this 3D arrangement of threads or thread groups. + /// + /// @returns the total number of threads or threads groups this struct represents. + uint32 Flatten() const { return x * y * z; } +}; + +// There are some places where we'd like to directly cast DispatchDims to an array of three uint32s. +static_assert(sizeof(DispatchDims) == sizeof(uint32) * 3, "DispatchDims not castable to uint32*"); + +/// Component-wise addition of two DispatchDims. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A new DispatchDims which contains the sum of 'l' and 'r' along each dimension. +inline DispatchDims operator+(DispatchDims l, DispatchDims r) { return {l.x + r.x, l.y + r.y, l.z + r.z}; } + +/// Component-wise addition of one DispatchDims into another. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A reference to 'l' after it is updated to the sum of 'l' and 'r'. +inline DispatchDims& operator+=(DispatchDims& l, DispatchDims r) { return l = (l + r); } + +/// Component-wise multiplication of two DispatchDims. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A new DispatchDims which contains the product of 'l' and 'r' along each dimension. +inline DispatchDims operator*(DispatchDims l, DispatchDims r) { return {l.x * r.x, l.y * r.y, l.z * r.z}; } + +/// Component-wise multiplication of one DispatchDims into another. +/// +/// @param [in] l The left-hand argument. +/// @param [in] r The right-hand argument. +/// +/// @returns A reference to 'l' after it is updated to the product of 'l' and 'r'. +inline DispatchDims& operator*=(DispatchDims& l, DispatchDims r) { return l = (l * r); } + +/// Used to represent API level shader stage. +enum ShaderStageFlagBits : uint32 +{ + ApiShaderStageCompute = (1u << static_cast(ShaderType::Compute)), + ApiShaderStageTask = (1u << static_cast(ShaderType::Task)), + ApiShaderStageVertex = (1u << static_cast(ShaderType::Vertex)), + ApiShaderStageHull = (1u << static_cast(ShaderType::Hull)), + ApiShaderStageDomain = (1u << static_cast(ShaderType::Domain)), + ApiShaderStageGeometry = (1u << static_cast(ShaderType::Geometry)), + ApiShaderStageMesh = (1u << static_cast(ShaderType::Mesh)), + ApiShaderStagePixel = (1u << static_cast(ShaderType::Pixel)), +}; + +/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined +/// due to HW support. The same information will be repeated for both the constituent shaders in this case. +struct ShaderStats +{ + uint32 shaderStageMask; ///< Indicates the stages of the pipeline this shader is + /// used for. If multiple bits are set, it implies + /// shaders were merged. See @ref ShaderStageFlagBits. + CommonShaderStats common; ///< The shader compilation parameters for this shader. + /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableVgprs; + /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableSgprs; + size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. + ShaderHash palShaderHash; ///< Internal hash of the shader compilation data used by PAL. + + union + { + struct + { + uint32 writesUAV : 1; ///< This shader performs writes to UAVs. + uint32 writesDepth : 1; ///< Indicates explicit depth writes performed by the shader stage. + uint32 streamOut : 1; ///< The shader performs stream out of shader generated data. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< All flags combined as a single uint32. + + } shaderOperations; ///< Flags depicting shader operations. + + struct + { + DispatchDims numThreadsPerGroup; ///< Number of compute threads per thread group in X, Y, and Z dimensions. + } cs; ///< Parameters specific to compute shader only. + + union + { + struct + { + uint8 copyShaderPresent : 1; ///< Indicates that the copy shader data is valid. + uint8 reserved : 7; ///< Reserved for future use. + }; + uint8 u8All; ///< All the flags as a single value. + } flags; ///< Flags related to this shader data. + + CommonShaderStats copyShader; ///< This data is valid only when the copyShaderPresent flag above is set. +}; + + /** + *********************************************************************************************************************** + * @interface IPipeline + * @brief Monolithic object containing all shaders and a large amount of "shader adjacent" state. Separate concrete + * implementations will support compute or graphics pipelines. + * + * @see IDevice::CreateComputePipeline() + * @see IDevice::CreateGraphicsPipeline() + * @see IDevice::LoadPipeline() + *********************************************************************************************************************** + */ +class IPipeline : public IDestroyable +{ +public: + /// Returns PAL-computed properties of this pipeline and its corresponding shaders. + /// + /// @returns Property structure describing this pipeline. + virtual const PipelineInfo& GetInfo() const = 0; + + /// Returns a list of GPU memory allocations used by this pipeline. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this pipeline. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Obtains the binary code object for this pipeline. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the pipeline ELF is written in the buffer. If null, the size required + /// for the pipeline ELF is given out in the location pSize. + /// + /// @returns Success if the pipeline binary was fetched successfully. + /// +ErrorUnavailable if the pipeline binary was not fetched successfully. + virtual Result GetCodeObject( + uint32* pSize, + void* pBuffer) const = 0; + + /// Obtains the pointer of code object with ELF format according to the shader type. Returned ELF object is not + /// guaranteed to be unique with different shader type, because a single code object can contain multiple shaders. + /// + /// @param [in] shaderType The shader stage for which the code object are requested. + /// @param [out] pSize The size of the ELF binary. + /// + /// @returns The pointer of ELF binary which contains requested shader stage. + virtual const void* GetCodeObjectWithShaderType( + ShaderType shaderType, + size_t* pSize) const = 0; + + /// Obtains the shader pre and post compilation stats/params for the specified shader stage. + /// + /// @param [in] shaderType The shader stage for which the stats are requested. + /// + /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for + /// the shader stage mentioned in shaderType. This cannot be nullptr. + /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the + /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. + /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. + /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error + /// occured. + virtual Result GetShaderStats( + ShaderType shaderType, + ShaderStats* pShaderStats, + bool getDisassemblySize) const = 0; + + /// Obtains the compiled shader ISA code for the shader stage specified. + /// + /// @param [in] shaderType The shader stage for which the shader cache entry is requested. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required + /// for the shader ISA is given out in the location pSize. + /// + /// @returns Success if the shader ISA code was fetched successfully. + /// +ErrorUnavailable if the shader ISA code was not fetched successfully. + virtual Result GetShaderCode( + ShaderType shaderType, + size_t* pSize, + void* pBuffer) const = 0; + + /// Obtains the generated performance data for the shader stage specified. + /// + /// @param [in] hardwareStage The hardware stage of the shader which the performance data is requested. + /// @param [in, out] pSize Represents the size of the performance data. + /// @param [out] pBuffer If non-null, the performance data is written in the buffer. If null, the size + /// required for the performance data is given out in the location pSize. + /// + /// @returns Success if the performance data was fetched successfully. + /// +ErrorUnavailable if the performance data was not fetched successfully. + virtual Result GetPerformanceData( + Util::Abi::HardwareStage hardwareStage, + size_t* pSize, + void* pBuffer) = 0; + + /// Notifies PAL that this pipeline may make indirect function calls to any function contained within any of the + /// specified @ref IShaderLibrary objects. This gives PAL a chance to perform any late linking steps required to + /// valid execution of the possible function calls (this could include adjusting hardware resources such as GPRs + /// or LDS space for the pipeline). + /// + /// This may be called multiple times on the same pipeline object. Subsequent calls do not invalidate the result + /// of previous calls. + /// + /// This must be called prior to binding this pipeline to a command buffer which will make function calls into any + /// shader function contained within any of the specified libraries. Failure to comply is an error and will result + /// in undefined behavior. + /// + /// Currently only supported on compute pipelines. + /// + /// @param [in] ppLibraryList List of @ref IShaderLibrary object to link with. + /// @param [in] libraryCount Number of valid library objects in the ppLibraryList array. + /// + /// @returns Success if the operation is successful. Other return codes may include: + /// + ErrorUnavailable if called on a graphics pipeline. + /// + ErrorBadPipelineData if any of the libraries in ppLibraryList are not compatible with this pipeline. + /// Reasons for incompatibility include (but are not limited to) different user-data mappings, different + /// wavefront sizes, and other reasons. + virtual Result LinkWithLibraries( + const IShaderLibrary*const* ppLibraryList, + uint32 libraryCount) = 0; + + /// Sets the stack size for indirect function calls made by this pipeline. This may be smaller than or equal to the + /// stack size already determined during pipeline creation or during an earlier call to LinkWithLibraries() because + /// the client has access to more information about which functions contained in those libraries (or in the pipeline + /// itself) are actually going to be called. + /// + /// Note that a future call to LinkWithLibraries() will invalidate this value and this should + /// be called again. + /// + /// @param [in] stackSizeInBytes Client-specified stack size, in bytes. + virtual void SetStackSizeInBytes( + uint32 stackSizeInBytes) = 0; + + /// Retrieve the stack sizes managed by compiler, including the frontend stack and the backend stack. + /// + /// @param [out] pSizes To be filled with both the frontend stack size and the backend stack size, in bytes. + /// + /// @returns SUCCESS + virtual Result GetStackSizes( + CompilerStackSizes* pSizes) const = 0; + + /// Returns the API shader type to hardware stage mapping for the pipeline. + /// + /// @returns The appropriate mapping for this pipeline. + virtual Util::Abi::ApiHwShaderMapping ApiHwShaderMapping() const = 0; + + /// Given the zero-based position of a kernel argument, return a pointer to that argument's metadata. + /// + /// @note Only compute pipelines using the HSA ABI have kernel arguments. + /// + /// @param [in] index The zero-based position of the kernel argument to query. + /// + /// @returns A pointer to the kernel argument's metadata, or null if this pipeline doesn't have this argument. + virtual const Util::HsaAbi::KernelArgument* GetKernelArgument(uint32 index) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const { return m_pClientData; } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Get the array of underlying pipelines that this pipeline contains. For a normal non-multi-pipeline, + /// this returns a single-entry array pointing to the same IPipeline. For a multi-pipeline compiled in + /// dynamic launch mode, this returns an empty array. The contents of the returned array remain valid + /// until the IPipeline is destroyed. + /// + /// @returns The array of underlying pipelines. + virtual Util::Span GetPipelines() const = 0; + + /// Get the array of underlying shader libraries that this pipeline contains. For a normal non-multi-pipeline, + /// this returns the empty array. The contents of the returned array remain valid until the IPipeline is + /// destroyed. + /// + /// @returns The array of underlying shader libraries. + virtual Util::Span GetLibraries() const { return {}; } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPipeline() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPipeline() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + + IPipeline(const IPipeline&) = delete; + IPipeline& operator=(const IPipeline&) = delete; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h b/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h index 3bc91ec772..50ae295d29 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palPlatform.h @@ -1,649 +1,645 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palPlatform.h - * @brief Defines the Platform Abstraction Library (PAL) IPlatform interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palSysMemory.h" -#include "palMemTrackerImpl.h" -#include "palDestroyable.h" -#include "palDeveloperHooks.h" - -// DevDriver forward declarations. -namespace DevDriver -{ -class DevDriverServer; -namespace EventProtocol -{ -class EventServer; -} -class SettingsRpcService; -} - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 -namespace SettingsRpcService -{ -class SettingsService; -} -#endif - -#if PAL_BUILD_RDF -// GpuUtil forward declarations. -namespace GpuUtil -{ -class TraceSession; -} -#endif - -#if PAL_ENABLE_LOGGING -namespace Util -{ -struct DbgLoggerFileSettings; -} -#endif - -namespace Pal -{ - -// Forward declarations. -class IDevice; -class IScreen; -struct PalPlatformSettings; -enum class PalEvent : uint32; - -/// Maximum number of Devices possibly attached to a system. -constexpr uint32 MaxDevices = 16; -/// Maximum number of Screens possibly attached to a Device. -constexpr uint32 MaxScreensPerDevice = 6; -/// Maximum number of Screens possibly attached to a system. -constexpr uint32 MaxScreens = (MaxScreensPerDevice * MaxDevices); -constexpr uint32 MaxModePerScreen = 64; - -/// 32-bit PAL version identifier. -/// -/// Version number of the PAL library. Major version is bumped on every promotion from stg/pal to stg/pal_prm. Minor -/// version is bumped when a change is cherry-picked to stg/pal_prm. -/// -/// @see PlatformProperties::palVersion -struct Version -{ - uint16 major; ///< Major version number. - uint16 minor; ///< Minor version number. -}; - -/// Union defining the DevDriver GPU ID layout. -/// This is specifically used by DevDriver across multiple tools/driver and should not be changed. -union PciId -{ - struct - { - uint32 functionId : 8; ///< PCI function number in the system for this GPU. - uint32 deviceId : 8; ///< PCI device number in the system for this GPU. - uint32 busId : 8; ///< PCI bus number in the system for this GPU. - uint32 reserved : 8; ///< Reserved for future use. - }; - uint32 u32All; ///< Fields packed as 32-bit uint. -}; - -/// Reports capabilities and general properties of this instantiation of the PAL library. -/// -/// This covers any property that it platform-wide as opposed to being tied to a particular device in the system. -/// -/// @see IPlatform::GetProperties -struct PlatformProperties -{ - Version palVersion; ///< Version number of the PAL library. Note that this is distinct from the interface version. - /// It will be regularly updated as described in @ref Version. - union - { - struct - { - uint32 supportNonSwapChainPresents : 1; ///< If set, non-swapchain presents are supported. - uint32 supportBlockIfFlipping : 1; ///< If set, IQueue::Submit can protect against command buffers - /// that write to GPU memory queued for a flip present. - uint32 explicitPresentModes : 1; ///< If set, the PresentMode enums specified during direct and swap - /// chain presents explicitly determine the presentation method. - /// Additionally, the client must enumerate IScreens and use them - /// to explicitly manage fullscreen ownership. Otherwise, the - /// present modes are suggestions and fullscreen ownership is - /// managed internally by PAL. - uint32 reserved : 29; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - }; -}; - -/// The client that Pal may query profile for. the order is the same as SHARED_AP_AREA in KMD escape interface -enum class ApplicationProfileClient : uint32 -{ - Uninitialized = 0, - Dxx, - Udx, - Cfx, - Ogl, - User3D, - Ocl, - Mmd, - Pplib, - Dal, - Chill, - Spp, - Boost, - DeLag, - Ris, - TurboSync, - DutyCycleScaling, - ProBoost, - RisWindowed, - FreeMux, - FsrOvr, - Count -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 -/// Describes a primary surface view -/// -/// @see IPlatform::GetPrimaryLayout() -struct PrimaryViewInfo -{ - Rect rect; ///< Rectangle defining one portion of a primary surface layout. - uint32 numIndices; ///< The size of the gpuIndex array. - uint32 gpuIndex[MaxDevices]; ///< The devices in a linked adapter chain that can use this view. -}; - -/// Specifies output arguments for IPlatform::GetPrimaryLayout(), returning information about the layout of the primary -/// surface. -/// -/// @see IPlatform::GetPrimaryLayout() -struct GetPrimaryLayoutOutput -{ - uint32 numViews; ///< The number of views in the pViewInfoList array. - PrimaryViewInfo* pViewInfoList; ///< The primary surface is composed of these views. - union - { - struct - { - uint32 disablePartialCopy : 1; ///< If this flag is not set, the client can transfer the specific views of - /// primary surface to peer GPUs. Otherwise, the client must transfer the - /// whole primary surface to peer GPUs. - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< specifies primary surface layout flags. -}; -#endif - -/// Specifies TurboSync control mode -enum class TurboSyncControlMode : uint32 -{ - Disable = 0, ///< Disable TurboSync - Enable = 1, ///< Enable TurboSync - UpdateAllocations = 2, ///< Update allocations only, without disable or enable TurboSync - Register = 3, ///< Register the current platform as TurboSync requested platform, doesn't actually - /// activate TurboSync. - Count -}; - -constexpr uint32 TurboSyncMaxSurfaces = 2; ///< Specifies maximum number of surfaces in a private TurboSync swapchain - -/// Input argument for IPlatform::TurboSyncControl. TurboSync is a feature that enables app to render at higher than -/// V-Sync frame rates while still being tearing-free. It creates a private swapchain and copy application's back -/// buffer to the primary in this private swapchain when application is flipping. KMD controls the flipping of the -/// private swapchain to screen. -struct TurboSyncControlInput -{ - TurboSyncControlMode mode; ///< Specifies the TurboSync control mode - uint32 vidPnSourceId; ///< The vidPnSourceId the call is targeted - - /// GpuMemory of the primaries in private swapchain, per-gpu. This is indexed by the device indices enumerated by - /// the platform. Pal forwards the allocation handles (if IGpuMemory ptr is not null) to Kmd without validation. - const IGpuMemory* pPrimaryMemoryArray[MaxDevices][TurboSyncMaxSurfaces]; -}; - -/** -************************************************************************************************************************ -* @interface IPlatform -* @brief Interface representing an client-configurable context of the PAL platform. -* -* This is the root of all client interaction with PAL. Each IPlatform contains a set of the IDevice's and IScreens -* found in the system. -* -* + Creation of IDevice and IScreen objects. -* + Installation of memory management callbacks. -* + Query application profiles from the system. -************************************************************************************************************************ -*/ -class IPlatform : public IDestroyable -{ -public: - /// Enumerates a list of available Devices. - /// - /// This function creates a set of @ref IDevice objects corresponding to the devices attached to the system. - /// CreatePlatform() must be called before this function is called. - /// - /// This function may be called multiple times during the lifetime of the PAL lib, in which case all previous - /// @ref IDevice and @ref IScreen objects are automatically destroyed. The client is responsible for - /// destroying all objects attached to the existing @ref IDevice objects before re-calling this function. - /// Re-enumerating Devices is required if ErrorDeviceLost is ever returned by PAL, as this may indicate a device - /// has been physically removed from the system. - /// - /// @note Before IPlatform::Destroy can be called, all devices returned by IPlatform::EnumerateDevices() must be - /// destroyed. - /// - /// @param [out] pDeviceCount Specifies the number of devices available in the system. This is the number of valid - /// entries in pDevices[]. Must not be null. - /// @param [out] pDevices Array to be populated with a device object pointer for each device available in the - /// system. The first *pDeviceCount entries are valid. Must not be null. - /// - /// @returns Success if all Devices were successfully enumerated in pDevices[]. Otherwise, one of the following - /// error codes may be returned: - /// + ErrorInitializationFailed will be returned if PAL is unable to query the available Devices. - virtual Result EnumerateDevices( - uint32* pDeviceCount, - IDevice* pDevices[MaxDevices]) = 0; - - /// Returns the storage size of the object implementing IScreen. - /// - /// Use this to determine the size of each pStorage pointer passed to GetScreens. - /// - /// @returns the storage size in bytes of the object implementing IScreen. - virtual size_t GetScreenObjectSize() const = 0; - - /// Retrieves the list of available screens. - /// - /// This function queries a set of @ref IScreen objects corresponding to the screens attached to the system. - /// CreatePlatform() and IPlatform::EnumerateDevices() must be called before this function is called. - /// - /// This function may be called multiple times during the lifetime of the PAL lib. Each call returns a new - /// set of screen objects. - /// - /// @ingroup LibInit - /// - /// @param [out] pScreenCount Specifies the number of screens available in the system. This is the number of valid - /// entries in pScreens[] and pStorage[]. Must not be null. - /// @param [in] pStorage Array of caller-allocated storage for the screen objects. Each must be the size - /// returned by GetScreenObjectSize. Must always pre-allocate MaxScreens worth, must - /// not be NULL nor may any entry be NULL. - /// @param [out] pScreens Array to be populated with a screen pointer for each screen available in the system. - /// The first *pScreenCount entries are valid. Must not be null. - /// - /// @note pScreens[i] uses the storage from pStorage[i]. pStorage[i] is unused for i >= *pScreenCount. - /// - /// @returns Success if all screens were successfully retrieved in pScreens[]. Otherwise, one of the following - /// error codes may be returned: - /// + ErrorUnavailable if this was called prior to IPlatform::EnumerateDevices(). - virtual Result GetScreens( - uint32* pScreenCount, - void* pStorage[MaxScreens], - IScreen* pScreens[MaxScreens]) = 0; - - /// Queries a client specified application profile in raw format. - /// - /// This function queries the kernel-mode driver to determine if there is a platform-wide profile for a specific - /// application that the client would like to honor. It is optional, and doesn't need to be called if the client - /// does not wish to support application profiles. - /// - /// As the format of profile is client specified, the profile will be returned in raw format and client has the - /// responsibility to parse the profile. @see GpuUtil::ProfileIterator provides a basic capability to iterate all - /// properties in the raw data packet. The memory storing the raw data is managed by Pal. - /// - /// The pFilename string can be the EXE name, like "doom.exe", or the "Content Distribution Network" (CDN) ID, - /// like "SteamAppId:570". You can use the function GpuUtil::QueryAppContentDistributionId() to get the CDN ID. - /// - /// @ingroup LibInit - /// - /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. - /// See GpuUtil::QueryAppContentDistributionId(). - /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. - /// @param [in] client Client name that KMD will query the profile for - /// @param [out] pOut Will be filled with the application profile string if the profile exists and was - /// successfully queried. - /// - /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully - /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. - /// Otherwise, one of the following error codes may be returned: - /// + ErrorInvalidPointer will be returned if pFilename or pOut is null. - /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices - /// discovered. - virtual Result QueryRawApplicationProfile( - const wchar_t* pFilename, - const wchar_t* pPathname, - ApplicationProfileClient client, - const char** pOut) = 0; - - /// Enable UMD side support for the SPP feature (Shader Profiling for Power). The gist of the initial version - /// of this feature is that we will profile important applications to determine which shaders are heavily memory - /// bound, then use these profiles to program the RLC to dynamically reduce engine clocks when running such shaders. - /// This should result in power savings with a limited perf impact. - - /// Each of these app profiles will include one or more tables specifying how to program the RLC. - /// There may be multiple tables in cases where we need different RLC programming based on user controlled factors - /// that affect memory boundedness: resolution, MSAA rate, etc. - - /// @ingroup LibInit - /// - /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. - /// See GpuUtil::QueryAppContentDistributionId(). - /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. - - /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully - /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. - /// Otherwise, one of the following error codes may be returned: - /// + ErrorInvalidPointer will be returned if pFilename is null. - /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices - /// discovered. - virtual Result EnableSppProfile( - const wchar_t* pFilename, - const wchar_t* pPathname) = 0; - - /// Reports the properties of the platform. - /// - /// Returns the capabilities and general properties of this platform instantiation. - /// - /// @param [out] pProperties Capabilities and general properties of this platform instantiation (not tied to a - /// particular device). - /// - /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the - /// following errors may be returned: - /// + ErrorInvalidPointer if pProperties is null. - virtual Result GetProperties( - PlatformProperties* pProperties) = 0; - - /// Installs the callback into the specified platform. - /// - /// @param [in] pPlatform The platform to install the callback into. - /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. - /// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. - static void InstallDeveloperCb( - IPlatform* pPlatform, - Developer::Callback pfnDeveloperCb, - void* pPrivateData) - { pPlatform->InstallDeveloperCb(pfnDeveloperCb, pPrivateData); }; - - /// Returns the currently enabled developer callback types. - /// - /// @returns The bitmask of currently enabled developer callback types - virtual uint32 GetEnabledCallbackTypes() const = 0; - - /// Sets the currently enabled developer callback types. - /// - /// @param [in] callbackTypeMask A bitmask of client requested developer callback types to enable - virtual void SetEnabledCallbackTypes( - uint32 enabledCallbackTypesMask) = 0; - - /// Returns a pointer to the developer driver server object if developer mode is enabled on the system. - /// - /// @returns A valid DevDriver::DevDriverServer pointer if developer mode is enabled. If developer mode is not - /// enabled, nullptr will be returned. - virtual DevDriver::DevDriverServer* GetDevDriverServer() = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 - /// Will be replaced by GetSettingsRpcService(). - virtual SettingsRpcService::SettingsService* GetSettingsService() = 0; -#endif - - /// Client drivers can register their DevDriver based settings components via SettingsRpcService. - /// - /// @returns A pointer to a SettingsRpcService object. Could be nullptr if developer driver mode is not enabled. - virtual DevDriver::SettingsRpcService* GetSettingsRpcService() = 0; - - /// Returns a pointer to the event server object. The event server will soon move out of the DevDriver - /// server. Hence the need to provide a separate interface to access the event server. - /// - /// @returns A valid EventServer pointer or nullptr if not valid. - virtual DevDriver::EventProtocol::EventServer* GetEventServer() = 0; - -#if PAL_BUILD_RDF - /// Returns a pointer to the current trace session if one was created during startup - /// - /// @returns A valid TraceSession pointer if a session currently exists. If a trace session was not created during - /// startup, nullptr will be returned. - virtual GpuUtil::TraceSession* GetTraceSession() = 0; - - /// Indicate frame count increment to frame trace controller. - /// Client driver is responsible for calling this method once per frame. - /// - /// @param [in] pQueue The queue on which a new frame has been detected - virtual void UpdateFrameTraceController( - IQueue *pQueue) = 0; -#endif - - /// Gets the GPU ID for a given pal device index. - /// - /// The GPU ID is determined from (BusID << 16) | (DeviceID << 8) | FunctionID - /// - /// @param [in] gpuIndex The index of the GPU - /// - /// @returns A GPU ID described above or UnknownGPUID if the device doesn't exist - virtual PciId GetPciId(uint32 gpuIndex) = 0; - - /// Indicates whether tracing has been enabled. - /// - /// @returns True if tracing is enabled, false otherwise. - virtual bool IsTracingEnabled() const = 0; - - /// Indicates whether the driver has been signaled to enable crash analysis mode. - /// - /// @returns True if crash analysis is enabled, false otherwise. - virtual bool IsCrashAnalysisModeEnabled() const = 0; - - /// Indicates whether the driver has been signaled to enable Raytracing Shader Data Tokens. - /// - /// @returns True if Raytracing Shader Data Tokens is enabled, false otherwise. - virtual bool IsRaytracingShaderDataTokenRequested() const = 0; - - /// Returns a pointer to the Platform settings structure - /// - /// @returns A reference to a PalPlatformSettings structure. - virtual const PalPlatformSettings& PlatformSettings() const = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 948 - /// Get primary surface layout based upon VidPnSource provided by client. - /// - /// This function is used by client to query the layout of the primary surface. The layout describes how primary - /// surface is composed with a set of views. Each view provides the rectangle of the surface area and the GPUs - /// this surface area will be displayed on. - /// Client should make first call pass in pPrimaryLayoutOutput->pViewInfoList as NULL to query the number of views - /// this primary surface has. - /// Client then based on pPrimaryLayoutOutput->numViews, allocates the buffer for pViewInfoList. And client then - /// makes the escape call again to query the actual view information. - /// - /// @param [in] vidPnSourceId VidPnSource ID that's associated to a primary surface. - /// @param [in, out] pPrimaryLayoutOutput Primary surface layout output arguments. - /// - /// @returns Success if the display layout on given vidPnSourceId was successfully queried. - /// Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if pPrimaryLayoutOutput is invalid. - /// + ErrorUnavailable if no implementation on current platform. - /// + ErrorOutOfMemory if there is not enough system memory. - inline Result GetPrimaryLayout( - uint32 vidPnSourceId, - GetPrimaryLayoutOutput* pPrimaryLayoutOutput) { return Result::ErrorUnavailable; } -#endif - - /// Calls TurboSyncControl escape to control TurboSync on specific vidPnSourceId. - /// - /// The function is called when clients intend to toggle TurboSync on a vidPnSourceId. The client should allocate - /// private swapchain primary surfaces that's compatible with the application swapchain primaries. When used to - /// activate TurboSync, the private primaries' handles needs to be passed in the TurboSyncControlInput data. - /// - /// @param [in] turboSyncControlInput TurboSyncControl input arguments. See TurboSyncControlInput. - /// - /// @returns Success if the TurboSyncControl request is handled successfully. - virtual Result TurboSyncControl( - const TurboSyncControlInput& turboSyncControlInput) = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - - /// Allocates memory using the platform's ForwardAllocator. - /// - /// @param [in] allocInfo @see Util::AllocInfo - /// - /// @returns Pointer to the allocated memory on success, nullptr on failure. - void* Alloc(const Util::AllocInfo& allocInfo) - { -#if PAL_MEMTRACK - return m_memTracker.Alloc(allocInfo); -#else - return m_allocator.Alloc(allocInfo); -#endif - } - - /// Frees memory using the platform's ForwardAllocator. - /// - /// @param [in] freeInfo @see Util::FreeInfo - void Free(const Util::FreeInfo& freeInfo) - { -#if PAL_MEMTRACK - m_memTracker.Free(freeInfo); -#else - m_allocator.Free(freeInfo); -#endif - } - - /// Logs a text string via the developer driver bus if it is currently connected. - /// - /// @param [in] level Log priority level associated with the message. - /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. - /// @param [in] pFormat Format string for the log message. - /// @param [in] args Variable arguments that correspond to the format string. - virtual void LogMessage(LogLevel level, - LogCategoryMask categoryMask, - const char* pFormat, - va_list args) = 0; - - /// Logs a text string via the developer driver bus if it is currently connected. - /// - /// @param [in] level Log priority level associated with the message. - /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. - /// @param [in] pFormat Format string for the log message. - /// @param [in] ... Variable arguments that correspond to the format string. - void LogMessage(LogLevel level, - LogCategoryMask categoryMask, - const char* pFormat, - ...) - { - va_list args; - va_start(args, pFormat); - LogMessage(level, categoryMask, pFormat, args); - va_end(args); - } - - /// Logs an event using the DevDriver protocol. - /// - /// @param [in] eventId The type of event you want to log. - /// @param [in] pEventData A pointer to the struct corresponding to the event id. - /// @param [in] eventDataSize The size of the event data struct. - virtual void LogEvent( - PalEvent eventId, - const void* pEventData, - uint32 eventDataSize) {} - -#if PAL_ENABLE_LOGGING - /// Function to access the current settings of file logger. - /// Clients can call this function to get file logger settings in order to configure - /// this logger at the time of its creation. - /// - /// @param [in] pSettings A struct in which file logger settings are copied. - virtual void GetDbgLoggerFileSettings( - Util::DbgLoggerFileSettings* pSettings) = 0; -#endif - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IPlatform( - const Util::AllocCallbacks& allocCb) - : -#if PAL_MEMTRACK - m_memTracker(&m_allocator), -#endif - m_allocator(allocCb), - m_pClientData(nullptr) { } - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IPlatform() { } - - /// @internal Initialization common to all platforms; must be called in subclass overrides of this function. - /// Currently only handles initialization of the memory leak tracker. - virtual Result Init() - { -#if PAL_MEMTRACK - return m_memTracker.Init(); -#else - return Result::Success; -#endif - } - - /// Used by the InstallDeveloperCb to install the event handler according to the derived platform. - /// - /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. - /// @param [in] pPrivateData Private data that is installed with the event handler for use by the installer. - virtual void InstallDeveloperCb( - Developer::Callback pfnDeveloperCb, - void* pPrivateData) = 0; - -#if PAL_MEMTRACK - /// @internal Memory leak tracker. Requires an allocator in order to perform the actual allocations. We can't - /// provide this platform because that would result in a stack overflow. We must give it our forward allocator. - Util::MemTracker m_memTracker; -#endif - - /// @internal Memory allocator. Calls to Alloc() and Free() are chained down to the allocator's counterparts. - Util::ForwardAllocator m_allocator; - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palPlatform.h + * @brief Defines the Platform Abstraction Library (PAL) IPlatform interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palSysMemory.h" +#include "palMemTrackerImpl.h" +#include "palDestroyable.h" +#include "palDeveloperHooks.h" + +// DevDriver forward declarations. +namespace DevDriver +{ +class DevDriverServer; +namespace EventProtocol +{ +class EventServer; +} +class SettingsRpcService; +} + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 +namespace SettingsRpcService +{ +class SettingsService; +} +#endif + +#if PAL_BUILD_RDF +// GpuUtil forward declarations. +namespace GpuUtil +{ +class TraceSession; +} +#endif + +#if PAL_ENABLE_LOGGING +namespace Util +{ +struct DbgLoggerFileSettings; +} +#endif + +namespace Pal +{ + +// Forward declarations. +class IDevice; +class IScreen; +struct PalPlatformSettings; +enum class PalEvent : uint32; + +/// Maximum number of Devices possibly attached to a system. +constexpr uint32 MaxDevices = 16; +/// Maximum number of Screens possibly attached to a Device. +constexpr uint32 MaxScreensPerDevice = 6; +/// Maximum number of Screens possibly attached to a system. +constexpr uint32 MaxScreens = (MaxScreensPerDevice * MaxDevices); +constexpr uint32 MaxModePerScreen = 64; + +/// 32-bit PAL version identifier. +/// +/// Version number of the PAL library. Major version is bumped on every promotion from stg/pal to stg/pal_prm. Minor +/// version is bumped when a change is cherry-picked to stg/pal_prm. +/// +/// @see PlatformProperties::palVersion +struct Version +{ + uint16 major; ///< Major version number. + uint16 minor; ///< Minor version number. +}; + +/// Union defining the DevDriver GPU ID layout. +/// This is specifically used by DevDriver across multiple tools/driver and should not be changed. +union PciId +{ + struct + { + uint32 functionId : 8; ///< PCI function number in the system for this GPU. + uint32 deviceId : 8; ///< PCI device number in the system for this GPU. + uint32 busId : 8; ///< PCI bus number in the system for this GPU. + uint32 reserved : 8; ///< Reserved for future use. + }; + uint32 u32All; ///< Fields packed as 32-bit uint. +}; + +/// Reports capabilities and general properties of this instantiation of the PAL library. +/// +/// This covers any property that it platform-wide as opposed to being tied to a particular device in the system. +/// +/// @see IPlatform::GetProperties +struct PlatformProperties +{ + Version palVersion; ///< Version number of the PAL library. Note that this is distinct from the interface version. + /// It will be regularly updated as described in @ref Version. + union + { + struct + { + uint32 supportNonSwapChainPresents : 1; ///< If set, non-swapchain presents are supported. + uint32 supportBlockIfFlipping : 1; ///< If set, IQueue::Submit can protect against command buffers + /// that write to GPU memory queued for a flip present. + uint32 explicitPresentModes : 1; ///< If set, the PresentMode enums specified during direct and swap + /// chain presents explicitly determine the presentation method. + /// Additionally, the client must enumerate IScreens and use them + /// to explicitly manage fullscreen ownership. Otherwise, the + /// present modes are suggestions and fullscreen ownership is + /// managed internally by PAL. + uint32 reserved : 29; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + }; +}; + +/// The client that Pal may query profile for. the order is the same as SHARED_AP_AREA in KMD escape interface +enum class ApplicationProfileClient : uint32 +{ + Uninitialized = 0, + Dxx, + Udx, + Cfx, + Ogl, + User3D, + Ocl, + Mmd, + Pplib, + Dal, + Chill, + Spp, + Boost, + DeLag, + Ris, + TurboSync, + DutyCycleScaling, + ProBoost, + RisWindowed, + FreeMux, + FsrOvr, + Count +}; + +/// Describes a primary surface view +/// +/// @see IPlatform::GetPrimaryLayout() +struct PrimaryViewInfo +{ + Rect rect; ///< Rectangle defining one portion of a primary surface layout. + uint32 numIndices; ///< The size of the gpuIndex array. + uint32 gpuIndex[MaxDevices]; ///< The devices in a linked adapter chain that can use this view. +}; + +/// Specifies output arguments for IPlatform::GetPrimaryLayout(), returning information about the layout of the primary +/// surface. +/// +/// @see IPlatform::GetPrimaryLayout() +struct GetPrimaryLayoutOutput +{ + uint32 numViews; ///< The number of views in the pViewInfoList array. + PrimaryViewInfo* pViewInfoList; ///< The primary surface is composed of these views. + union + { + struct + { + uint32 disablePartialCopy : 1; ///< If this flag is not set, the client can transfer the specific views of + /// primary surface to peer GPUs. Otherwise, the client must transfer the + /// whole primary surface to peer GPUs. + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< specifies primary surface layout flags. +}; + +/// Specifies TurboSync control mode +enum class TurboSyncControlMode : uint32 +{ + Disable = 0, ///< Disable TurboSync + Enable = 1, ///< Enable TurboSync + UpdateAllocations = 2, ///< Update allocations only, without disable or enable TurboSync + Register = 3, ///< Register the current platform as TurboSync requested platform, doesn't actually + /// activate TurboSync. + Count +}; + +constexpr uint32 TurboSyncMaxSurfaces = 2; ///< Specifies maximum number of surfaces in a private TurboSync swapchain + +/// Input argument for IPlatform::TurboSyncControl. TurboSync is a feature that enables app to render at higher than +/// V-Sync frame rates while still being tearing-free. It creates a private swapchain and copy application's back +/// buffer to the primary in this private swapchain when application is flipping. KMD controls the flipping of the +/// private swapchain to screen. +struct TurboSyncControlInput +{ + TurboSyncControlMode mode; ///< Specifies the TurboSync control mode + uint32 vidPnSourceId; ///< The vidPnSourceId the call is targeted + + /// GpuMemory of the primaries in private swapchain, per-gpu. This is indexed by the device indices enumerated by + /// the platform. Pal forwards the allocation handles (if IGpuMemory ptr is not null) to Kmd without validation. + const IGpuMemory* pPrimaryMemoryArray[MaxDevices][TurboSyncMaxSurfaces]; +}; + +/** +************************************************************************************************************************ +* @interface IPlatform +* @brief Interface representing an client-configurable context of the PAL platform. +* +* This is the root of all client interaction with PAL. Each IPlatform contains a set of the IDevice's and IScreens +* found in the system. +* +* + Creation of IDevice and IScreen objects. +* + Installation of memory management callbacks. +* + Query application profiles from the system. +************************************************************************************************************************ +*/ +class IPlatform : public IDestroyable +{ +public: + /// Enumerates a list of available Devices. + /// + /// This function creates a set of @ref IDevice objects corresponding to the devices attached to the system. + /// CreatePlatform() must be called before this function is called. + /// + /// This function may be called multiple times during the lifetime of the PAL lib, in which case all previous + /// @ref IDevice and @ref IScreen objects are automatically destroyed. The client is responsible for + /// destroying all objects attached to the existing @ref IDevice objects before re-calling this function. + /// Re-enumerating Devices is required if ErrorDeviceLost is ever returned by PAL, as this may indicate a device + /// has been physically removed from the system. + /// + /// @note Before IPlatform::Destroy can be called, all devices returned by IPlatform::EnumerateDevices() must be + /// destroyed. + /// + /// @param [out] pDeviceCount Specifies the number of devices available in the system. This is the number of valid + /// entries in pDevices[]. Must not be null. + /// @param [out] pDevices Array to be populated with a device object pointer for each device available in the + /// system. The first *pDeviceCount entries are valid. Must not be null. + /// + /// @returns Success if all Devices were successfully enumerated in pDevices[]. Otherwise, one of the following + /// error codes may be returned: + /// + ErrorInitializationFailed will be returned if PAL is unable to query the available Devices. + virtual Result EnumerateDevices( + uint32* pDeviceCount, + IDevice* pDevices[MaxDevices]) = 0; + + /// Returns the storage size of the object implementing IScreen. + /// + /// Use this to determine the size of each pStorage pointer passed to GetScreens. + /// + /// @returns the storage size in bytes of the object implementing IScreen. + virtual size_t GetScreenObjectSize() const = 0; + + /// Retrieves the list of available screens. + /// + /// This function queries a set of @ref IScreen objects corresponding to the screens attached to the system. + /// CreatePlatform() and IPlatform::EnumerateDevices() must be called before this function is called. + /// + /// This function may be called multiple times during the lifetime of the PAL lib. Each call returns a new + /// set of screen objects. + /// + /// @ingroup LibInit + /// + /// @param [out] pScreenCount Specifies the number of screens available in the system. This is the number of valid + /// entries in pScreens[] and pStorage[]. Must not be null. + /// @param [in] pStorage Array of caller-allocated storage for the screen objects. Each must be the size + /// returned by GetScreenObjectSize. Must always pre-allocate MaxScreens worth, must + /// not be NULL nor may any entry be NULL. + /// @param [out] pScreens Array to be populated with a screen pointer for each screen available in the system. + /// The first *pScreenCount entries are valid. Must not be null. + /// + /// @note pScreens[i] uses the storage from pStorage[i]. pStorage[i] is unused for i >= *pScreenCount. + /// + /// @returns Success if all screens were successfully retrieved in pScreens[]. Otherwise, one of the following + /// error codes may be returned: + /// + ErrorUnavailable if this was called prior to IPlatform::EnumerateDevices(). + virtual Result GetScreens( + uint32* pScreenCount, + void* pStorage[MaxScreens], + IScreen* pScreens[MaxScreens]) = 0; + + /// Queries a client specified application profile in raw format. + /// + /// This function queries the kernel-mode driver to determine if there is a platform-wide profile for a specific + /// application that the client would like to honor. It is optional, and doesn't need to be called if the client + /// does not wish to support application profiles. + /// + /// As the format of profile is client specified, the profile will be returned in raw format and client has the + /// responsibility to parse the profile. @see GpuUtil::ProfileIterator provides a basic capability to iterate all + /// properties in the raw data packet. The memory storing the raw data is managed by Pal. + /// + /// The pFilename string can be the EXE name, like "doom.exe", or the "Content Distribution Network" (CDN) ID, + /// like "SteamAppId:570". You can use the function GpuUtil::QueryAppContentDistributionId() to get the CDN ID. + /// + /// @ingroup LibInit + /// + /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. + /// See GpuUtil::QueryAppContentDistributionId(). + /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. + /// @param [in] client Client name that KMD will query the profile for + /// @param [out] pOut Will be filled with the application profile string if the profile exists and was + /// successfully queried. + /// + /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully + /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. + /// Otherwise, one of the following error codes may be returned: + /// + ErrorInvalidPointer will be returned if pFilename or pOut is null. + /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices + /// discovered. + virtual Result QueryRawApplicationProfile( + const wchar_t* pFilename, + const wchar_t* pPathname, + ApplicationProfileClient client, + const char** pOut) = 0; + + /// Enable UMD side support for the SPP feature (Shader Profiling for Power). The gist of the initial version + /// of this feature is that we will profile important applications to determine which shaders are heavily memory + /// bound, then use these profiles to program the RLC to dynamically reduce engine clocks when running such shaders. + /// This should result in power savings with a limited perf impact. + + /// Each of these app profiles will include one or more tables specifying how to program the RLC. + /// There may be multiple tables in cases where we need different RLC programming based on user controlled factors + /// that affect memory boundedness: resolution, MSAA rate, etc. + + /// @ingroup LibInit + /// + /// @param [in] pFilename Filename of the application or the Steam/EA/UPlay game ID to query for its profile. + /// See GpuUtil::QueryAppContentDistributionId(). + /// @param [in] pPathname Optional. Allows the caller to specify a pathname in addition to a filename if they wish. + + /// @returns Success if the application profile exists for the specified string(s) and the profile was successfully + /// retrieved, or Unsupported if the profile does not exist and the query was successfully performed. + /// Otherwise, one of the following error codes may be returned: + /// + ErrorInvalidPointer will be returned if pFilename is null. + /// + ErrorUnavailable if this is called before IPlatform::EnumerateDevices(), or if there were no Devices + /// discovered. + virtual Result EnableSppProfile( + const wchar_t* pFilename, + const wchar_t* pPathname) = 0; + + /// Reports the properties of the platform. + /// + /// Returns the capabilities and general properties of this platform instantiation. + /// + /// @param [out] pProperties Capabilities and general properties of this platform instantiation (not tied to a + /// particular device). + /// + /// @returns Success if the properties were successfully queried and returned in pProperties. Otherwise, one of the + /// following errors may be returned: + /// + ErrorInvalidPointer if pProperties is null. + virtual Result GetProperties( + PlatformProperties* pProperties) = 0; + + /// Installs the callback into the specified platform. + /// + /// @param [in] pPlatform The platform to install the callback into. + /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. + /// @param [in] pPrivateData Private data that is installed with the callback for use by the installer. + static void InstallDeveloperCb( + IPlatform* pPlatform, + Developer::Callback pfnDeveloperCb, + void* pPrivateData) + { pPlatform->InstallDeveloperCb(pfnDeveloperCb, pPrivateData); }; + + /// Returns the currently enabled developer callback types. + /// + /// @returns The bitmask of currently enabled developer callback types + virtual uint32 GetEnabledCallbackTypes() const = 0; + + /// Sets the currently enabled developer callback types. + /// + /// @param [in] callbackTypeMask A bitmask of client requested developer callback types to enable + virtual void SetEnabledCallbackTypes( + uint32 enabledCallbackTypesMask) = 0; + + /// Returns a pointer to the developer driver server object if developer mode is enabled on the system. + /// + /// @returns A valid DevDriver::DevDriverServer pointer if developer mode is enabled. If developer mode is not + /// enabled, nullptr will be returned. + virtual DevDriver::DevDriverServer* GetDevDriverServer() = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 890 + /// Will be replaced by GetSettingsRpcService(). + virtual SettingsRpcService::SettingsService* GetSettingsService() = 0; +#endif + + /// Client drivers can register their DevDriver based settings components via SettingsRpcService. + /// + /// @returns A pointer to a SettingsRpcService object. Could be nullptr if developer driver mode is not enabled. + virtual DevDriver::SettingsRpcService* GetSettingsRpcService() = 0; + + /// Returns a pointer to the event server object. The event server will soon move out of the DevDriver + /// server. Hence the need to provide a separate interface to access the event server. + /// + /// @returns A valid EventServer pointer or nullptr if not valid. + virtual DevDriver::EventProtocol::EventServer* GetEventServer() = 0; + +#if PAL_BUILD_RDF + /// Returns a pointer to the current trace session if one was created during startup + /// + /// @returns A valid TraceSession pointer if a session currently exists. If a trace session was not created during + /// startup, nullptr will be returned. + virtual GpuUtil::TraceSession* GetTraceSession() = 0; + + /// Indicate frame count increment to frame trace controller. + /// Client driver is responsible for calling this method once per frame. + /// + /// @param [in] pQueue The queue on which a new frame has been detected + virtual void UpdateFrameTraceController( + IQueue *pQueue) = 0; +#endif + + /// Gets the GPU ID for a given pal device index. + /// + /// The GPU ID is determined from (BusID << 16) | (DeviceID << 8) | FunctionID + /// + /// @param [in] gpuIndex The index of the GPU + /// + /// @returns A GPU ID described above or UnknownGPUID if the device doesn't exist + virtual PciId GetPciId(uint32 gpuIndex) = 0; + + /// Indicates whether tracing has been enabled. + /// + /// @returns True if tracing is enabled, false otherwise. + virtual bool IsTracingEnabled() const = 0; + + /// Indicates whether the driver has been signaled to enable crash analysis mode. + /// + /// @returns True if crash analysis is enabled, false otherwise. + virtual bool IsCrashAnalysisModeEnabled() const = 0; + + /// Indicates whether the driver has been signaled to enable Raytracing Shader Data Tokens. + /// + /// @returns True if Raytracing Shader Data Tokens is enabled, false otherwise. + virtual bool IsRaytracingShaderDataTokenRequested() const = 0; + + /// Returns a pointer to the Platform settings structure + /// + /// @returns A reference to a PalPlatformSettings structure. + virtual const PalPlatformSettings& PlatformSettings() const = 0; + + /// Get primary surface layout based upon VidPnSource provided by client. + /// + /// This function is used by client to query the layout of the primary surface. The layout describes how primary + /// surface is composed with a set of views. Each view provides the rectangle of the surface area and the GPUs + /// this surface area will be displayed on. + /// Client should make first call pass in pPrimaryLayoutOutput->pViewInfoList as NULL to query the number of views + /// this primary surface has. + /// Client then based on pPrimaryLayoutOutput->numViews, allocates the buffer for pViewInfoList. And client then + /// makes the escape call again to query the actual view information. + /// + /// @param [in] vidPnSourceId VidPnSource ID that's associated to a primary surface. + /// @param [in, out] pPrimaryLayoutOutput Primary surface layout output arguments. + /// + /// @returns Success if the display layout on given vidPnSourceId was successfully queried. + /// Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if pPrimaryLayoutOutput is invalid. + /// + ErrorUnavailable if no implementation on current platform. + /// + ErrorOutOfMemory if there is not enough system memory. + virtual Result GetPrimaryLayout( + uint32 vidPnSourceId, + GetPrimaryLayoutOutput* pPrimaryLayoutOutput) = 0; + + /// Calls TurboSyncControl escape to control TurboSync on specific vidPnSourceId. + /// + /// The function is called when clients intend to toggle TurboSync on a vidPnSourceId. The client should allocate + /// private swapchain primary surfaces that's compatible with the application swapchain primaries. When used to + /// activate TurboSync, the private primaries' handles needs to be passed in the TurboSyncControlInput data. + /// + /// @param [in] turboSyncControlInput TurboSyncControl input arguments. See TurboSyncControlInput. + /// + /// @returns Success if the TurboSyncControl request is handled successfully. + virtual Result TurboSyncControl( + const TurboSyncControlInput& turboSyncControlInput) = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Allocates memory using the platform's ForwardAllocator. + /// + /// @param [in] allocInfo @see Util::AllocInfo + /// + /// @returns Pointer to the allocated memory on success, nullptr on failure. + void* Alloc(const Util::AllocInfo& allocInfo) + { +#if PAL_MEMTRACK + return m_memTracker.Alloc(allocInfo); +#else + return m_allocator.Alloc(allocInfo); +#endif + } + + /// Frees memory using the platform's ForwardAllocator. + /// + /// @param [in] freeInfo @see Util::FreeInfo + void Free(const Util::FreeInfo& freeInfo) + { +#if PAL_MEMTRACK + m_memTracker.Free(freeInfo); +#else + m_allocator.Free(freeInfo); +#endif + } + + /// Logs a text string via the developer driver bus if it is currently connected. + /// + /// @param [in] level Log priority level associated with the message. + /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. + /// @param [in] pFormat Format string for the log message. + /// @param [in] args Variable arguments that correspond to the format string. + virtual void LogMessage(LogLevel level, + LogCategoryMask categoryMask, + const char* pFormat, + va_list args) = 0; + + /// Logs a text string via the developer driver bus if it is currently connected. + /// + /// @param [in] level Log priority level associated with the message. + /// @param [in] categoryMask Log category mask that represents what category fields the message relates to. + /// @param [in] pFormat Format string for the log message. + /// @param [in] ... Variable arguments that correspond to the format string. + void LogMessage(LogLevel level, + LogCategoryMask categoryMask, + const char* pFormat, + ...) + { + va_list args; + va_start(args, pFormat); + LogMessage(level, categoryMask, pFormat, args); + va_end(args); + } + + /// Logs an event using the DevDriver protocol. + /// + /// @param [in] eventId The type of event you want to log. + /// @param [in] pEventData A pointer to the struct corresponding to the event id. + /// @param [in] eventDataSize The size of the event data struct. + virtual void LogEvent( + PalEvent eventId, + const void* pEventData, + uint32 eventDataSize) {} + +#if PAL_ENABLE_LOGGING + /// Function to access the current settings of file logger. + /// Clients can call this function to get file logger settings in order to configure + /// this logger at the time of its creation. + /// + /// @param [in] pSettings A struct in which file logger settings are copied. + virtual void GetDbgLoggerFileSettings( + Util::DbgLoggerFileSettings* pSettings) = 0; +#endif + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IPlatform( + const Util::AllocCallbacks& allocCb) + : +#if PAL_MEMTRACK + m_memTracker(&m_allocator), +#endif + m_allocator(allocCb), + m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IPlatform() { } + + /// @internal Initialization common to all platforms; must be called in subclass overrides of this function. + /// Currently only handles initialization of the memory leak tracker. + virtual Result Init() + { +#if PAL_MEMTRACK + return m_memTracker.Init(); +#else + return Result::Success; +#endif + } + + /// Used by the InstallDeveloperCb to install the event handler according to the derived platform. + /// + /// @param [in] pfnDeveloperCb The developer callback function pointer to be executed by the pPlatform. + /// @param [in] pPrivateData Private data that is installed with the event handler for use by the installer. + virtual void InstallDeveloperCb( + Developer::Callback pfnDeveloperCb, + void* pPrivateData) = 0; + +#if PAL_MEMTRACK + /// @internal Memory leak tracker. Requires an allocator in order to perform the actual allocations. We can't + /// provide this platform because that would result in a stack overflow. We must give it our forward allocator. + Util::MemTracker m_memTracker; +#endif + + /// @internal Memory allocator. Calls to Alloc() and Free() are chained down to the allocator's counterparts. + Util::ForwardAllocator m_allocator; + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h index 5fe269663a..69e4de70c1 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueryPool.h @@ -1,234 +1,234 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palQueryPool.h - * @brief Defines the Platform Abstraction Library (PAL) IQueryPool interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palGpuMemoryBindable.h" - -namespace Pal -{ - -/// Specifies a category of GPU query pool. -enum class QueryPoolType : uint32 -{ - Occlusion = 0x0, ///< Occlusion query pool. Supports queries based on the Z test. - PipelineStats = 0x1, ///< Pipeline stats query pool. Supports queries based on statistics from the GPU's execution - /// such as a count of prims generated, shader invocations, etc. - StreamoutStats = 0x2, ///< Streamout query pool. Supports queries based on statistics from the GPU's execution - /// such as number of primitives written to SO buffer and storage needed. - Count, -}; - -/// Specifies what data a query slot must produce. Some query pool types support multiple query types. -enum class QueryType : uint32 -{ - Occlusion = 0x0, ///< The total passes recorded by the Z test. - BinaryOcclusion = 0x1, ///< One if there were one or more Z test passes, zero otherwise. - PipelineStats = 0x2, ///< The total statistics selected by the given pipeline stats query pool. - StreamoutStats = 0x3, ///< SO statistics tracked by CP/VGT including primitives written and storage needed. - StreamoutStats1 = 0x4, ///< SO1 statistics tracked by CP/VGT including primitives written and storage needed. - StreamoutStats2 = 0x5, ///< SO2 statistics tracked by CP/VGT including primitives written and storage needed. - StreamoutStats3 = 0x6, ///< SO3 statistics tracked by CP/VGT including primitives written and storage needed. - Count, -}; - -/// Specifies which pipeline stats should be tracked by a pipeline stats query pool. -enum QueryPipelineStatsFlags : uint32 -{ - QueryPipelineStatsIaVertices = 0x1, ///< Input vertices. - QueryPipelineStatsIaPrimitives = 0x2, ///< Input primitives. - QueryPipelineStatsVsInvocations = 0x4, ///< Vertex shader invocations. - QueryPipelineStatsGsInvocations = 0x8, ///< Geometry shader invocations. - QueryPipelineStatsGsPrimitives = 0x10, ///< Geometry shader primitives. - QueryPipelineStatsCInvocations = 0x20, ///< Clipper invocations. - QueryPipelineStatsCPrimitives = 0x40, ///< Clipper primitives. - QueryPipelineStatsPsInvocations = 0x80, ///< Pixel shader invocations. - QueryPipelineStatsHsInvocations = 0x100, ///< Hull shader invocations. - QueryPipelineStatsDsInvocations = 0x200, ///< Domain shader invocations. - QueryPipelineStatsCsInvocations = 0x400, ///< Compute shader invocations. - QueryPipelineStatsTsInvocations = 0x800, ///< Task shader invocations. - QueryPipelineStatsMsInvocations = 0x1000, ///< Mesh shader invocations. - QueryPipelineStatsMsPrimitives = 0x2000, ///< Mesh shader primitives. - QueryPipelineStatsAll = 0x3FFF ///< All of the above stats. -}; - -/// Specifies properties for @ref IQueryPool creation. Input structure to IDevice::CreateQueryPool(). -struct QueryPoolCreateInfo -{ - QueryPoolType queryPoolType; ///< Type of query pool to create (i.e., occlusion vs. pipeline stats). - uint32 numSlots; ///< Number of slots in the query pool. - uint32 enabledStats; ///< An ORed mask of stats flags specific to the query pool type. - /// @see QueryPipelineStatsFlags for PipelineStats query pools. - union - { - struct - { - /// If true, this query pool can have results retrieved using the CPU (using @ref IQueryPool::GetResults) - /// and can be reset using the CPU (using @ref IQueryPool::Reset). Otherwise, the client must use command - /// buffers to perform these operations (using @ref ICmdBuffer::CmdResetQueryPool and - /// @ref ICmdBuffer::CmdResolveQuery). - uint32 enableCpuAccess : 1; - uint32 reserved : 31; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed together as a uint32. - } flags; ///< Flags controlling QueryPool behavior. -}; - -/// Controls operations that compute query results. -enum QueryResultFlags : uint32 -{ - QueryResultDefault = 0x0, ///< Default to 32-bit results with no waiting. - QueryResult64Bit = 0x1, ///< Store all results as 64-bit values. - QueryResultWait = 0x2, ///< Wait for the queries to finish when computing the results. - QueryResultAvailability = 0x4, ///< If the results of a query are available at computation time a one will be - /// written as a separate value after the result value, if the results were not - /// available a zero will be written. - QueryResultPartial = 0x8, ///< If the final result of a query would be unavailable, then return a - /// result for that query between 0 and what the final result would be. - QueryResultAccumulate = 0x10, ///< Results are added to the values present in the destination, if availability - /// data is enabled it will be ANDed with the present availability data. - QueryResultPreferShaderPath = 0x20, ///< Prefer a shader resolve path over a command processor path. - QueryResultOnlyPrimNeeded = 0x40, ///< Select only primitives storage needed in Streamout query results - QueryResultAll = 0x7F ///< Clients should NOT use it, for internal static_assert purpose only. -}; - -/** - *********************************************************************************************************************** - * @interface IQueryPool - * @brief Represents a set of queries that can be used to retrieve detailed info about the GPU's execution of a - * particular range of a command buffer. - * - * Currently, only occlusion queries and pipeline statistic queries are supported. All queries in a pool are the same - * type. - * - * @see IDevice::CreateQueryPool() - *********************************************************************************************************************** - */ -class IQueryPool : public IGpuMemoryBindable -{ -public: - /// Retrieves query results from a query pool. - /// - /// Multiple consecutive query results can be retrieved with one call. - /// - /// @param [in] flags Flags that control the result data layout and how the results are retrieved. - /// @param [in] queryType Specifies what data the query slots must produce. - /// @param [in] startQuery First query pool slot to retrieve data for. - /// @param [in] queryCount Number of query pool slots to retrieve data for. - /// @param [in] pMappedGpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, - // this method will use Map\UnMap to access the data. - /// @param [in,out] pDataSize Input value specifies the available size in pData in bytes; output value reports the - /// number of bytes required to hold all result data. - /// @param [out] pData Location where the query results should be written. Can be null in order to query the - /// required size. The data returned depends on the query pool type and flags. All data - /// entries are either uint32 or uint64 integers. One or more type-specific entries will - /// be optionally followed by one entry for availability. The type-specific data is:
- /// + QueryOcclusion: One entry to store the zPass count. - /// + QueryPipelineStats: One entry per statistic enabled in the create info. The stats - /// will be written in the appropriate order for each PAL client. - /// @param [in] stride Stride in bytes between subsequent query result data or zero to request tightly - /// packed result data. - /// - /// @returns Success if query results were successfully returned in pData, or NotReady if any of the requested query - /// slots does not yet have results available. Otherwise, one of the following error codes may be - /// returned: - /// + ErrorInvalidValue if the range defined by startQuery and queryCount is not valid for this query pool. - /// + ErrorGpuMemoryNotBound if the query pool requires GPU memory but none is bound. - /// + ErrorInvalidMemorySize if pData is non-null and the value stored in pDataSize is too small. - virtual Result GetResults( - QueryResultFlags flags, - QueryType queryType, - uint32 startQuery, - uint32 queryCount, - const void* pMappedGpuAddr, - size_t* pDataSize, - void* pData, - size_t stride) = 0; - - /// Use CPU to reset the query pool slots. - /// - /// Supported for occlusion and video decode statistics query pools. - /// - /// @param [in] startQuery First query pool slot to reset. - /// @param [in] queryCount Number of query pool slots to reset. - /// @param [in] pMappedCpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, - // this method will use Map/UnMap to access the data. - /// - /// @returns Success if the reset was successfully performed. - virtual Result Reset( - uint32 startQuery, - uint32 queryCount, - void* pMappedCpuAddr) = 0; - - /// Returns the distance, in bytes, between successive query slots in the bound GPU memory. - /// This method is only supported for @ref QueryPoolType::VideoDecodeStats - /// - /// @returns the distance, in bytes, between successive query slots in the bound GPU memory. - virtual gpusize GetQuerySlotStride() const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IQueryPool() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IQueryPool() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueryPool.h + * @brief Defines the Platform Abstraction Library (PAL) IQueryPool interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palGpuMemoryBindable.h" + +namespace Pal +{ + +/// Specifies a category of GPU query pool. +enum class QueryPoolType : uint32 +{ + Occlusion = 0x0, ///< Occlusion query pool. Supports queries based on the Z test. + PipelineStats = 0x1, ///< Pipeline stats query pool. Supports queries based on statistics from the GPU's execution + /// such as a count of prims generated, shader invocations, etc. + StreamoutStats = 0x2, ///< Streamout query pool. Supports queries based on statistics from the GPU's execution + /// such as number of primitives written to SO buffer and storage needed. + Count, +}; + +/// Specifies what data a query slot must produce. Some query pool types support multiple query types. +enum class QueryType : uint32 +{ + Occlusion = 0x0, ///< The total passes recorded by the Z test. + BinaryOcclusion = 0x1, ///< One if there were one or more Z test passes, zero otherwise. + PipelineStats = 0x2, ///< The total statistics selected by the given pipeline stats query pool. + StreamoutStats = 0x3, ///< SO statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats1 = 0x4, ///< SO1 statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats2 = 0x5, ///< SO2 statistics tracked by CP/VGT including primitives written and storage needed. + StreamoutStats3 = 0x6, ///< SO3 statistics tracked by CP/VGT including primitives written and storage needed. + Count, +}; + +/// Specifies which pipeline stats should be tracked by a pipeline stats query pool. +enum QueryPipelineStatsFlags : uint32 +{ + QueryPipelineStatsIaVertices = 0x1, ///< Input vertices. + QueryPipelineStatsIaPrimitives = 0x2, ///< Input primitives. + QueryPipelineStatsVsInvocations = 0x4, ///< Vertex shader invocations. + QueryPipelineStatsGsInvocations = 0x8, ///< Geometry shader invocations. + QueryPipelineStatsGsPrimitives = 0x10, ///< Geometry shader primitives. + QueryPipelineStatsCInvocations = 0x20, ///< Clipper invocations. + QueryPipelineStatsCPrimitives = 0x40, ///< Clipper primitives. + QueryPipelineStatsPsInvocations = 0x80, ///< Pixel shader invocations. + QueryPipelineStatsHsInvocations = 0x100, ///< Hull shader invocations. + QueryPipelineStatsDsInvocations = 0x200, ///< Domain shader invocations. + QueryPipelineStatsCsInvocations = 0x400, ///< Compute shader invocations. + QueryPipelineStatsTsInvocations = 0x800, ///< Task shader invocations. + QueryPipelineStatsMsInvocations = 0x1000, ///< Mesh shader invocations. + QueryPipelineStatsMsPrimitives = 0x2000, ///< Mesh shader primitives. + QueryPipelineStatsAll = 0x3FFF ///< All of the above stats. +}; + +/// Specifies properties for @ref IQueryPool creation. Input structure to IDevice::CreateQueryPool(). +struct QueryPoolCreateInfo +{ + QueryPoolType queryPoolType; ///< Type of query pool to create (i.e., occlusion vs. pipeline stats). + uint32 numSlots; ///< Number of slots in the query pool. + uint32 enabledStats; ///< An ORed mask of stats flags specific to the query pool type. + /// @see QueryPipelineStatsFlags for PipelineStats query pools. + union + { + struct + { + /// If true, this query pool can have results retrieved using the CPU (using @ref IQueryPool::GetResults) + /// and can be reset using the CPU (using @ref IQueryPool::Reset). Otherwise, the client must use command + /// buffers to perform these operations (using @ref ICmdBuffer::CmdResetQueryPool and + /// @ref ICmdBuffer::CmdResolveQuery). + uint32 enableCpuAccess : 1; + uint32 reserved : 31; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed together as a uint32. + } flags; ///< Flags controlling QueryPool behavior. +}; + +/// Controls operations that compute query results. +enum QueryResultFlags : uint32 +{ + QueryResultDefault = 0x0, ///< Default to 32-bit results with no waiting. + QueryResult64Bit = 0x1, ///< Store all results as 64-bit values. + QueryResultWait = 0x2, ///< Wait for the queries to finish when computing the results. + QueryResultAvailability = 0x4, ///< If the results of a query are available at computation time a one will be + /// written as a separate value after the result value, if the results were not + /// available a zero will be written. + QueryResultPartial = 0x8, ///< If the final result of a query would be unavailable, then return a + /// result for that query between 0 and what the final result would be. + QueryResultAccumulate = 0x10, ///< Results are added to the values present in the destination, if availability + /// data is enabled it will be ANDed with the present availability data. + QueryResultPreferShaderPath = 0x20, ///< Prefer a shader resolve path over a command processor path. + QueryResultOnlyPrimNeeded = 0x40, ///< Select only primitives storage needed in Streamout query results + QueryResultAll = 0x7F ///< Clients should NOT use it, for internal static_assert purpose only. +}; + +/** + *********************************************************************************************************************** + * @interface IQueryPool + * @brief Represents a set of queries that can be used to retrieve detailed info about the GPU's execution of a + * particular range of a command buffer. + * + * Currently, only occlusion queries and pipeline statistic queries are supported. All queries in a pool are the same + * type. + * + * @see IDevice::CreateQueryPool() + *********************************************************************************************************************** + */ +class IQueryPool : public IGpuMemoryBindable +{ +public: + /// Retrieves query results from a query pool. + /// + /// Multiple consecutive query results can be retrieved with one call. + /// + /// @param [in] flags Flags that control the result data layout and how the results are retrieved. + /// @param [in] queryType Specifies what data the query slots must produce. + /// @param [in] startQuery First query pool slot to retrieve data for. + /// @param [in] queryCount Number of query pool slots to retrieve data for. + /// @param [in] pMappedGpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, + // this method will use Map\UnMap to access the data. + /// @param [in,out] pDataSize Input value specifies the available size in pData in bytes; output value reports the + /// number of bytes required to hold all result data. + /// @param [out] pData Location where the query results should be written. Can be null in order to query the + /// required size. The data returned depends on the query pool type and flags. All data + /// entries are either uint32 or uint64 integers. One or more type-specific entries will + /// be optionally followed by one entry for availability. The type-specific data is:
+ /// + QueryOcclusion: One entry to store the zPass count. + /// + QueryPipelineStats: One entry per statistic enabled in the create info. The stats + /// will be written in the appropriate order for each PAL client. + /// @param [in] stride Stride in bytes between subsequent query result data or zero to request tightly + /// packed result data. + /// + /// @returns Success if query results were successfully returned in pData, or NotReady if any of the requested query + /// slots does not yet have results available. Otherwise, one of the following error codes may be + /// returned: + /// + ErrorInvalidValue if the range defined by startQuery and queryCount is not valid for this query pool. + /// + ErrorGpuMemoryNotBound if the query pool requires GPU memory but none is bound. + /// + ErrorInvalidMemorySize if pData is non-null and the value stored in pDataSize is too small. + virtual Result GetResults( + QueryResultFlags flags, + QueryType queryType, + uint32 startQuery, + uint32 queryCount, + const void* pMappedGpuAddr, + size_t* pDataSize, + void* pData, + size_t stride) = 0; + + /// Use CPU to reset the query pool slots. + /// + /// Supported for occlusion and video decode statistics query pools. + /// + /// @param [in] startQuery First query pool slot to reset. + /// @param [in] queryCount Number of query pool slots to reset. + /// @param [in] pMappedCpuAddr Specify the query buffer mapped address. If the parameter equals nullptr, + // this method will use Map/UnMap to access the data. + /// + /// @returns Success if the reset was successfully performed. + virtual Result Reset( + uint32 startQuery, + uint32 queryCount, + void* pMappedCpuAddr) = 0; + + /// Returns the distance, in bytes, between successive query slots in the bound GPU memory. + /// This method is only supported for @ref QueryPoolType::VideoDecodeStats + /// + /// @returns the distance, in bytes, between successive query slots in the bound GPU memory. + virtual gpusize GetQuerySlotStride() const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueryPool() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IQueryPool() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h index afbfa7ac00..55467e7e6d 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueue.h @@ -1,765 +1,765 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palQueue.h - * @brief Defines the Platform Abstraction Library (PAL) IQueue interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" -#include "palEvent.h" - -namespace Pal -{ - -// Forward declarations. -class ICmdBuffer; -class IFence; -class IGpuMemory; -class IImage; -class IPrivateScreen; -class IQueueSemaphore; -class IScreen; -class ISwapChain; -struct CmdBufInfo; -struct GpuMemSubAllocInfo; -struct GpuMemoryRef; -struct DoppRef; - -enum class VirtualGpuMemAccessMode : uint32; - -/// Specifies whether presents are windowed or fullscreen. This will determine whether the present is performed via a -/// BLT or flip. -enum class PresentMode : uint32 -{ - Unknown, - Windowed, - Fullscreen, - Count -}; - -/// Enumerates the possible overrides for the flip interval. -enum class FlipIntervalOverride : uint32 -{ - _None = 0, ///< No override. - Immediate = 1, ///< Zero frames of flip latency. - ImmediateAllowTearing = 2, ///< Same as Immediate, but allows tearing (no vsync). - One = 3, ///< One frame of flip latency. - Two = 4, ///< Two frames of flip latency. - Three = 5, ///< Three frames of flip latency. - Four = 6, ///< Four frames of flip latency. -}; - -/// Defines flags for describing which types of present modes are supported on a given queue. -enum PresentModeSupport : uint32 -{ - SupportWindowedPresent = 0x1, - SupportWindowedPriorBlitPresent = 0x2, - SupportFullscreenPresent = 0x4, -}; - -/// Defines submit-time bottlenecks which PAL can potentially optimize. -enum class SubmitOptMode : uint32 -{ - Default = 0, ///< PAL will enable optimizations when generally efficient. - Disabled = 1, ///< Disable all optimizations that could be detrimental in special cases. - MinKernelSubmits = 2, ///< Minimize the overhead of launching command buffers on the CPU and GPU. - MinGpuCmdOverhead = 3, ///< Minimize the overhead of reading command buffer commands on the GPU. - Count -}; - -/// Enumerates vcn instance affinity statuses -enum MmAffinityStatus : uint32 -{ - MmAffinityNotAllowed = 0, ///< The specific vcn instance can't be used. - MmAffinityAllowed = 1 ///< The specific vcn instance can be used. -}; - -/// Union describes all vcn instance affinity status. -union MmAffinity -{ - struct - { - uint32 vcn0Affinity : 2; ///< Affinity for instance vcn0 - uint32 vcn1Affinity : 2; ///< Affinity for instance vcn1 - uint32 reserved : 28; ///< Reserved (all 0) - }; - uint32 u32All; -}; - -/// Structure describing dump information for a command buffer. -struct CmdBufferDumpDesc -{ - EngineType engineType; ///< The engine type that this buffer is targeted for. - QueueType queueType; ///< The type of queue that this buffer is being created on. - SubEngineType subEngineType; ///< The ID of which sub-engine that this buffer is made for. - - uint32 cmdBufferIdx; ///< The index into the SubmitInfo ppCmdBuffers array that this - /// command buffer dump came from. - union - { - struct - { - uint8 isPreamble : 1; ///< Set if the buffer is an internal preamble command buffer. - uint8 isPostamble : 1; ///< Set if the buffer is an internal postamble command buffer. - uint8 reserved : 6; ///< Reserved for future use. - }; - uint8 u32All; ///< Flags packed as 8-bit uint. - } flags; - -}; - -/// Structure describing a command buffer chunk for use while dumping command buffers. -struct CmdBufferChunkDumpDesc -{ - uint32 id; ///< ID (number) of this command chunk within the command buffer. - const void* pCommands; ///< Pointer to the command data. - size_t size; ///< Size of valid data in bytes pointed to in pCommands. -}; - -/// Definition for command buffer dumping callback. -/// -/// @param [in] cmdBufferDesc Description of the command buffer. -/// @param [in] pChunks Pointer to an array of command buffer chunk descriptions. -/// @param [in] numChunks The number of chunks pointed to in pChunks. -typedef void (PAL_STDCALL* CmdDumpCallback)( - const CmdBufferDumpDesc& cmdBufferDesc, - const CmdBufferChunkDumpDesc* pChunks, - uint32 numChunks, - void* pUserData); - -/// Specifies properties for @ref IQueue creation. Input structure to IDevice::CreateQueue(). -struct QueueCreateInfo -{ - QueueType queueType; ///< Selects which type of queue to create. - EngineType engineType; ///< Selects which type of engine to create. - uint32 engineIndex; ///< Which instance of the specified engine type to query. For example, there - /// can be multiple compute queues, so this parameter distinguished between them. - SubmitOptMode submitOptMode; ///< A hint telling PAL which submit-time bottlenecks should be optimized, if any. - QueuePriority priority; ///< A hint telling PAL to create queue with proper priority. - /// It is only supported if supportQueuePriority is set in DeviceProperties. - /// In Linux, if we don't have root privilege, the creation with above-Medium - /// priority will fail. Client should take the corresponding action like retry - /// with lower priority, if necessary. - struct - { - uint32 aqlQueue : 1; ///< Compute queue will process AQL packets and kernels - uint32 windowedPriorBlit : 1; ///< All windowed presents on this queue are notifications - /// that the client has manually done a blit present - uint32 tmzOnly : 1; ///< This queue allows only TMZ submissions. Required for - /// compute TMZ submits. - -#if PAL_AMDGPU_BUILD - uint32 enableGpuMemoryPriorities : 1; ///< Enables support for GPU memory priorities on this Queue. - /// This is optional because enabling the feature requires - /// a small amount of memory overhead per-Queue for - /// bookkeeping purposes. -#else - uint32 placeholder2 : 1; ///< Reserved field. Set to 0. -#endif - uint32 dispatchTunneling : 1; ///< This queue uses compute dispatch tunneling. - - uint32 forceWaitIdleOnRingResize : 1; ///< This queue need to wait for idle before resize RingSet. - /// This is intended as a workaround for misbehaving applications. -#if defined(_WIN32) - uint32 nullRendering : 1; ///< Setting this bit makes this queue behave like IfhModeKmd. -#else - uint32 placeholder3 : 1; ///< Reserved field. Set to 0. -#endif - uint32 reserved : 25; ///< Reserved for future use. - }; - - uint32 numReservedCu; ///< The number of reserved compute units for RT CU queue - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 - uint32 persistentCeRamOffset; ///< Byte offset to the beginning of the region of CE RAM which this Queue should - /// preserve across consecutive submissions. Must be a multiple of 32. It is an - /// error to specify a nonzero value here if the the Device does not support - /// @ref supportPersistentCeRam for the Engine this Queue will attach to. - uint32 persistentCeRamSize; ///< Amount of CE RAM space which this Queue should preserve across consecutive - /// submissions. Units are in DWORDs, and this must be a multiple of 8. It is an - /// error to specify a nonzero value here if the the Device does not support - /// @ref supportPersistentCeRam for the Engine this Queue will attach to. -#endif - - uintptr_t aqlPacketList; ///< Location of the HIP runtime's info about this queue -}; - -/// Specifies the portion of @ref SubmitInfo that is specific to each sub-queue in a multi-queue object (@see -/// IDevice::CreateMultiQueue). Effectively, this enables specifying a different set of command buffers for each -/// queue that makes up a gang submission to a multi-queue object. -struct PerSubQueueSubmitInfo -{ - uint32 cmdBufferCount; ///< Number of command buffers to be submitted (can be 0 if this submit doesn't - /// involve work for the relevant queue). - ICmdBuffer*const* ppCmdBuffers; ///< Array of cmdBufferCount command buffers to be submitted. Command buffers - /// that are part of a ganged submit must guarantee the conditions required - /// for the optimizeExclusiveSubmit flag. - const CmdBufInfo* pCmdBufInfoList; ///< Null, or an array of cmdBufferCount structs providing additional - /// info about the command buffers being submitted. If non-null, - /// elements are ignored if their isValid flag is false. -}; - -/// Specifies all information needed to execute a set of command buffers. Input structure to IQueue::Submit(). -/// -/// Some members of this structure are not supported on all platforms. The client must check the appropriate properties -/// structures to determine if the corresponding features are supported: -/// + pGpuMemoryRefs: Support is indicated by supportPerSubmitMemRefs in @ref DeviceProperties. -/// + ppBlockIfFlipping: Support is indicated by supportBlockIfFlipping in @ref PlatformProperties. If it is supported, -/// the client must not specify a blockIfFlippingCount greater than MaxBlockIfFlippingCount. -/// -/// @note If this queue is running in physical submission mode (due to hardware restrictions), the gpuMemRefCount and -/// pGpuMemoryRefs arguments to this method are ignored because the command buffers themselves contain their own -/// GPU memory reference lists. -struct MultiSubmitInfo -{ - const PerSubQueueSubmitInfo* pPerSubQueueInfo;///< Specifies per-subqueue information for the submit. Typically - /// this is a pointer to a single entry specifying the command - /// buffers to be submitted on this queue. For gang submission on - /// a multi-queue, this should be an array with one entry per - /// sub-queue. The array size must be less than or equal to the - /// queueCount specified when the multi-queue was created and - /// the workload specified in each entry will be assigned to the - /// corresponding sub-queue. It is valid to have a cmdBufferCount - /// of 0 for sub-queues without work. Can be null if perSubQueueInfo- - /// Count is 0. - uint32 perSubQueueInfoCount; ///< Number of PerSubqueueSubmitInfo to be submitted. Can be zero if - /// there is no work to submit. - uint32 gpuMemRefCount; ///< Number of GPU memory references for this submit. - const GpuMemoryRef* pGpuMemoryRefs; ///< Array of gpuMemRefCount GPU memory references. Can be null if - /// gpuMemRefCount is zero. The GPU memory objects will be made - /// resident for the duration of this submit. - uint32 doppRefCount; ///< Number of DOPP desktop texture references for this submit. - const DoppRef* pDoppRefs; ///< Array of doppRefCount DOPP texture references. Can be null if - /// doppRefCount is zero. - uint32 externPhysMemCount; ///< Number of entries in ppExternPhysMem. - const IGpuMemory** ppExternPhysMem; ///< Array of external physical memory allocations to be initialized - /// as part of this submit. The first submit that references a - /// particular external physical memory allocation must include - /// that allocation in this list. Subsequent submits that reference - /// the same allocation should not include it in this list, as it - /// would trigger redundant GPU page table initialization. - uint32 blockIfFlippingCount; ///< Number of GPU memory objects to protect when flipped. - const IGpuMemory*const* ppBlockIfFlipping; ///< Array of blockIfFlippingCount GPU memory objects. Can be null if - /// blockIfFlippingCount is zero. The command buffers will not be - /// scheduled to the GPU while a fullscreen (flip) present is queued - /// for any of these GPU memory allocations. - uint32 fenceCount; ///< Number of fence objects to be signaled once the last command buffer - /// in this submission completes execution. - IFence** ppFences; ///< Array of fence objects. Can be null if fenceCount is zero. - CmdDumpCallback pfnCmdDumpCb; ///< Null, or a callback function to handle the dumping of the - /// command buffers used in this submit. - void* pUserData; ///< Client provided data to be passed to callback. - - uint32 stackSizeInDwords; ///< 0, or the max of stack frame size for indirect shaders of the - /// pipelines referenced in the command buffers of this submission. - /// The size is per native thread. So that the client will have to - /// multiply by 2 if a Wave64 shader that needs scratch is used. - /// Note that the size will not shrink for the lifetime of the queue - /// once it is grown and only affects compute scratch ring. - const IGpuMemory* pFreeMuxMemory; ///< The gpu memory object of the private flip primary surface for the - /// FreeMux feature. -}; - -typedef MultiSubmitInfo SubmitInfo; - -/// The value of blockIfFlippingCount in @ref SubmitInfo cannot be greater than this value. -constexpr uint32 MaxBlockIfFlippingCount = 16; - -/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentDirect(). -struct PresentDirectInfo -{ - union - { - struct - { - uint32 fullscreenDoNotWait : 1; ///< Fail the present immediately if the present queue is full. - uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. - uint32 dstIsTypedBuffer : 1; ///< True if the destination is a typed buffer instead of an image. - uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not - /// execute a present if this is true but may update internal - /// tracking state. - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Present flags. - - OsWindowHandle hWindow; ///< Native OS window handle that this image should be presented to. - PresentMode presentMode; ///< Chooses between windowed and fullscreen present. - uint32 presentInterval; ///< Must be an integer from 0 to 4. 0 indicates that the present should - /// occur immediately (may tear), and 1-4 indicates the present should - /// occur after 1 to 4 vertical syncs. Only valid for fullscreen presents. - union - { - IImage* pSrcImage; ///< Optional: The image to be presented. If null, the present will not - /// occur but PAL may still call into the OS on certain platforms that - /// expect it. - IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur - /// but PAL may still call into the OS on certain platforms that expect it. - }; - union - { - IImage* pDstImage; ///< Optional: copy from the source image to this image. If null, PAL will - /// automatically copy into the appropriate platform-specific destination. - /// This is only supported for windowed mode presents. - IGpuMemory* pDstTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur - /// but PAL may still call into the OS on certain platforms that expect it. - }; - -}; - -/// Media stream counter information. -struct MscInfo -{ - uint64 targetMsc; ///< if the current MSC is less than , the buffer swap - ///< will occur when the MSC value becomes equal to - uint64 divisor; ///< Divisor - ///< the buffer swap will occur the next time the MSC value is - ///< incremented to a value such that MSC % = - ///< if the current MSC is greater than or equal to - uint64 remainder; ///< Remainder -}; - -/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentSwapChain(). -struct PresentSwapChainInfo -{ - PresentMode presentMode; ///< Chooses between windowed and fullscreen present. - IImage* pSrcImage; ///< The image to be presented. - ISwapChain* pSwapChain; ///< The swap chain associated with the source image. - uint32 imageIndex; ///< The index of the source image within the swap chain. Owership of this image - /// index will be released back to the swap chain if this call succeeds. - uint32 rectangleCount; ///< Number of valid rectangles in the pRectangles array. - uint32 syncInterval; ///< Applicable only when syncIntervalOverride is set - /// 0 - The presentation occurs immediately, there is no synchronization. - /// 1 through 4 - Synchronize presentation after the nth vertical blank. - const Rect* pRectangles; ///< Array of rectangles defining the regions which will be updated. - uint64 presentId; ///< PresentId functions as an identifier for present operations on a swapchain. - /// If this PresentId is non-zero, then the application can later use this value - /// to refer to that image presentation. A value of zero indicates that this - /// presentation has no associated presentId. A non-zero presentId must be greater - /// than any non-zero presentId passed previously by the application for the same - /// swapchain. - union - { - struct - { -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 941 - uint32 notifyOnly : 1; ///< True if it is a notify-only present -#else - uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not - /// execute a present if this is true but may update internal - /// tracking state. -#endif - uint32 isTemporaryMono : 1; ///< True if WS Stereo is enabled, but 3D display mode turned off. - uint32 turboSyncEnabled : 1; ///< Whether TurboSync is enabled. - uint32 syncIntervalOverride : 1; ///< Override default syncInterval with the value in syncInterval - /// Supported only on Windows wsiPlatforms. - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< PresentSwapChainInfo flags. -#if PAL_AMDGPU_BUILD - MscInfo mscInfo; ///< Media stream counter information -#endif -}; - -/// Specifies a mapping from a range of pages in a virtual GPU memory object to a range of pages in a real GPU memory -/// object. Input to IQueue::RemapVirtualMemoryPages(). -/// -/// When mapping pages of a virtual GPU memory object to a range of pages in a real GPU memory object on a remote GPU, -/// the client must point pRealGpuMem at a peer GPU memory object created on the input queue's device instead of the -/// actual real GPU memory object created on the remote device. This is required for two reasons: -/// 1. PAL can only view remote GPU memory using peer objects. -/// 2. PAL enforces a separation of state between different IDevice object families. -/// -/// virtualStartOffset and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. -/// realStartOffset must be aligned to the realMemAllocGranularity member of @ref DeviceProperties. -struct VirtualMemoryRemapRange -{ - IGpuMemory* pVirtualGpuMem; ///< Virtual GPU memory object whose mapping is being updated. - gpusize virtualStartOffset; ///< Start of the page range to be updated, in bytes. - IGpuMemory* pRealGpuMem; ///< Real GPU memory object the virtual range should point at. - gpusize realStartOffset; ///< Start of the page range in the real GPU memory object, in bytes. - gpusize size; ///< Size of the mapping range, in bytes. - VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages. - /// This parameter is ignored on some platforms. -}; - -/// Specifies a set of page mappings to copy between virtual GPU memory objects. The source and destination can be the -/// same memory object and the source and destination regions may overlap. Input to IQueue::CopyVirtualMemoryPageMappings(). -/// -/// srcStartOffset, dstStartOffset, and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. -struct VirtualMemoryCopyPageMappingsRange -{ - IGpuMemory* pSrcGpuMem; ///< Virtual GPU memory object whose mapping is being copied from. - gpusize srcStartOffset; ///< Start of the copy source range, in bytes. - IGpuMemory* pDstGpuMem; ///< Virtual GPU memory object whose mapping is being copied to. - gpusize dstStartOffset; ///< Start of the copy destination range, in bytes. - gpusize size; ///< Size of the mapping range, in bytes. -}; - -/// Specifies kernel level information about a context. -struct KernelContextInfo -{ - union - { - struct - { - uint32 hasDebugVmid : 1; ///< True if the context has acquired the debug vmid. - uint32 hasHighPriorityVmid : 1; ///< True if the context has acquired the high priority vmid. - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Context flags. - - uint64 contextIdentifier; ///< Kernel scheduler context identifier. -}; - -/** - *********************************************************************************************************************** - * @interface IQueue - * @brief Represents a queue of work for a particular GPU engine on a device. - * - * An IQueue object is a virtual representation of a hardware engine on the device. Multiple IQueue objects can be - * created and have work submitted on them in parallel. Work is submitted to a queue through @ref ICmdBuffer objects, - * and work can be synchronized between multiple queues using @ref IQueueSemaphore objects. - * - * @see IDevice::GetQueue() - *********************************************************************************************************************** - */ -class IQueue : public IDestroyable -{ -public: - /// Submits a group of root command buffers for execution on this queue. - /// - /// @param [in] submitInfo Specifies all command buffers to execute along with other residency and synchronization - /// information. See @ref SubmitInfo for additional, important documentation. - /// - /// @returns Success if the command buffer was successfully submitted. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidPointer if: - /// - any of the array inputs are null when their counts are non-zero. - /// - any members of non-null point arrays are null. - /// + ErrorTooManyMemoryReferences if the total number of memory references (device/queue global and - /// per-command buffer) is too large. - /// + ErrorInvalidValue if blockIfFlippingCount is too large. - /// + ErrorIncompleteCommandBuffer if any of the submitted command buffers are not properly constructed. - /// + ErrorIncompatibleQueue if any submitted command buffer does not match this queue's type (e.g., - /// universal, graphics, DMA). - virtual Result Submit( - const MultiSubmitInfo& submitInfo) = 0; - - /// Waits for all previous submission on this queue to complete before control is returned to the caller. - /// - /// @returns Success if wait for submissions completed. Otherwise an error indicates reason for unsuccessful wait, - /// for example due to lost device. - virtual Result WaitIdle() = 0; - - /// Inserts a semaphore signal into the GPU queue. The semaphore will be signaled once all previously submitted - /// work on this queue has completed. - /// - /// @param [in] pQueueSemaphore Semaphore to signal. - /// @param [in] value timeline Semaphore point value to signal, ignored for non-timeline semaphores. - /// - /// @returns Success if the semaphore signal was successfully queued. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if the OS scheduler rejects the signal for unknown reasons. - virtual Result SignalQueueSemaphore( - IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; - - /// Inserts a semaphore wait into the GPU queue. The queue will be stalled until the specified semaphore is - /// signaled. - /// - /// @param [in] pQueueSemaphore Semaphore to wait on. - /// @param [in] value timeline semaphore point value to wait on, ignored for non-timeline semaphores. - /// - /// @returns Success if the semaphore wait was successfully queued. Otherwise, one of the following errors may be - /// returned: - /// + ErrorUnknown if the OS scheduler rejects the wait for unknown reasons. - virtual Result WaitQueueSemaphore( - IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; - -#if PAL_KMT_BUILD - /// Acquire the keyed mutex of shared GPU memory object (CPU sync) and then wait for the synchronization object of - /// the shared GPU memory object (GPU sync based on fence). Note that the shared GPU memory object has to be - /// a D3d11 resource created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) - /// misc flag. - /// - /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are bound. - /// [in] key Key of keyed mutex to be acquired. - /// [in] timeout Timeout interval for keyed mutex acquiring, in milliseconds. - /// - /// @ returns Success if the keyed mutex has been successfully acquired and wait for the synchronization object - /// has been successfully scheduled. Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if either the keyed mutex has not been successfully acuiqred or wait for the - /// synchronization object has not been successfully scheduled. - virtual Result KeyedMutexAcquireSync( - IGpuMemory* pGpuMemory, - uint64 key, - std::chrono::milliseconds timeout) = 0; - - /// Signal the synchronization object of shared GPU memory object with bumped fence value and then release the - /// keyed mutex of shared GPU memory object. Note that the shared GPU memory object has to be a D3d11 resource - /// created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) misc flag. - /// - /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are - /// bound. - /// [in] key Key of keyed mutex to be released. - /// - /// @ returns Success if signal of the synchronization object has been successfully scheduled and the keyed mutex - /// has been successfully released. Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if either signal of the synchronization object has not been successfully scheduled - /// or keyed mutex has not been successfully released. - virtual Result KeyedMutexReleaseSync( - IGpuMemory* pGpuMemory, - uint64 key) = 0; -#endif - - /// This function passes application information to KMD for application specific power optimizations. - /// Power configuration are restored to default when all application queues are destroyed. - /// - /// @param [in] pFileName Application executable name - /// @param [in] pPathName Path to the application - /// - /// @returns Success if the information is passed successfully. Otherwise, one of the following errors may be - /// returned: - /// + Unsupported if this function is not available on this OS or if the queue context is null. - /// + ErrorUnknown if an unexpected internal error occurs. - virtual Result UpdateAppPowerProfile( - const wchar_t* pFileName, - const wchar_t* pPathName) = 0; - - /// Queues the specified image for presentation on the screen. This function directly queues the presentation - /// request based on the input parameters without special synchronization considerations like a swap chain present. - /// All previous work done on this queue will complete before the image is displayed. - /// - /// This function should never be called with a swap chain presentable image because it won't release ownership of - /// the presentable image index, eventually deadlocking the swap chain. - /// - /// Overall support for direct presents can be queried at platform creation time via supportNonSwapChainPresents - /// in @ref PlatformProperties. Support for particular present modes is specifed via supportedDirectPresentModes - /// in @ref DeviceProperties. - /// - /// @note Any images specified in presentInfo must be made resident before calling this function. - /// - /// @param [in] presentInfo Specifies the source image and destination window for the present as well as other - /// properties. - /// - /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidValue if the flip interval is invalid. - /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image. - /// + ErrorInvalidFlags if the present flags don't match the capabilities of the image. - virtual Result PresentDirect( - const PresentDirectInfo& presentInfo) = 0; - - /// Queues the specified image for presentation on the screen. This function uses the provided swap chain to - /// determine exactly how the image should be presented (e.g., can the user see tearing). See @ref ISwapChain for - /// more information on swap chain presentation. All previous work done on this queue will complete before the - /// image is displayed, but future work may execute before the present is completed because swap chain present - /// execution may be asynchronous to the queue that initiated present. - /// - /// Assuming the presentInfo is valid, this function will always release ownership of the presentable image index - /// even if PAL encounters an error while executing the present. - /// - /// Queue support for swap chain presents is specified via supportsSwapChainPresents in @ref DeviceProperties. - /// Support for particular PresentModes is queried per SwapChainMode via IDevice::GetSwapChainInfo(). - /// - /// @note The source image specified in presentInfo must be made resident before calling this function. - /// - /// @param [in] presentInfo Specifies the source image, swap chain, and basic presentation information. - /// - /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be - /// returned: - /// + ErrorInvalidPointer if the source image or swap chain are null. - /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image or if the image - /// index isn't valid within the swap chain. - virtual Result PresentSwapChain( - const PresentSwapChainInfo& presentInfo) = 0; - - /// Inserts a delay of a specified amount of time before processing more commands on this queue. - /// - /// Only available on timer queues. Useful in conjunction with queue semaphores to implement frame pacing. - /// - /// @param [in] delay Time, in milliseconds, to delay before processing more commands on this queue. - /// - /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if delay is less than 0. - virtual Result Delay( - Util::fmilliseconds delay) = 0; - - /// Inserts a delay of a specified amount of time on this queue after a vsync on a private display object. - /// - /// Only available on timer queues. Useful in conjunction with queue semaphores to implement pacing of GPU and CPU - /// operations for rendering and presentation in VR as this allows GPU commands of next frame to be sent early but - /// blocks GPU execution until after vsync. - /// - /// @param [in] delay Time, in microseconds, to delay before processing more commands on this queue. - /// @param [in] pScreen The private screen object that the vsync is occurring and the delay is waiting on. - /// - /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue if delay is less than 0. - virtual Result DelayAfterVsync( - Util::fmicroseconds delay, - const IPrivateScreen* pScreen) = 0; - - /// Updates page mappings for virtual GPU memory allocations. - /// - /// @param [in] rangeCount Number of ranges to remap (i.e., size of the pRanges array). - /// @param [in] pRanges Defines the set of remappings from virtual GPU memory object pages to real GPU - /// memory object pages. - /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without - /// waiting for any previous rendering to finish first. On platforms that don't support - /// this, the flag will be ignored. - /// @param [in] pFence Optional. Pointer to an IFence, which will be signaled after the VA remapping. - /// - /// @returns Success if the remappings were executed successfully. It is assumed that the following conditions are - /// met for the input to this function: - /// + rangeCount is not 0. - /// + The page range for all members of pRanges are valid. - /// + pRanges is not null. - /// + pVirtualGpuMem is not null for any member of pRanges. - /// + pRanges does not specify a real GPU memory object as a virtual GPU memory object or vice versa. - virtual Result RemapVirtualMemoryPages( - uint32 rangeCount, - const VirtualMemoryRemapRange* pRanges, - bool doNotWait, - IFence* pFence) = 0; - - /// Copies page mappings from one virtual GPU memory object to another. - /// - /// @param [in] rangeCount Number of ranges to copy (i.e., size of the pRanges array). - /// @param [in] pRanges Defines the set of page mappings to copy between virtual GPU memory objects. - /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without - /// waiting for any previous rendering to finish first. On platforms that don't support - /// this, the flag will be ignored. - /// - /// @returns Success if the mappings were copied successfully. It is assumed that the following conditions are - /// met for the input to this function: - /// + rangeCount is not 0. - /// + The page range for all members of pRanges are valid. - /// + pRanges is not null. - /// + pSrcGpuMem or pDstGpuMem is not null for any member of pRanges. - /// + pRanges does not specify a real GPU memory object as source or destination - virtual Result CopyVirtualMemoryPageMappings( - uint32 rangeCount, - const VirtualMemoryCopyPageMappingsRange* pRanges, - bool doNotWait) = 0; - - /// Associates the provided Fence object with the last submission on this queue object. The Fence can be used via - /// GetStatus() to get the status of the last Submit, however no event will be created/set for the Fence so - /// WaitForFences() should NOT be called on the fence after this association. - /// - /// @see IFence::GetStatus() - /// @see IFence::WaitForFences() - /// - /// @param [in] pFence Fence object to be associated with the last Submit on this queue - /// - /// @returns Success if the association was successful. ErrorUnavailable will be returned in there has not yet been - /// a Submit on this queue. - virtual Result AssociateFenceWithLastSubmit( - IFence* pFence) = 0; - - /// Set execution priority for the current queue, it allows to elevate execution priority of submitted command - /// buffers, but it has no effect on command buffers that have already been submitted for execution. Elevating - /// the queue priority to medium or high would allow to temporary stall a low priority queue execution and execute - /// its work as soon as the low priority queue starts draining. - /// - /// @param [in] priority The priority level of the queue. - virtual void SetExecutionPriority( - QueuePriority priority) = 0; - - /// Returns a list of GPU memory allocations used by this queue. - /// - /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value - /// reports the number of GPU memory allocations. - /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it - /// will reflect the number of allocations that make up this queue. If - /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number - /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the - /// number of entries in pAllocInfoList that are valid. - /// @returns Success if the allocation info was successfully written to the buffer. - /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. - /// + ErrorInvalidPointer if pNumEntries is nullptr. - virtual Result QueryAllocationInfo( - size_t* pNumEntries, - GpuMemSubAllocInfo* const pAllocInfoList) = 0; - - /// Returns the QueueType for the queue - virtual QueueType Type() const = 0; - - /// Returns the EngineType for the queue - virtual EngineType GetEngineType() const = 0; - - /// Queries the kernel context info associated with this queue and copies it into pKernelContextInfo. - /// - /// Only supported on Windows platforms. - /// - /// @param [out] pKernelContextInfo Pointer to a KernelContextInfo struct to copy the information into. - /// @returns Success if the information is successfully copied into the output struct. - /// + ErrorInvalidPointer if pKernelContextInfo is nullptr. - /// + ErrorUnavailable if kernel context information is not available on the current platform. - virtual Result QueryKernelContextInfo(KernelContextInfo* pKernelContextInfo) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IQueue() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Queues will be destroyed when the - /// associated device is destroyed. - virtual ~IQueue() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueue.h + * @brief Defines the Platform Abstraction Library (PAL) IQueue interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include "palEvent.h" + +namespace Pal +{ + +// Forward declarations. +class ICmdBuffer; +class IFence; +class IGpuMemory; +class IImage; +class IPrivateScreen; +class IQueueSemaphore; +class IScreen; +class ISwapChain; +struct CmdBufInfo; +struct GpuMemSubAllocInfo; +struct GpuMemoryRef; +struct DoppRef; + +enum class VirtualGpuMemAccessMode : uint32; + +/// Specifies whether presents are windowed or fullscreen. This will determine whether the present is performed via a +/// BLT or flip. +enum class PresentMode : uint32 +{ + Unknown, + Windowed, + Fullscreen, + Count +}; + +/// Enumerates the possible overrides for the flip interval. +enum class FlipIntervalOverride : uint32 +{ + _None = 0, ///< No override. + Immediate = 1, ///< Zero frames of flip latency. + ImmediateAllowTearing = 2, ///< Same as Immediate, but allows tearing (no vsync). + One = 3, ///< One frame of flip latency. + Two = 4, ///< Two frames of flip latency. + Three = 5, ///< Three frames of flip latency. + Four = 6, ///< Four frames of flip latency. +}; + +/// Defines flags for describing which types of present modes are supported on a given queue. +enum PresentModeSupport : uint32 +{ + SupportWindowedPresent = 0x1, + SupportWindowedPriorBlitPresent = 0x2, + SupportFullscreenPresent = 0x4, +}; + +/// Defines submit-time bottlenecks which PAL can potentially optimize. +enum class SubmitOptMode : uint32 +{ + Default = 0, ///< PAL will enable optimizations when generally efficient. + Disabled = 1, ///< Disable all optimizations that could be detrimental in special cases. + MinKernelSubmits = 2, ///< Minimize the overhead of launching command buffers on the CPU and GPU. + MinGpuCmdOverhead = 3, ///< Minimize the overhead of reading command buffer commands on the GPU. + Count +}; + +/// Enumerates vcn instance affinity statuses +enum MmAffinityStatus : uint32 +{ + MmAffinityNotAllowed = 0, ///< The specific vcn instance can't be used. + MmAffinityAllowed = 1 ///< The specific vcn instance can be used. +}; + +/// Union describes all vcn instance affinity status. +union MmAffinity +{ + struct + { + uint32 vcn0Affinity : 2; ///< Affinity for instance vcn0 + uint32 vcn1Affinity : 2; ///< Affinity for instance vcn1 + uint32 reserved : 28; ///< Reserved (all 0) + }; + uint32 u32All; +}; + +/// Structure describing dump information for a command buffer. +struct CmdBufferDumpDesc +{ + EngineType engineType; ///< The engine type that this buffer is targeted for. + QueueType queueType; ///< The type of queue that this buffer is being created on. + SubEngineType subEngineType; ///< The ID of which sub-engine that this buffer is made for. + + uint32 cmdBufferIdx; ///< The index into the SubmitInfo ppCmdBuffers array that this + /// command buffer dump came from. + union + { + struct + { + uint8 isPreamble : 1; ///< Set if the buffer is an internal preamble command buffer. + uint8 isPostamble : 1; ///< Set if the buffer is an internal postamble command buffer. + uint8 reserved : 6; ///< Reserved for future use. + }; + uint8 u32All; ///< Flags packed as 8-bit uint. + } flags; + +}; + +/// Structure describing a command buffer chunk for use while dumping command buffers. +struct CmdBufferChunkDumpDesc +{ + uint32 id; ///< ID (number) of this command chunk within the command buffer. + const void* pCommands; ///< Pointer to the command data. + size_t size; ///< Size of valid data in bytes pointed to in pCommands. +}; + +/// Definition for command buffer dumping callback. +/// +/// @param [in] cmdBufferDesc Description of the command buffer. +/// @param [in] pChunks Pointer to an array of command buffer chunk descriptions. +/// @param [in] numChunks The number of chunks pointed to in pChunks. +typedef void (PAL_STDCALL* CmdDumpCallback)( + const CmdBufferDumpDesc& cmdBufferDesc, + const CmdBufferChunkDumpDesc* pChunks, + uint32 numChunks, + void* pUserData); + +/// Specifies properties for @ref IQueue creation. Input structure to IDevice::CreateQueue(). +struct QueueCreateInfo +{ + QueueType queueType; ///< Selects which type of queue to create. + EngineType engineType; ///< Selects which type of engine to create. + uint32 engineIndex; ///< Which instance of the specified engine type to query. For example, there + /// can be multiple compute queues, so this parameter distinguished between them. + SubmitOptMode submitOptMode; ///< A hint telling PAL which submit-time bottlenecks should be optimized, if any. + QueuePriority priority; ///< A hint telling PAL to create queue with proper priority. + /// It is only supported if supportQueuePriority is set in DeviceProperties. + /// In Linux, if we don't have root privilege, the creation with above-Medium + /// priority will fail. Client should take the corresponding action like retry + /// with lower priority, if necessary. + struct + { + uint32 aqlQueue : 1; ///< Compute queue will process AQL packets and kernels + uint32 windowedPriorBlit : 1; ///< All windowed presents on this queue are notifications + /// that the client has manually done a blit present + uint32 tmzOnly : 1; ///< This queue allows only TMZ submissions. Required for + /// compute TMZ submits. + +#if PAL_AMDGPU_BUILD + uint32 enableGpuMemoryPriorities : 1; ///< Enables support for GPU memory priorities on this Queue. + /// This is optional because enabling the feature requires + /// a small amount of memory overhead per-Queue for + /// bookkeeping purposes. +#else + uint32 placeholder2 : 1; ///< Reserved field. Set to 0. +#endif + uint32 dispatchTunneling : 1; ///< This queue uses compute dispatch tunneling. + + uint32 forceWaitIdleOnRingResize : 1; ///< This queue need to wait for idle before resize RingSet. + /// This is intended as a workaround for misbehaving applications. +#if defined(_WIN32) + uint32 nullRendering : 1; ///< Setting this bit makes this queue behave like IfhModeKmd. +#else + uint32 placeholder3 : 1; ///< Reserved field. Set to 0. +#endif + uint32 reserved : 25; ///< Reserved for future use. + }; + + uint32 numReservedCu; ///< The number of reserved compute units for RT CU queue + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 + uint32 persistentCeRamOffset; ///< Byte offset to the beginning of the region of CE RAM which this Queue should + /// preserve across consecutive submissions. Must be a multiple of 32. It is an + /// error to specify a nonzero value here if the the Device does not support + /// @ref supportPersistentCeRam for the Engine this Queue will attach to. + uint32 persistentCeRamSize; ///< Amount of CE RAM space which this Queue should preserve across consecutive + /// submissions. Units are in DWORDs, and this must be a multiple of 8. It is an + /// error to specify a nonzero value here if the the Device does not support + /// @ref supportPersistentCeRam for the Engine this Queue will attach to. +#endif + + uintptr_t aqlPacketList; ///< Location of the HIP runtime's info about this queue +}; + +/// Specifies the portion of @ref SubmitInfo that is specific to each sub-queue in a multi-queue object (@see +/// IDevice::CreateMultiQueue). Effectively, this enables specifying a different set of command buffers for each +/// queue that makes up a gang submission to a multi-queue object. +struct PerSubQueueSubmitInfo +{ + uint32 cmdBufferCount; ///< Number of command buffers to be submitted (can be 0 if this submit doesn't + /// involve work for the relevant queue). + ICmdBuffer*const* ppCmdBuffers; ///< Array of cmdBufferCount command buffers to be submitted. Command buffers + /// that are part of a ganged submit must guarantee the conditions required + /// for the optimizeExclusiveSubmit flag. + const CmdBufInfo* pCmdBufInfoList; ///< Null, or an array of cmdBufferCount structs providing additional + /// info about the command buffers being submitted. If non-null, + /// elements are ignored if their isValid flag is false. +}; + +/// Specifies all information needed to execute a set of command buffers. Input structure to IQueue::Submit(). +/// +/// Some members of this structure are not supported on all platforms. The client must check the appropriate properties +/// structures to determine if the corresponding features are supported: +/// + pGpuMemoryRefs: Support is indicated by supportPerSubmitMemRefs in @ref DeviceProperties. +/// + ppBlockIfFlipping: Support is indicated by supportBlockIfFlipping in @ref PlatformProperties. If it is supported, +/// the client must not specify a blockIfFlippingCount greater than MaxBlockIfFlippingCount. +/// +/// @note If this queue is running in physical submission mode (due to hardware restrictions), the gpuMemRefCount and +/// pGpuMemoryRefs arguments to this method are ignored because the command buffers themselves contain their own +/// GPU memory reference lists. +struct MultiSubmitInfo +{ + const PerSubQueueSubmitInfo* pPerSubQueueInfo;///< Specifies per-subqueue information for the submit. Typically + /// this is a pointer to a single entry specifying the command + /// buffers to be submitted on this queue. For gang submission on + /// a multi-queue, this should be an array with one entry per + /// sub-queue. The array size must be less than or equal to the + /// queueCount specified when the multi-queue was created and + /// the workload specified in each entry will be assigned to the + /// corresponding sub-queue. It is valid to have a cmdBufferCount + /// of 0 for sub-queues without work. Can be null if perSubQueueInfo- + /// Count is 0. + uint32 perSubQueueInfoCount; ///< Number of PerSubqueueSubmitInfo to be submitted. Can be zero if + /// there is no work to submit. + uint32 gpuMemRefCount; ///< Number of GPU memory references for this submit. + const GpuMemoryRef* pGpuMemoryRefs; ///< Array of gpuMemRefCount GPU memory references. Can be null if + /// gpuMemRefCount is zero. The GPU memory objects will be made + /// resident for the duration of this submit. + uint32 doppRefCount; ///< Number of DOPP desktop texture references for this submit. + const DoppRef* pDoppRefs; ///< Array of doppRefCount DOPP texture references. Can be null if + /// doppRefCount is zero. + uint32 externPhysMemCount; ///< Number of entries in ppExternPhysMem. + const IGpuMemory** ppExternPhysMem; ///< Array of external physical memory allocations to be initialized + /// as part of this submit. The first submit that references a + /// particular external physical memory allocation must include + /// that allocation in this list. Subsequent submits that reference + /// the same allocation should not include it in this list, as it + /// would trigger redundant GPU page table initialization. + uint32 blockIfFlippingCount; ///< Number of GPU memory objects to protect when flipped. + const IGpuMemory*const* ppBlockIfFlipping; ///< Array of blockIfFlippingCount GPU memory objects. Can be null if + /// blockIfFlippingCount is zero. The command buffers will not be + /// scheduled to the GPU while a fullscreen (flip) present is queued + /// for any of these GPU memory allocations. + uint32 fenceCount; ///< Number of fence objects to be signaled once the last command buffer + /// in this submission completes execution. + IFence** ppFences; ///< Array of fence objects. Can be null if fenceCount is zero. + CmdDumpCallback pfnCmdDumpCb; ///< Null, or a callback function to handle the dumping of the + /// command buffers used in this submit. + void* pUserData; ///< Client provided data to be passed to callback. + + uint32 stackSizeInDwords; ///< 0, or the max of stack frame size for indirect shaders of the + /// pipelines referenced in the command buffers of this submission. + /// The size is per native thread. So that the client will have to + /// multiply by 2 if a Wave64 shader that needs scratch is used. + /// Note that the size will not shrink for the lifetime of the queue + /// once it is grown and only affects compute scratch ring. + const IGpuMemory* pFreeMuxMemory; ///< The gpu memory object of the private flip primary surface for the + /// FreeMux feature. +}; + +typedef MultiSubmitInfo SubmitInfo; + +/// The value of blockIfFlippingCount in @ref SubmitInfo cannot be greater than this value. +constexpr uint32 MaxBlockIfFlippingCount = 16; + +/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentDirect(). +struct PresentDirectInfo +{ + union + { + struct + { + uint32 fullscreenDoNotWait : 1; ///< Fail the present immediately if the present queue is full. + uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. + uint32 dstIsTypedBuffer : 1; ///< True if the destination is a typed buffer instead of an image. + uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not + /// execute a present if this is true but may update internal + /// tracking state. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Present flags. + + OsWindowHandle hWindow; ///< Native OS window handle that this image should be presented to. + PresentMode presentMode; ///< Chooses between windowed and fullscreen present. + uint32 presentInterval; ///< Must be an integer from 0 to 4. 0 indicates that the present should + /// occur immediately (may tear), and 1-4 indicates the present should + /// occur after 1 to 4 vertical syncs. Only valid for fullscreen presents. + union + { + IImage* pSrcImage; ///< Optional: The image to be presented. If null, the present will not + /// occur but PAL may still call into the OS on certain platforms that + /// expect it. + IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur + /// but PAL may still call into the OS on certain platforms that expect it. + }; + union + { + IImage* pDstImage; ///< Optional: copy from the source image to this image. If null, PAL will + /// automatically copy into the appropriate platform-specific destination. + /// This is only supported for windowed mode presents. + IGpuMemory* pDstTypedBuffer; ///< The typed buffer to be presented. If null, the present will not occur + /// but PAL may still call into the OS on certain platforms that expect it. + }; + +}; + +/// Media stream counter information. +struct MscInfo +{ + uint64 targetMsc; ///< if the current MSC is less than , the buffer swap + ///< will occur when the MSC value becomes equal to + uint64 divisor; ///< Divisor + ///< the buffer swap will occur the next time the MSC value is + ///< incremented to a value such that MSC % = + ///< if the current MSC is greater than or equal to + uint64 remainder; ///< Remainder +}; + +/// Specifies properties for the presentation of an image to the screen. Input structure to IQueue::PresentSwapChain(). +struct PresentSwapChainInfo +{ + PresentMode presentMode; ///< Chooses between windowed and fullscreen present. + IImage* pSrcImage; ///< The image to be presented. + ISwapChain* pSwapChain; ///< The swap chain associated with the source image. + uint32 imageIndex; ///< The index of the source image within the swap chain. Owership of this image + /// index will be released back to the swap chain if this call succeeds. + uint32 rectangleCount; ///< Number of valid rectangles in the pRectangles array. + uint32 syncInterval; ///< Applicable only when syncIntervalOverride is set + /// 0 - The presentation occurs immediately, there is no synchronization. + /// 1 through 4 - Synchronize presentation after the nth vertical blank. + const Rect* pRectangles; ///< Array of rectangles defining the regions which will be updated. + uint64 presentId; ///< PresentId functions as an identifier for present operations on a swapchain. + /// If this PresentId is non-zero, then the application can later use this value + /// to refer to that image presentation. A value of zero indicates that this + /// presentation has no associated presentId. A non-zero presentId must be greater + /// than any non-zero presentId passed previously by the application for the same + /// swapchain. + union + { + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 941 + uint32 notifyOnly : 1; ///< True if it is a notify-only present +#else + uint32 notifyOnly : 1; ///< Indicates that a present occurred outside of PAL. PAL must not + /// execute a present if this is true but may update internal + /// tracking state. +#endif + uint32 isTemporaryMono : 1; ///< True if WS Stereo is enabled, but 3D display mode turned off. + uint32 turboSyncEnabled : 1; ///< Whether TurboSync is enabled. + uint32 syncIntervalOverride : 1; ///< Override default syncInterval with the value in syncInterval + /// Supported only on Windows wsiPlatforms. + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< PresentSwapChainInfo flags. +#if PAL_AMDGPU_BUILD + MscInfo mscInfo; ///< Media stream counter information +#endif +}; + +/// Specifies a mapping from a range of pages in a virtual GPU memory object to a range of pages in a real GPU memory +/// object. Input to IQueue::RemapVirtualMemoryPages(). +/// +/// When mapping pages of a virtual GPU memory object to a range of pages in a real GPU memory object on a remote GPU, +/// the client must point pRealGpuMem at a peer GPU memory object created on the input queue's device instead of the +/// actual real GPU memory object created on the remote device. This is required for two reasons: +/// 1. PAL can only view remote GPU memory using peer objects. +/// 2. PAL enforces a separation of state between different IDevice object families. +/// +/// virtualStartOffset and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. +/// realStartOffset must be aligned to the realMemAllocGranularity member of @ref DeviceProperties. +struct VirtualMemoryRemapRange +{ + IGpuMemory* pVirtualGpuMem; ///< Virtual GPU memory object whose mapping is being updated. + gpusize virtualStartOffset; ///< Start of the page range to be updated, in bytes. + IGpuMemory* pRealGpuMem; ///< Real GPU memory object the virtual range should point at. + gpusize realStartOffset; ///< Start of the page range in the real GPU memory object, in bytes. + gpusize size; ///< Size of the mapping range, in bytes. + VirtualGpuMemAccessMode virtualAccessMode; ///< Access mode for virtual GPU memory's unmapped pages. + /// This parameter is ignored on some platforms. +}; + +/// Specifies a set of page mappings to copy between virtual GPU memory objects. The source and destination can be the +/// same memory object and the source and destination regions may overlap. Input to IQueue::CopyVirtualMemoryPageMappings(). +/// +/// srcStartOffset, dstStartOffset, and size must be aligned to the virtualMemPageSize member of @ref DeviceProperties. +struct VirtualMemoryCopyPageMappingsRange +{ + IGpuMemory* pSrcGpuMem; ///< Virtual GPU memory object whose mapping is being copied from. + gpusize srcStartOffset; ///< Start of the copy source range, in bytes. + IGpuMemory* pDstGpuMem; ///< Virtual GPU memory object whose mapping is being copied to. + gpusize dstStartOffset; ///< Start of the copy destination range, in bytes. + gpusize size; ///< Size of the mapping range, in bytes. +}; + +/// Specifies kernel level information about a context. +struct KernelContextInfo +{ + union + { + struct + { + uint32 hasDebugVmid : 1; ///< True if the context has acquired the debug vmid. + uint32 hasHighPriorityVmid : 1; ///< True if the context has acquired the high priority vmid. + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Context flags. + + uint64 contextIdentifier; ///< Kernel scheduler context identifier. +}; + +/** + *********************************************************************************************************************** + * @interface IQueue + * @brief Represents a queue of work for a particular GPU engine on a device. + * + * An IQueue object is a virtual representation of a hardware engine on the device. Multiple IQueue objects can be + * created and have work submitted on them in parallel. Work is submitted to a queue through @ref ICmdBuffer objects, + * and work can be synchronized between multiple queues using @ref IQueueSemaphore objects. + * + * @see IDevice::GetQueue() + *********************************************************************************************************************** + */ +class IQueue : public IDestroyable +{ +public: + /// Submits a group of root command buffers for execution on this queue. + /// + /// @param [in] submitInfo Specifies all command buffers to execute along with other residency and synchronization + /// information. See @ref SubmitInfo for additional, important documentation. + /// + /// @returns Success if the command buffer was successfully submitted. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidPointer if: + /// - any of the array inputs are null when their counts are non-zero. + /// - any members of non-null point arrays are null. + /// + ErrorTooManyMemoryReferences if the total number of memory references (device/queue global and + /// per-command buffer) is too large. + /// + ErrorInvalidValue if blockIfFlippingCount is too large. + /// + ErrorIncompleteCommandBuffer if any of the submitted command buffers are not properly constructed. + /// + ErrorIncompatibleQueue if any submitted command buffer does not match this queue's type (e.g., + /// universal, graphics, DMA). + virtual Result Submit( + const MultiSubmitInfo& submitInfo) = 0; + + /// Waits for all previous submission on this queue to complete before control is returned to the caller. + /// + /// @returns Success if wait for submissions completed. Otherwise an error indicates reason for unsuccessful wait, + /// for example due to lost device. + virtual Result WaitIdle() = 0; + + /// Inserts a semaphore signal into the GPU queue. The semaphore will be signaled once all previously submitted + /// work on this queue has completed. + /// + /// @param [in] pQueueSemaphore Semaphore to signal. + /// @param [in] value timeline Semaphore point value to signal, ignored for non-timeline semaphores. + /// + /// @returns Success if the semaphore signal was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if the OS scheduler rejects the signal for unknown reasons. + virtual Result SignalQueueSemaphore( + IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; + + /// Inserts a semaphore wait into the GPU queue. The queue will be stalled until the specified semaphore is + /// signaled. + /// + /// @param [in] pQueueSemaphore Semaphore to wait on. + /// @param [in] value timeline semaphore point value to wait on, ignored for non-timeline semaphores. + /// + /// @returns Success if the semaphore wait was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorUnknown if the OS scheduler rejects the wait for unknown reasons. + virtual Result WaitQueueSemaphore( + IQueueSemaphore* pQueueSemaphore, uint64 value = 0) = 0; + +#if PAL_KMT_BUILD + /// Acquire the keyed mutex of shared GPU memory object (CPU sync) and then wait for the synchronization object of + /// the shared GPU memory object (GPU sync based on fence). Note that the shared GPU memory object has to be + /// a D3d11 resource created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) + /// misc flag. + /// + /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are bound. + /// [in] key Key of keyed mutex to be acquired. + /// [in] timeout Timeout interval for keyed mutex acquiring, in milliseconds. + /// + /// @ returns Success if the keyed mutex has been successfully acquired and wait for the synchronization object + /// has been successfully scheduled. Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if either the keyed mutex has not been successfully acuiqred or wait for the + /// synchronization object has not been successfully scheduled. + virtual Result KeyedMutexAcquireSync( + IGpuMemory* pGpuMemory, + uint64 key, + std::chrono::milliseconds timeout) = 0; + + /// Signal the synchronization object of shared GPU memory object with bumped fence value and then release the + /// keyed mutex of shared GPU memory object. Note that the shared GPU memory object has to be a D3d11 resource + /// created with (D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX | D3D11_RESOURCE_MISC_SHARED_NTHANDLE) misc flag. + /// + /// @param [in] pGpuMemory Shared GPU memory object on which keyed mutex and synchronization object are + /// bound. + /// [in] key Key of keyed mutex to be released. + /// + /// @ returns Success if signal of the synchronization object has been successfully scheduled and the keyed mutex + /// has been successfully released. Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if either signal of the synchronization object has not been successfully scheduled + /// or keyed mutex has not been successfully released. + virtual Result KeyedMutexReleaseSync( + IGpuMemory* pGpuMemory, + uint64 key) = 0; +#endif + + /// This function passes application information to KMD for application specific power optimizations. + /// Power configuration are restored to default when all application queues are destroyed. + /// + /// @param [in] pFileName Application executable name + /// @param [in] pPathName Path to the application + /// + /// @returns Success if the information is passed successfully. Otherwise, one of the following errors may be + /// returned: + /// + Unsupported if this function is not available on this OS or if the queue context is null. + /// + ErrorUnknown if an unexpected internal error occurs. + virtual Result UpdateAppPowerProfile( + const wchar_t* pFileName, + const wchar_t* pPathName) = 0; + + /// Queues the specified image for presentation on the screen. This function directly queues the presentation + /// request based on the input parameters without special synchronization considerations like a swap chain present. + /// All previous work done on this queue will complete before the image is displayed. + /// + /// This function should never be called with a swap chain presentable image because it won't release ownership of + /// the presentable image index, eventually deadlocking the swap chain. + /// + /// Overall support for direct presents can be queried at platform creation time via supportNonSwapChainPresents + /// in @ref PlatformProperties. Support for particular present modes is specifed via supportedDirectPresentModes + /// in @ref DeviceProperties. + /// + /// @note Any images specified in presentInfo must be made resident before calling this function. + /// + /// @param [in] presentInfo Specifies the source image and destination window for the present as well as other + /// properties. + /// + /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidValue if the flip interval is invalid. + /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image. + /// + ErrorInvalidFlags if the present flags don't match the capabilities of the image. + virtual Result PresentDirect( + const PresentDirectInfo& presentInfo) = 0; + + /// Queues the specified image for presentation on the screen. This function uses the provided swap chain to + /// determine exactly how the image should be presented (e.g., can the user see tearing). See @ref ISwapChain for + /// more information on swap chain presentation. All previous work done on this queue will complete before the + /// image is displayed, but future work may execute before the present is completed because swap chain present + /// execution may be asynchronous to the queue that initiated present. + /// + /// Assuming the presentInfo is valid, this function will always release ownership of the presentable image index + /// even if PAL encounters an error while executing the present. + /// + /// Queue support for swap chain presents is specified via supportsSwapChainPresents in @ref DeviceProperties. + /// Support for particular PresentModes is queried per SwapChainMode via IDevice::GetSwapChainInfo(). + /// + /// @note The source image specified in presentInfo must be made resident before calling this function. + /// + /// @param [in] presentInfo Specifies the source image, swap chain, and basic presentation information. + /// + /// @returns Success if the present was successfully queued. Otherwise, one of the following errors may be + /// returned: + /// + ErrorInvalidPointer if the source image or swap chain are null. + /// + ErrorInvalidValue if the present mode doesn't match the capabilities of the image or if the image + /// index isn't valid within the swap chain. + virtual Result PresentSwapChain( + const PresentSwapChainInfo& presentInfo) = 0; + + /// Inserts a delay of a specified amount of time before processing more commands on this queue. + /// + /// Only available on timer queues. Useful in conjunction with queue semaphores to implement frame pacing. + /// + /// @param [in] delay Time, in milliseconds, to delay before processing more commands on this queue. + /// + /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if delay is less than 0. + virtual Result Delay( + Util::fmilliseconds delay) = 0; + + /// Inserts a delay of a specified amount of time on this queue after a vsync on a private display object. + /// + /// Only available on timer queues. Useful in conjunction with queue semaphores to implement pacing of GPU and CPU + /// operations for rendering and presentation in VR as this allows GPU commands of next frame to be sent early but + /// blocks GPU execution until after vsync. + /// + /// @param [in] delay Time, in microseconds, to delay before processing more commands on this queue. + /// @param [in] pScreen The private screen object that the vsync is occurring and the delay is waiting on. + /// + /// @returns Success if the delay was successfully queued. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue if delay is less than 0. + virtual Result DelayAfterVsync( + Util::fmicroseconds delay, + const IPrivateScreen* pScreen) = 0; + + /// Updates page mappings for virtual GPU memory allocations. + /// + /// @param [in] rangeCount Number of ranges to remap (i.e., size of the pRanges array). + /// @param [in] pRanges Defines the set of remappings from virtual GPU memory object pages to real GPU + /// memory object pages. + /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without + /// waiting for any previous rendering to finish first. On platforms that don't support + /// this, the flag will be ignored. + /// @param [in] pFence Optional. Pointer to an IFence, which will be signaled after the VA remapping. + /// + /// @returns Success if the remappings were executed successfully. It is assumed that the following conditions are + /// met for the input to this function: + /// + rangeCount is not 0. + /// + The page range for all members of pRanges are valid. + /// + pRanges is not null. + /// + pVirtualGpuMem is not null for any member of pRanges. + /// + pRanges does not specify a real GPU memory object as a virtual GPU memory object or vice versa. + virtual Result RemapVirtualMemoryPages( + uint32 rangeCount, + const VirtualMemoryRemapRange* pRanges, + bool doNotWait, + IFence* pFence) = 0; + + /// Copies page mappings from one virtual GPU memory object to another. + /// + /// @param [in] rangeCount Number of ranges to copy (i.e., size of the pRanges array). + /// @param [in] pRanges Defines the set of page mappings to copy between virtual GPU memory objects. + /// @param [in] doNotWait If true, then this paging operation will be executed on the Queue immediately, without + /// waiting for any previous rendering to finish first. On platforms that don't support + /// this, the flag will be ignored. + /// + /// @returns Success if the mappings were copied successfully. It is assumed that the following conditions are + /// met for the input to this function: + /// + rangeCount is not 0. + /// + The page range for all members of pRanges are valid. + /// + pRanges is not null. + /// + pSrcGpuMem or pDstGpuMem is not null for any member of pRanges. + /// + pRanges does not specify a real GPU memory object as source or destination + virtual Result CopyVirtualMemoryPageMappings( + uint32 rangeCount, + const VirtualMemoryCopyPageMappingsRange* pRanges, + bool doNotWait) = 0; + + /// Associates the provided Fence object with the last submission on this queue object. The Fence can be used via + /// GetStatus() to get the status of the last Submit, however no event will be created/set for the Fence so + /// WaitForFences() should NOT be called on the fence after this association. + /// + /// @see IFence::GetStatus() + /// @see IFence::WaitForFences() + /// + /// @param [in] pFence Fence object to be associated with the last Submit on this queue + /// + /// @returns Success if the association was successful. ErrorUnavailable will be returned in there has not yet been + /// a Submit on this queue. + virtual Result AssociateFenceWithLastSubmit( + IFence* pFence) = 0; + + /// Set execution priority for the current queue, it allows to elevate execution priority of submitted command + /// buffers, but it has no effect on command buffers that have already been submitted for execution. Elevating + /// the queue priority to medium or high would allow to temporary stall a low priority queue execution and execute + /// its work as soon as the low priority queue starts draining. + /// + /// @param [in] priority The priority level of the queue. + virtual void SetExecutionPriority( + QueuePriority priority) = 0; + + /// Returns a list of GPU memory allocations used by this queue. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this queue. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) = 0; + + /// Returns the QueueType for the queue + virtual QueueType Type() const = 0; + + /// Returns the EngineType for the queue + virtual EngineType GetEngineType() const = 0; + + /// Queries the kernel context info associated with this queue and copies it into pKernelContextInfo. + /// + /// Only supported on Windows platforms. + /// + /// @param [out] pKernelContextInfo Pointer to a KernelContextInfo struct to copy the information into. + /// @returns Success if the information is successfully copied into the output struct. + /// + ErrorInvalidPointer if pKernelContextInfo is nullptr. + /// + ErrorUnavailable if kernel context information is not available on the current platform. + virtual Result QueryKernelContextInfo(KernelContextInfo* pKernelContextInfo) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueue() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Queues will be destroyed when the + /// associated device is destroyed. + virtual ~IQueue() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h b/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h index 55eca6d55e..87fef9f360 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palQueueSemaphore.h @@ -1,275 +1,275 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palQueueSemaphore.h - * @brief Defines the Platform Abstraction Library (PAL) IQueueSemaphore interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" -#include - -#if defined(_WIN32) -struct _SECURITY_ATTRIBUTES; -#endif - -namespace Pal -{ - -// Forward declarations. -class IQueueSemaphore; - -/// Specifies properties for @ref IQueueSemaphore creation. Input structure to IDevice::CreateQueueSemaphore(). -struct QueueSemaphoreCreateInfo -{ - union - { - struct - { - /// This queue semaphore may be opened for use by a different device. - /// For DX12 native fence, the flag needs to be consistent with D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.Shared - /// given by DX runtime. - uint32 shareable : 1; - /// This queue semaphore can only be shared through Nt handle. - /// For DX12 native fence, the flag needs to be consistent with - /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NtSecuritySharing given by DX runtime. - uint32 sharedViaNtHandle : 1; - uint32 externalOpened : 1; ///< Semaphore was created by other APIs - /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload - /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the - /// waiter until the specified payload value has been signaled. - /// For DX12 native fence, runtime determines initialCount. Therefore, timeline flag has to be set. - uint32 timeline : 1; - /// Do not signal the queue semaphore to max if the device is lost. - /// For DX12 native fence, the flag needs to be consistent with - /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NoSignalMaxValueOnTdr given by DX runtime. - uint32 noSignalOnDeviceLost : 1; - /// For native fence only. If it's 0x0, the native fence type is D3DDDI_NATIVEFENCE_TYPE_DEFAULT. - /// If it's 0x1, native fence type is D3DDDI_NATIVEFENCE_TYPE_INTRA_GPU. - /// For DX12, the value is determined by runtime. DXCP needs to set it by reading D3DDDI_NATIVEFENCEINFO. - uint32 gpuOnly : 1; - /// This queue semaphore will be a monitored fence if this flag set, even if OS supports native fence. - uint32 forceUseMonitoredFence : 1; - uint32 reserved : 25; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< Queue semaphore creation flags. - - uint32 maxCount; ///< The maximum signal count; once reached, further signals are dropped. Must be - /// non-zero and no more than maxSemaphoreCount in @ref DeviceProperties. For - /// example, a value of one would request a binary semaphore. - /// NOTE: maxCount does not apply to timeline semaphores. - - uint64 initialCount; ///< Initial value for timeline semaphores. (or) - /// Initial count value for counting semaphores. - /// Must not be larger than maxCount for counting semaphores. - /// For DX12 native fence, DXCP needs to pass InitialFenceValue from - /// D3DDDI_NATIVEFENCEINFO. - -}; - -/// Specifies parameters for opening a queue semaphore for use on another device. Input structure to -/// IDevice::OpenSharedQueueSemaphore(). -struct QueueSemaphoreOpenInfo -{ - /// Shared queue semaphore object from another device to be opened. - IQueueSemaphore* pSharedQueueSemaphore; -}; - -/// Specifies parameters for opening a queue semaphore created by other APIs such as D3D. -struct ExternalQueueSemaphoreOpenInfo -{ - union - { - struct - { - uint32 crossProcess : 1; ///< This semaphore is created in another process. - uint32 sharedViaNtHandle : 1; ///< The shared semaphore handle is NT handle. - uint32 isReference : 1; ///< If set, then the opened semaphore will reference the same sync - ///< object in the kernel. Otherwise, the object is copied to the - ///< new Semaphore. - /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload - /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the - /// waiter until the specified payload value has been signaled. - uint32 timeline : 1; - uint32 reserved : 28; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< External queue semaphore open flags. - - OsExternalHandle externalSemaphore; ///< External shared semaphore handle. - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 882 -#if defined(__unix__) && PAL_KMT_BUILD - uint64 syncFdSignalValue; ///< Signal timeline value when importing the state of a sync file -#endif -#endif -}; - -/// Specifies parameters for exporting a queue semaphore. Input structure to IQueueSemaphore::ExportExternalHandle(). -struct QueueSemaphoreExportInfo -{ - union - { - struct - { - uint32 isReference : 1; ///< If set, then the semaphore exporting a handle that reference the - ///< same sync object in the kernel. Otherwise, the object is copied - ///< to the new Semaphore. - uint32 reserved : 31; ///< Resevered for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - } flags; ///< External queue semaphore export flags. - -#if PAL_KMT_BUILD - const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable - /// attribute. - const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT - /// handle with a name, and then the handle can be acquired - /// via this name. - uint32 accessFlags; ///< Desried access rights of GPU memory. -#if defined(__unix__) - uint64 syncFdWaitValue; ///< Wait timeline value when exporting the state of a sync file -#endif -#endif -}; - -/** - *********************************************************************************************************************** - * @interface IQueueSemaphore - * @brief Semaphore object used to synchronize GPU work performed by multiple, parallel queues. - * - * These semaphores are used by calling IQueue::SignalQueueSemaphore() and IQueue::WaitQueueSemaphore(). - * - * @see IDevice::CreateQueueSemaphore() - * @see IDevice::OpenSharedQueueSemaphore() - *********************************************************************************************************************** - */ -class IQueueSemaphore : public IDestroyable -{ -public: - /// An IQueue::WaitQueueSemaphore operation may need to be sent down to the OS after the corresponding - /// IQueue::SignalQueueSemaphore operation due to GPU scheduler limitations. This method checks if any queues have - /// batched-up commands waiting for a SignalQueueSemaphore operation to appear. - /// - /// @returns True if one or more queues have some number of commands batched-up waiting for other queues to signal - /// this semaphore. False otherwise. - virtual bool HasStalledQueues() = 0; - - /// Query timeline Semaphore payload - /// - /// @param [out] pValue returned payload from querying - /// - /// @returns Success if the timeline semaphore is queried successful. Otherwise, one of the following errors may - /// be returned: - /// + ErrorInvalidValue if an unexpected conversion error occurs. - /// + ErrorInvalidObjectType if semaphore is non-timeline type. - virtual Result QuerySemaphoreValue( - uint64* pValue) = 0; - - /// Wait on timeline Semaphore points, to be clarified, this is a CPU wait. - /// - /// @param [in] value Indicate which point to be waited. - /// @param [in] timeout the max waiting time, timeout is the timeout period in units of nanoseconds. - /// - /// @returns Success if the timeline semaphore point is waited successful. Otherwise, one of the following errors - /// may be returned: - /// + ErrorInvalidValue if an unexpected conversion error occurs. - /// + ErrorInvalidObjectType if semaphore is non-timeline type. - virtual Result WaitSemaphoreValue( - uint64 value, - std::chrono::nanoseconds timeout) = 0; - - /// Signal on timeline Semaphore points, to be clarified, this is a CPU signal. - /// - /// @param [in] value Indicate which point to be signaled. - /// - /// @returns Success if the timeline semaphore point is signaled successful. Otherwise, one of the following errors - /// may be returned: - /// + ErrorInvalidValue if an unexpected conversion error occurs. - /// + ErrorInvalidObjectType if semaphore is non-timeline type. - virtual Result SignalSemaphoreValue( - uint64 value) = 0; - -#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD - /// Returns an OS-specific handle which can be used to refer to this semaphore object across processes. This will - /// return a null or invalid handle if the object was not created with the external create flag set. - /// - /// @param [in] exportInfo Information describing how the Semamphore handle should be exported. - /// @note This function is only available for Linux builds. - /// - /// @returns An OS-specific handle which can be used to access the semaphore object across processes. - virtual OsExternalHandle ExportExternalHandle( - const QueueSemaphoreExportInfo& exportInfo) const = 0; -#endif - -#if defined(_WIN32) - /// Returns an OS-specific handle which can be used by another device to access the semaphore object. - /// - /// @returns An OS-specific handle which can be used by another device to access the semaphore object. - virtual OsExternalHandle ExportKmtHandle() const = 0; -#endif - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const - { - return m_pClientData; - } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IQueueSemaphore() : m_pClientData(nullptr) {} - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IQueueSemaphore() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palQueueSemaphore.h + * @brief Defines the Platform Abstraction Library (PAL) IQueueSemaphore interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include + +#if defined(_WIN32) +struct _SECURITY_ATTRIBUTES; +#endif + +namespace Pal +{ + +// Forward declarations. +class IQueueSemaphore; + +/// Specifies properties for @ref IQueueSemaphore creation. Input structure to IDevice::CreateQueueSemaphore(). +struct QueueSemaphoreCreateInfo +{ + union + { + struct + { + /// This queue semaphore may be opened for use by a different device. + /// For DX12 native fence, the flag needs to be consistent with D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.Shared + /// given by DX runtime. + uint32 shareable : 1; + /// This queue semaphore can only be shared through Nt handle. + /// For DX12 native fence, the flag needs to be consistent with + /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NtSecuritySharing given by DX runtime. + uint32 sharedViaNtHandle : 1; + uint32 externalOpened : 1; ///< Semaphore was created by other APIs + /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload + /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the + /// waiter until the specified payload value has been signaled. + /// For DX12 native fence, runtime determines initialCount. Therefore, timeline flag has to be set. + uint32 timeline : 1; + /// Do not signal the queue semaphore to max if the device is lost. + /// For DX12 native fence, the flag needs to be consistent with + /// D3DDDI_SYNCHRONIZATIONOBJECT_FLAGS.NoSignalMaxValueOnTdr given by DX runtime. + uint32 noSignalOnDeviceLost : 1; + /// For native fence only. If it's 0x0, the native fence type is D3DDDI_NATIVEFENCE_TYPE_DEFAULT. + /// If it's 0x1, native fence type is D3DDDI_NATIVEFENCE_TYPE_INTRA_GPU. + /// For DX12, the value is determined by runtime. DXCP needs to set it by reading D3DDDI_NATIVEFENCEINFO. + uint32 gpuOnly : 1; + /// This queue semaphore will be a monitored fence if this flag set, even if OS supports native fence. + uint32 forceUseMonitoredFence : 1; + uint32 reserved : 25; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< Queue semaphore creation flags. + + uint32 maxCount; ///< The maximum signal count; once reached, further signals are dropped. Must be + /// non-zero and no more than maxSemaphoreCount in @ref DeviceProperties. For + /// example, a value of one would request a binary semaphore. + /// NOTE: maxCount does not apply to timeline semaphores. + + uint64 initialCount; ///< Initial value for timeline semaphores. (or) + /// Initial count value for counting semaphores. + /// Must not be larger than maxCount for counting semaphores. + /// For DX12 native fence, DXCP needs to pass InitialFenceValue from + /// D3DDDI_NATIVEFENCEINFO. + +}; + +/// Specifies parameters for opening a queue semaphore for use on another device. Input structure to +/// IDevice::OpenSharedQueueSemaphore(). +struct QueueSemaphoreOpenInfo +{ + /// Shared queue semaphore object from another device to be opened. + IQueueSemaphore* pSharedQueueSemaphore; +}; + +/// Specifies parameters for opening a queue semaphore created by other APIs such as D3D. +struct ExternalQueueSemaphoreOpenInfo +{ + union + { + struct + { + uint32 crossProcess : 1; ///< This semaphore is created in another process. + uint32 sharedViaNtHandle : 1; ///< The shared semaphore handle is NT handle. + uint32 isReference : 1; ///< If set, then the opened semaphore will reference the same sync + ///< object in the kernel. Otherwise, the object is copied to the + ///< new Semaphore. + /// This queue semaphore is a timeline semaphore. Timeline semaphores have a 64-bit unsigned integer payload + /// which gets monotonically increased with each Signal operation. A wait on a timeline semaphore blocks the + /// waiter until the specified payload value has been signaled. + uint32 timeline : 1; + uint32 reserved : 28; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External queue semaphore open flags. + + OsExternalHandle externalSemaphore; ///< External shared semaphore handle. + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 882 +#if defined(__unix__) && PAL_KMT_BUILD + uint64 syncFdSignalValue; ///< Signal timeline value when importing the state of a sync file +#endif +#endif +}; + +/// Specifies parameters for exporting a queue semaphore. Input structure to IQueueSemaphore::ExportExternalHandle(). +struct QueueSemaphoreExportInfo +{ + union + { + struct + { + uint32 isReference : 1; ///< If set, then the semaphore exporting a handle that reference the + ///< same sync object in the kernel. Otherwise, the object is copied + ///< to the new Semaphore. + uint32 reserved : 31; ///< Resevered for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + } flags; ///< External queue semaphore export flags. + +#if PAL_KMT_BUILD + const _SECURITY_ATTRIBUTES* pSecurityAttributes; ///< It specifies the security descriptor and the inheritable + /// attribute. + const wchar_t* pNtObjectName; ///< A name to NT handle, if the object is exported as a NT + /// handle with a name, and then the handle can be acquired + /// via this name. + uint32 accessFlags; ///< Desried access rights of GPU memory. +#if defined(__unix__) + uint64 syncFdWaitValue; ///< Wait timeline value when exporting the state of a sync file +#endif +#endif +}; + +/** + *********************************************************************************************************************** + * @interface IQueueSemaphore + * @brief Semaphore object used to synchronize GPU work performed by multiple, parallel queues. + * + * These semaphores are used by calling IQueue::SignalQueueSemaphore() and IQueue::WaitQueueSemaphore(). + * + * @see IDevice::CreateQueueSemaphore() + * @see IDevice::OpenSharedQueueSemaphore() + *********************************************************************************************************************** + */ +class IQueueSemaphore : public IDestroyable +{ +public: + /// An IQueue::WaitQueueSemaphore operation may need to be sent down to the OS after the corresponding + /// IQueue::SignalQueueSemaphore operation due to GPU scheduler limitations. This method checks if any queues have + /// batched-up commands waiting for a SignalQueueSemaphore operation to appear. + /// + /// @returns True if one or more queues have some number of commands batched-up waiting for other queues to signal + /// this semaphore. False otherwise. + virtual bool HasStalledQueues() = 0; + + /// Query timeline Semaphore payload + /// + /// @param [out] pValue returned payload from querying + /// + /// @returns Success if the timeline semaphore is queried successful. Otherwise, one of the following errors may + /// be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result QuerySemaphoreValue( + uint64* pValue) = 0; + + /// Wait on timeline Semaphore points, to be clarified, this is a CPU wait. + /// + /// @param [in] value Indicate which point to be waited. + /// @param [in] timeout the max waiting time, timeout is the timeout period in units of nanoseconds. + /// + /// @returns Success if the timeline semaphore point is waited successful. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result WaitSemaphoreValue( + uint64 value, + std::chrono::nanoseconds timeout) = 0; + + /// Signal on timeline Semaphore points, to be clarified, this is a CPU signal. + /// + /// @param [in] value Indicate which point to be signaled. + /// + /// @returns Success if the timeline semaphore point is signaled successful. Otherwise, one of the following errors + /// may be returned: + /// + ErrorInvalidValue if an unexpected conversion error occurs. + /// + ErrorInvalidObjectType if semaphore is non-timeline type. + virtual Result SignalSemaphoreValue( + uint64 value) = 0; + +#if PAL_KMT_BUILD || PAL_AMDGPU_BUILD + /// Returns an OS-specific handle which can be used to refer to this semaphore object across processes. This will + /// return a null or invalid handle if the object was not created with the external create flag set. + /// + /// @param [in] exportInfo Information describing how the Semamphore handle should be exported. + /// @note This function is only available for Linux builds. + /// + /// @returns An OS-specific handle which can be used to access the semaphore object across processes. + virtual OsExternalHandle ExportExternalHandle( + const QueueSemaphoreExportInfo& exportInfo) const = 0; +#endif + +#if defined(_WIN32) + /// Returns an OS-specific handle which can be used by another device to access the semaphore object. + /// + /// @returns An OS-specific handle which can be used by another device to access the semaphore object. + virtual OsExternalHandle ExportKmtHandle() const = 0; +#endif + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const + { + return m_pClientData; + } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IQueueSemaphore() : m_pClientData(nullptr) {} + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IQueueSemaphore() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h b/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h index f51b536ff2..1bfd170c35 100644 --- a/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h +++ b/shared/amdgpu-windows-interop/pal/inc/core/palShaderLibrary.h @@ -1,253 +1,251 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palShaderLibrary.h - * @brief Defines the Platform Abstraction Library (PAL) IShaderLibrary interface and related types. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" -#include "palDestroyable.h" -#include "palStringView.h" -#include "palSpan.h" - -namespace Pal -{ - -struct GpuMemSubAllocInfo; - -/// Common flags controlling creation of shader libraries. -union LibraryCreateFlags -{ - struct - { - uint32 clientInternal : 1; ///< Internal library not created by the application. - uint32 isGraphics : 1; ///< Whether it is a graphics library - uint32 reserved : 30; ///< Reserved for future use. - }; - uint32 u32All; ///< Flags packed as 32-bit uint. -}; - -/// Specifies properties about an indirect function belonging to a @ref IShaderLibrary object. Part of the input -/// structure to IDevice::CreateShaderLibrary(). -struct ShaderLibraryFunctionInfo -{ - Util::StringView symbolName; ///< ELF Symbol name for the associated function. - gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during - /// library creation. -}; - -/// Specifies a shader sub type / ShaderKind. -enum class ShaderSubType : uint32 -{ - Unknown = 0, - Traversal, - RayGeneration, - Intersection, - AnyHit, - ClosestHit, - Miss, - Callable, - LaunchKernel, ///< Raytracing launch kernel - Count -}; - -/// Specifies properties for creation of a compute @ref IShaderLibrary object. Input structure to -/// IDevice::CreateShaderLibrary(). -struct ShaderLibraryCreateInfo -{ - LibraryCreateFlags flags; ///< Library creation flags - - const void* pCodeObject; ///< Pointer to code-object ELF binary implementing the Pipeline ABI interface. - /// The code-object ELF contains pre-compiled shaders, register values, and - /// additional metadata. - size_t codeObjectSize; ///< Size of code object in bytes. -}; - -/// Reports properties of a compiled library. -struct LibraryInfo -{ - PipelineHash internalLibraryHash; ///< 128-bit identifier extracted from this library's ELF binary, composed of - /// the state the compiler decided was appropriate to identify the compiled - /// library. The lower 64 bits are "stable"; the upper 64 bits are "unique". - Util::StringView colorExports; ///< For a Graphics Partial Pipeline pixel shader, an opaque - /// string to pass to the compiler to build the color export shader. -}; - -/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined -/// due to HW support. The same information will be repeated for both the constituent shaders in this case. -struct ShaderLibStats -{ - ShaderHash shaderHash; ///< Shader hash. - CommonShaderStats common; ///< The shader compilation parameters for this shader. - /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum - /// of any architectural restriction and any client-requested limit intended to increase the number of waves in - /// flight. - uint32 numAvailableVgprs; - /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum - /// of any architectural restriction and any client-requested limit intended to increase the number of waves in - /// flight. - uint32 numAvailableSgprs; - size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. - PipelineHash palInternalLibraryHash; ///< Internal hash of the shader compilation data used by PAL. - uint32 stackFrameSizeInBytes; ///< Shader function stack frame size - ShaderSubType shaderSubType; ///< ShaderSubType / Shader Kind - CompilerStackSizes cpsStackSizes; ///< Stack used in Continuation -}; - -/** - *********************************************************************************************************************** - * @interface IShaderLibrary - * @brief Object containing one or more shader functions stored in GPU memory. These shader functions are callable - * from the shaders contained within IPipeline objects. - * - * Before a pipeline which calls into this library is bound to a command buffer (using @ref ICmdBuffer::BindPipeline), - * the client must call @ref IPipeline::LinkWithLibraries() and specify this library in the list of linked libraries. - * Failure to comply with this requirement is an error and will result in undefined behavior. - * - * @see IDevice::CreateShaderLibrary() - * @see IPipeline::LinkWithLibraries() - *********************************************************************************************************************** - */ -class IShaderLibrary : public IDestroyable -{ -public: - /// Returns properties of this library and its corresponding shader functions. - /// - /// @returns Property structure describing this library. - virtual const LibraryInfo& GetInfo() const = 0; - - /// Returns a list of GPU memory allocations used by this library. - /// - /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value - /// reports the number of GPU memory allocations. - /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it - /// will reflect the number of allocations that make up this pipeline. If - /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number - /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the - /// number of entries in pAllocInfoList that are valid. - /// @returns Success if the allocation info was successfully written to the buffer. - /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. - /// + ErrorInvalidPointer if pNumEntries is nullptr. - virtual Result QueryAllocationInfo( - size_t* pNumEntries, - GpuMemSubAllocInfo* const pAllocInfoList) const = 0; - - /// Gives the client access to the resource ID used for internal Pal events. - /// EX: Resource Create, Resource Bind, Resource Destroy. - /// - /// @returns The Resource ID. - virtual const void* GetResourceId() const = 0; - - /// Obtains the binary code object for this library. - /// - /// @param [in, out] pSize Represents the size of the shader ISA code. - /// - /// @param [out] pBuffer If non-null, the library ELF is written in the buffer. If null, the size required - /// for the library ELF is given out in the location pSize. - /// - /// @returns Success if the library binary was fetched successfully. - /// +ErrorUnavailable if the library binary was not fetched successfully. - virtual Result GetCodeObject( - uint32* pSize, - void* pBuffer) const = 0; - - /// Returns the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @returns Pointer to client data. - void* GetClientData() const { return m_pClientData; } - - /// Sets the value of the associated arbitrary client data pointer. - /// Can be used to associate arbitrary data with a particular PAL object. - /// - /// @param [in] pClientData A pointer to arbitrary client data. - void SetClientData( - void* pClientData) - { - m_pClientData = pClientData; - } - - /// Obtains the compiled shader ISA code for the shader function specified. - /// - /// @param [in] pShaderExportName The shader exported name - /// - /// @param [in, out] pSize Represents the size of the shader ISA code. - /// - /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required - /// for the shader ISA is given out in the location pSize. - /// - /// @returns Success if the shader ISA code was fetched successfully. - /// +ErrorUnavailable if the shader ISA code was not fetched successfully. - - virtual Result GetShaderFunctionCode( - Util::StringView shaderExportName, - size_t* pSize, - void* pBuffer) const = 0; - - /// Obtains the shader pre and post compilation stats/params for the specified shader. - /// - /// @param [in] pShaderExportName The shader exported name - /// - /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for - /// the shader stage mentioned in shaderType. This cannot be nullptr. - /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the - /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. - /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. - /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error - /// occured. - virtual Result GetShaderFunctionStats( - Util::StringView shaderExportName, - ShaderLibStats* pShaderStats) const = 0; - - /// Returns the function list owned by this shader library - /// - /// @returns A list of ShaderLibraryFunctionInfo. - virtual const Util::Span GetShaderLibFunctionInfos() const = 0; - -protected: - /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly - /// called the proper create method. - IShaderLibrary() : m_pClientData(nullptr) { } - - /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by - /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the - /// object on their own. - virtual ~IShaderLibrary() { } - -private: - /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() - /// and set via SetClientData(). - /// For non-top-layer objects, this will point to the layer above the current object. - void* m_pClientData; - - IShaderLibrary(const IShaderLibrary&) = delete; - IShaderLibrary& operator=(const IShaderLibrary&) = delete; -}; - -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palShaderLibrary.h + * @brief Defines the Platform Abstraction Library (PAL) IShaderLibrary interface and related types. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" +#include "palDestroyable.h" +#include "palStringView.h" +#include "palSpan.h" + +namespace Pal +{ + +struct GpuMemSubAllocInfo; + +/// Common flags controlling creation of shader libraries. +union LibraryCreateFlags +{ + struct + { + uint32 clientInternal : 1; ///< Internal library not created by the application. + uint32 isGraphics : 1; ///< Whether it is a graphics library + uint32 reserved : 30; ///< Reserved for future use. + }; + uint32 u32All; ///< Flags packed as 32-bit uint. +}; + +/// Specifies properties about an indirect function belonging to a @ref IShaderLibrary object. Part of the input +/// structure to IDevice::CreateShaderLibrary(). +struct ShaderLibraryFunctionInfo +{ + Util::StringView symbolName; ///< ELF Symbol name for the associated function. + gpusize gpuVirtAddr; ///< [out] GPU virtual address of the function. This is computed by PAL during + /// library creation. +}; + +/// Specifies a shader sub type / ShaderKind. +enum class ShaderSubType : uint32 +{ + Unknown = 0, + Traversal, + RayGeneration, + Intersection, + AnyHit, + ClosestHit, + Miss, + Callable, + LaunchKernel, ///< Raytracing launch kernel + Count +}; + +/// Specifies properties for creation of a compute @ref IShaderLibrary object. Input structure to +/// IDevice::CreateShaderLibrary(). +struct ShaderLibraryCreateInfo +{ + LibraryCreateFlags flags; ///< Library creation flags + + const void* pCodeObject; ///< Pointer to code-object ELF binary implementing the Pipeline ABI interface. + /// The code-object ELF contains pre-compiled shaders, register values, and + /// additional metadata. + size_t codeObjectSize; ///< Size of code object in bytes. +}; + +/// Reports properties of a compiled library. +struct LibraryInfo +{ + PipelineHash internalLibraryHash; ///< 128-bit identifier extracted from this library's ELF binary, composed of + /// the state the compiler decided was appropriate to identify the compiled + /// library. The lower 64 bits are "stable"; the upper 64 bits are "unique". +}; + +/// Reports shader stats. Multiple bits set in the shader stage mask indicates that multiple shaders have been combined +/// due to HW support. The same information will be repeated for both the constituent shaders in this case. +struct ShaderLibStats +{ + ShaderHash shaderHash; ///< Shader hash. + CommonShaderStats common; ///< The shader compilation parameters for this shader. + /// Maximum number of VGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableVgprs; + /// Maximum number of SGPRs the compiler was allowed to use for this shader. This limit will be the minimum + /// of any architectural restriction and any client-requested limit intended to increase the number of waves in + /// flight. + uint32 numAvailableSgprs; + size_t isaSizeInBytes; ///< Size of the shader ISA disassembly for this shader. + PipelineHash palInternalLibraryHash; ///< Internal hash of the shader compilation data used by PAL. + uint32 stackFrameSizeInBytes; ///< Shader function stack frame size + ShaderSubType shaderSubType; ///< ShaderSubType / Shader Kind + CompilerStackSizes cpsStackSizes; ///< Stack used in Continuation +}; + +/** + *********************************************************************************************************************** + * @interface IShaderLibrary + * @brief Object containing one or more shader functions stored in GPU memory. These shader functions are callable + * from the shaders contained within IPipeline objects. + * + * Before a pipeline which calls into this library is bound to a command buffer (using @ref ICmdBuffer::BindPipeline), + * the client must call @ref IPipeline::LinkWithLibraries() and specify this library in the list of linked libraries. + * Failure to comply with this requirement is an error and will result in undefined behavior. + * + * @see IDevice::CreateShaderLibrary() + * @see IPipeline::LinkWithLibraries() + *********************************************************************************************************************** + */ +class IShaderLibrary : public IDestroyable +{ +public: + /// Returns properties of this library and its corresponding shader functions. + /// + /// @returns Property structure describing this library. + virtual const LibraryInfo& GetInfo() const = 0; + + /// Returns a list of GPU memory allocations used by this library. + /// + /// @param [in,out] pNumEntries Input value specifies the available size in pAllocInfoList; output value + /// reports the number of GPU memory allocations. + /// @param [out] pAllocInfoList If pAllocInfoList=nullptr, then pNumEntries is ignored on input. On output it + /// will reflect the number of allocations that make up this pipeline. If + /// pAllocInfoList!=nullptr, then on input pNumEntries is assumed to be the number + /// of entries in the pAllocInfoList array. On output, pNumEntries reflects the + /// number of entries in pAllocInfoList that are valid. + /// @returns Success if the allocation info was successfully written to the buffer. + /// + ErrorInvalidValue if the caller provides a buffer size that is different from the size needed. + /// + ErrorInvalidPointer if pNumEntries is nullptr. + virtual Result QueryAllocationInfo( + size_t* pNumEntries, + GpuMemSubAllocInfo* const pAllocInfoList) const = 0; + + /// Gives the client access to the resource ID used for internal Pal events. + /// EX: Resource Create, Resource Bind, Resource Destroy. + /// + /// @returns The Resource ID. + virtual const void* GetResourceId() const = 0; + + /// Obtains the binary code object for this library. + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the library ELF is written in the buffer. If null, the size required + /// for the library ELF is given out in the location pSize. + /// + /// @returns Success if the library binary was fetched successfully. + /// +ErrorUnavailable if the library binary was not fetched successfully. + virtual Result GetCodeObject( + uint32* pSize, + void* pBuffer) const = 0; + + /// Returns the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @returns Pointer to client data. + void* GetClientData() const { return m_pClientData; } + + /// Sets the value of the associated arbitrary client data pointer. + /// Can be used to associate arbitrary data with a particular PAL object. + /// + /// @param [in] pClientData A pointer to arbitrary client data. + void SetClientData( + void* pClientData) + { + m_pClientData = pClientData; + } + + /// Obtains the compiled shader ISA code for the shader function specified. + /// + /// @param [in] pShaderExportName The shader exported name + /// + /// @param [in, out] pSize Represents the size of the shader ISA code. + /// + /// @param [out] pBuffer If non-null, the shader ISA code is written in the buffer. If null, the size required + /// for the shader ISA is given out in the location pSize. + /// + /// @returns Success if the shader ISA code was fetched successfully. + /// +ErrorUnavailable if the shader ISA code was not fetched successfully. + + virtual Result GetShaderFunctionCode( + Util::StringView shaderExportName, + size_t* pSize, + void* pBuffer) const = 0; + + /// Obtains the shader pre and post compilation stats/params for the specified shader. + /// + /// @param [in] pShaderExportName The shader exported name + /// + /// @param [out] pShaderStats Pointer to the ShaderStats structure which will be filled with the shader stats for + /// the shader stage mentioned in shaderType. This cannot be nullptr. + /// @param [in] getDisassemblySize If set to true performs disassembly on the shader binary code and reports the + /// size of the disassembly string in ShaderStats::isaSizeInBytes. Else reports 0. + /// @returns Success if the stats were successfully obtained for this shader, including the shader disassembly size. + /// +ErrorUnavailable if a wrong shader stage for this pipeline was specified, or if some internal error + /// occured. + virtual Result GetShaderFunctionStats( + Util::StringView shaderExportName, + ShaderLibStats* pShaderStats) const = 0; + + /// Returns the function list owned by this shader library + /// + /// @returns A list of ShaderLibraryFunctionInfo. + virtual const Util::Span GetShaderLibFunctionInfos() const = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly + /// called the proper create method. + IShaderLibrary() : m_pClientData(nullptr) { } + + /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by + /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the + /// object on their own. + virtual ~IShaderLibrary() { } + +private: + /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() + /// and set via SetClientData(). + /// For non-top-layer objects, this will point to the layer above the current object. + void* m_pClientData; + + IShaderLibrary(const IShaderLibrary&) = delete; + IShaderLibrary& operator=(const IShaderLibrary&) = delete; +}; + +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h index 6e3677e2f5..a3f8e1b9a1 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palCodeObjectTraceSource.h @@ -1,214 +1,212 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "palGpaSession.h" -#include "palGpuUtil.h" -#include "palTraceSession.h" -#include "palVector.h" -#include "palHashSet.h" -#include "palMutex.h" - -namespace Pal -{ -class IPlatform; -class IDevice; -class IShaderLibrary; -} // namespace Pal - -namespace GpuUtil -{ -class GpaSession; -} // namespace GpuUtil - -namespace GpuUtil -{ - -namespace TraceChunk -{ - -/// "CodeObject" RDF chunk identifier & version -constexpr char CodeObjectChunkId[TextIdentifierSize] = "CodeObject"; -constexpr Pal::uint32 CodeObjectChunkVersion = 2; - -/// Header for the "CodeObject" RDF chunk -struct CodeObjectHeader -{ - Pal::uint32 pciId; /// The ID of the GPU the trace was run on - Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary -}; - -/// "COLoadEvent" RDF chunk identifier & version -constexpr char CodeObjectLoadEventChunkId[TextIdentifierSize] = "COLoadEvent"; -constexpr Pal::uint32 CodeObjectLoadEventChunkVersion = 3; - -struct CodeObjectLoadEventHeader -{ - Pal::uint32 count; /// Number of load events in this chunk -}; - -/// Describes whether a load event was into GPU memory or from. -enum class CodeObjectLoadEventType : Pal::uint32 -{ - LoadToGpuMemory = 0, /// Code Object was loaded into GPU memory - UnloadFromGpuMemory = 1 /// Code Object was unloaded from GPU memory -}; - -/// Describes one or more GPU load/unload(s) of a Code Object. Payload for "COLoadEvent" RDF chunk. -struct CodeObjectLoadEvent -{ - Pal::uint32 pciId; /// The ID of the GPU the trace was run on - CodeObjectLoadEventType eventType; /// Type of loader event - Pal::uint64 baseAddress; /// Base address where the Code Object was loaded - Pal::ShaderHash codeObjectHash; /// Hash of the (un)loaded Code Object binary - Pal::uint64 timestamp; /// CPU timestamp of this event being triggered -}; - -/// "PsoCorrelation" RDF chunk identifier & version -constexpr char PsoCorrelationChunkId[TextIdentifierSize] = "PsoCorrelation"; -constexpr Pal::uint32 PsoCorrelationChunkVersion = 3; - -struct PsoCorrelationHeader -{ - Pal::uint32 count; /// Number of PSO correlations in this chunk -}; - -/// Payload for the "PsoCorrelation" RDF chunks -struct PsoCorrelation -{ - Pal::uint32 pciId; /// The ID of the GPU the trace was run on - Pal::uint64 apiPsoHash; /// Hash of the API-level Pipeline State Object - Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler - char apiLevelObjectName[64]; /// Debug object name (null-terminated) -}; - -/// "COCorrelation" RDF chunk identifier & version -constexpr char CodeObjectCorrelationChunkId[TextIdentifierSize] = "COCorrelation"; -constexpr uint32_t CodeObjectCorrelationChunkVersion = 4; - -struct CodeObjectCorrelationHeader -{ - Pal::uint32 count; /// Number of Code Object Correlations in this chunk -}; - -/// Payload for the "CodeObjectCorrelation" RDF chunks -struct CodeObjectCorrelation -{ - Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler - Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary in the CO Database - Pal::uint32 containsMetadata : 1; /// 1 if the code object contains metadata, 0 otherwise - Pal::uint32 reserved : 31; /// Bitflags reserved for future use -}; - -} // namespace TraceChunk - -/// CodeObject Trace Source name & version -constexpr char CodeObjectTraceSourceName[] = "codeobject"; -constexpr Pal::uint32 CodeObjectTraceSourceVersion = 3; - -// ===================================================================================================================== -class CodeObjectTraceSource : public ITraceSource -{ -public: - CodeObjectTraceSource(Pal::IPlatform* pPlatform); - ~CodeObjectTraceSource(); - - // ==== TraceSource Native Functions ========================================================================== // - Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); - Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); - - Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); - Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); - - Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); - Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); - - // ==== Base Class Overrides =================================================================================== // -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < COMPRESSION_ARG_VERSION - virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { } -#endif - - virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } -#else - virtual void OnTraceAccepted() override { } -#endif - virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } - virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 - virtual void OnPostambleEnd( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer* pCmdBuf) override { } -#endif - virtual void OnTraceFinished() override; - - virtual const char* GetName() const override { return CodeObjectTraceSourceName; } - virtual Pal::uint32 GetVersion() const override { return CodeObjectTraceSourceVersion; } - -private: - Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); - Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); - - Pal::Result AddCodeObjectLoadEvent( - const Pal::IShaderLibrary* pLibrary, - TraceChunk::CodeObjectLoadEventType eventType); - Pal::Result AddCodeObjectLoadEvent( - const Pal::IPipeline* pLibrary, - TraceChunk::CodeObjectLoadEventType eventType); - Pal::Result AddCodeObjectLoadEvent( - const ElfBinaryInfo& elfBinaryInfo, - TraceChunk::CodeObjectLoadEventType eventType); - - Pal::Result WriteCodeObjectChunks(); - Pal::Result WriteLoaderEventsChunk(); - Pal::Result WritePsoCorrelationChunk(); - Pal::Result WriteCoCorrelationChunk(); - - struct CodeObjectDatabaseRecord - { - Pal::uint32 recordSize; - Pal::ShaderHash codeObjectHash; - }; - - Pal::IPlatform* const m_pPlatform; - - Util::RWLock m_registerPipelineLock; - Util::Vector m_codeObjectRecords; - Util::Vector m_loadEventRecords; - Util::Vector m_psoCorrelationRecords; - Util::Vector m_coCorrelationRecords; - - // API hashes -> internal pipeline hash (-> child code object hashes) - Util::HashSet m_registeredApiHashes; - Util::HashSet m_registeredPipelines; - Util::HashSet m_registeredCoHashes; - -}; - -} // namespace GpuUtil - +/* + *********************************************************************************************************************** + * + * Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palGpaSession.h" +#include "palGpuUtil.h" +#include "palTraceSession.h" +#include "palVector.h" +#include "palHashSet.h" +#include "palMutex.h" + +namespace Pal +{ +class IPlatform; +class IDevice; +class IShaderLibrary; +} // namespace Pal + +namespace GpuUtil +{ +class GpaSession; +} // namespace GpuUtil + +namespace GpuUtil +{ + +namespace TraceChunk +{ + +/// "CodeObject" RDF chunk identifier & version +constexpr char CodeObjectChunkId[TextIdentifierSize] = "CodeObject"; +constexpr Pal::uint32 CodeObjectChunkVersion = 2; + +/// Header for the "CodeObject" RDF chunk +struct CodeObjectHeader +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary +}; + +/// "COLoadEvent" RDF chunk identifier & version +constexpr char CodeObjectLoadEventChunkId[TextIdentifierSize] = "COLoadEvent"; +constexpr Pal::uint32 CodeObjectLoadEventChunkVersion = 3; + +struct CodeObjectLoadEventHeader +{ + Pal::uint32 count; /// Number of load events in this chunk +}; + +/// Describes whether a load event was into GPU memory or from. +enum class CodeObjectLoadEventType : Pal::uint32 +{ + LoadToGpuMemory = 0, /// Code Object was loaded into GPU memory + UnloadFromGpuMemory = 1 /// Code Object was unloaded from GPU memory +}; + +/// Describes one or more GPU load/unload(s) of a Code Object. Payload for "COLoadEvent" RDF chunk. +struct CodeObjectLoadEvent +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + CodeObjectLoadEventType eventType; /// Type of loader event + Pal::uint64 baseAddress; /// Base address where the Code Object was loaded + Pal::ShaderHash codeObjectHash; /// Hash of the (un)loaded Code Object binary + Pal::uint64 timestamp; /// CPU timestamp of this event being triggered +}; + +/// "PsoCorrelation" RDF chunk identifier & version +constexpr char PsoCorrelationChunkId[TextIdentifierSize] = "PsoCorrelation"; +constexpr Pal::uint32 PsoCorrelationChunkVersion = 3; + +struct PsoCorrelationHeader +{ + Pal::uint32 count; /// Number of PSO correlations in this chunk +}; + +/// Payload for the "PsoCorrelation" RDF chunks +struct PsoCorrelation +{ + Pal::uint32 pciId; /// The ID of the GPU the trace was run on + Pal::uint64 apiPsoHash; /// Hash of the API-level Pipeline State Object + Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler + char apiLevelObjectName[64]; /// Debug object name (null-terminated) +}; + +/// "COCorrelation" RDF chunk identifier & version +constexpr char CodeObjectCorrelationChunkId[TextIdentifierSize] = "COCorrelation"; +constexpr uint32_t CodeObjectCorrelationChunkVersion = 4; + +struct CodeObjectCorrelationHeader +{ + Pal::uint32 count; /// Number of Code Object Correlations in this chunk +}; + +/// Payload for the "CodeObjectCorrelation" RDF chunks +struct CodeObjectCorrelation +{ + Pal::PipelineHash internalPipelineHash; /// Hash of all inputs to the pipeline compiler + Pal::ShaderHash codeObjectHash; /// Hash of the Code Object binary in the CO Database + Pal::uint32 containsMetadata : 1; /// 1 if the code object contains metadata, 0 otherwise + Pal::uint32 reserved : 31; /// Bitflags reserved for future use +}; + +} // namespace TraceChunk + +/// CodeObject Trace Source name & version +constexpr char CodeObjectTraceSourceName[] = "codeobject"; +constexpr Pal::uint32 CodeObjectTraceSourceVersion = 3; + +// ===================================================================================================================== +class CodeObjectTraceSource : public ITraceSource +{ +public: + CodeObjectTraceSource(Pal::IPlatform* pPlatform); + ~CodeObjectTraceSource(); + + // ==== TraceSource Native Functions ========================================================================== // + Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); + + Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); + Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); + + Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + // ==== Base Class Overrides =================================================================================== // + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { } + + virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } +#else + virtual void OnTraceAccepted() override { } +#endif + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } + virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { } +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override { } +#endif + virtual void OnTraceFinished() override; + + virtual const char* GetName() const override { return CodeObjectTraceSourceName; } + virtual Pal::uint32 GetVersion() const override { return CodeObjectTraceSourceVersion; } + +private: + Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); + + Pal::Result AddCodeObjectLoadEvent( + const Pal::IShaderLibrary* pLibrary, + TraceChunk::CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent( + const Pal::IPipeline* pLibrary, + TraceChunk::CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent( + const ElfBinaryInfo& elfBinaryInfo, + TraceChunk::CodeObjectLoadEventType eventType); + + Pal::Result WriteCodeObjectChunks(); + Pal::Result WriteLoaderEventsChunk(); + Pal::Result WritePsoCorrelationChunk(); + Pal::Result WriteCoCorrelationChunk(); + + struct CodeObjectDatabaseRecord + { + Pal::uint32 recordSize; + Pal::ShaderHash codeObjectHash; + }; + + Pal::IPlatform* const m_pPlatform; + + Util::RWLock m_registerPipelineLock; + Util::Vector m_codeObjectRecords; + Util::Vector m_loadEventRecords; + Util::Vector m_psoCorrelationRecords; + Util::Vector m_coCorrelationRecords; + + // API hashes -> internal pipeline hash (-> child code object hashes) + Util::HashSet m_registeredApiHashes; + Util::HashSet m_registeredPipelines; + Util::HashSet m_registeredCoHashes; + +}; + +} // namespace GpuUtil + diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h index b64e2ca282..81aabf1122 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpaSession.h @@ -1,1197 +1,1201 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palGpaSession.h - * @brief PAL GPU utility GpaSession class. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palDeque.h" -#include "palDevice.h" -#include "palGpuUtil.h" -#include "palHashSet.h" -#include "palMutex.h" -#include "palPipeline.h" -#include "palVector.h" -#include "palPlatform.h" -#include "palSysMemory.h" -#include "palGpuMemory.h" -#include "palMemTrackerImpl.h" - -// Forward declarations. -namespace Pal -{ - class ICmdAllocator; - class ICmdBuffer; - class IDevice; - class IGpuEvent; - class IGpuMemory; - class IPerfExperiment; - class IQueue; - class IQueueSemaphore; - struct GlobalCounterLayout; - struct MultiSubmitInfo; - struct ThreadTraceLayout; -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 - enum PipelineStageFlag : uint32; -#else - enum HwPipePoint : uint32; -#endif -} -struct SqttFileChunkCpuInfo; -struct SqttFileChunkAsicInfo; -struct SqttCodeObjectDatabaseRecord; - -struct GpuMemoryInfo; - -namespace GpuUtil -{ -// Sample id initialization value. -constexpr Pal::uint32 InvalidSampleId = 0xFFFFFFFF; - -/// The available states of GpaSession -enum class GpaSessionState : Pal::uint32 -{ - Reset = 0, - Building = 1, - Complete = 2, - Ready = 3, -}; - -/// The various ways you can change trace options after it has started. -enum class UpdateSampleTraceMode : Pal::uint32 -{ - MinimalToFullMask = 0, ///< Used to convert a minimal trace (needed for context in compute presents) to a full - /// trace according to the options in the active trace. Requires enableSampleUpdates. - /// Additionally, this must be called between BeginSample() and EndSample() and - /// queue timing must also be enabled on the GpaSession when this function is called. - StartInstructionTrace = 1, ///< Used to enable instruction-level trace globally at any time. Can be run without an - /// active sample. Useful for targeting specific parts of a frame. - StopInstructionTrace = 2, ///< Used to disable instruction-level trace globally at any time. Can be run without an - /// active sample. -}; - -/// Specifies basic type of sample to perfom - either a normal set of "global" perf counters, or a trace consisting -/// of SQ thread trace and/or streaming performance counters. -enum class GpaSampleType : Pal::uint32 -{ - None = 0x0, ///< No profile will be done. - Cumulative = 0x1, ///< One 64-bit result will be returned per performance counter representing the cumulative delta - /// for that counter over the sample period. Cumulative samples must begin and end in the same - /// command buffer. - Trace = 0x2, ///< A GPU memory buffer will be filled with hw-specific SQ thread trace and/or streaming - /// performance counter data. Trace samples may span multiple command buffers. - Timing = 0x3, ///< Two 64-bit results will be recorded in beginTs and endTs to gather timestamp data. - Query = 0x4, ///< A set of 11 pipeline stats will be collected. - Count -}; - -/// Specifies a specific performance counter to be sampled with GpaSession::BeginSample() and GpaSession::EndSample(). -/// -/// This identifies a specific counter in a particular HW block instance, e.g., TCC instance 3 counter #19. It is up -/// to the client to know the meaning of a particular counter, e.g., TCC #19 is TCC_PERF_SEL_MISS on Fiji. Eventually, -/// PAL may want to support certain counters without the client needing HW-specific knowledge (i.e., select an enum -/// called L2MissRate from PAL rather than needing to know that counter is TCC #19 on Fiji), but GPA currently works in -/// this low-level mode with other drivers, and wants to keep the flexibility. -struct PerfCounterId -{ - Pal::GpuBlock block; ///< Which GPU block to reference (e.g., CB, DB, TCC). - Pal::uint32 instance; ///< Which instance of the specified GPU block to sample. E.g., Tahiti has 12 TCC blocks - /// (this number is returned per-block in the @ref Pal::GpuBlockPerfProperties structure). - /// There is no shortcut to get results for all instances of block in the whole chip, the - /// client must explicitly sample each instance and sum the results. - Pal::uint32 eventId; ///< Counter ID to sample. Note that the meaning of a particular eventId for a block can - /// change between chips. - - union - { - struct - { - Pal::uint32 spm32Bit : 1; ///< For SPM counters, collect in 32bit instead of 16bit - Pal::uint32 reserved : 31; ///< Reserved for future use - }; - - Pal::uint32 u32All; ///< Union value for copying - - } flags; - - // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for - // the relevant blocks. It's recommended to zero them out when not in use. - union - { - struct - { - Pal::uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. - } df; - - struct - { - Pal::uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. - Pal::uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold) - Pal::uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). - } umc; - - Pal::uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events - - Pal::uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds - } subConfig; -}; - -/// Defines a set of flags for a particular gpa session. -union GpaSessionFlags -{ - struct - { - /// Enables timing of queue operations via Timed* functions. - Pal::uint32 enableQueueTiming : 1; - - /// Enables sample updates via the UpdateSampleTraceParams function. - Pal::uint32 enableSampleUpdates : 1; - - /// Indicates that the client will use the internal Timed*QueueSemaphore() functions for queue semaphore timing - /// data. When not set it indicates the client will provide ETW data via the ExteralTimed* functions. - Pal::uint32 useInternalQueueSemaphoreTiming : 1; - - /// Reserved for future use. - Pal::uint32 reserved : 29; - }; - - /// Flags packed as 32-bit uint. - Pal::uint32 u32All; -}; - -/// Specifies options that direct the gpa session behavior. -struct GpaSessionBeginInfo -{ - /// Gpa Session flags used to control behavior. - GpaSessionFlags flags; -}; - -/// Input structure for CmdBeginGpuProfilerSample. -/// -/// Defines a set of global performance counters and/or SQ thread trace data to be sampled. -struct GpaSampleConfig -{ - /// Selects what type of data should be gathered for this sample. This can either be _cumulative_ to gather - /// simple deltas for the specified set of perf counters over the sample period, or it can be _trace_ to generate - /// a blob of RGP-formatted data containing SQ thread trace and/or streaming performance monitor data. - GpaSampleType type; - - union - { - struct - { - Pal::uint32 sampleInternalOperations : 1; ///< Include BLTs and internal driver operations in the - /// results. - Pal::uint32 cacheFlushOnCounterCollection : 1; ///< Insert cache flush and invalidate events before and - /// after every sample. - Pal::uint32 sqShaderMask : 1; ///< If sqShaderMask is valid. - Pal::uint32 sqWgpShaderMask : 1; ///< If sqWgpShaderMask is valid. - Pal::uint32 reserved : 28; ///< Reserved for future use. - }; - Pal::uint32 u32All; ///< Bit flags packed as uint32. - } flags; ///< Bit flags controlling sample operation for all sample - /// types. - - Pal::PerfExperimentShaderFlags sqShaderMask; ///< Which shader stages are sampled by GpuBlock::Sq counters. - ///< Only used if flags.sqShaderMask is set to 1. - Pal::PerfExperimentShaderFlags sqWgpShaderMask; ///< Which shader stages are sampled by GpuBlock::SqWgp counters. - ///< Only used if flags.sqWgpShaderMask is set to 1. - - struct - { - /// Number of entries in pIds. - Pal::uint32 numCounters; - - /// List of performance counters to be gathered for a sample. If the sample type is _cumulative_ this will - /// result in "global" perf counters being sampled at the beginning of the sample period; if the sample type - /// is _trace_ this will result in SPM data being added to the sample's resulting RGP blob. - /// - /// Note that it is up to the client to respect the hardware counter limit per block. This can be - /// determined by the maxGlobalOnlyCounters, maxGlobalSharedCounters, maxSpmCounters, and instanceGroupSize - /// fields of @ref Pal::GpuBlockPerfProperties. - const PerfCounterId* pIds; - - /// Period for SPM sample collection in cycles. Only relevant for _trace_ samples. - Pal::uint32 spmTraceSampleInterval; - - /// Maximum amount of GPU memory in bytes this sample can allocate for SPM data. Only relevant for _trace_ - /// samples. - Pal::gpusize gpuMemoryLimit; - } perfCounters; ///< Performance counter selection (valid for both _cumulative_ and _trace_ samples). - - struct - { - /// Number of entries in pIds. - Pal::uint32 numCounters; - - /// Period for DF SPM sample collection in nano seconds. - Pal::uint32 sampleInterval; - - /// Maximum amount of GPU memory in bytes this sample can allocate for DF SPM data. - Pal::gpusize gpuMemoryLimit; - - /// List of performance counters to be gathered for a df sample. This has to be separate from the list - /// list of normal counters because it is a completely different mechanism for gathering data. - /// - /// Note that it is up to the client to respect the hardware counter limit per block. This can be - /// determined by the maxSpmCounters fields of - /// @ref Pal::GpuBlockPerfProperties. - const PerfCounterId* pIds; - } dfSpmPerfCounters; - - struct - { - union - { - struct - { - Pal::uint32 enable : 1; ///< Include SQTT data in the trace. - Pal::uint32 supressInstructionTokens : 1; ///< Prevents capturing instruction-level SQTT tokens, - /// significantly reducing the amount of sample data. - Pal::uint32 stallMode : 2; ///< Describes behavior when buffer full - Pal::uint32 stallAllSimds : 1; ///< Stall all SIMDs for thread trace stall. - Pal::uint32 excludeNonDetailShaderData : 1; ///< Only emit shader tokens from the SIMD that have been - /// selected for detail instruction tracing - Pal::uint32 enableExecPopTokens : 1; ///< Output exec tokens - Pal::uint32 reserved : 25; ///< Reserved for future use. - }; - Pal::uint32 u32All; ///< Bit flags packed as uint32. - } flags; ///< Bit flags controlling SQTT samples. - Pal::uint32 seMask; ///< Mask that determines which specific SEs to run Thread trace on. - /// If 0, all SEs are enabled - Pal::uint32 seDetailedMask; ///< Mask that selects which specific SEs to reveal Thread trace detailed info. - /// If 0, all SEs will reveal detailed thread trace - Pal::gpusize gpuMemoryLimit; ///< Maximum amount of GPU memory in bytes this sample can allocate for the SQTT - /// buffer. If 0, allocate maximum size to prevent dropping tokens toward the - /// end of the sample. - Pal::uint32 tokenMask; ///< Mask indicating which SQTT tokens are requested for capture. If a tokenMask is - /// not provided, PAL will default to collecting all tokens or tokens except - /// instruction tokens if the supressInstructionTokens flag is set. Instruction - /// tokens will always be filtered out if supressInstructionTokens = true. - } sqtt; ///< SQ thread trace configuration (only valid for _trace_ samples). - - struct - { -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 - Pal::PipelineStageFlag preSample; ///< The pipeline stage in the GPU pipeline where the begin timestamp should - /// take place. - Pal::PipelineStageFlag postSample; ///< The pipeline stage in the GPU pipeline where the end timestamp should - /// take place. -#else - Pal::HwPipePoint preSample; ///< The point in the GPU pipeline where the begin timestamp should take place. - Pal::HwPipePoint postSample; ///< The point in the GPU pipeline where the end timestamp should take place. -#endif - } timing; ///< Timestamp configuration. (only valid for timing samples) -}; - -/// Extra metadata about a command buffer submission -struct TimedSubmitInfo -{ - const Pal::uint64* pApiCmdBufIds; ///< Array of api specific command buffer ids - const Pal::uint32* pSqttCmdBufIds; ///< Array of sqtt command buffer ids - Pal::uint64 frameIndex; ///< The global frame index for the application. -}; - -/// Extra metadata about a queue semaphore operation -struct TimedQueueSemaphoreInfo -{ - Pal::uint64 semaphoreID; ///< Api specific id associated with a semaphore. -}; - -/// Extra metadata about a queue present operation -struct TimedQueuePresentInfo -{ - Pal::uint64 presentID; ///< Api specific id associated with a present. -}; - -/// Struct for storing information about gpu clock speeds. -struct GpuClocksSample -{ - Pal::uint32 gpuEngineClockSpeed; // Current speed of the gpu engine clock in MHz - Pal::uint32 gpuMemoryClockSpeed; // Current speed of the gpu memory clock in MHz -}; - -/// Struct for storing CPU-side allocations of Pal::IPerfExperiment's. -struct PerfExperimentMemory -{ - void* pMemory; // Memory allocated for an IPerfExperiment. - size_t memorySize; // Size of the memory allocated in pMemory. -}; - -/// Struct for supplying API-dependent information about pipelines. -struct RegisterPipelineInfo -{ - Pal::uint64 apiPsoHash; ///< Client-provided PSO hash. -}; - -/// Struct for supplying API-dependent information about libraries. -struct RegisterLibraryInfo -{ - Pal::uint64 apiHash; ///< Client-provided api hash. -}; - -/// Struct for supplying Elf binary. -struct ElfBinaryInfo -{ - const void* pBinary; ///< FAT Elf binary. - Pal::uint32 binarySize; ///< FAT Elf binary size. - Pal::IGpuMemory* pGpuMemory; ///< GPU Memory where the compiled ISA resides. - Pal::gpusize offset; ///< Offset inside GPU memory object - Pal::uint64 originalHash; ///< Original source/binary hash. - Pal::uint64 compiledHash; ///< Compiled binary hash. -}; - -/// Enumeration of RGP trace profiling modes -enum class TraceProfilingMode : Pal::uint32 -{ - Present = 0, ///< Present triggered capture - UserMarkers = 1, ///< Capture triggered by user marker - FrameNumber = 2, ///< Capture based on frame number - Tags = 3, ///< Tag based capture -}; - -/// Constant defines the maximum length for a user marker string. -static constexpr Pal::uint32 UserMarkerStringLength = 256; - -/// Defines data specific to each profiling mode used to capture an RGP trace. -union TraceProfilingModeData -{ - struct - { - char start[UserMarkerStringLength]; ///< User marker string used to start trace capture. - char end[UserMarkerStringLength]; ///< User marker string used to end trace capture. - } userMarkerData; - - struct - { - Pal::uint32 start; ///< Frame number used to start the trace. - Pal::uint32 end; ///< Frame number used to end the trace. - } frameNumberData; - - struct - { - Pal::uint64 start; ///< Tag used to start the trace. - Pal::uint64 end; ///< Tag used to end the trace. - } tagData; -}; - -/// Enumerates the different instruction level data modes for an RGP trace -enum class InstructionTraceMode : Pal::uint32 -{ - Disabled = 0, ///< Instruction level data was disabled for trace. - FullFrame = 1, ///< Instruction level data was enabled for the full trace. - ApiPso = 2, ///< Instruction level data was enabled only for a single API PSO. -}; - -/// Defines the data used to control enabling of instruction level data. -struct InstructionTraceModeData -{ - Pal::uint64 apiPsoHash; ///< Hash of the API PSO targeted for instruction level data. -}; - -/// Struct for supplying API specific information about an RGP trace -struct SampleTraceApiInfo -{ - TraceProfilingMode profilingMode; ///< Profiling mode used to trigger the trace. - TraceProfilingModeData profilingModeData; ///< Profiling mode specific data. - InstructionTraceMode instructionTraceMode; ///< Instruction trace mode for the trace. - InstructionTraceModeData instructionTraceModeData; ///< Instruction trace mode data. -}; - -/// An enumeration of the API types. -enum class ApiType : Pal::uint32 -{ - DirectX12 = 0, ///< Represents DirectX12 API type. - Vulkan = 1, ///< Represents Vulkan API type. - Generic = 2, ///< Represents Generic API type. - OpenCl = 3, ///< Represents OpenCL API type. - Hip = 5, ///< Represents HIP API type. -}; - -/// Struct used for storing SQTT-specific trace information -struct SqttTraceInfo -{ - Pal::uint32 shaderEngine; ///< Shader engine index - Pal::uint32 computeUnit; ///< Compute unit index - Pal::uint32 sqttVersion; ///< SQTT version - Pal::uint64 bufferSize; ///< SQTT trace buffer size -}; - -/// Struct used for storing SPM-specific trace information -struct SpmTraceInfo -{ - Pal::uint32 numSpmCounters; ///< The number of SPM counters sampled in the trace - Pal::uint32 numTimestamps; ///< The number of timestamps that samples were taken - Pal::uint32 sampleFrequency; ///< The SPM counter sampling frequency -}; - -/// Struct used for storing QueueTimings-specific trace information -struct QueueTimingsTraceInfo -{ - Pal::uint32 numQueueInfoRecords; - Pal::uint32 numQueueEventRecords; - Pal::uint32 queueInfoTableSize; - Pal::uint32 queueEventTableSize; -}; - -/** -*********************************************************************************************************************** -* @class GpaSession -* @brief Helper class providing common driver functionality required by all PAL clients that support the GPUPerfAPI -* (GPA). Abstracts IPerfExperiment creation, memory management, completion confirmation, and results reporting -* at a level convenient for GPA. Each PAL client driver will need to publish an API extension exposing this -* support for use by GPA. -* -* A GpaSession is a container for a set of _samples_ of performance counter and/or SQ thread trace data. Its main -* purpose is to manage resources (IPerfExperiments and their backing system/GPU memory) in an efficient manner that is -* consistent with command buffer management in modern APIs. Consider GpaSession as a peer of DX12's command -* allocator or Vulkan's command pool objects. -* -* Basic flow of usage: -* - Newly create sessions are in the _reset_ state. -* - A session is moved from the _reset_ state to the _building_ state by calling Begin(). -* - Samples are added to a session by specifying desired data for each query and marking a begin and end location -* in ICmdBuffers as they are built. Internally required resources, like GPU memory where counters will be -* written, are allocated from internal pools managed by the session. -* - A session is moved from the _building_ state to the _complete_ state by calling End(). -* - The application will submit all command buffers referenced by the session. -* - The session is confirmed as _ready_, either using standard PAL fences to confirm all assocated submission have -* completed, or by polling IsReady() on the session. -* - Results for all samples in the session can be queried via GetResults(). -* - Reset() should be called once results have been gathered and before building a new session. Resources are -* retained by the session object for use in the newly built session. The session object must be destroyed in -* order to fully release all resource back to the system. -* -* Cumulative-type samples may not span multiple command buffers, as other apps could interfere with the counts and -* there the final data doesn't have time-based visibility to detect that happened. -* -* @warning GpaSession is not thread safe. Performing samples in command buffers being built simultaneously by multiple -* threads should use multiple GpaSession objects. -*********************************************************************************************************************** -*/ -class GpaSession -{ - typedef Pal::IPlatform GpaAllocator; -public: - typedef Util::Deque PerfExpMemDeque; - - /// Constructor. - GpaSession( - Pal::IPlatform* pPlatform, - Pal::IDevice* pDevice, - Pal::uint16 apiMajorVer, - Pal::uint16 apiMinorVer, - ApiType apiType, - Pal::uint16 rgpInstrumentationSpecVer = 0, - Pal::uint16 rgpInstrumentationApiVer = 0, - PerfExpMemDeque* pAvailablePerfExpMem = nullptr); - - ~GpaSession(); - - /// Copy constructor creates an empty copy of a session. - /// - /// Newly constructed session copies the GPU memory allocations and their layout from the source session, making - /// this a valid destination for a CopyResults command. This new object is effectively in the _complete_ state. - /// - /// The purpose of such objects is to handle sampling data from bundles or nested command buffers where the same - /// set of commands might be executed multiple times from a single root-level command buffer. The client should - /// note such cases, and create a copy of the bundle's session for each invocation, then call CopyResults() from - /// the original session into the copy after the invocation. - /// - /// @param [in] src Session to be copied. Must either be in the _complete_ or _ready_ state. - explicit GpaSession(const GpaSession& src); - - /// Initialize the newly constructed GPA session. - Pal::Result Init(); - - /// Registers a queue with the GpaSession that will be submitted to using TimedSubmit. This must be called on any - /// queues that are submitted to via the Timed* functions. For Timed* signal and wait queue semaphore events, a - /// valid queueContext will be required (queueContext not equal to 0). - Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, - Pal::uint64 queueId, - Pal::uint64 queueContext); - - /// Unregisters a queue prior to object destruction, and ensure that associated resources are destroyed. Work can - /// no longer be submitted on the queue after this has been called. - Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); - - Pal::Result TimedSubmit(Pal::IQueue* pQueue, - const Pal::MultiSubmitInfo& submitInfo, - const TimedSubmitInfo& timedSubmitInfo); - - /// Executes a timed queue semaphore signal through the given queue. The HW time is measured when the queue semaphore - /// is signaled. - Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, - Pal::IQueueSemaphore* pQueueSemaphore, - const TimedQueueSemaphoreInfo& timedSignalInfo, - Pal::uint64 value = 0); - - /// Executes a timed queue semaphore wait through the given queue. The HW time is measured when the queue semaphore - /// wait finishes. - Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, - Pal::IQueueSemaphore* pQueueSemaphore, - const TimedQueueSemaphoreInfo& timedWaitInfo, - Pal::uint64 value = 0); - - /// Injects a timed queue present event. - Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, - const TimedQueuePresentInfo& timedPresentInfo); - - /// Injects a timed wait queue semaphore event using information supplied by an external source. - /// A valid queueContext (queueContext not equal to 0) is needed for this function. - Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, - Pal::uint64 cpuSubmissionTimestamp, - Pal::uint64 cpuCompletionTimestamp, - const TimedQueueSemaphoreInfo& timedWaitInfo); - - /// Injects a timed signal queue semaphore event using information supplied by an external source. - /// A valid queueContext (queueContext not equal to 0) is needed for this function. - Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, - Pal::uint64 cpuSubmissionTimestamp, - Pal::uint64 cpuCompletionTimestamp, - const TimedQueueSemaphoreInfo& timedSignalInfo); - - /// Queries the engine and memory clocks from DeviceProperties - Pal::Result SampleGpuClocks(GpuClocksSample* pGpuClocksSample) const; - - /// Samples the timing clocks if queue timing is enabled and adds a clock sample entry to the current session. - Pal::Result SampleTimingClocks(); - - /// Moves the session from the _reset_ state to the _building_ state. - /// - /// Invalid to call Begin() on a session that isn't in the _reset_ state. - /// - /// @param [in] info Information about the gpa sessions desired behavior. - /// - /// @returns Success if the session was successfully moved to the _building_ state. Otherwise, possible errors - /// include: - /// + ErrorUnavailable if the sessions isn't current in the _reset_ state. - Pal::Result Begin(const GpaSessionBeginInfo& info); - - /// Moves the session from the _building_ state to the _complete_ state. - /// - /// Invalid to call End() on a session that isn't in the _building_ state. The implementation _may_ insert GPU - /// commands into the specified pCmdBuf - in the case of a session that spans multiple command buffers, the - /// command buffer specified to End() _must_ be the last command buffer of the session that is submitted. - /// - /// @param [in] pCmdBuf Last (normally _only_) command buffer of the session. Can be used by implementation - /// to insert GPU commands required after all samples are inserted (e.g., to confirm session - /// completion). - /// - /// @returns Success if the session was successfully moved to the _complete_ state. Otherwise, possible errors - /// include: - /// + ErrorUnavailable if the sessions isn't current in the _building_ state. - Pal::Result End(Pal::ICmdBuffer* pCmdBuf); - - /// Marks the beginning of a range of GPU operations to be measured and specifies what data should be recorded. - /// - /// It is possible the sample will not succeed due to internal memory allocation failure, etc. In those cases, - /// the session will be marked invalid and no sample commands will be inserted. Reporting of this error is - /// delayed until GetResults(). - /// - /// A note for GpuBlock::SqWgp - /// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity - /// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock - /// gating (CGCG) and power gating. PAL expose this feature to clients. - /// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same - /// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. - /// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features - /// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. - /// The counter data is still reported per WGP (not aggregated for the whole SE). - /// - /// Check the following two documents for details: - /// - /// @param [in] pCmdBuf Command buffer to issue the begin sample commands. All operations performed - /// between executing the BeginSample() and EndSample() GPU commands will contribute to - /// the sample results. - /// @param [in] sampleConfig Describes what data should be sampled. - /// @param [out] pSampleId An ID corresponding to this sample. This ID should be recorded and passed back to - /// EndSample() when the sampled command buffer range is complete. This ID should also - /// be passed to GetResults() when the session is in the _ready_ state in order to get - /// the results of this sample. - /// - /// @returns Success if the update was successful. Unsupported if the sample config type is not supported. - /// Otherwise, possible errors include: - /// + ErrorInvalidPointer if pCmdBuf or pSampleId is nullptr. - Pal::Result BeginSample( - Pal::ICmdBuffer* pCmdBuf, - const GpaSampleConfig& sampleConfig, - Pal::uint32* pSampleId); - - /// Updates the trace parameters for a specific sample. - /// - /// @param [in] pCmdBuf Command buffer to issue the update commands. - /// @param [in] sampleId Identifies the sample to be updated, if required by the mode. This should be a value - /// returned by BeginSample(), and must correspond to a thread trace sample. - /// @param [in] updateMode The way the sample parameters should be set. Some modes have additional restrictions. - /// @see UpdateSampleTraceMode - /// - /// @returns Success if the update was successful. Otherwise, possible errors - /// include: - /// + ErrorInvalidPointer if pCmdBuf is nullptr. - /// + ErrorInvalidObjectType if a sample is required and the sample associated with sampleId is not a - /// trace sample. - Pal::Result UpdateSampleTraceParams( - Pal::ICmdBuffer* pCmdBuf, - Pal::uint32 sampleId, - UpdateSampleTraceMode updateMode); - - /// Marks the end of a range of command buffer operations to be measured. - /// - /// @param [in] pCmdBuf Command buffer to issue the end sample commands. All operations performed between - /// executing the BeginSample() and EndSample() GPU commands will contribute to the sample - /// results. _Cumulative_ samples (i.e., global performance counter samples) must never span - /// multiple command buffers (EndSample() should be called in the same command buffer as - /// BeginSample()). - /// @param [in] sampleId Identifies the sample to be ended. This should be the value returned by BeginSample() - /// for the sample that is being ended. - /// - /// @note BeginSample() must be called before EndSample() _and_ the GPU commands inserted by BeginSample() must be - /// executed before the command inserted by EndSample(). Since a session is a single-threaded object, this - /// will normally happen naturally. - void EndSample( - Pal::ICmdBuffer* pCmdBuf, - Pal::uint32 sampleId); - - /// Copies the DF SPM trace buffer to the GpaSession result buffer - /// - /// @param [in] pCmdBuf Command buffer to issue the copy commands. - /// @param [in] sampleId Identifies the sample to be copied. - /// @note This must be called after a command buffer with the dfSpmTraceEnd CmdBufInfo flag - /// and with a separate command buffer. DF SPM traces are on a per command buffer granularity - /// because they are started and stopped by the KMD. - void CopyDfSpmTraceResults( - Pal::ICmdBuffer* pCmdBuf, - Pal::uint32 sampleId); - - /// Provides API specific information about an RGP trace. - /// - /// @param [in] traceApiInfo Const reference to the struct of API specific information. - /// @param [in] sampleId Sample ID (returned by BeginSample) for the RGP trace type sample info is being - /// provided for. - void SetSampleTraceApiInfo( - const SampleTraceApiInfo& traceApiInfo, - Pal::uint32 sampleId) const; - - /// Reports if GPU execution of this session has completed and results are _ready_ for querying from the CPU via - /// GetResults(). - /// - /// @returns true if all samples in the session have completed GPU execution. - bool IsReady() const; - - /// Reports results of a particular sample. Only valid for sessions in the _ready_ state. - /// - /// Results will be formatted depending on the sample type: - /// + Cumulative: Results will be an array of uint64 values in the order of perf counter IDs specified by - /// BeginSample(). - /// + SqThreadTrace: Results will be a binary blob in the RGP file format. - /// - /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). - /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space - /// available in pData, and *pSizeInBytes will be set to the amount of space written - /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space - /// required. - /// @param [out] pData Can be null to query how much size is required (should only be necessary when - /// getting RGP data). If non-null, the sample results will be written to this - /// location. - /// - /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required - /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: - /// + ErrorUnavailable if the session is not in the _ready_ state. - /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. - /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. - Pal::Result GetResults( - Pal::uint32 sampleId, - size_t* pSizeInBytes, - void* pData) const; - - /// Retrieves the SQTT results. Only valid for sessions in the _complete_ state. - /// - /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). - /// @param [in] traceIndex The index of the trace to get. - /// @param [out] pTraceInfoOut Optional pointer to a structure which will be written with information about the trace. - /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space - /// available in pData, and *pSizeInBytes will be set to the amount of space written - /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space - /// required. - /// @param [out] pData Can be null to query how much size is required. - /// If non-null, the sample results will be written to this location. - /// - /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required - /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: - /// + ErrorUnavailable if the session is not in the _ready_ state. - /// + NotFound if the given index is not valid. - /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. - /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. - // + ErrorInvalidPointer if pSizeInBytes is NULL. - Pal::Result GetSqttTraceData( - Pal::uint32 sampleId, - Pal::uint32 traceIndex, - SqttTraceInfo* pTraceInfo, - size_t* pSizeInBytes, - void* pData) const; - - /// Retrieves the SPM trace results of a particular sample. Only valid for 'Trace' type samples and sessions - /// in the _complete_ state. - /// - /// Results in the output buffer are a binary blob formatted according to the RGP specification. - /// The data layout of the populated output buffer is as follows: - /// - Timestamps array [size: "numTimestamps * sizeof(uint64)" bytes] - /// - SpmCounterInfo array [size: "numSpmCounters * sizeof(SpmCounterInfo)" bytes] - /// - SPM Counter Data matrix [size: "*pSizeInBytes - (timestamps array + SpmCounterInfo array size)" bytes] - /// - /// The SPM Counter Data matrix is laid out linearly in a row-major format. There are "numSpmCounters" rows and - /// "numTimestamps" columns. Each element in the matrix is either 16- or 32-bits, based on the "dataSize" field - /// of the corresponding "SpmCounterInfo" entry. - /// - /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). - /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with trace metadata. - /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space - /// available in pData. - /// If pData is null, *pSizeInBytes will be set to the amount of space - /// required. - /// @param [out] pData Can be null to query how much size is required. - /// If non-null, the sample results will be written to this location. - /// - /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required - /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: - /// + ErrorUnavailable if the session is not in the _ready_ state. - /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. - /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. - Pal::Result GetSpmTraceData( - Pal::uint32 sampleId, - SpmTraceInfo* pTraceInfo, - size_t* pSizeInBytes, - void* pData) const; - - /// Retrieves the Queue Timings data from the active GpaSession. - /// Only valid when the GpaSession had `enableQueueTiming` flag set. - /// - /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with metadata. - /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space - /// available in pData. - /// If pData is null, *pSizeInBytes will be set to the amount of space - /// required. - /// @param [out] pData Can be null to query how much size is required. - /// If non-null, the sample results will be written to this location. - /// - /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required - /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: - /// + ErrorUnavailable if the session was not configured with `enableQueueTiming`. - Pal::Result GetQueueTimingsData( - QueueTimingsTraceInfo* pTraceInfo, - size_t* pSizeInBytes, - void* pData) const; - - /// Moves the session to the _reset_ state, marking all sessions resources as unused and available for reuse when - /// the session is re-built. - /// - /// @warning This function cannot be called when the session is queued for execution on the GPU. The client must - /// confirm this is not the case using IsReady(), fences, etc. - /// - /// @returns Success if the session was successfully moved to the _reset_ state. Otherwise, possible errors - /// include: - /// + ErrorUnknown if an internal PAL error occurs. - Pal::Result Reset(); - - /// Uses the GPU to copy results from a nested command buffer's session into a root-level command buffer's per- - /// invocation session data. - /// - /// This command will implicitly wait for the source session (as specified in the copy constructor) to be complete - /// then use the GPU to update this session's data. This allows the client to get accurate sample data in the - /// case where a nested command buffer is launched multiple times from the same root-level command buffer. - /// - /// The session remains in the _complete_ state after calling this, and the client should submit the commands - /// and verify their completion to move to the _ready_ state. - /// - /// @param pCmdBuf Command buffer where the session copy should be performed. - void CopyResults(Pal::ICmdBuffer* pCmdBuf); - - /// Register pipeline with GpaSession for obtaining shader dumps and load events in the RGP file. - /// - /// @param [in] pPipeline The PAL pipeline to be tracked. - /// @param [in] clientInfo API-dependent information for this pipeline to also be recorded. - /// - /// @returns Success if the pipeline has been registered with GpaSession successfully. - /// + AlreadyExists if a duplicate pipeline is provided. - Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); - - /// Unregister pipeline with GpaSession for obtaining unload events in the RGP file. - /// This should be called immediately before destroying the PAL pipeline object. - /// - /// @param [in] pPipeline The PAL pipeline to be tracked. - /// - /// @returns Success if the pipeline has been unregistered with GpaSession successfully. - Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); - - /// Register library with GpaSession for obtaining shader dumps and load events in the RGP file. - /// - /// @param [in] pLibrary The PAL library to be tracked. - /// @param [in] clientInfo API-dependent information for this library to also be recorded. - /// - /// @returns Success if the library has been registered with GpaSession successfully. - /// + AlreadyExists if a duplicate library is provided. - Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); - - /// Unregister library with GpaSession for obtaining unload events in the RGP file. - /// This should be called immediately before destroying the PAL library object. - /// - /// @param [in] pLibrary The PAL library to be tracked. - /// - /// @returns Success if the library has been unregistered with GpaSession successfully. - Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); - - /// Register ELF binary with GpaSession for obtaining kernel dumps and load events in the RGP file. - /// - /// @param [in] elfBinaryInfo Contains information about the Elf binary to be recorded. - /// - /// @returns Success if the Elf binary has been registered with GpaSession successfully. - Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); - - /// Unregister Elf binary with GpaSession for obtaining unload events in the RGP file. - /// This should be called immediately before destroying the Elf binary. - /// - /// @param [in] elfBinaryInfo Contains the elf binary info to be removed from tracking. - /// - /// @returns Success if the library has been unregistered with GpaSession successfully. - Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); - - /// Given a Pal device, validate a list of perfcounters. - /// - /// @param [in] pDevice a given device - /// @param [in] pCounters a list of perf counters. - /// @param [in] numCounters perf counter counts. - /// - /// @returns Success if counters are valid. - Pal::Result ValidatePerfCounters(Pal::IDevice* pDevice, - const PerfCounterId* pCounters, - const Pal::uint32 numCounters); - -private: - // Tracking structure for a single IGpuMemory allocation owned by a GpaSession::GpaSession. In particular, it - // tracks the associated CPU pointer since these allocations remain mapped for CPU access for their lifetime. - struct GpuMemoryInfo - { - Pal::IGpuMemory* pGpuMemory; - void* pCpuAddr; - }; - - // Event type for code object load events - enum class CodeObjectLoadEventType - { - LoadToGpuMemory = 0, - UnloadFromGpuMemory - }; - - // Represents all information to be contained in one SqttCodeObjectLoaderEventRecord - struct CodeObjectLoadEventRecord - { - CodeObjectLoadEventType eventType; - Pal::uint64 baseAddress; - Pal::ShaderHash codeObjectHash; - Pal::uint64 timestamp; - }; - - // Represents all information to be contained in one SqttPsoCorrelationRecord - struct PsoCorrelationRecord - { - Pal::uint64 apiPsoHash; - Pal::PipelineHash internalPipelineHash; - }; - - // Registers a single (non-archive) pipeline with the GpaSession. Returns AlreadyExists on duplicate PAL pipeline. - Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); - - // Unregisters a single (non-archive) pipeline from the GpaSession. - Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); - - Pal::IDevice*const m_pDevice; // Device associated with this GpaSession. - Pal::DeviceProperties m_deviceProps; - Pal::SetClockModeOutput m_peakClockFrequency; // Output of query for stable peak, values in Mhz - Pal::PerfExperimentProperties m_perfExperimentProps; - Pal::uint32 m_timestampAlignment; // Pre-calculated timestamp data alignment. - ApiType m_apiType; // API type, e.g. Vulkan, used in RGP dumps. - Pal::uint16 m_apiMajorVer; // API major version, used in RGP dumps. - Pal::uint16 m_apiMinorVer; // API minor version, used in RGP dumps. - Pal::uint16 m_instrumentationSpecVersion; // Spec version of RGP instrumetation. - Pal::uint16 m_instrumentationApiVersion; // Api version of RGP instrumetation. - - Pal::IGpuEvent* m_pGpuEvent; - GpaSessionState m_sessionState; - - const GpaSession* const m_pSrcSession; // source session for session created via copy c'tor - - // Tracks the current GPU memory object and offset being sub-allocated for AcquireGpuMem(). - GpuMemoryInfo m_curGartGpuMem; - Pal::gpusize m_curGartGpuMemOffset; - GpuMemoryInfo m_curLocalGpuMem; - Pal::gpusize m_curLocalGpuMemOffset; - GpuMemoryInfo m_curInvisGpuMem; - Pal::gpusize m_curInvisGpuMemOffset; - - // Locks for the local-invisible, gart and local memory subdivision (and their pools) - Util::Mutex m_gartGpuMemLock; - Util::Mutex m_localGpuMemLock; - Util::Mutex m_invisGpuMemLock; - - // Counts number of samples that are active in this GpaSession. - Pal::uint32 m_sampleCount; - - Pal::IPlatform*const m_pPlatform; // Platform associated with this GpaSesion. - - // GartHeap / LocalHeap / InvisHeap GPU chunk pools. - Util::Deque m_availableGartGpuMem; - Util::Deque m_busyGartGpuMem; - Util::Deque m_availableLocalGpuMem; - Util::Deque m_busyLocalGpuMem; - Util::Deque m_availableInvisGpuMem; - Util::Deque m_busyInvisGpuMem; - - struct SampleItem; - class PerfSample; - class CounterSample; - class TraceSample; - class TimingSample; - class QuerySample; - - Util::Vector m_sampleItemArray; - PerfExpMemDeque* m_pAvailablePerfExpMem; - - // Unique pipelines registered with this GpaSession. - Util::HashSet m_registeredPipelines; - // Unique API PSOs registered with this GpaSession. - Util::HashSet m_registeredApiHashes; - - // List of cached pipeline code object records that will be copied to the final database at the end of a trace - Util::Deque m_codeObjectRecordsCache; - // List of pipeline code object records that were registered during a trace - Util::Deque m_curCodeObjectRecords; - - // List of cached code object load event records that will be copied to the final database at the end of a trace - Util::Deque m_codeObjectLoadEventRecordsCache; - // List of code object load event records that were registered during a trace - Util::Deque m_curCodeObjectLoadEventRecords; - - // List of cached PSO correlation records that will be copied to the final database at the end of a trace - Util::Deque m_psoCorrelationRecordsCache; - // List of PSO correlation records that were registered during a trace - Util::Deque m_curPsoCorrelationRecords; - - Util::RWLock m_registerPipelineLock; - - // Event type for timed queue events - enum class TimedQueueEventType : Pal::uint32 - { - Submit, - Signal, - Wait, - Present, - ExternalSignal, - ExternalWait - }; - - // Struct that contains information about a specific timed queue event. - struct TimedQueueEventItem - { - TimedQueueEventType eventType; // Type of event - Pal::uint64 cpuTimestamp; // Time when the event was processed on the cpu - Pal::uint64 apiId; // The api specific id for the queue event - Pal::uint32 sqttCmdBufId; // The sqtt command buffer id value associated with a submit event - Pal::uint32 submitSubIndex; // The sub index of an event within a submission event. - Pal::uint32 queueIndex; // The index of the associated queue in the m_timedQueuesArray - Pal::uint64 frameIndex; // The index of the current frame being rendered - union - { - struct - { - GpuMemoryInfo memInfo[2]; // The gpu memory for the timestamps associated with the event - Pal::gpusize offsets[2]; // Memory offsets for the associated timestamp gpu memory - } gpuTimestamps; - - Pal::uint64 cpuCompletionTimestamp; // The time when the event completed on the cpu - }; - }; - - // Struct for keeping track of timed operation on a specific queue - struct TimedQueueState - { - Pal::IQueue* pQueue; // Pal Queue - Pal::uint64 queueId; // Api specific queue id - Pal::uint64 queueContext; // Api specific queue context - Pal::QueueType queueType; // Queue type - Pal::EngineType engineType; // Engine type - bool valid; // Used to track if the queue is valid - Util::Deque* pAvailableCmdBuffers; // List of available cmdbuffers - Util::Deque* pBusyCmdBuffers; // List of busy cmdbuffers - Pal::IFence* pFence; // Used to track queue - // operations - }; - - // Flags for the current session. - GpaSessionFlags m_flags; - - // Array containing all of the queues registered for timing operations - Util::Vector m_timedQueuesArray; - Util::RWLock m_timedQueuesArrayLock; - - // List of timed queue events for the current session - Util::Vector m_queueEvents; - Util::Mutex m_queueEventsLock; - - // List of timestamp calibration samples - Util::Vector m_timestampCalibrations; - - // The most recent gpu clocks sample - GpuClocksSample m_lastGpuClocksSample; - - // Internal command allocator used for timing command buffers - Pal::ICmdAllocator* m_pCmdAllocator; - - // Finds the TimedQueueState associated with pQueue. - Pal::Result FindTimedQueue(Pal::IQueue* pQueue, - TimedQueueState** ppQueueState, - Pal::uint32* pQueueIndex); - - // Finds the TimedQueueState associated with queueContext. - Pal::Result FindTimedQueueByContext(Pal::uint64 queueContext, - TimedQueueState** ppQueueState, - Pal::uint32* pQueueIndex); - - /// Injects an external timed queue semaphore operation event - Pal::Result ExternalTimedQueueSemaphoreOperation(Pal::uint64 queueContext, - Pal::uint64 cpuSubmissionTimestamp, - Pal::uint64 cpuCompletionTimestamp, - const TimedQueueSemaphoreInfo& timedSemaphoreInfo, - bool isSignalOperation); - - /// Converts a CPU timestamp to a GPU timestamp using a CalibratedTimestamps struct - Pal::uint64 ConvertCpuTimestampToGpuTimestamp(Pal::uint64 cpuTimestamp, - const Pal::CalibratedTimestamps& calibration) const; - - /// Extracts a GPU timestamp from a queue event - Pal::uint64 ExtractGpuTimestampFromQueueEvent(const TimedQueueEventItem& queueEvent) const; - - // Creates a new command buffer for use on pQueue - Pal::Result CreateCmdBufferForQueue(Pal::IQueue* pQueue, - Pal::ICmdBuffer** ppCmdBuffer); - - // Acquires a command buffer from the TimedQueueState's command buffer pool - Pal::Result AcquireTimedQueueCmdBuffer(TimedQueueState* pQueueState, - Pal::ICmdBuffer** ppCmdBuffer); - - // Recycles busy command buffers in pQueueState - Pal::Result RecycleTimedQueueCmdBuffers(TimedQueueState* pQueueState); - - // Preallocates a fixed number of command buffers for pQueueState and adds them to the command buffer pool - Pal::Result PreallocateTimedQueueCmdBuffers(TimedQueueState* pQueueState, - Pal::uint32 numCmdBuffers); - - // Resets all per session state in pQueueState - Pal::Result ResetTimedQueueState(TimedQueueState* pQueueState); - - // Destroys the memory and resources for pQueueState - void DestroyTimedQueueState(TimedQueueState* pQueueState); - - // Helper function to import one sample item from a source session to copy session. - Pal::Result ImportSampleItem(const SampleItem* pSrcSampleItem); - - // Acquires a range of queue-owned GPU memory for use by the next command buffer submission. - Pal::Result AcquireGpuMem( - Pal::gpusize size, - Pal::gpusize alignment, - Pal::GpuHeap heapType, - Pal::GpuMemMallPolicy mallPolicy, - GpuMemoryInfo* pGpuMem, - Pal::gpusize* pOffset); - - // Acquires a GpaSession-owned performance experiment based on the device's active perf counter requests. - Pal::Result AcquirePerfExperiment( - GpaSession::SampleItem* pSampleItem, - const GpaSampleConfig& sampleConfig, - GpuMemoryInfo* pGpuMem, - Pal::gpusize* pOffset, - GpuMemoryInfo* pSecondaryGpuMem, - Pal::gpusize* pSecondaryOffset, - Pal::gpusize* pHeapSize, - Pal::IPerfExperiment** ppExperiment); - - // Acquires a session-owned pipeline stats query. - Pal::Result AcquirePipeStatsQuery( - GpuMemoryInfo* pGpuMem, - Pal::gpusize* pOffset, - Pal::gpusize* pHeapSize, - Pal::IQueryPool** ppQuery); - - // Dump SQ thread trace data in rgp format - Pal::Result DumpRgpData(const GpaSampleConfig* pTraceConfig, - TraceSample* pTraceSample, - void* pRgpOutput, - size_t* pTraceSize) const; - - // Dumps the spm trace data in the buffer provided. - Pal::Result AppendSpmTraceData(TraceSample* pTraceSample, - size_t bufferSize, - void* pData, - Pal::gpusize* pSizeInBytes) const; - - // Dumps the df spm trace data in the buffer provided. - Pal::Result AppendDfSpmTraceData(TraceSample* pTraceSample, - size_t bufferSize, - void* pData, - Pal::gpusize* pSizeInBytes) const; - - Pal::Result AddCodeObjectLoadEvent(const Pal::IPipeline* pPipeline, CodeObjectLoadEventType eventType); - Pal::Result AddCodeObjectLoadEvent(const Pal::IShaderLibrary* pLibrary, CodeObjectLoadEventType eventType); - Pal::Result AddCodeObjectLoadEvent(const ElfBinaryInfo& elfBinaryInfo, CodeObjectLoadEventType eventType); - - // Recycle used Gart rafts and put back to available pool - void RecycleGartGpuMem(); - - // Recycle used Local rafts and put back to available pool - void RecycleLocalGpuMem(); - - // Recycle used Invisible rafts and put back to available pool - void RecycleInvisGpuMem(); - - // Destroy and free one sample item and its sub-items. - void FreeSampleItem(GpaSession::SampleItem* pSampleItem); - - // Destroy and free the m_sampleItemArray and associated memory allocation - void FreeSampleItemArray(); - - // Destroy the sub-items in m_sampleItemArray but keep associated memory allocations. - void RecycleSampleItemArray(); - - // Helper function to destroy the GpuMemoryInfo object - void DestroyGpuMemoryInfo(GpuMemoryInfo* pGpuMemoryInfo); - - PAL_DISALLOW_DEFAULT_CTOR(GpaSession); - GpaSession& operator =(const GpaSession&); -}; -} // GpuUtil +/* + *********************************************************************************************************************** + * + * Copyright (c) 2016-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpaSession.h + * @brief PAL GPU utility GpaSession class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDeque.h" +#include "palDevice.h" +#include "palGpuUtil.h" +#include "palHashSet.h" +#include "palMutex.h" +#include "palPipeline.h" +#include "palVector.h" +#include "palPlatform.h" +#include "palSysMemory.h" +#include "palGpuMemory.h" +#include "palMemTrackerImpl.h" + +// Forward declarations. +namespace Pal +{ + class ICmdAllocator; + class ICmdBuffer; + class IDevice; + class IGpuEvent; + class IGpuMemory; + class IPerfExperiment; + class IQueue; + class IQueueSemaphore; + struct GlobalCounterLayout; + struct MultiSubmitInfo; + struct ThreadTraceLayout; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 + enum PipelineStageFlag : uint32; +#else + enum HwPipePoint : uint32; +#endif +} +struct SqttFileChunkCpuInfo; +struct SqttFileChunkAsicInfo; +struct SqttCodeObjectDatabaseRecord; + +struct GpuMemoryInfo; + +namespace GpuUtil +{ +// Sample id initialization value. +constexpr Pal::uint32 InvalidSampleId = 0xFFFFFFFF; + +/// The available states of GpaSession +enum class GpaSessionState : Pal::uint32 +{ + Reset = 0, + Building = 1, + Complete = 2, + Ready = 3, +}; + +/// The various ways you can change trace options after it has started. +enum class UpdateSampleTraceMode : Pal::uint32 +{ + MinimalToFullMask = 0, ///< Used to convert a minimal trace (needed for context in compute presents) to a full + /// trace according to the options in the active trace. Requires enableSampleUpdates. + /// Additionally, this must be called between BeginSample() and EndSample() and + /// queue timing must also be enabled on the GpaSession when this function is called. + StartInstructionTrace = 1, ///< Used to enable instruction-level trace globally at any time. Can be run without an + /// active sample. Useful for targeting specific parts of a frame. + StopInstructionTrace = 2, ///< Used to disable instruction-level trace globally at any time. Can be run without an + /// active sample. +}; + +/// Specifies basic type of sample to perfom - either a normal set of "global" perf counters, or a trace consisting +/// of SQ thread trace and/or streaming performance counters. +enum class GpaSampleType : Pal::uint32 +{ + None = 0x0, ///< No profile will be done. + Cumulative = 0x1, ///< One 64-bit result will be returned per performance counter representing the cumulative delta + /// for that counter over the sample period. Cumulative samples must begin and end in the same + /// command buffer. + Trace = 0x2, ///< A GPU memory buffer will be filled with hw-specific SQ thread trace and/or streaming + /// performance counter data. Trace samples may span multiple command buffers. + Timing = 0x3, ///< Two 64-bit results will be recorded in beginTs and endTs to gather timestamp data. + Query = 0x4, ///< A set of 11 pipeline stats will be collected. + Count +}; + +/// Specifies a specific performance counter to be sampled with GpaSession::BeginSample() and GpaSession::EndSample(). +/// +/// This identifies a specific counter in a particular HW block instance, e.g., TCC instance 3 counter #19. It is up +/// to the client to know the meaning of a particular counter, e.g., TCC #19 is TCC_PERF_SEL_MISS on Fiji. Eventually, +/// PAL may want to support certain counters without the client needing HW-specific knowledge (i.e., select an enum +/// called L2MissRate from PAL rather than needing to know that counter is TCC #19 on Fiji), but GPA currently works in +/// this low-level mode with other drivers, and wants to keep the flexibility. +struct PerfCounterId +{ + Pal::GpuBlock block; ///< Which GPU block to reference (e.g., CB, DB, TCC). + Pal::uint32 instance; ///< Which instance of the specified GPU block to sample. E.g., Tahiti has 12 TCC blocks + /// (this number is returned per-block in the @ref Pal::GpuBlockPerfProperties structure). + /// There is no shortcut to get results for all instances of block in the whole chip, the + /// client must explicitly sample each instance and sum the results. + Pal::uint32 eventId; ///< Counter ID to sample. Note that the meaning of a particular eventId for a block can + /// change between chips. + + union + { + struct + { + Pal::uint32 spm32Bit : 1; ///< For SPM counters, collect in 32bit instead of 16bit + Pal::uint32 reserved : 31; ///< Reserved for future use + }; + + Pal::uint32 u32All; ///< Union value for copying + + } flags; + + // Some blocks have additional per-counter controls. They must be properly programmed when adding counters for + // the relevant blocks. It's recommended to zero them out when not in use. + union + { + struct + { + Pal::uint32 eventQualifier; ///< The DF counters have an event-specific qualifier bitfield. + } df; + + struct + { + Pal::uint16 eventThreshold; ///< Threshold value for those UMC counters having event-specific threshold. + Pal::uint8 eventThresholdEn; ///< Threshold enable (0 for disabled,1 for threshold) + Pal::uint8 rdWrMask; ///< Read/Write mask select (1 for Read, 2 for Write). + } umc; + + Pal::uint32 rs64Cntl; ///< CP blocks CPG and CPC have events that can be further filtered for processor events + + Pal::uint32 u32All; ///< Union value for copying, must be increased in size if any element of the union exceeds + } subConfig; +}; + +/// Defines a set of flags for a particular gpa session. +union GpaSessionFlags +{ + struct + { + /// Enables timing of queue operations via Timed* functions. + Pal::uint32 enableQueueTiming : 1; + + /// Enables sample updates via the UpdateSampleTraceParams function. + Pal::uint32 enableSampleUpdates : 1; + + /// Indicates that the client will use the internal Timed*QueueSemaphore() functions for queue semaphore timing + /// data. When not set it indicates the client will provide ETW data via the ExteralTimed* functions. + Pal::uint32 useInternalQueueSemaphoreTiming : 1; + + /// Reserved for future use. + Pal::uint32 reserved : 29; + }; + + /// Flags packed as 32-bit uint. + Pal::uint32 u32All; +}; + +/// Specifies options that direct the gpa session behavior. +struct GpaSessionBeginInfo +{ + /// Gpa Session flags used to control behavior. + GpaSessionFlags flags; +}; + +/// Input structure for CmdBeginGpuProfilerSample. +/// +/// Defines a set of global performance counters and/or SQ thread trace data to be sampled. +struct GpaSampleConfig +{ + /// Selects what type of data should be gathered for this sample. This can either be _cumulative_ to gather + /// simple deltas for the specified set of perf counters over the sample period, or it can be _trace_ to generate + /// a blob of RGP-formatted data containing SQ thread trace and/or streaming performance monitor data. + GpaSampleType type; + + union + { + struct + { + Pal::uint32 sampleInternalOperations : 1; ///< Include BLTs and internal driver operations in the + /// results. + Pal::uint32 cacheFlushOnCounterCollection : 1; ///< Insert cache flush and invalidate events before and + /// after every sample. + Pal::uint32 sqShaderMask : 1; ///< If sqShaderMask is valid. + Pal::uint32 sqWgpShaderMask : 1; ///< If sqWgpShaderMask is valid. + Pal::uint32 reserved : 28; ///< Reserved for future use. + }; + Pal::uint32 u32All; ///< Bit flags packed as uint32. + } flags; ///< Bit flags controlling sample operation for all sample + /// types. + + Pal::PerfExperimentShaderFlags sqShaderMask; ///< Which shader stages are sampled by GpuBlock::Sq counters. + ///< Only used if flags.sqShaderMask is set to 1. + Pal::PerfExperimentShaderFlags sqWgpShaderMask; ///< Which shader stages are sampled by GpuBlock::SqWgp counters. + ///< Only used if flags.sqWgpShaderMask is set to 1. + + struct + { + /// Number of entries in pIds. + Pal::uint32 numCounters; + + /// List of performance counters to be gathered for a sample. If the sample type is _cumulative_ this will + /// result in "global" perf counters being sampled at the beginning of the sample period; if the sample type + /// is _trace_ this will result in SPM data being added to the sample's resulting RGP blob. + /// + /// Note that it is up to the client to respect the hardware counter limit per block. This can be + /// determined by the maxGlobalOnlyCounters, maxGlobalSharedCounters, maxSpmCounters, and instanceGroupSize + /// fields of @ref Pal::GpuBlockPerfProperties. + const PerfCounterId* pIds; + + /// Period for SPM sample collection in cycles. Only relevant for _trace_ samples. + Pal::uint32 spmTraceSampleInterval; + + /// Maximum amount of GPU memory in bytes this sample can allocate for SPM data. Only relevant for _trace_ + /// samples. + Pal::gpusize gpuMemoryLimit; + } perfCounters; ///< Performance counter selection (valid for both _cumulative_ and _trace_ samples). + + struct + { + /// Number of entries in pIds. + Pal::uint32 numCounters; + + /// Period for DF SPM sample collection in nano seconds. + Pal::uint32 sampleInterval; + + /// Maximum amount of GPU memory in bytes this sample can allocate for DF SPM data. + Pal::gpusize gpuMemoryLimit; + + /// List of performance counters to be gathered for a df sample. This has to be separate from the list + /// list of normal counters because it is a completely different mechanism for gathering data. + /// + /// Note that it is up to the client to respect the hardware counter limit per block. This can be + /// determined by the maxSpmCounters fields of + /// @ref Pal::GpuBlockPerfProperties. + const PerfCounterId* pIds; + } dfSpmPerfCounters; + + struct + { + union + { + struct + { + Pal::uint32 enable : 1; ///< Include SQTT data in the trace. + Pal::uint32 supressInstructionTokens : 1; ///< Prevents capturing instruction-level SQTT tokens, + /// significantly reducing the amount of sample data. + Pal::uint32 stallMode : 2; ///< Describes behavior when buffer full + Pal::uint32 stallAllSimds : 1; ///< Stall all SIMDs for thread trace stall. + Pal::uint32 excludeNonDetailShaderData : 1; ///< Only emit shader tokens from the SIMD that have been + /// selected for detail instruction tracing +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 899 + Pal::uint32 enableExecPopTokens : 1; ///< Output exec tokens +#else + Pal::uint32 placeholder2 : 1; +#endif + Pal::uint32 reserved : 25; ///< Reserved for future use. + }; + Pal::uint32 u32All; ///< Bit flags packed as uint32. + } flags; ///< Bit flags controlling SQTT samples. + Pal::uint32 seMask; ///< Mask that determines which specific SEs to run Thread trace on. + /// If 0, all SEs are enabled + Pal::uint32 seDetailedMask; ///< Mask that selects which specific SEs to reveal Thread trace detailed info. + /// If 0, all SEs will reveal detailed thread trace + Pal::gpusize gpuMemoryLimit; ///< Maximum amount of GPU memory in bytes this sample can allocate for the SQTT + /// buffer. If 0, allocate maximum size to prevent dropping tokens toward the + /// end of the sample. + Pal::uint32 tokenMask; ///< Mask indicating which SQTT tokens are requested for capture. If a tokenMask is + /// not provided, PAL will default to collecting all tokens or tokens except + /// instruction tokens if the supressInstructionTokens flag is set. Instruction + /// tokens will always be filtered out if supressInstructionTokens = true. + } sqtt; ///< SQ thread trace configuration (only valid for _trace_ samples). + + struct + { +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 900 + Pal::PipelineStageFlag preSample; ///< The pipeline stage in the GPU pipeline where the begin timestamp should + /// take place. + Pal::PipelineStageFlag postSample; ///< The pipeline stage in the GPU pipeline where the end timestamp should + /// take place. +#else + Pal::HwPipePoint preSample; ///< The point in the GPU pipeline where the begin timestamp should take place. + Pal::HwPipePoint postSample; ///< The point in the GPU pipeline where the end timestamp should take place. +#endif + } timing; ///< Timestamp configuration. (only valid for timing samples) +}; + +/// Extra metadata about a command buffer submission +struct TimedSubmitInfo +{ + const Pal::uint64* pApiCmdBufIds; ///< Array of api specific command buffer ids + const Pal::uint32* pSqttCmdBufIds; ///< Array of sqtt command buffer ids + Pal::uint64 frameIndex; ///< The global frame index for the application. +}; + +/// Extra metadata about a queue semaphore operation +struct TimedQueueSemaphoreInfo +{ + Pal::uint64 semaphoreID; ///< Api specific id associated with a semaphore. +}; + +/// Extra metadata about a queue present operation +struct TimedQueuePresentInfo +{ + Pal::uint64 presentID; ///< Api specific id associated with a present. +}; + +/// Struct for storing information about gpu clock speeds. +struct GpuClocksSample +{ + Pal::uint32 gpuEngineClockSpeed; // Current speed of the gpu engine clock in MHz + Pal::uint32 gpuMemoryClockSpeed; // Current speed of the gpu memory clock in MHz +}; + +/// Struct for storing CPU-side allocations of Pal::IPerfExperiment's. +struct PerfExperimentMemory +{ + void* pMemory; // Memory allocated for an IPerfExperiment. + size_t memorySize; // Size of the memory allocated in pMemory. +}; + +/// Struct for supplying API-dependent information about pipelines. +struct RegisterPipelineInfo +{ + Pal::uint64 apiPsoHash; ///< Client-provided PSO hash. +}; + +/// Struct for supplying API-dependent information about libraries. +struct RegisterLibraryInfo +{ + Pal::uint64 apiHash; ///< Client-provided api hash. +}; + +/// Struct for supplying Elf binary. +struct ElfBinaryInfo +{ + const void* pBinary; ///< FAT Elf binary. + Pal::uint32 binarySize; ///< FAT Elf binary size. + Pal::IGpuMemory* pGpuMemory; ///< GPU Memory where the compiled ISA resides. + Pal::gpusize offset; ///< Offset inside GPU memory object + Pal::uint64 originalHash; ///< Original source/binary hash. + Pal::uint64 compiledHash; ///< Compiled binary hash. +}; + +/// Enumeration of RGP trace profiling modes +enum class TraceProfilingMode : Pal::uint32 +{ + Present = 0, ///< Present triggered capture + UserMarkers = 1, ///< Capture triggered by user marker + FrameNumber = 2, ///< Capture based on frame number + Tags = 3, ///< Tag based capture +}; + +/// Constant defines the maximum length for a user marker string. +static constexpr Pal::uint32 UserMarkerStringLength = 256; + +/// Defines data specific to each profiling mode used to capture an RGP trace. +union TraceProfilingModeData +{ + struct + { + char start[UserMarkerStringLength]; ///< User marker string used to start trace capture. + char end[UserMarkerStringLength]; ///< User marker string used to end trace capture. + } userMarkerData; + + struct + { + Pal::uint32 start; ///< Frame number used to start the trace. + Pal::uint32 end; ///< Frame number used to end the trace. + } frameNumberData; + + struct + { + Pal::uint64 start; ///< Tag used to start the trace. + Pal::uint64 end; ///< Tag used to end the trace. + } tagData; +}; + +/// Enumerates the different instruction level data modes for an RGP trace +enum class InstructionTraceMode : Pal::uint32 +{ + Disabled = 0, ///< Instruction level data was disabled for trace. + FullFrame = 1, ///< Instruction level data was enabled for the full trace. + ApiPso = 2, ///< Instruction level data was enabled only for a single API PSO. +}; + +/// Defines the data used to control enabling of instruction level data. +struct InstructionTraceModeData +{ + Pal::uint64 apiPsoHash; ///< Hash of the API PSO targeted for instruction level data. +}; + +/// Struct for supplying API specific information about an RGP trace +struct SampleTraceApiInfo +{ + TraceProfilingMode profilingMode; ///< Profiling mode used to trigger the trace. + TraceProfilingModeData profilingModeData; ///< Profiling mode specific data. + InstructionTraceMode instructionTraceMode; ///< Instruction trace mode for the trace. + InstructionTraceModeData instructionTraceModeData; ///< Instruction trace mode data. +}; + +/// An enumeration of the API types. +enum class ApiType : Pal::uint32 +{ + DirectX12 = 0, ///< Represents DirectX12 API type. + Vulkan = 1, ///< Represents Vulkan API type. + Generic = 2, ///< Represents Generic API type. + OpenCl = 3, ///< Represents OpenCL API type. + Hip = 5, ///< Represents HIP API type. +}; + +/// Struct used for storing SQTT-specific trace information +struct SqttTraceInfo +{ + Pal::uint32 shaderEngine; ///< Shader engine index + Pal::uint32 computeUnit; ///< Compute unit index + Pal::uint32 sqttVersion; ///< SQTT version + Pal::uint64 bufferSize; ///< SQTT trace buffer size +}; + +/// Struct used for storing SPM-specific trace information +struct SpmTraceInfo +{ + Pal::uint32 numSpmCounters; ///< The number of SPM counters sampled in the trace + Pal::uint32 numTimestamps; ///< The number of timestamps that samples were taken + Pal::uint32 sampleFrequency; ///< The SPM counter sampling frequency +}; + +/// Struct used for storing QueueTimings-specific trace information +struct QueueTimingsTraceInfo +{ + Pal::uint32 numQueueInfoRecords; + Pal::uint32 numQueueEventRecords; + Pal::uint32 queueInfoTableSize; + Pal::uint32 queueEventTableSize; +}; + +/** +*********************************************************************************************************************** +* @class GpaSession +* @brief Helper class providing common driver functionality required by all PAL clients that support the GPUPerfAPI +* (GPA). Abstracts IPerfExperiment creation, memory management, completion confirmation, and results reporting +* at a level convenient for GPA. Each PAL client driver will need to publish an API extension exposing this +* support for use by GPA. +* +* A GpaSession is a container for a set of _samples_ of performance counter and/or SQ thread trace data. Its main +* purpose is to manage resources (IPerfExperiments and their backing system/GPU memory) in an efficient manner that is +* consistent with command buffer management in modern APIs. Consider GpaSession as a peer of DX12's command +* allocator or Vulkan's command pool objects. +* +* Basic flow of usage: +* - Newly create sessions are in the _reset_ state. +* - A session is moved from the _reset_ state to the _building_ state by calling Begin(). +* - Samples are added to a session by specifying desired data for each query and marking a begin and end location +* in ICmdBuffers as they are built. Internally required resources, like GPU memory where counters will be +* written, are allocated from internal pools managed by the session. +* - A session is moved from the _building_ state to the _complete_ state by calling End(). +* - The application will submit all command buffers referenced by the session. +* - The session is confirmed as _ready_, either using standard PAL fences to confirm all assocated submission have +* completed, or by polling IsReady() on the session. +* - Results for all samples in the session can be queried via GetResults(). +* - Reset() should be called once results have been gathered and before building a new session. Resources are +* retained by the session object for use in the newly built session. The session object must be destroyed in +* order to fully release all resource back to the system. +* +* Cumulative-type samples may not span multiple command buffers, as other apps could interfere with the counts and +* there the final data doesn't have time-based visibility to detect that happened. +* +* @warning GpaSession is not thread safe. Performing samples in command buffers being built simultaneously by multiple +* threads should use multiple GpaSession objects. +*********************************************************************************************************************** +*/ +class GpaSession +{ + typedef Pal::IPlatform GpaAllocator; +public: + typedef Util::Deque PerfExpMemDeque; + + /// Constructor. + GpaSession( + Pal::IPlatform* pPlatform, + Pal::IDevice* pDevice, + Pal::uint16 apiMajorVer, + Pal::uint16 apiMinorVer, + ApiType apiType, + Pal::uint16 rgpInstrumentationSpecVer = 0, + Pal::uint16 rgpInstrumentationApiVer = 0, + PerfExpMemDeque* pAvailablePerfExpMem = nullptr); + + ~GpaSession(); + + /// Copy constructor creates an empty copy of a session. + /// + /// Newly constructed session copies the GPU memory allocations and their layout from the source session, making + /// this a valid destination for a CopyResults command. This new object is effectively in the _complete_ state. + /// + /// The purpose of such objects is to handle sampling data from bundles or nested command buffers where the same + /// set of commands might be executed multiple times from a single root-level command buffer. The client should + /// note such cases, and create a copy of the bundle's session for each invocation, then call CopyResults() from + /// the original session into the copy after the invocation. + /// + /// @param [in] src Session to be copied. Must either be in the _complete_ or _ready_ state. + explicit GpaSession(const GpaSession& src); + + /// Initialize the newly constructed GPA session. + Pal::Result Init(); + + /// Registers a queue with the GpaSession that will be submitted to using TimedSubmit. This must be called on any + /// queues that are submitted to via the Timed* functions. For Timed* signal and wait queue semaphore events, a + /// valid queueContext will be required (queueContext not equal to 0). + Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, + Pal::uint64 queueId, + Pal::uint64 queueContext); + + /// Unregisters a queue prior to object destruction, and ensure that associated resources are destroyed. Work can + /// no longer be submitted on the queue after this has been called. + Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); + + Pal::Result TimedSubmit(Pal::IQueue* pQueue, + const Pal::MultiSubmitInfo& submitInfo, + const TimedSubmitInfo& timedSubmitInfo); + + /// Executes a timed queue semaphore signal through the given queue. The HW time is measured when the queue semaphore + /// is signaled. + Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedSignalInfo, + Pal::uint64 value = 0); + + /// Executes a timed queue semaphore wait through the given queue. The HW time is measured when the queue semaphore + /// wait finishes. + Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedWaitInfo, + Pal::uint64 value = 0); + + /// Injects a timed queue present event. + Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, + const TimedQueuePresentInfo& timedPresentInfo); + + /// Injects a timed wait queue semaphore event using information supplied by an external source. + /// A valid queueContext (queueContext not equal to 0) is needed for this function. + Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedWaitInfo); + + /// Injects a timed signal queue semaphore event using information supplied by an external source. + /// A valid queueContext (queueContext not equal to 0) is needed for this function. + Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSignalInfo); + + /// Queries the engine and memory clocks from DeviceProperties + Pal::Result SampleGpuClocks(GpuClocksSample* pGpuClocksSample) const; + + /// Samples the timing clocks if queue timing is enabled and adds a clock sample entry to the current session. + Pal::Result SampleTimingClocks(); + + /// Moves the session from the _reset_ state to the _building_ state. + /// + /// Invalid to call Begin() on a session that isn't in the _reset_ state. + /// + /// @param [in] info Information about the gpa sessions desired behavior. + /// + /// @returns Success if the session was successfully moved to the _building_ state. Otherwise, possible errors + /// include: + /// + ErrorUnavailable if the sessions isn't current in the _reset_ state. + Pal::Result Begin(const GpaSessionBeginInfo& info); + + /// Moves the session from the _building_ state to the _complete_ state. + /// + /// Invalid to call End() on a session that isn't in the _building_ state. The implementation _may_ insert GPU + /// commands into the specified pCmdBuf - in the case of a session that spans multiple command buffers, the + /// command buffer specified to End() _must_ be the last command buffer of the session that is submitted. + /// + /// @param [in] pCmdBuf Last (normally _only_) command buffer of the session. Can be used by implementation + /// to insert GPU commands required after all samples are inserted (e.g., to confirm session + /// completion). + /// + /// @returns Success if the session was successfully moved to the _complete_ state. Otherwise, possible errors + /// include: + /// + ErrorUnavailable if the sessions isn't current in the _building_ state. + Pal::Result End(Pal::ICmdBuffer* pCmdBuf); + + /// Marks the beginning of a range of GPU operations to be measured and specifies what data should be recorded. + /// + /// It is possible the sample will not succeed due to internal memory allocation failure, etc. In those cases, + /// the session will be marked invalid and no sample commands will be inserted. Reporting of this error is + /// delayed until GetResults(). + /// + /// A note for GpuBlock::SqWgp + /// Client of palPerfExperiment may configure counters of GpuBlock::SqWgp based on a per-wgp granularity + /// only if the following are disabled: GFXOFF, virtualization/SRIOV, VDDGFX (power down features), clock + /// gating (CGCG) and power gating. PAL expose this feature to clients. + /// If any of the conditions above cannot be met, it's the client's job to set all WGPs in the same SE to the same + /// perf counter programming. In this case, GpuBlock::SqWgp's perf counter works on a per-SE granularity. + /// Strictly speaking, it's not true that the counters work on a per-SE granularity when those power features + /// are enabled. It's all still per-WGP in HW, we just can't support different counter configs within the same SE. + /// The counter data is still reported per WGP (not aggregated for the whole SE). + /// + /// Check the following two documents for details: + /// + /// @param [in] pCmdBuf Command buffer to issue the begin sample commands. All operations performed + /// between executing the BeginSample() and EndSample() GPU commands will contribute to + /// the sample results. + /// @param [in] sampleConfig Describes what data should be sampled. + /// @param [out] pSampleId An ID corresponding to this sample. This ID should be recorded and passed back to + /// EndSample() when the sampled command buffer range is complete. This ID should also + /// be passed to GetResults() when the session is in the _ready_ state in order to get + /// the results of this sample. + /// + /// @returns Success if the update was successful. Unsupported if the sample config type is not supported. + /// Otherwise, possible errors include: + /// + ErrorInvalidPointer if pCmdBuf or pSampleId is nullptr. + Pal::Result BeginSample( + Pal::ICmdBuffer* pCmdBuf, + const GpaSampleConfig& sampleConfig, + Pal::uint32* pSampleId); + + /// Updates the trace parameters for a specific sample. + /// + /// @param [in] pCmdBuf Command buffer to issue the update commands. + /// @param [in] sampleId Identifies the sample to be updated, if required by the mode. This should be a value + /// returned by BeginSample(), and must correspond to a thread trace sample. + /// @param [in] updateMode The way the sample parameters should be set. Some modes have additional restrictions. + /// @see UpdateSampleTraceMode + /// + /// @returns Success if the update was successful. Otherwise, possible errors + /// include: + /// + ErrorInvalidPointer if pCmdBuf is nullptr. + /// + ErrorInvalidObjectType if a sample is required and the sample associated with sampleId is not a + /// trace sample. + Pal::Result UpdateSampleTraceParams( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId, + UpdateSampleTraceMode updateMode); + + /// Marks the end of a range of command buffer operations to be measured. + /// + /// @param [in] pCmdBuf Command buffer to issue the end sample commands. All operations performed between + /// executing the BeginSample() and EndSample() GPU commands will contribute to the sample + /// results. _Cumulative_ samples (i.e., global performance counter samples) must never span + /// multiple command buffers (EndSample() should be called in the same command buffer as + /// BeginSample()). + /// @param [in] sampleId Identifies the sample to be ended. This should be the value returned by BeginSample() + /// for the sample that is being ended. + /// + /// @note BeginSample() must be called before EndSample() _and_ the GPU commands inserted by BeginSample() must be + /// executed before the command inserted by EndSample(). Since a session is a single-threaded object, this + /// will normally happen naturally. + void EndSample( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId); + + /// Copies the DF SPM trace buffer to the GpaSession result buffer + /// + /// @param [in] pCmdBuf Command buffer to issue the copy commands. + /// @param [in] sampleId Identifies the sample to be copied. + /// @note This must be called after a command buffer with the dfSpmTraceEnd CmdBufInfo flag + /// and with a separate command buffer. DF SPM traces are on a per command buffer granularity + /// because they are started and stopped by the KMD. + void CopyDfSpmTraceResults( + Pal::ICmdBuffer* pCmdBuf, + Pal::uint32 sampleId); + + /// Provides API specific information about an RGP trace. + /// + /// @param [in] traceApiInfo Const reference to the struct of API specific information. + /// @param [in] sampleId Sample ID (returned by BeginSample) for the RGP trace type sample info is being + /// provided for. + void SetSampleTraceApiInfo( + const SampleTraceApiInfo& traceApiInfo, + Pal::uint32 sampleId) const; + + /// Reports if GPU execution of this session has completed and results are _ready_ for querying from the CPU via + /// GetResults(). + /// + /// @returns true if all samples in the session have completed GPU execution. + bool IsReady() const; + + /// Reports results of a particular sample. Only valid for sessions in the _ready_ state. + /// + /// Results will be formatted depending on the sample type: + /// + Cumulative: Results will be an array of uint64 values in the order of perf counter IDs specified by + /// BeginSample(). + /// + SqThreadTrace: Results will be a binary blob in the RGP file format. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData, and *pSizeInBytes will be set to the amount of space written + /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required (should only be necessary when + /// getting RGP data). If non-null, the sample results will be written to this + /// location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + Pal::Result GetResults( + Pal::uint32 sampleId, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the SQTT results. Only valid for sessions in the _complete_ state. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [in] traceIndex The index of the trace to get. + /// @param [out] pTraceInfoOut Optional pointer to a structure which will be written with information about the trace. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData, and *pSizeInBytes will be set to the amount of space written + /// to pData. If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + NotFound if the given index is not valid. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + // + ErrorInvalidPointer if pSizeInBytes is NULL. + Pal::Result GetSqttTraceData( + Pal::uint32 sampleId, + Pal::uint32 traceIndex, + SqttTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the SPM trace results of a particular sample. Only valid for 'Trace' type samples and sessions + /// in the _complete_ state. + /// + /// Results in the output buffer are a binary blob formatted according to the RGP specification. + /// The data layout of the populated output buffer is as follows: + /// - Timestamps array [size: "numTimestamps * sizeof(uint64)" bytes] + /// - SpmCounterInfo array [size: "numSpmCounters * sizeof(SpmCounterInfo)" bytes] + /// - SPM Counter Data matrix [size: "*pSizeInBytes - (timestamps array + SpmCounterInfo array size)" bytes] + /// + /// The SPM Counter Data matrix is laid out linearly in a row-major format. There are "numSpmCounters" rows and + /// "numTimestamps" columns. Each element in the matrix is either 16- or 32-bits, based on the "dataSize" field + /// of the corresponding "SpmCounterInfo" entry. + /// + /// @param [in] sampleId Sample to be reported. Corresponds to value returned by BeginSample(). + /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with trace metadata. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData. + /// If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session is not in the _ready_ state. + /// + ErrorOutOfGpuMemory if the session wasn't properly built due to running out of GPU memory resources. + /// + ErrorInvalidMemorySize if *pSizeInBytes isn't big enough to hold the results. + Pal::Result GetSpmTraceData( + Pal::uint32 sampleId, + SpmTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Retrieves the Queue Timings data from the active GpaSession. + /// Only valid when the GpaSession had `enableQueueTiming` flag set. + /// + /// @param [out] pTraceInfo Optional. If non-null, this structure is populated with metadata. + /// @param [in,out] pSizeInBytes If pData is non-null, the input value of *pSizeInBytes is the amount of space + /// available in pData. + /// If pData is null, *pSizeInBytes will be set to the amount of space + /// required. + /// @param [out] pData Can be null to query how much size is required. + /// If non-null, the sample results will be written to this location. + /// + /// @returns Success if the sample results are successfully written to pData (or, if pData is null, the required + /// size is successfully written to pSizeInBytes). Otherwise, possible errors include: + /// + ErrorUnavailable if the session was not configured with `enableQueueTiming`. + Pal::Result GetQueueTimingsData( + QueueTimingsTraceInfo* pTraceInfo, + size_t* pSizeInBytes, + void* pData) const; + + /// Moves the session to the _reset_ state, marking all sessions resources as unused and available for reuse when + /// the session is re-built. + /// + /// @warning This function cannot be called when the session is queued for execution on the GPU. The client must + /// confirm this is not the case using IsReady(), fences, etc. + /// + /// @returns Success if the session was successfully moved to the _reset_ state. Otherwise, possible errors + /// include: + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result Reset(); + + /// Uses the GPU to copy results from a nested command buffer's session into a root-level command buffer's per- + /// invocation session data. + /// + /// This command will implicitly wait for the source session (as specified in the copy constructor) to be complete + /// then use the GPU to update this session's data. This allows the client to get accurate sample data in the + /// case where a nested command buffer is launched multiple times from the same root-level command buffer. + /// + /// The session remains in the _complete_ state after calling this, and the client should submit the commands + /// and verify their completion to move to the _ready_ state. + /// + /// @param pCmdBuf Command buffer where the session copy should be performed. + void CopyResults(Pal::ICmdBuffer* pCmdBuf); + + /// Register pipeline with GpaSession for obtaining shader dumps and load events in the RGP file. + /// + /// @param [in] pPipeline The PAL pipeline to be tracked. + /// @param [in] clientInfo API-dependent information for this pipeline to also be recorded. + /// + /// @returns Success if the pipeline has been registered with GpaSession successfully. + /// + AlreadyExists if a duplicate pipeline is provided. + Pal::Result RegisterPipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + + /// Unregister pipeline with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the PAL pipeline object. + /// + /// @param [in] pPipeline The PAL pipeline to be tracked. + /// + /// @returns Success if the pipeline has been unregistered with GpaSession successfully. + Pal::Result UnregisterPipeline(const Pal::IPipeline* pPipeline); + + /// Register library with GpaSession for obtaining shader dumps and load events in the RGP file. + /// + /// @param [in] pLibrary The PAL library to be tracked. + /// @param [in] clientInfo API-dependent information for this library to also be recorded. + /// + /// @returns Success if the library has been registered with GpaSession successfully. + /// + AlreadyExists if a duplicate library is provided. + Pal::Result RegisterLibrary(const Pal::IShaderLibrary* pLibrary, const RegisterLibraryInfo& clientInfo); + + /// Unregister library with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the PAL library object. + /// + /// @param [in] pLibrary The PAL library to be tracked. + /// + /// @returns Success if the library has been unregistered with GpaSession successfully. + Pal::Result UnregisterLibrary(const Pal::IShaderLibrary* pLibrary); + + /// Register ELF binary with GpaSession for obtaining kernel dumps and load events in the RGP file. + /// + /// @param [in] elfBinaryInfo Contains information about the Elf binary to be recorded. + /// + /// @returns Success if the Elf binary has been registered with GpaSession successfully. + Pal::Result RegisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + /// Unregister Elf binary with GpaSession for obtaining unload events in the RGP file. + /// This should be called immediately before destroying the Elf binary. + /// + /// @param [in] elfBinaryInfo Contains the elf binary info to be removed from tracking. + /// + /// @returns Success if the library has been unregistered with GpaSession successfully. + Pal::Result UnregisterElfBinary(const ElfBinaryInfo& elfBinaryInfo); + + /// Given a Pal device, validate a list of perfcounters. + /// + /// @param [in] pDevice a given device + /// @param [in] pCounters a list of perf counters. + /// @param [in] numCounters perf counter counts. + /// + /// @returns Success if counters are valid. + Pal::Result ValidatePerfCounters(Pal::IDevice* pDevice, + const PerfCounterId* pCounters, + const Pal::uint32 numCounters); + +private: + // Tracking structure for a single IGpuMemory allocation owned by a GpaSession::GpaSession. In particular, it + // tracks the associated CPU pointer since these allocations remain mapped for CPU access for their lifetime. + struct GpuMemoryInfo + { + Pal::IGpuMemory* pGpuMemory; + void* pCpuAddr; + }; + + // Event type for code object load events + enum class CodeObjectLoadEventType + { + LoadToGpuMemory = 0, + UnloadFromGpuMemory + }; + + // Represents all information to be contained in one SqttCodeObjectLoaderEventRecord + struct CodeObjectLoadEventRecord + { + CodeObjectLoadEventType eventType; + Pal::uint64 baseAddress; + Pal::ShaderHash codeObjectHash; + Pal::uint64 timestamp; + }; + + // Represents all information to be contained in one SqttPsoCorrelationRecord + struct PsoCorrelationRecord + { + Pal::uint64 apiPsoHash; + Pal::PipelineHash internalPipelineHash; + }; + + // Registers a single (non-archive) pipeline with the GpaSession. Returns AlreadyExists on duplicate PAL pipeline. + Pal::Result RegisterSinglePipeline(const Pal::IPipeline* pPipeline, const RegisterPipelineInfo& clientInfo); + + // Unregisters a single (non-archive) pipeline from the GpaSession. + Pal::Result UnregisterSinglePipeline(const Pal::IPipeline* pPipeline); + + Pal::IDevice*const m_pDevice; // Device associated with this GpaSession. + Pal::DeviceProperties m_deviceProps; + Pal::SetClockModeOutput m_peakClockFrequency; // Output of query for stable peak, values in Mhz + Pal::PerfExperimentProperties m_perfExperimentProps; + Pal::uint32 m_timestampAlignment; // Pre-calculated timestamp data alignment. + ApiType m_apiType; // API type, e.g. Vulkan, used in RGP dumps. + Pal::uint16 m_apiMajorVer; // API major version, used in RGP dumps. + Pal::uint16 m_apiMinorVer; // API minor version, used in RGP dumps. + Pal::uint16 m_instrumentationSpecVersion; // Spec version of RGP instrumetation. + Pal::uint16 m_instrumentationApiVersion; // Api version of RGP instrumetation. + + Pal::IGpuEvent* m_pGpuEvent; + GpaSessionState m_sessionState; + + const GpaSession* const m_pSrcSession; // source session for session created via copy c'tor + + // Tracks the current GPU memory object and offset being sub-allocated for AcquireGpuMem(). + GpuMemoryInfo m_curGartGpuMem; + Pal::gpusize m_curGartGpuMemOffset; + GpuMemoryInfo m_curLocalGpuMem; + Pal::gpusize m_curLocalGpuMemOffset; + GpuMemoryInfo m_curInvisGpuMem; + Pal::gpusize m_curInvisGpuMemOffset; + + // Locks for the local-invisible, gart and local memory subdivision (and their pools) + Util::Mutex m_gartGpuMemLock; + Util::Mutex m_localGpuMemLock; + Util::Mutex m_invisGpuMemLock; + + // Counts number of samples that are active in this GpaSession. + Pal::uint32 m_sampleCount; + + Pal::IPlatform*const m_pPlatform; // Platform associated with this GpaSesion. + + // GartHeap / LocalHeap / InvisHeap GPU chunk pools. + Util::Deque m_availableGartGpuMem; + Util::Deque m_busyGartGpuMem; + Util::Deque m_availableLocalGpuMem; + Util::Deque m_busyLocalGpuMem; + Util::Deque m_availableInvisGpuMem; + Util::Deque m_busyInvisGpuMem; + + struct SampleItem; + class PerfSample; + class CounterSample; + class TraceSample; + class TimingSample; + class QuerySample; + + Util::Vector m_sampleItemArray; + PerfExpMemDeque* m_pAvailablePerfExpMem; + + // Unique pipelines registered with this GpaSession. + Util::HashSet m_registeredPipelines; + // Unique API PSOs registered with this GpaSession. + Util::HashSet m_registeredApiHashes; + + // List of cached pipeline code object records that will be copied to the final database at the end of a trace + Util::Deque m_codeObjectRecordsCache; + // List of pipeline code object records that were registered during a trace + Util::Deque m_curCodeObjectRecords; + + // List of cached code object load event records that will be copied to the final database at the end of a trace + Util::Deque m_codeObjectLoadEventRecordsCache; + // List of code object load event records that were registered during a trace + Util::Deque m_curCodeObjectLoadEventRecords; + + // List of cached PSO correlation records that will be copied to the final database at the end of a trace + Util::Deque m_psoCorrelationRecordsCache; + // List of PSO correlation records that were registered during a trace + Util::Deque m_curPsoCorrelationRecords; + + Util::RWLock m_registerPipelineLock; + + // Event type for timed queue events + enum class TimedQueueEventType : Pal::uint32 + { + Submit, + Signal, + Wait, + Present, + ExternalSignal, + ExternalWait + }; + + // Struct that contains information about a specific timed queue event. + struct TimedQueueEventItem + { + TimedQueueEventType eventType; // Type of event + Pal::uint64 cpuTimestamp; // Time when the event was processed on the cpu + Pal::uint64 apiId; // The api specific id for the queue event + Pal::uint32 sqttCmdBufId; // The sqtt command buffer id value associated with a submit event + Pal::uint32 submitSubIndex; // The sub index of an event within a submission event. + Pal::uint32 queueIndex; // The index of the associated queue in the m_timedQueuesArray + Pal::uint64 frameIndex; // The index of the current frame being rendered + union + { + struct + { + GpuMemoryInfo memInfo[2]; // The gpu memory for the timestamps associated with the event + Pal::gpusize offsets[2]; // Memory offsets for the associated timestamp gpu memory + } gpuTimestamps; + + Pal::uint64 cpuCompletionTimestamp; // The time when the event completed on the cpu + }; + }; + + // Struct for keeping track of timed operation on a specific queue + struct TimedQueueState + { + Pal::IQueue* pQueue; // Pal Queue + Pal::uint64 queueId; // Api specific queue id + Pal::uint64 queueContext; // Api specific queue context + Pal::QueueType queueType; // Queue type + Pal::EngineType engineType; // Engine type + bool valid; // Used to track if the queue is valid + Util::Deque* pAvailableCmdBuffers; // List of available cmdbuffers + Util::Deque* pBusyCmdBuffers; // List of busy cmdbuffers + Pal::IFence* pFence; // Used to track queue + // operations + }; + + // Flags for the current session. + GpaSessionFlags m_flags; + + // Array containing all of the queues registered for timing operations + Util::Vector m_timedQueuesArray; + Util::RWLock m_timedQueuesArrayLock; + + // List of timed queue events for the current session + Util::Vector m_queueEvents; + Util::Mutex m_queueEventsLock; + + // List of timestamp calibration samples + Util::Vector m_timestampCalibrations; + + // The most recent gpu clocks sample + GpuClocksSample m_lastGpuClocksSample; + + // Internal command allocator used for timing command buffers + Pal::ICmdAllocator* m_pCmdAllocator; + + // Finds the TimedQueueState associated with pQueue. + Pal::Result FindTimedQueue(Pal::IQueue* pQueue, + TimedQueueState** ppQueueState, + Pal::uint32* pQueueIndex); + + // Finds the TimedQueueState associated with queueContext. + Pal::Result FindTimedQueueByContext(Pal::uint64 queueContext, + TimedQueueState** ppQueueState, + Pal::uint32* pQueueIndex); + + /// Injects an external timed queue semaphore operation event + Pal::Result ExternalTimedQueueSemaphoreOperation(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSemaphoreInfo, + bool isSignalOperation); + + /// Converts a CPU timestamp to a GPU timestamp using a CalibratedTimestamps struct + Pal::uint64 ConvertCpuTimestampToGpuTimestamp(Pal::uint64 cpuTimestamp, + const Pal::CalibratedTimestamps& calibration) const; + + /// Extracts a GPU timestamp from a queue event + Pal::uint64 ExtractGpuTimestampFromQueueEvent(const TimedQueueEventItem& queueEvent) const; + + // Creates a new command buffer for use on pQueue + Pal::Result CreateCmdBufferForQueue(Pal::IQueue* pQueue, + Pal::ICmdBuffer** ppCmdBuffer); + + // Acquires a command buffer from the TimedQueueState's command buffer pool + Pal::Result AcquireTimedQueueCmdBuffer(TimedQueueState* pQueueState, + Pal::ICmdBuffer** ppCmdBuffer); + + // Recycles busy command buffers in pQueueState + Pal::Result RecycleTimedQueueCmdBuffers(TimedQueueState* pQueueState); + + // Preallocates a fixed number of command buffers for pQueueState and adds them to the command buffer pool + Pal::Result PreallocateTimedQueueCmdBuffers(TimedQueueState* pQueueState, + Pal::uint32 numCmdBuffers); + + // Resets all per session state in pQueueState + Pal::Result ResetTimedQueueState(TimedQueueState* pQueueState); + + // Destroys the memory and resources for pQueueState + void DestroyTimedQueueState(TimedQueueState* pQueueState); + + // Helper function to import one sample item from a source session to copy session. + Pal::Result ImportSampleItem(const SampleItem* pSrcSampleItem); + + // Acquires a range of queue-owned GPU memory for use by the next command buffer submission. + Pal::Result AcquireGpuMem( + Pal::gpusize size, + Pal::gpusize alignment, + Pal::GpuHeap heapType, + Pal::GpuMemMallPolicy mallPolicy, + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset); + + // Acquires a GpaSession-owned performance experiment based on the device's active perf counter requests. + Pal::Result AcquirePerfExperiment( + GpaSession::SampleItem* pSampleItem, + const GpaSampleConfig& sampleConfig, + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset, + GpuMemoryInfo* pSecondaryGpuMem, + Pal::gpusize* pSecondaryOffset, + Pal::gpusize* pHeapSize, + Pal::IPerfExperiment** ppExperiment); + + // Acquires a session-owned pipeline stats query. + Pal::Result AcquirePipeStatsQuery( + GpuMemoryInfo* pGpuMem, + Pal::gpusize* pOffset, + Pal::gpusize* pHeapSize, + Pal::IQueryPool** ppQuery); + + // Dump SQ thread trace data in rgp format + Pal::Result DumpRgpData(const GpaSampleConfig* pTraceConfig, + TraceSample* pTraceSample, + void* pRgpOutput, + size_t* pTraceSize) const; + + // Dumps the spm trace data in the buffer provided. + Pal::Result AppendSpmTraceData(TraceSample* pTraceSample, + size_t bufferSize, + void* pData, + Pal::gpusize* pSizeInBytes) const; + + // Dumps the df spm trace data in the buffer provided. + Pal::Result AppendDfSpmTraceData(TraceSample* pTraceSample, + size_t bufferSize, + void* pData, + Pal::gpusize* pSizeInBytes) const; + + Pal::Result AddCodeObjectLoadEvent(const Pal::IPipeline* pPipeline, CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent(const Pal::IShaderLibrary* pLibrary, CodeObjectLoadEventType eventType); + Pal::Result AddCodeObjectLoadEvent(const ElfBinaryInfo& elfBinaryInfo, CodeObjectLoadEventType eventType); + + // Recycle used Gart rafts and put back to available pool + void RecycleGartGpuMem(); + + // Recycle used Local rafts and put back to available pool + void RecycleLocalGpuMem(); + + // Recycle used Invisible rafts and put back to available pool + void RecycleInvisGpuMem(); + + // Destroy and free one sample item and its sub-items. + void FreeSampleItem(GpaSession::SampleItem* pSampleItem); + + // Destroy and free the m_sampleItemArray and associated memory allocation + void FreeSampleItemArray(); + + // Destroy the sub-items in m_sampleItemArray but keep associated memory allocations. + void RecycleSampleItemArray(); + + // Helper function to destroy the GpuMemoryInfo object + void DestroyGpuMemoryInfo(GpuMemoryInfo* pGpuMemoryInfo); + + PAL_DISALLOW_DEFAULT_CTOR(GpaSession); + GpaSession& operator =(const GpaSession&); +}; +} // GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h index 87f515b4dd..a201fcc8c2 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palGpuUtil.h @@ -1,141 +1,141 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palGpuUtil.h - * @brief Common include for the PAL GPU utility collection. Defines common types, macros, enums, etc. - *********************************************************************************************************************** - */ - -#pragma once - -#include "pal.h" - -// Forward declarations. -namespace Pal -{ - struct DeviceProperties; - class IImage; - class IGpuMemory; - struct ImageCopyRegion; - struct TypedBufferCopyRegion; - struct MemoryImageCopyRegion; -} - -/// Library-wide namespace encapsulating all PAL GPU utility entities. -namespace GpuUtil -{ - -/// Validate image copy region. -/// -/// @param [in] properties The device properties. -/// @param [in] engineType Engine to validate. -/// @param [in] src Src image. -/// @param [in] dst Des image. -/// @param [in] region Copy region. -/// -/// @returns true if the image copy is supported by the specific engine, otherwise false. -extern bool ValidateImageCopyRegion( - const Pal::DeviceProperties& properties, - Pal::EngineType engineType, - const Pal::IImage& src, - const Pal::IImage& dst, - const Pal::ImageCopyRegion& region); - -/// Validate typed buffer copy region. -/// -/// @param [in] properties The device properties. -/// @param [in] engineType Engine to validate. -/// @param [in] region Copy region. -/// -/// @returns true if the typed buffer copy is supported by the specific engine, otherwise false. -extern bool ValidateTypedBufferCopyRegion( - const Pal::DeviceProperties& properties, - Pal::EngineType engineType, - const Pal::TypedBufferCopyRegion& region); - -/// Validate image-memory copy region. -/// -/// @param [in] properties The device properties. -/// @param [in] engineType Engine to validate. -/// @param [in] image The IImage object. -/// @param [in] region Copy region. -/// -/// @returns true if the image-memory copy is supported by the specific engine, otherwise false. -extern bool ValidateMemoryImageRegion( - const Pal::DeviceProperties& properties, - Pal::EngineType engineType, - const Pal::IImage& image, - const Pal::IGpuMemory& memory, - const Pal::MemoryImageCopyRegion& region); - -/// Generate a 64-bit uniqueId for a GPU memory allocation -/// -/// @param [in] isInterprocess Indicates this uniqueId is for an externally shareable GPU memory allocation -/// -/// @returns 64-bit uniqueId -extern Pal::uint64 GenerateGpuMemoryUniqueId( - bool isInterprocess); - -} // GpuUtil - -/** - *********************************************************************************************************************** - * @page GpuUtilOverview GPU Utility Collection - * - * In addition to the generic, OS-abstracted software utilities, PAL provides GPU-specific utilities in the @ref GpuUtil - * namespace. The PAL GPU Utility Collection relies on both PAL core and PAL Utility. They are also available for use by - * its clients. - * - * All available PAL GPU utilities are defined in the @ref GpuUtil namespace, and are briefly summarized below. See the - * Reference topics for more detailed information on specific classes, enums, etc. - * - * ### TextWriter - * The TextWriter GPU utility class provides a method for clients to write text directly to an image. This can be used - * for debugging purposes. PAL's internal DbgOverlay uses the TextWriter class to write information about the current - * FPS and total allocated GPU video memory usage. - * - * The TextWriter class is broken up into palTextWriter.h and palTextWriterImpl.h. The intention is that palTextWriter.h - * will be included from other header files that need a full TextWriter definition, while palTextWriterImpl.h will be - * included by .cpp files that actually interact with the TextWriter. This should keep build times down versus putting - * all implementations directly in palTextWriter.h. - * - * Also included in the TextWriter is the TextWriterFont namespace, which defines the shader IL for drawing the text via - * a compute shader. It also defines the Font data, which is a packed binary that represents which pixels of a 10x16 - * rectangle to render. The font is monospaced. - * - * ### Helper Functions - * ValidateImageCopyRegion - Validate the image copy region, returns true if the image copy is supported by the specific - * engine, otherwise false. - * - * ValidateTypedBufferCopyRegion - Validate the typed buffer copy region, returns true if the typed buffer copy is - * supported by the specific engine, otherwise false. - * - * ValidateMemoryImageRegion - Validate the image-memory copy region, returns true if the image-memory copy is supported - * by the specific engine, otherwise false. - * - * Next: @ref Overview - *********************************************************************************************************************** - */ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palGpuUtil.h + * @brief Common include for the PAL GPU utility collection. Defines common types, macros, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +#include "pal.h" + +// Forward declarations. +namespace Pal +{ + struct DeviceProperties; + class IImage; + class IGpuMemory; + struct ImageCopyRegion; + struct TypedBufferCopyRegion; + struct MemoryImageCopyRegion; +} + +/// Library-wide namespace encapsulating all PAL GPU utility entities. +namespace GpuUtil +{ + +/// Validate image copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] src Src image. +/// @param [in] dst Des image. +/// @param [in] region Copy region. +/// +/// @returns true if the image copy is supported by the specific engine, otherwise false. +extern bool ValidateImageCopyRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::IImage& src, + const Pal::IImage& dst, + const Pal::ImageCopyRegion& region); + +/// Validate typed buffer copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] region Copy region. +/// +/// @returns true if the typed buffer copy is supported by the specific engine, otherwise false. +extern bool ValidateTypedBufferCopyRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::TypedBufferCopyRegion& region); + +/// Validate image-memory copy region. +/// +/// @param [in] properties The device properties. +/// @param [in] engineType Engine to validate. +/// @param [in] image The IImage object. +/// @param [in] region Copy region. +/// +/// @returns true if the image-memory copy is supported by the specific engine, otherwise false. +extern bool ValidateMemoryImageRegion( + const Pal::DeviceProperties& properties, + Pal::EngineType engineType, + const Pal::IImage& image, + const Pal::IGpuMemory& memory, + const Pal::MemoryImageCopyRegion& region); + +/// Generate a 64-bit uniqueId for a GPU memory allocation +/// +/// @param [in] isInterprocess Indicates this uniqueId is for an externally shareable GPU memory allocation +/// +/// @returns 64-bit uniqueId +extern Pal::uint64 GenerateGpuMemoryUniqueId( + bool isInterprocess); + +} // GpuUtil + +/** + *********************************************************************************************************************** + * @page GpuUtilOverview GPU Utility Collection + * + * In addition to the generic, OS-abstracted software utilities, PAL provides GPU-specific utilities in the @ref GpuUtil + * namespace. The PAL GPU Utility Collection relies on both PAL core and PAL Utility. They are also available for use by + * its clients. + * + * All available PAL GPU utilities are defined in the @ref GpuUtil namespace, and are briefly summarized below. See the + * Reference topics for more detailed information on specific classes, enums, etc. + * + * ### TextWriter + * The TextWriter GPU utility class provides a method for clients to write text directly to an image. This can be used + * for debugging purposes. PAL's internal DbgOverlay uses the TextWriter class to write information about the current + * FPS and total allocated GPU video memory usage. + * + * The TextWriter class is broken up into palTextWriter.h and palTextWriterImpl.h. The intention is that palTextWriter.h + * will be included from other header files that need a full TextWriter definition, while palTextWriterImpl.h will be + * included by .cpp files that actually interact with the TextWriter. This should keep build times down versus putting + * all implementations directly in palTextWriter.h. + * + * Also included in the TextWriter is the TextWriterFont namespace, which defines the shader IL for drawing the text via + * a compute shader. It also defines the Font data, which is a packed binary that represents which pixels of a 10x16 + * rectangle to render. The font is monospaced. + * + * ### Helper Functions + * ValidateImageCopyRegion - Validate the image copy region, returns true if the image copy is supported by the specific + * engine, otherwise false. + * + * ValidateTypedBufferCopyRegion - Validate the typed buffer copy region, returns true if the typed buffer copy is + * supported by the specific engine, otherwise false. + * + * ValidateMemoryImageRegion - Validate the image-memory copy region, returns true if the image-memory copy is supported + * by the specific engine, otherwise false. + * + * Next: @ref Overview + *********************************************************************************************************************** + */ diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h index 222a187725..a9b95610db 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palQueueTimingsTraceSource.h @@ -1,238 +1,236 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "palGpuUtil.h" -#include "palTraceSession.h" -#include "palGpaSession.h" - -#include - -struct SqttQueueEventRecord; -struct SqttQueueInfoRecord; - -namespace Pal -{ -class Platform; -} - -namespace GpuUtil -{ -namespace TraceChunk -{ - -/// "QueueInfo" RDF chunk identifier & version -constexpr char QueueInfoChunkId[TextIdentifierSize] = "QueueInfo"; -constexpr Pal::uint32 QueueInfoChunkVersion = 1; - -/// Enum describing logical queue types -enum class QueueType : Pal::uint8 -{ - Unknown = 0, - Universal = 1, - Compute = 2, - Dma = 3, - Encode = 4, - Decode = 5, - Security = 6, - VideoProcessor = 7 -}; - -/// Enum describing hardware engine types -enum class HwEngineType : Pal::uint8 -{ - Unknown = 0, - Universal = 1, - Compute = 2, - ExclusiveCompute = 3, - Dma = 4, - Decode = 5, - Encode = 6, - HighPriorityUniversal = 7, - HighPriorityGraphics = 8, - Security = 9, - Vpe = 10 -}; - -/// Structure describing a queue's properties -struct QueueInfo -{ - Pal::uint32 pciId; ///< The ID of the GPU queried - Pal::uint64 queueId; ///< API-specific queue ID - Pal::uint64 queueContext; ///< OS-level queue context value from Windows KMD to correlate with ETW data. - /// Only applicable to D3D on Windows; 0 otherwise. - QueueType queueType; ///< The logical queue type - HwEngineType engineType; ///< The hardware engine that the queue is mapped to -}; - -// ------------------------------------------------------------------------------------------- // - -/// "QueueEvent" RDF chunk identifier & version -constexpr char QueueEventChunkId[TextIdentifierSize] = "QueueEvent"; -constexpr Pal::uint32 QueueEventChunkVersion = 1; - -/// The type of queue-level timings event -enum class QueueEventType : Pal::uint32 -{ - CmdBufSubmit = 0, - SignalSemaphore = 1, - WaitSemaphore = 2, - Present = 3 -}; - -/// Structure describing a queue-level timings event -struct QueueEvent -{ - Pal::uint32 pciId; ///< The ID of the GPU queried - Pal::uint64 queueId; ///< The API-specific queue ID which triggered the event - QueueEventType eventType; ///< The type of the queue-timing event - Pal::uint32 sqttCmdBufId; ///< [`CmdBufSubmit` only; 0 otherwise] - /// SQTT command buffer ID matching CmdBufStart user data marker - Pal::uint64 frameIndex; ///< [`CmdBufSubmit` & `Present` only; 0 otherwise] - /// Global frame index incremented for each "Present" call - Pal::uint32 submitSubIndex; ///< [`CmdBufSubmit` only; 0 otherwise] - /// Sub-index of event within submission. - /// When there is only one CmdBuffer per submission, `submitSubIndex` is 0. - /// When there are multiple command buffers per submission, `submitSubIndex` - /// is incremented by one for each command buffer within the submission. - Pal::uint64 apiEventId; ///< [`CmdBufSubmit`] API-specific command buffer ID signaled - /// [`SignalSemaphore`] API-specific semaphore ID signaled - /// [`WaitSemaphore`] API-specific semaphore ID waited on - /// [`Present`] N/A (set to 0) - Pal::uint64 cpuTimestamp; ///< CPU start timestamp of when this event is triggered in clock cycle units - Pal::uint64 gpuTimestamp1; ///< [`CmdBufSubmit`] GPU timestamp when the HW execution of command buffer began - /// [`SignalSemaphore`] GPU timestamp when the HW signaled the queue semaphore - /// [`WaitSemaphore`] GPU timestamp when HW finished waiting on the semaphore - /// [`Present`] GPU timestamp when HW processed the Present call - /// - /// All timestamps are expressed in clock cycle units. - Pal::uint64 gpuTimestamp2; ///< [`CmdBufSubmit` only; 0 otherwise] - /// GPU timestamp when the HW execution of command buffer finished -}; - -} // namespace TraceChunk - -// QueueTimings Trace Source name & version -constexpr char QueueTimingsTraceSourceName[] = "queuetimings"; -constexpr Pal::uint32 QueueTimingsTraceSourceVersion = 2; - -// ===================================================================================================================== -// This trace source captures queue timings data through GPA session & produces "QueueInfo" and "QueueEvent" RDF chunks -class QueueTimingsTraceSource : public ITraceSource -{ -public: - explicit QueueTimingsTraceSource(Pal::IPlatform* pPlatform); - virtual ~QueueTimingsTraceSource(); - - // ==== TraceSource Native Functions ========================================================================== // - Pal::Result Init(Pal::IDevice* pDevice); - - Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, - Pal::uint64 queueId, - Pal::uint64 queueContext); - - Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); - - Pal::Result TimedSubmit(Pal::IQueue* pQueue, - const Pal::MultiSubmitInfo& submitInfo, - const TimedSubmitInfo& timedSubmitInfo); - - Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, - Pal::IQueueSemaphore* pQueueSemaphore, - const TimedQueueSemaphoreInfo& timedSignalInfo, - Pal::uint64 value = 0); - - Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, - Pal::IQueueSemaphore* pQueueSemaphore, - const TimedQueueSemaphoreInfo& timedWaitInfo, - Pal::uint64 value = 0); - - Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, - const TimedQueuePresentInfo& timedPresentInfo); - - Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, - Pal::uint64 cpuSubmissionTimestamp, - Pal::uint64 cpuCompletionTimestamp, - const TimedQueueSemaphoreInfo& timedWaitInfo); - - Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, - Pal::uint64 cpuSubmissionTimestamp, - Pal::uint64 cpuCompletionTimestamp, - const TimedQueueSemaphoreInfo& timedSignalInfo); - - bool IsTimingInProgress() const; - - // ==== Base Class Overrides =================================================================================== // -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < COMPRESSION_ARG_VERSION - virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { } -#endif - - virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override; -#else - virtual void OnTraceAccepted() override; -#endif - virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { }; -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 - virtual void OnPostambleEnd( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer* pCmdBuf) override; - virtual void OnTraceEnd( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer* pCmdBuf) override {}; -#else - virtual void OnTraceEnd( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer* pCmdBuf) override; -#endif - virtual void OnTraceFinished() override; - - virtual const char* GetName() const override { return QueueTimingsTraceSourceName; } - virtual Pal::uint32 GetVersion() const override { return QueueTimingsTraceSourceVersion; } - -private: - void WriteQueueInfoChunks( - const SqttQueueInfoRecord* pQueueInfoRecords, - size_t numQueueInfoRecords); - - void WriteQueueEventChunks( - const SqttQueueInfoRecord* pQueueInfoRecords, - size_t numQueueInfoRecords, - const SqttQueueEventRecord* pQueueEventRecords, - size_t numQueueEventRecords); - - void ReportInternalError(const char* pErrorMsg, Pal::Result result); - - Pal::IPlatform* const m_pPlatform; // IPlatform owning the parent TraceSession - GpaSession* m_pGpaSession; // Handle to GpaSession object for tracking queue timings - bool m_traceIsHealthy; // Internal flag for tracking resource and state health - std::atomic m_timingInProgress; // Flag for tracking if queue timings operations are ongoing - -}; - -} // namespace GpuUtil +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palGpuUtil.h" +#include "palTraceSession.h" +#include "palGpaSession.h" + +#include + +struct SqttQueueEventRecord; +struct SqttQueueInfoRecord; + +namespace Pal +{ +class Platform; +} + +namespace GpuUtil +{ +namespace TraceChunk +{ + +/// "QueueInfo" RDF chunk identifier & version +constexpr char QueueInfoChunkId[TextIdentifierSize] = "QueueInfo"; +constexpr Pal::uint32 QueueInfoChunkVersion = 1; + +/// Enum describing logical queue types +enum class QueueType : Pal::uint8 +{ + Unknown = 0, + Universal = 1, + Compute = 2, + Dma = 3, + Encode = 4, + Decode = 5, + Security = 6, + VideoProcessor = 7 +}; + +/// Enum describing hardware engine types +enum class HwEngineType : Pal::uint8 +{ + Unknown = 0, + Universal = 1, + Compute = 2, + ExclusiveCompute = 3, + Dma = 4, + Decode = 5, + Encode = 6, + HighPriorityUniversal = 7, + HighPriorityGraphics = 8, + Security = 9, + Vpe = 10 +}; + +/// Structure describing a queue's properties +struct QueueInfo +{ + Pal::uint32 pciId; ///< The ID of the GPU queried + Pal::uint64 queueId; ///< API-specific queue ID + Pal::uint64 queueContext; ///< OS-level queue context value from Windows KMD to correlate with ETW data. + /// Only applicable to D3D on Windows; 0 otherwise. + QueueType queueType; ///< The logical queue type + HwEngineType engineType; ///< The hardware engine that the queue is mapped to +}; + +// ------------------------------------------------------------------------------------------- // + +/// "QueueEvent" RDF chunk identifier & version +constexpr char QueueEventChunkId[TextIdentifierSize] = "QueueEvent"; +constexpr Pal::uint32 QueueEventChunkVersion = 1; + +/// The type of queue-level timings event +enum class QueueEventType : Pal::uint32 +{ + CmdBufSubmit = 0, + SignalSemaphore = 1, + WaitSemaphore = 2, + Present = 3 +}; + +/// Structure describing a queue-level timings event +struct QueueEvent +{ + Pal::uint32 pciId; ///< The ID of the GPU queried + Pal::uint64 queueId; ///< The API-specific queue ID which triggered the event + QueueEventType eventType; ///< The type of the queue-timing event + Pal::uint32 sqttCmdBufId; ///< [`CmdBufSubmit` only; 0 otherwise] + /// SQTT command buffer ID matching CmdBufStart user data marker + Pal::uint64 frameIndex; ///< [`CmdBufSubmit` & `Present` only; 0 otherwise] + /// Global frame index incremented for each "Present" call + Pal::uint32 submitSubIndex; ///< [`CmdBufSubmit` only; 0 otherwise] + /// Sub-index of event within submission. + /// When there is only one CmdBuffer per submission, `submitSubIndex` is 0. + /// When there are multiple command buffers per submission, `submitSubIndex` + /// is incremented by one for each command buffer within the submission. + Pal::uint64 apiEventId; ///< [`CmdBufSubmit`] API-specific command buffer ID signaled + /// [`SignalSemaphore`] API-specific semaphore ID signaled + /// [`WaitSemaphore`] API-specific semaphore ID waited on + /// [`Present`] N/A (set to 0) + Pal::uint64 cpuTimestamp; ///< CPU start timestamp of when this event is triggered in clock cycle units + Pal::uint64 gpuTimestamp1; ///< [`CmdBufSubmit`] GPU timestamp when the HW execution of command buffer began + /// [`SignalSemaphore`] GPU timestamp when the HW signaled the queue semaphore + /// [`WaitSemaphore`] GPU timestamp when HW finished waiting on the semaphore + /// [`Present`] GPU timestamp when HW processed the Present call + /// + /// All timestamps are expressed in clock cycle units. + Pal::uint64 gpuTimestamp2; ///< [`CmdBufSubmit` only; 0 otherwise] + /// GPU timestamp when the HW execution of command buffer finished +}; + +} // namespace TraceChunk + +// QueueTimings Trace Source name & version +constexpr char QueueTimingsTraceSourceName[] = "queuetimings"; +constexpr Pal::uint32 QueueTimingsTraceSourceVersion = 2; + +// ===================================================================================================================== +// This trace source captures queue timings data through GPA session & produces "QueueInfo" and "QueueEvent" RDF chunks +class QueueTimingsTraceSource : public ITraceSource +{ +public: + explicit QueueTimingsTraceSource(Pal::IPlatform* pPlatform); + virtual ~QueueTimingsTraceSource(); + + // ==== TraceSource Native Functions ========================================================================== // + Pal::Result Init(Pal::IDevice* pDevice); + + Pal::Result RegisterTimedQueue(Pal::IQueue* pQueue, + Pal::uint64 queueId, + Pal::uint64 queueContext); + + Pal::Result UnregisterTimedQueue(Pal::IQueue* pQueue); + + Pal::Result TimedSubmit(Pal::IQueue* pQueue, + const Pal::MultiSubmitInfo& submitInfo, + const TimedSubmitInfo& timedSubmitInfo); + + Pal::Result TimedSignalQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedSignalInfo, + Pal::uint64 value = 0); + + Pal::Result TimedWaitQueueSemaphore(Pal::IQueue* pQueue, + Pal::IQueueSemaphore* pQueueSemaphore, + const TimedQueueSemaphoreInfo& timedWaitInfo, + Pal::uint64 value = 0); + + Pal::Result TimedQueuePresent(Pal::IQueue* pQueue, + const TimedQueuePresentInfo& timedPresentInfo); + + Pal::Result ExternalTimedWaitQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedWaitInfo); + + Pal::Result ExternalTimedSignalQueueSemaphore(Pal::uint64 queueContext, + Pal::uint64 cpuSubmissionTimestamp, + Pal::uint64 cpuCompletionTimestamp, + const TimedQueueSemaphoreInfo& timedSignalInfo); + + bool IsTimingInProgress() const; + + // ==== Base Class Overrides =================================================================================== // + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override { }; + + virtual Pal::uint64 QueryGpuWorkMask() const override { return 0; } + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override; +#else + virtual void OnTraceAccepted() override; +#endif + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) override { }; +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override; + virtual void OnTraceEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override {}; +#else + virtual void OnTraceEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) override; +#endif + virtual void OnTraceFinished() override; + + virtual const char* GetName() const override { return QueueTimingsTraceSourceName; } + virtual Pal::uint32 GetVersion() const override { return QueueTimingsTraceSourceVersion; } + +private: + void WriteQueueInfoChunks( + const SqttQueueInfoRecord* pQueueInfoRecords, + size_t numQueueInfoRecords); + + void WriteQueueEventChunks( + const SqttQueueInfoRecord* pQueueInfoRecords, + size_t numQueueInfoRecords, + const SqttQueueEventRecord* pQueueEventRecords, + size_t numQueueEventRecords); + + void ReportInternalError(const char* pErrorMsg, Pal::Result result); + + Pal::IPlatform* const m_pPlatform; // IPlatform owning the parent TraceSession + GpaSession* m_pGpaSession; // Handle to GpaSession object for tracking queue timings + bool m_traceIsHealthy; // Internal flag for tracking resource and state health + std::atomic m_timingInProgress; // Flag for tracking if queue timings operations are ongoing + +}; + +} // namespace GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h index 4954aed5ef..ce27cf212b 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palRenderOpTraceController.h @@ -1,155 +1,150 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "palTraceSession.h" - -namespace Pal -{ -class IPlatform; -class IQueue; -class ICmdBuffer; -class Device; -} - -namespace GpuUtil -{ - -/// Supported render operations used to advance the trace -enum RenderOp : Pal::uint8 -{ - RenderOpDraw = (1u << 0), - RenderOpDispatch = (1u << 1) -}; - -/// Structure used to batch submit render operations on queue submission -/// This struct should have a `*Count` field for each @ref RenderOp enumeration above -struct RenderOpCounts -{ - Pal::uint32 drawCount; - Pal::uint32 dispatchCount; -}; - -constexpr Pal::uint32 RenderOpTraceControllerVersion = 4; -constexpr char RenderOpTraceControllerName[] = "renderop"; - -// ===================================================================================================================== -class RenderOpTraceController : public ITraceController -{ -public: -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 - using RenderOp = GpuUtil::RenderOp; -#endif - RenderOpTraceController(Pal::IPlatform* pPlatform, Pal::IDevice* pDevice); - virtual ~RenderOpTraceController(); - - virtual const char* GetName() const override { return RenderOpTraceControllerName; } - virtual Pal::uint32 GetVersion() const override { return RenderOpTraceControllerVersion; } - - virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override; - virtual Pal::Result OnTraceRequested() override; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) override; -#endif - virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; - virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; - virtual Pal::Result OnEndPostambleGpuWork( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer** ppCmdBuffer) override; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 - void RecordRenderOp(Pal::IQueue* pQueue, RenderOp renderOp); -#endif - - void FinishTrace(); - - // Cancel the trace currently in progress. - virtual Pal::Result OnTraceCanceled() override; - - /// This function must be called by client drivers implementing the RenderOp controller. - /// On every queue submission, this function is called with the cumulative counts of render operations - /// recorded into that queue's command buffers. - /// Based on the controller's internal mask, set by the user during trace configuration, - /// the trace controller may advance its state. - void RecordRenderOps(Pal::IQueue* pQueue, const RenderOpCounts& renderOpCounts); - - // Force a controller update - virtual void OnUpdated() override { OnRenderOpUpdated(0); } - - virtual Pal::IQueue* GetTraceQueue() const override { return m_pQueue; } - -private: - /// Controls whether the trace proceeds on absolute render op counts or relative - enum class CaptureMode : Pal::uint8 - { - Relative = 0, ///< Relative to when the trace request is received - Absolute ///< Absolute render op index - }; - - Pal::Result AcceptTrace(); - Pal::Result BeginTrace(); - - Pal::Result SubmitBeginTraceGpuWork() const; - Pal::Result SubmitEndTraceGpuWork(); - Pal::Result SubmitEndPostambleGpuWork(); - - Pal::Result WaitForTraceEndGpuWorkCompletion() const; - Pal::Result CreateFence(Pal::IFence** ppFence) const; - Pal::Result CreateCommandBuffer(bool traceEnd, Pal::ICmdBuffer** ppCmdBuf) const; - Pal::Result CreateCmdAllocator(); - - void OnRenderOpUpdated(Pal::uint64 countRecorded); - void FreeResources(); - void AbortTrace(); - - Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceController - Pal::IDevice* m_pDevice; // Device associated with this TraceController - Pal::ICmdAllocator* m_pCmdAllocator; // Command allocator for the TraceController - - TraceSession* m_pTraceSession; // TraceSession owning this TraceController - Pal::uint64 m_supportedGpuMask; // Bit mask of GPU indices that are capable of participating in the trace - Pal::uint8 m_renderOpMask; // Bitmask of RenderOp modes, indicating which are accepted - CaptureMode m_captureMode; // Modality for determining the starting renderop index of the trace - Pal::uint64 m_renderOpCount; // The "global" count, incremented on every render op - Pal::uint64 m_prepStartRenderOp; // Relative or absolute render op number indicating trace begin - Pal::uint64 m_numPrepRenderOps; // Number of "warm-up" frames before the start frame - Pal::uint64 m_captureRenderOpCount; // Number of frames to wait before ending the trace - Pal::uint64 m_renderOpTraceAccepted; // The frame number when the trace was accepted - - Util::Mutex m_renderOpLock; // Lock over UpdateFrame/OnFrameUpdated - Pal::IQueue* m_pQueue; // The queue being used to submit Begin/End GPU trace command buffers -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - Pal::ICmdBuffer* m_pCmdBufTracePrepare; // Command buffer for recording during the prep phase -#endif - Pal::ICmdBuffer* m_pCmdBufTraceBegin; // Command buffer to submit Trace Begin - Pal::ICmdBuffer* m_pCmdBufTraceEnd; // Command buffer to submit Trace End - Pal::ICmdBuffer* m_pCmdBufPostambleEnd; // Command buffer to submit Postamble End - Pal::IFence* m_pFenceTraceEnd; // Fence to wait for Trace End command buffer completion - Pal::IFence* m_pFencePostambleEnd; // Fence to wait for Postamble End command buffer completion -}; - -} // namespace GpuUtil +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "palTraceSession.h" + +namespace Pal +{ +class IPlatform; +class IQueue; +class ICmdBuffer; +class Device; +} + +namespace GpuUtil +{ + +/// Supported render operations used to advance the trace +enum RenderOp : Pal::uint8 +{ + RenderOpDraw = (1u << 0), + RenderOpDispatch = (1u << 1) +}; + +/// Structure used to batch submit render operations on queue submission +/// This struct should have a `*Count` field for each @ref RenderOp enumeration above +struct RenderOpCounts +{ + Pal::uint32 drawCount; + Pal::uint32 dispatchCount; +}; + +constexpr Pal::uint32 RenderOpTraceControllerVersion = 4; +constexpr char RenderOpTraceControllerName[] = "renderop"; + +// ===================================================================================================================== +class RenderOpTraceController : public ITraceController +{ +public: +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 + using RenderOp = GpuUtil::RenderOp; +#endif + RenderOpTraceController(Pal::IPlatform* pPlatform, Pal::IDevice* pDevice); + virtual ~RenderOpTraceController(); + + virtual const char* GetName() const override { return RenderOpTraceControllerName; } + virtual Pal::uint32 GetVersion() const override { return RenderOpTraceControllerVersion; } + + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) override; + virtual Pal::Result OnTraceRequested() override; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) override; +#endif + virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; + virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuffer) override; + virtual Pal::Result OnEndPostambleGpuWork( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer** ppCmdBuffer) override; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 896 + void RecordRenderOp(Pal::IQueue* pQueue, RenderOp renderOp); +#endif + + void FinishTrace(); + + // Cancel the trace currently in progress. + virtual Pal::Result OnTraceCanceled() override; + + /// This function must be called by client drivers implementing the RenderOp controller. + /// On every queue submission, this function is called with the cumulative counts of render operations + /// recorded into that queue's command buffers. + /// Based on the controller's internal mask, set by the user during trace configuration, + /// the trace controller may advance its state. + void RecordRenderOps(Pal::IQueue* pQueue, const RenderOpCounts& renderOpCounts); + +private: + /// Controls whether the trace proceeds on absolute render op counts or relative + enum class CaptureMode : Pal::uint8 + { + Relative = 0, ///< Relative to when the trace request is received + Absolute ///< Absolute render op index + }; + + Pal::Result AcceptTrace(); + Pal::Result BeginTrace(); + + Pal::Result SubmitBeginTraceGpuWork() const; + Pal::Result SubmitEndTraceGpuWork(); + Pal::Result SubmitEndPostambleGpuWork(); + + Pal::Result WaitForTraceEndGpuWorkCompletion() const; + Pal::Result CreateFence(Pal::IFence** ppFence) const; + Pal::Result CreateCommandBuffer(bool traceEnd, Pal::ICmdBuffer** ppCmdBuf) const; + Pal::Result CreateCmdAllocator(); + + void OnRenderOpUpdated(Pal::uint64 countRecorded); + void FreeResources(); + void AbortTrace(); + + Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceController + Pal::IDevice* m_pDevice; // Device associated with this TraceController + Pal::ICmdAllocator* m_pCmdAllocator; // Command allocator for the TraceController + + TraceSession* m_pTraceSession; // TraceSession owning this TraceController + Pal::uint64 m_supportedGpuMask; // Bit mask of GPU indices that are capable of participating in the trace + Pal::uint8 m_renderOpMask; // Bitmask of RenderOp modes, indicating which are accepted + CaptureMode m_captureMode; // Modality for determining the starting renderop index of the trace + Pal::uint64 m_renderOpCount; // The "global" count, incremented on every render op + Pal::uint64 m_prepStartRenderOp; // Relative or absolute render op number indicating trace begin + Pal::uint64 m_numPrepRenderOps; // Number of "warm-up" frames before the start frame + Pal::uint64 m_captureRenderOpCount; // Number of frames to wait before ending the trace + Pal::uint64 m_renderOpTraceAccepted; // The frame number when the trace was accepted + + Util::Mutex m_renderOpLock; // Lock over UpdateFrame/OnFrameUpdated + Pal::IQueue* m_pQueue; // The queue being used to submit Begin/End GPU trace command buffers +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + Pal::ICmdBuffer* m_pCmdBufTracePrepare; // Command buffer for recording during the prep phase +#endif + Pal::ICmdBuffer* m_pCmdBufTraceBegin; // Command buffer to submit Trace Begin + Pal::ICmdBuffer* m_pCmdBufTraceEnd; // Command buffer to submit Trace End + Pal::ICmdBuffer* m_pCmdBufPostambleEnd; // Command buffer to submit Postamble End + Pal::IFence* m_pFenceTraceEnd; // Fence to wait for Trace End command buffer completion + Pal::IFence* m_pFencePostambleEnd; // Fence to wait for Postamble End command buffer completion +}; + +} // namespace GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h index 810986eee0..b5cf7bda3b 100644 --- a/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h +++ b/shared/amdgpu-windows-interop/pal/inc/gpuUtil/palTraceSession.h @@ -1,829 +1,737 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palTraceSession.h - * @brief PAL GPU utility TraceSession class. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palPlatform.h" -#include "palDeque.h" -#include "palDevice.h" -#include "palGpuUtil.h" -#include "palHashMap.h" -#include "palMutex.h" -#include "palPipeline.h" -#include "palQueue.h" -#include "palSysMemory.h" -#include "palGpuMemory.h" -#include "palMemTrackerImpl.h" -#include "palVector.h" - -struct rdfStream; -struct rdfChunkFileWriter; - -namespace DevDriver -{ -class IStructuredWriter; -class IStructuredReader; -class StructuredValue; -} - -namespace GpuUtil -{ - -class TraceSession; -class ITraceController; -class ITraceSource; - -constexpr Pal::uint16 TextIdentifierSize = 16; - -/// Information required to create a new chunk of trace data in a TraceSession -/// -/// This data inside this structure is expected to be produced by trace source implementations. The specific fields -/// included within this structure are intended to support compatibility with the Radeon Data Format (RDF) spec. -struct TraceChunkInfo -{ - char id[TextIdentifierSize]; ///< Text identifier of the chunk - Pal::uint32 version; ///< Version number of the chunk - const void* pHeader; ///< [in] Pointer to a buffer that contains the header data for the chunk - Pal::int64 headerSize; ///< Size of the buffer pointed to by pHeader - const void* pData; ///< [in] Pointer to a buffer that contains the data for the chunk - Pal::int64 dataSize; ///< Size of the buffer pointed to by pData - bool enableCompression; ///< Indicates if the chunk's data should be compressed or not -}; - -/// The available states of TraceSession -enum class TraceSessionState : Pal::uint32 -{ - Ready = 0, ///< New trace ready to begin - Requested = 1, ///< A trace has been requested and awaiting acceptance - Preparing = 2, ///< Trace has been accepted and is preparing resources before beginning - Beginning = 3, ///< Commands are now being submitted to the GPU to begin tracing - Running = 4, ///< Trace is in progress -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 - Postamble = 5, ///< The detailed frame trace has ended but its data has not yet been written - /// into the session. Some trace sources may still collect data during this time. - PostambleWaiting = 6, ///< Waiting for Postamble to complete. - Completed = 7, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). - Count = 8 -#else - Waiting = 5, ///< Trace has ended, but data has not been written into the session - Completed = 6, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). - Count = 7 -#endif -}; - -/// Defines the type of payload. Currently only strings are supported but in the future can include JSON, structs, etc. -enum class TraceErrorPayload : Pal::uint32 -{ - None, //< Should be set when there is no additional information to be sent with the error - ErrorString //< Should be set when the error payload is string data -}; - -/// Chunk header for the error tracing chunk -struct TraceErrorHeader -{ - char chunkId[TextIdentifierSize]; ///< Text identifier of the failing chunk - Pal::uint32 chunkIndex; ///< Chunk index of the failing chunk - Pal::Result resultCode; ///< PAL Result code of the failure - TraceErrorPayload payloadType; ///< Type of error chunk payload -}; - -constexpr char ErrorChunkTextIdentifier[TextIdentifierSize] = "TraceError"; -constexpr Pal::uint32 ErrorTraceChunkVersion = 1; - -/// Function type for TraceSession state change callback -typedef void (PAL_STDCALL *TraceStateChangeCallback)( - const TraceSession& pTraceSession, - TraceSessionState newState, - void* pPrivateData); - -/** -*********************************************************************************************************************** -* @interface ITraceController -* @brief Interface that allows for control of a trace operation through TraceSession. -* -* Trace controllers are responsible for driving the high-level steps of a trace operation. Users of this interface are -* expected to create their own implementation of this interface, register it with a TraceSession, then call the -* following TraceSession functions to drive the trace process: -* -* TraceSession::AcceptTrace -* TraceSession::BeginTrace -* TraceSession::EndTrace -* TraceSession::EndPostamble -* TraceSession::FinishTrace -*********************************************************************************************************************** -*/ -class ITraceController -{ -public: - /// Returns the name of the controller - /// - /// @returns the name of the controller as a null terminated string - virtual const char* GetName() const = 0; - - /// Returns the version of the controller - /// - /// @returns the version of the controller as an unsigned integer value - virtual Pal::uint32 GetVersion() const = 0; - - /// Called by the associated session to update the current trace configuration - /// - /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object - virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; - - /// Called by the associated session to notify the controller that a trace has been requested and it can take - /// control of the TraceSession when desired. - virtual Pal::Result OnTraceRequested() = 0; - - /// Called by the associated session to notify the controller that a trace has been canceled and it can start - /// canceling the trace when ready. - virtual Pal::Result OnTraceCanceled() = 0; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - /// Called by TraceSession to indicate that GPU work is required on the indicated GPU during the preparation phase. - /// The command buffer must be ready to record commands; however, the trace controller should not submit it - /// until the trace begins. - /// - /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU - /// upon successful completion of this function via ppCmdBuf. - /// - /// This function will be called once per trace for each GPU that's considered relevant by the current set of - /// trace sources. - /// - /// Note: This command buffer should be submitted at the same time as the command buffer provided in - /// `OnBeginGpuWork`. They may be the same command buffer or separate; the goal is to allow trace sources - /// to frontload recording GPU work before the trace formally begins. - /// - /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs - /// to be capable of recording new commands. - /// - /// @param [in] gpuIndex The index of the target GPU - /// @param [out] ppCmdBuf A command buffer that can be used to record GPU work before a trace starts executing. - /// Note that this command buffer shouldn't be submitted until the trace begins. - /// - /// @returns Success if the command buffer was successfully returned - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; -#endif - - /// Called by TraceSession to indicate that GPU work is required to begin a trace on the indicated GPU - /// - /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU - /// upon successful completion of this function via ppCmdBuf. - /// - /// This function will be called once per trace for each GPU that's considered relevant by the current set of - /// trace sources. - /// - /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs - /// to be capable of recording new commands. - /// - /// @param [in] gpuIndex The index of the target GPU - /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace - /// - /// @returns Success if the command buffer was successfully returned - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; - - /// Called by TraceSession to indicate that GPU work is required to end a trace on the indicated GPU - /// - /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU - /// upon successful completion of this function via ppCmdBuf. - /// - /// This function will be called once per trace for each GPU that's considered relevant by the current set of - /// trace sources. - /// - /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs - /// to be capable of recording new commands. - /// - /// @param [in] gpuIndex The index of the target GPU - /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the trace - /// - /// @returns Success if the command buffer was successfully returned - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; - - /// Called by TraceSession to indicate that GPU work is required to end the postamble on the indicated GPU - /// - /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU - /// upon successful completion of this function via ppCmdBuf. - /// - /// This function will be called once per trace for each GPU that's considered relevant by the current set of - /// trace sources. - /// - /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs - /// to be capable of recording new commands. - /// - /// @param [in] gpuIndex The index of the target GPU - /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble - /// - /// @returns Success if the command buffer was successfully returned - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - virtual Pal::Result OnEndPostambleGpuWork( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer** ppCmdBuf) = 0; - - /// Called by the associated session to force a controller update and drive the session to completion when there - /// is an insufficient number of update events to accomplish that. This is primarily used in single frame/dispatch - /// captures, during which, the controller won't be automatically updated and we have to force it to return the - /// trace session to a clean state. - virtual void OnUpdated() = 0; - - /// Returns the queue tracked in the active trace controller - /// - /// Returns the queue used for submitting begin and end-trace gpu-work. The queue is tracked by the active - /// controller - /// - /// @returns A valid queue pointer used for submitting gpu-work - //// Or a nullptr if no such queue exists - virtual Pal::IQueue* GetTraceQueue() const = 0; -}; - -#define COMPRESSION_ARG_VERSION 949 - -/** -*********************************************************************************************************************** -* @interface ITraceSource -* @brief Interface that enables developers to emit arbitrary data chunks into a trace through TraceSession. -* -* Trace sources are used to implement any surrounding logic required to produce a trace data chunk. Users of this -* interface are expected to create their own implementation of this interface, register it with a TraceSession, then -* call TraceSession::WriteDataChunk during a trace operation whenever a data chunk should be produced. -*********************************************************************************************************************** -*/ -class ITraceSource -{ -public: -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= COMPRESSION_ARG_VERSION - - /// Base class constructor - ITraceSource() : m_useCompression(false) - { } - - /// Called by the associated session to update the current trace configuration. Will parse out common config options - /// then pass to OnConfigUpdated to allow derived classes to parse other options. - /// - /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object - void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig); -#else - /// Called by the associated session to update the current trace configuration - /// - /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object - virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; -#endif - - /// Returns a bitmask that represents which GPUs are relevant to this trace source - /// - /// If the bit at index N is set, GPU N must execute work on the GPU in order to produce trace data - virtual Pal::uint64 QueryGpuWorkMask() const = 0; - - /// Called by the associated session to notify the source that a new trace has been accepted - /// - /// The source may use this notification to do any preparation work that might be required before the trace begins. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 - /// A command buffer is provided for the trace source to insert any work into. Note that the work will not be - /// submitted until the trace begins (at the same time as `OnTraceBegin`). This allows for frontloading of - /// expensive operations, such as the construction of a GpaSession sample, that would affect runtime speed - /// or behavior during trace exeecution. - /// - /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf - /// @param [in] pCmdBuf A command buffer that can be used to record any GPU work required during the - /// preparation phase of the trace. Not submitted until `OnTraceBegin`. - virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; -#else - virtual void OnTraceAccepted() = 0; -#endif - - /// Called by the associated session to notify the source that it should begin a trace - /// - /// The source should use the provided command buffer to execute any GPU work that's required for the source to - /// begin a trace operation. - /// - /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to - /// participate in the trace. All GPUs that begin a trace are required to end it later. Sources are not expected - /// to handle cases where the begin/end function calls are mismatched during a trace operation. - /// - /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf - /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace - virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; - - /// Called by the associated session to notify the source that it should end the current trace - /// - /// The source should use the provided command buffer to execute any GPU work that's required for the source to - /// end a trace operation. - /// - /// The command buffer associated with the OnTraceBegin function is not guaranteed to have finished GPU execution - /// when this function is called. The command buffer associated with this function is also not guaranteed to finish - /// execution until OnTraceFinished is called. - /// - /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to - /// participate in the trace. - /// - /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf - /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the trace - virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; - - /// Called by the associated session to notify the source that it should end the postamble - /// - /// The source should use the provided command buffer to execute any GPU work that's required for the source to - /// end its postamble operation. - /// - /// The command buffer associated with the OnTraceBegin and OnTraceEnd functions are not guaranteed to have - /// finished GPU execution when this function is called. The command buffer associated with this function is also - /// not guaranteed to finish execution until OnTraceFinished is called. - /// - /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to - /// participate in the trace. - /// - /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf - /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 - virtual void OnPostambleEnd( - Pal::uint32 gpuIndex, - Pal::ICmdBuffer* pCmdBuf) = 0; -#endif - - /// Called by the associated session to notify the source that the current trace has finished - /// - /// When this function is called, all prior command buffers provided to the source during the trace operation have - /// finished execution. The source should use this function to collect any data generated by the GPU and emit it - /// via TraceSession::WriteDataChunk. - virtual void OnTraceFinished() = 0; - - /// Returns the name of the source - /// - /// @returns the name of the source as a null terminated string - virtual const char* GetName() const = 0; - - /// Returns the version of the source - /// - /// @returns the version of the source as an unsigned integer value - virtual Pal::uint32 GetVersion() const = 0; - - /// Whether multiple instances of the trace source are allowed - /// - /// @returns true if multiple instances of this trace sources can co-exist in one session, false otherwise. - virtual bool AllowMultipleInstances() const { return false; } - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= COMPRESSION_ARG_VERSION -protected: - /// Called by OnConfigUpdated to allow derived classes to update the current trace configuration. - /// Default implementation is empty. - /// - /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object - virtual void OnConfigUpdatedDerived(DevDriver::StructuredValue* pJsonConfig) { } - - bool m_useCompression; -#endif -}; - -/** -*********************************************************************************************************************** -* @class TraceSession -* @brief Helper class providing common driver functionality for collecting arbitrary data traces. -* -* Due to the global nature of the trace functionality, only one TraceSession is typically used at a time. -* An interface to acquire a session exists on IPlatform. Users who need to interact with an instance of this object -* should expect to acquire it there. -* -* @see IPlatform::GetTraceSession() -*********************************************************************************************************************** -*/ -class TraceSession final -{ -public: - /// Constructor. - /// - /// @param [in] pPlatform Platform associated with this TraceSesion - TraceSession(Pal::IPlatform* pPlatform); - - /// Destructor - ~TraceSession(); - - /// Initialize the trace session before requesting a trace. - /// - /// @returns Success if initalization was successful, or ErrorUnknown upon failure. - Pal::Result Init(); - - /// Returns whether tracing has been formally enabled via UberTrace or not. - /// If 'true', this means that tool-side applications have requested this - /// TraceSession to capture traces. This has implications for PAL clients. - /// - /// @returns True if tracing has been enabled, and false otherwise. - bool IsTracingEnabled() const { return m_tracingEnabled; } - - /// Attempts to update the current trace configuration - /// - /// This function will only succeed if there is currently to trace in progress - /// - /// TODO: The JSON configuration interface will likely be replaced with driver settings in the future - /// - /// @param [in] pData Buffer that stores the Json-formatted configuration data - /// @param [in] dataSize Configuration data-size - /// - /// @returns Success if the trace configuration was successfully updated. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - /// + ErrorUnavailable if a trace is currently in progress - /// + ErrorInvalidPointer pData is nullptr - /// + ErrorInvalidParameter pData is not valid json - Pal::Result UpdateTraceConfig(const void* pData, size_t dataSize); - - /// Attempts to request a new trace operation on the trace session. - /// - /// Once a trace is successfully requested, it will become available for a registered trace controller to accept. - /// When a controller accepts the trace, it becomes responsible for managing the rest of the trace operation and - /// notifying the session upon trace completion. - /// - /// Since the session can only run a single trace at a time, this function will not succeed if another trace is - /// is already requested or in progress. - /// - /// @returns Success if the trace operation was successfully requested. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - /// + ErrorUnavailable if there is a trace in progress already and a new one cannot be started - Pal::Result RequestTrace(); - - /// Cancels a trace currently in progress. - /// - /// @returns Success if the trace was successfully canceled. - /// Otherwise, one of the following errors may be returned: - /// + NotReady if the trace is not ready to be canceled. - /// + ErrorUnknown if an internal PAL error occurs. - Pal::Result CancelTrace(); - - /// Cancels an invalid trace in progress. - /// - /// Cancels traces that have not been cleanly collected cleanly or actively canceled and returns the trace session - /// to a clean state. It forces a controller update, drives the session to completion and discards any trace data. - void CancelInvalidTrace(); - - /// Cleans up the RDF chunk stream and makes it ready for a new trace again. - /// - /// @returns Success if the trace session and rdf streams were successfully cleaned up and returned to the - /// initialization state - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - Pal::Result CleanupChunkStream(); - - /// Attempts to consume any trace data stored within the trace session. - /// - /// This function will only successfully return trace data after a trace operation is completed on the session. - /// - /// TODO: This function should be replaced with one that uses a callback so we can avoid needing to store the trace - /// data into memory twice. - /// - /// @param [out] pData (Optional) Destination buffer to copy the trace data into - /// If this parameter is nullptr, the size of the trace data in bytes will be - /// returned via pDataSize instead of consuming any trace data. - /// @param [in/out] pDataSize If pData is nullptr, then this parameter is used to return the trace data - /// size in bytes. - /// If pData is valid, this parameter represents the size of the buffer - /// pointed to by pData. - /// - /// @returns Success if the trace data was successfully consumed or the size of the trace data was returned. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library - /// + ErrorUnavailable if trace data is not available for collection at this time - /// + ErrorInvalidPointer if nullptr is passed as pDataSize - /// + ErrorInvalidMemorySize if *pDataSize indicates that pData is too small to contain the trace data - Pal::Result CollectTrace(void* pData, size_t* pDataSize); - - /// Attempts to register a trace controller - /// - /// Once registered, trace controllers can receive configuration updates from the session. - /// They may also manage the trace operation by calling AcceptTrace, BeginTrace, EndTrace, EndPostamble and FinishTrace. - /// - /// Trace controllers can only be registered when there is no trace in progress - /// - /// @param [in] pController The trace controller to register with the session - /// - /// @returns Success if the controller was successfully registered. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - /// + AlreadyExists if this controller has already been registered - /// + ErrorUnavailable if a trace is in progress - /// + ErrorInvalidPointer if nullptr is passed as pController - Pal::Result RegisterController(ITraceController* pController); - - /// Attempts to unregister a previously registered trace controller - /// - /// @param [in] pController The trace controller to unregister from the session - /// - /// @returns Success if the controller was successfully unregistered. - /// Otherwise, one of the following errors may be returned: - /// + NotFound if the provided controller was not previously registered - /// + ErrorUnknown if an internal PAL error occurs. - /// + ErrorUnavailable if a trace is in progress - Pal::Result UnregisterController(ITraceController* pController); - - /// Attempts to register a trace source - /// - /// Once registered, trace sources can receive configuration updates from the session. - /// They may also emit data during trace operations by calling WriteDataChunk. - /// - /// Trace sources can only be registered when there is no trace in progress - /// - /// @param [in] pSource The trace source to register with the session - /// - /// @returns Success if the source was successfully registered. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - /// + AlreadyExists if this source has already been registered - /// + ErrorUnavailable if a trace is in progress - /// + ErrorInvalidPointer if nullptr is passed as pSource - Pal::Result RegisterSource(ITraceSource* pSource); - - /// Attempts to unregister a previously registered trace source - /// - /// @param [in] pSource The trace source to unregister from the session - /// - /// @returns Success if the source was successfully unregistered. - /// Otherwise, one of the following errors may be returned: - /// + NotFound if the provided source was not previously registered - /// + ErrorUnknown if an internal PAL error occurs. - /// + ErrorUnavailable if a trace is in progress - Pal::Result UnregisterSource(ITraceSource* pSource); - - /// Attempts to accept a previously requested trace with the provided controller - /// - /// Once a trace is successfully accepted by a controller, that controller becomes responsible for managing the - /// rest of the trace operation. Also, once a requested trace is accepted by a controller, no other controllers - /// will be able to accept that trace. Accept is a "consuming" operation. - /// - /// @param [in] pController The trace controller to accept the trace with - /// @param [in] supportedGpuMask Bit mask of GPU indices that are capable of participating in the trace - /// - /// The GPU mask provided to this function is used to determine which GPUs will be involved in the trace. In order - /// to decide which GPUs require GPU work, the session creates a combined mask from all registered sources and - /// checks it against the mask provided by this function. Only GPUs that are present in both masks will be able to - /// submit GPU work during the trace. - /// - /// @returns Success if the trace was successfully accepted. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal PAL error occurs. - /// + ErrorUnavailable if no trace has been requested or a trace is currently in progress - /// + ErrorInvalidPointer if nullptr is passed as pController - Pal::Result AcceptTrace(ITraceController* pController, Pal::uint64 supportedGpuMask); - - /// Begins the trace that was previously accepted by the provided controller - /// - /// This function MUST be called after a successful call to AcceptTrace. When this function is called, the session - /// will communicate with all registered trace sources and instruct them to begin the trace operation. The provided - /// trace controller will be notified if any GPU work is required via ITraceController::OnBeginGpuWork. The command - /// buffers returned by OnBeginGpuWork will be passed to each relevant trace source to record required work. - /// - /// The command buffers generated in response to this this call MUST be submitted BEFORE the command buffers - /// generated in response to the EndTrace call! - /// - /// In situations where multiple GPUs are present, the OnBeginGpuWork function will be called once per GPU index, - /// for all GPUs that are relevant for the current trace sources. - /// - /// @returns Success if the trace was successfully started. - /// Otherwise, the error generated by OnBeginGpuWork will be returned. - Pal::Result BeginTrace(); - - /// Ends the trace that was previously started by the provided controller - /// - /// This function MUST be called after BeginTrace. When this function is called, the session will communicate with - /// all registered trace sources and instruct them to end the trace operation. The provided trace controller will - /// trace controller will be notified if any GPU work is required via ITraceController::OnEndGpuWork. The command - /// buffers returned by OnEndGpuWork will be passed to each relevant trace source to record required work. - /// - /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers - /// generated in response to the previous BeginTrace call! The generated command buffers MUST also complete - /// execution on the GPU BEFORE FinishTrace is called! - /// - /// In situations where multiple GPUs are present, the OnEndGpuWork function will be called once per GPU index - /// for all GPUs that are relevant for the current trace sources. - /// - /// The Trace Session will enter Postamble phase after EndTrace is called. - /// - /// @returns Success if the trace was successfully ended. - /// Otherwise, the error generated by OnEndGpuWork will be returned. - Pal::Result EndTrace(); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 - /// Ends the postamble phase, which typically runs until the detailed trace data is available. - /// This function MUST be called after EndTrace. When this function is called, the session will communicate with - /// all registered trace sources and notify them of the end of the postamble phase. The provided trace controller - /// will be notified if any GPU work is required via ITraceController::OnEndPostambleGpuWork. The command - /// buffers returned by OnEndPostambleGpuWork will be passed to each relevant trace source to record required work. - /// - /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers - /// generated in response to the previous EndTrace call! The generated command buffers MUST also complete - /// execution on the GPU BEFORE FinishPostamble is called! - /// - /// In situations where multiple GPUs are present, the OnEndPostambleGpuWork function will be called once per GPU index - /// for all GPUs that are relevant for the current trace sources. - /// - /// @returns Success if the trace was successfully ended. - /// Otherwise, the error generated by OnEndPostambleGpuWork will be returned. - Pal::Result EndPostamble(); -#endif - - /// Notifies the session that the trace operation started by the provided controller has finished. - /// - /// This function MUST be called after EndPostamble. When this function is called, the session will communicate with - /// all registered trace sources and notify them that all GPU work is complete. This notification is typically - /// used by sources to retrieve data produced by the GPU and write it into the session's trace data. - void FinishTrace(); - - /// Writes a chunk of trace data into the session. - /// - /// Trace sources are expected to call this function whenever they produce a new data chunk that should be added - /// into the session's trace data. - /// - /// This function may ONLY be called AFTER the BeginTrace function returns and BEFORE the FinishTrace call returns! - /// - /// @param [in] pSource The trace source that generated the provided data chunk - /// @param [in] info Information about the provided chunk that will be written into the trace data - /// - /// @returns Success if the incoming data chunk was successfully written/appended into the current data stream. - /// Otherwise, one of the following errors may be returned: - /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library - Pal::Result WriteDataChunk(ITraceSource* pSource, const TraceChunkInfo& info); - - /// Returns the current TraceSession state - /// - /// @returns Enum value of the current TraceSessionState - TraceSessionState GetTraceSessionState() const - { - return m_sessionState; - } - - /// Sets the TraceSession state based on external operations - /// - /// @param [in] sessionState TraceSessionState value to be assigned as the current state - void SetTraceSessionState(TraceSessionState sessionState); - - /// Returns the current active controller - /// - /// @returns Pointer to the current active controller driving the TraceSession - ITraceController* GetActiveController() const - { - return m_pActiveController; - } - - /// Reports an error encountered during an active trace by inserting a "TraceError" chunk to the trace stream - /// - /// If, during a trace or the construction of an RDF chunk, an error is encountered and a chunk that was - /// expected to be written can no longer be, this function may be called to insert an error chunk in place - /// of the expected chunk. - /// - /// @param [in] chunkId Text identifier of the failed RDF chunk - /// @param [in] pPayload Pointer to the data sent for the error - /// If the payloadType is a string, the string must be null-terminated - /// @param [in] payloadSize Size of the data in the payload - /// @param [in] payloadType Type of payload data represented by `pPayload` - /// @param [in] errorResult The PAL result code of the encountered error - /// - /// @returns Success if the error chunk was written successfully - Pal::Result ReportError( - const char chunkId[TextIdentifierSize], - const void* pPayload, - Pal::uint64 payloadSize, - TraceErrorPayload payloadType, - Pal::Result errorResult); - - /// Explicitly activates this TraceSession for managing traces. - /// - /// This should be called during Platform Init in response to a tool-side request to enable UberTrace tracing. - /// This signals that an active connection has been made to tool-side applications and that profiling via - /// PAL Trace should be prioritized in client drivers. - void EnableTracing() - { - m_tracingEnabled = true; - } - - /// Returns a pointer to a byte array containing the trace configuration. - /// - /// @param [out] pTraceConfigSize Sets *pTraceConfigSize to the number of bytes in the trace config - /// - /// @returns A pointer to the trace configuration data - const void* GetTraceConfig(size_t* pTraceConfigSize) const - { - PAL_ASSERT(pTraceConfigSize != nullptr); - (*pTraceConfigSize) = m_configDataSize; - return m_pConfigData; - } - - /// Indicates if a cancel-trace signal has been received and that a cancelation is in progress. - /// - /// @return true if a cancelation is in progress. - bool IsCancelingTrace() const { return m_cancelingTrace; } - - /// Register a function to be called when the Trace Session state changes. - /// - /// @param [in] pfnCallback The function to be called - /// @param [in] pPrivateData A pointer to pass to the callback function when called - /// - /// @returns Success if the callback was successfully registered - /// AlreadyExists if the given Callback+PrivateData has already been registered - /// ErrorInvalidValue if the given callback is not valid - Pal::Result RegisterTraceStateChangeCallback( - TraceStateChangeCallback pfnCallback, - void* pPrivateData); - - /// Unregister a previously registered Trace Session state change callback. - /// - /// @param [in] pfnCallback The function which was previously registered as a callback - /// @param [in] pPrivateData The pointer which is associated with the callback to unregister - /// - /// @returns Success if the callback was successfully unregistered - /// NotFound if the given pfnCallback+pPrivateData pair was not found - Pal::Result UnregisterTraceStateChangeCallback( - TraceStateChangeCallback pfnCallback, - void* pPrivateData); -private: - typedef Pal::IPlatform TraceAllocator; - - Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceSesion - DevDriver::IStructuredReader* m_pReader; // Stores the current JSON-based config of the TraceSession - - // RW Locks for trace sources, controllers, and RDF streams - Util::RWLock m_registerTraceSourceLock; - Util::RWLock m_registerTraceControllerLock; - Util::RWLock m_chunkAppendLock; - - // Trace sources registered with this TraceSession. - using TraceSourcesVec = Util::Vector; - TraceSourcesVec m_registeredTraceSources; - - // TraceSources and corresponding configs - typedef Util::HashMap TraceSourcesConfigMap; - TraceSourcesConfigMap m_traceSourcesConfigs; - - // Unique trace controllers registered with this TraceSession. - typedef Util::HashMap TraceControllersMap; - TraceControllersMap m_registeredTraceControllers; - - ITraceController* m_pActiveController; // The controller currently driving the TraceSession. - // We can have only one active controller at a time. - TraceSessionState m_sessionState; // Current state of the TraceSession - rdfChunkFileWriter* m_pChunkFileWriter; // Helper struct that manages create chunk file streams - // and write data chunks - rdfStream* m_pCurrentStream; // Active RDF stream for writing chunks - Pal::int32 m_currentChunkIndex; // The current chunk index of the RDF stream - bool m_tracingEnabled; // Flag indicating UberTrace tracing is enabled tool-side - void* m_pConfigData; // Buffer containing the cached trace configurationn - size_t m_configDataSize; // Size of the cached trace config buffer - bool m_cancelingTrace; // Indicates that a cancel signal has been received and trace cancelation - // is in progress. - - Util::Mutex m_stateChangeCallbackLock; // RW lock for state change callbacks - - // Default capacity for the Trace Session state change callback vector - static constexpr Pal::uint32 TraceStateChangeCallbacksVecDefaultCapacity = 4; - - /// The data required to call a state change callback - struct TraceStateChangeCallbackInfo - { - TraceStateChangeCallback pfnCallback; - void* pPrivateData; - }; - - using TraceStateChangeCallbacksVec = Util::Vector; - TraceStateChangeCallbacksVec m_traceStateChangeCallbacks; // Registered state change callbacks -}; -} // GpuUtil +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palTraceSession.h + * @brief PAL GPU utility TraceSession class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palPlatform.h" +#include "palDeque.h" +#include "palDevice.h" +#include "palGpuUtil.h" +#include "palHashMap.h" +#include "palMutex.h" +#include "palPipeline.h" +#include "palSysMemory.h" +#include "palGpuMemory.h" +#include "palMemTrackerImpl.h" +#include "palVector.h" + +struct rdfStream; +struct rdfChunkFileWriter; + +namespace DevDriver +{ +class IStructuredWriter; +class IStructuredReader; +class StructuredValue; +} + +namespace GpuUtil +{ + +class ITraceController; +class ITraceSource; + +constexpr Pal::uint16 TextIdentifierSize = 16; + +/// Information required to create a new chunk of trace data in a TraceSession +/// +/// This data inside this structure is expected to be produced by trace source implementations. The specific fields +/// included within this structure are intended to support compatibility with the Radeon Data Format (RDF) spec. +struct TraceChunkInfo +{ + char id[TextIdentifierSize]; ///< Text identifier of the chunk + Pal::uint32 version; ///< Version number of the chunk + const void* pHeader; ///< [in] Pointer to a buffer that contains the header data for the chunk + Pal::int64 headerSize; ///< Size of the buffer pointed to by pHeader + const void* pData; ///< [in] Pointer to a buffer that contains the data for the chunk + Pal::int64 dataSize; ///< Size of the buffer pointed to by pData + bool enableCompression; ///< Indicates if the chunk's data should be compressed or not +}; + +/// The available states of TraceSession +enum class TraceSessionState : Pal::uint32 +{ + Ready = 0, ///< New trace ready to begin + Requested = 1, ///< A trace has been requested and awaiting acceptance + Preparing = 2, ///< Trace has been accepted and is preparing resources before beginning + Running = 3, ///< Trace is in progress +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + Postamble = 4, ///< The detailed frame trace has ended but its data has not yet been written + /// into the session. Some trace sources may still collect data during this time. + PostambleWaiting = 5, ///< Waiting for Postamble to complete. + Completed = 6, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). + Count = 7 +#else + Waiting = 4, ///< Trace has ended, but data has not been written into the session + Completed = 5, ///< Trace has fully completed. RDF trace data is ready to be pulled out by CollectTrace(). + Count = 6 +#endif +}; + +/// Defines the type of payload. Currently only strings are supported but in the future can include JSON, structs, etc. +enum class TraceErrorPayload : Pal::uint32 +{ + None, //< Should be set when there is no additional information to be sent with the error + ErrorString //< Should be set when the error payload is string data +}; + +/// Chunk header for the error tracing chunk +struct TraceErrorHeader +{ + char chunkId[TextIdentifierSize]; ///< Text identifier of the failing chunk + Pal::uint32 chunkIndex; ///< Chunk index of the failing chunk + Pal::Result resultCode; ///< PAL Result code of the failure + TraceErrorPayload payloadType; ///< Type of error chunk payload +}; + +constexpr char ErrorChunkTextIdentifier[TextIdentifierSize] = "TraceError"; +constexpr Pal::uint32 ErrorTraceChunkVersion = 1; + +/** +*********************************************************************************************************************** +* @interface ITraceController +* @brief Interface that allows for control of a trace operation through TraceSession. +* +* Trace controllers are responsible for driving the high-level steps of a trace operation. Users of this interface are +* expected to create their own implementation of this interface, register it with a TraceSession, then call the +* following TraceSession functions to drive the trace process: +* +* TraceSession::AcceptTrace +* TraceSession::BeginTrace +* TraceSession::EndTrace +* TraceSession::EndPostamble +* TraceSession::FinishTrace +*********************************************************************************************************************** +*/ +class ITraceController +{ +public: + /// Returns the name of the controller + /// + /// @returns the name of the controller as a null terminated string + virtual const char* GetName() const = 0; + + /// Returns the version of the controller + /// + /// @returns the version of the controller as an unsigned integer value + virtual Pal::uint32 GetVersion() const = 0; + + /// Called by the associated session to update the current trace configuration + /// + /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; + + /// Called by the associated session to notify the controller that a trace has been requested and it can take + /// control of the TraceSession when desired. + virtual Pal::Result OnTraceRequested() = 0; + + /// Called by the associated session to notify the controller that a trace has been canceled and it can start + /// canceling the trace when ready. + virtual Pal::Result OnTraceCanceled() = 0; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + /// Called by TraceSession to indicate that GPU work is required on the indicated GPU during the preparation phase. + /// The command buffer must be ready to record commands; however, the trace controller should not submit it + /// until the trace begins. + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: This command buffer should be submitted at the same time as the command buffer provided in + /// `OnBeginGpuWork`. They may be the same command buffer or separate; the goal is to allow trace sources + /// to frontload recording GPU work before the trace formally begins. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to record GPU work before a trace starts executing. + /// Note that this command buffer shouldn't be submitted until the trace begins. + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnPreparationGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; +#endif + + /// Called by TraceSession to indicate that GPU work is required to begin a trace on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnBeginGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; + + /// Called by TraceSession to indicate that GPU work is required to end a trace on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the trace + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnEndGpuWork(Pal::uint32 gpuIndex, Pal::ICmdBuffer** ppCmdBuf) = 0; + + /// Called by TraceSession to indicate that GPU work is required to end the postamble on the indicated GPU + /// + /// The controller MUST return a valid command buffer that is ready to record commands for the target GPU + /// upon successful completion of this function via ppCmdBuf. + /// + /// This function will be called once per trace for each GPU that's considered relevant by the current set of + /// trace sources. + /// + /// Note: The command buffer provided by this function does not need to be a new command buffer. It just needs + /// to be capable of recording new commands. + /// + /// @param [in] gpuIndex The index of the target GPU + /// @param [out] ppCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble + /// + /// @returns Success if the command buffer was successfully returned + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + virtual Pal::Result OnEndPostambleGpuWork( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer** ppCmdBuf) = 0; +}; + +/** +*********************************************************************************************************************** +* @interface ITraceSource +* @brief Interface that enables developers to emit arbitrary data chunks into a trace through TraceSession. +* +* Trace sources are used to implement any surrounding logic required to produce a trace data chunk. Users of this +* interface are expected to create their own implementation of this interface, register it with a TraceSession, then +* call TraceSession::WriteDataChunk during a trace operation whenever a data chunk should be produced. +*********************************************************************************************************************** +*/ +class ITraceSource +{ +public: + /// Called by the associated session to update the current trace configuration + /// + /// @param [in] pJsonConfig Configuration data formatted as json and stored as DevDriver's StructuredValue object + virtual void OnConfigUpdated(DevDriver::StructuredValue* pJsonConfig) = 0; + + /// Returns a bitmask that represents which GPUs are relevant to this trace source + /// + /// If the bit at index N is set, GPU N must execute work on the GPU in order to produce trace data + virtual Pal::uint64 QueryGpuWorkMask() const = 0; + + /// Called by the associated session to notify the source that a new trace has been accepted + /// + /// The source may use this notification to do any preparation work that might be required before the trace begins. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 908 + /// A command buffer is provided for the trace source to insert any work into. Note that the work will not be + /// submitted until the trace begins (at the same time as `OnTraceBegin`). This allows for frontloading of + /// expensive operations, such as the construction of a GpaSession sample, that would affect runtime speed + /// or behavior during trace exeecution. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to record any GPU work required during the + /// preparation phase of the trace. Not submitted until `OnTraceBegin`. + virtual void OnTraceAccepted(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; +#else + virtual void OnTraceAccepted() = 0; +#endif + + /// Called by the associated session to notify the source that it should begin a trace + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// begin a trace operation. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. All GPUs that begin a trace are required to end it later. Sources are not expected + /// to handle cases where the begin/end function calls are mismatched during a trace operation. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to begin the trace + virtual void OnTraceBegin(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; + + /// Called by the associated session to notify the source that it should end the current trace + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// end a trace operation. + /// + /// The command buffer associated with the OnTraceBegin function is not guaranteed to have finished GPU execution + /// when this function is called. The command buffer associated with this function is also not guaranteed to finish + /// execution until OnTraceFinished is called. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the trace + virtual void OnTraceEnd(Pal::uint32 gpuIndex, Pal::ICmdBuffer* pCmdBuf) = 0; + + /// Called by the associated session to notify the source that it should end the postamble + /// + /// The source should use the provided command buffer to execute any GPU work that's required for the source to + /// end its postamble operation. + /// + /// The command buffer associated with the OnTraceBegin and OnTraceEnd functions are not guaranteed to have + /// finished GPU execution when this function is called. The command buffer associated with this function is also + /// not guaranteed to finish execution until OnTraceFinished is called. + /// + /// In situations where multiple GPUs are present, this function will be called for all GPUs that are expected to + /// participate in the trace. + /// + /// @param [in] gpuIndex The index of the GPU that owns pCmdBuf + /// @param [in] pCmdBuf A command buffer that can be used to perform any GPU work required to end the postamble +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + virtual void OnPostambleEnd( + Pal::uint32 gpuIndex, + Pal::ICmdBuffer* pCmdBuf) = 0; +#endif + + /// Called by the associated session to notify the source that the current trace has finished + /// + /// When this function is called, all prior command buffers provided to the source during the trace operation have + /// finished execution. The source should use this function to collect any data generated by the GPU and emit it + /// via TraceSession::WriteDataChunk. + virtual void OnTraceFinished() = 0; + + /// Returns the name of the source + /// + /// @returns the name of the source as a null terminated string + virtual const char* GetName() const = 0; + + /// Returns the version of the source + /// + /// @returns the version of the source as an unsigned integer value + virtual Pal::uint32 GetVersion() const = 0; + + /// Whether multiple instances of the trace source are allowed + /// + /// @returns true if multiple instances of this trace sources can co-exist in one session, false otherwise. + virtual bool AllowMultipleInstances() const { return false; } +}; + +/** +*********************************************************************************************************************** +* @class TraceSession +* @brief Helper class providing common driver functionality for collecting arbitrary data traces. +* +* Due to the global nature of the trace functionality, only one TraceSession is typically used at a time. +* An interface to acquire a session exists on IPlatform. Users who need to interact with an instance of this object +* should expect to acquire it there. +* +* @see IPlatform::GetTraceSession() +*********************************************************************************************************************** +*/ +class TraceSession final +{ +public: + /// Constructor. + /// + /// @param [in] pPlatform Platform associated with this TraceSesion + TraceSession(Pal::IPlatform* pPlatform); + + /// Destructor + ~TraceSession(); + + /// Initialize the trace session before requesting a trace. + /// + /// @returns Success if initalization was successful, or ErrorUnknown upon failure. + Pal::Result Init(); + + /// Returns whether tracing has been formally enabled via UberTrace or not. + /// If 'true', this means that tool-side applications have requested this + /// TraceSession to capture traces. This has implications for PAL clients. + /// + /// @returns True if tracing has been enabled, and false otherwise. + bool IsTracingEnabled() const { return m_tracingEnabled; } + + /// Attempts to update the current trace configuration + /// + /// This function will only succeed if there is currently to trace in progress + /// + /// TODO: The JSON configuration interface will likely be replaced with driver settings in the future + /// + /// @param [in] pData Buffer that stores the Json-formatted configuration data + /// @param [in] dataSize Configuration data-size + /// + /// @returns Success if the trace configuration was successfully updated. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is currently in progress + /// + ErrorInvalidPointer pData is nullptr + /// + ErrorInvalidParameter pData is not valid json + Pal::Result UpdateTraceConfig(const void* pData, size_t dataSize); + + /// Attempts to request a new trace operation on the trace session. + /// + /// Once a trace is successfully requested, it will become available for a registered trace controller to accept. + /// When a controller accepts the trace, it becomes responsible for managing the rest of the trace operation and + /// notifying the session upon trace completion. + /// + /// Since the session can only run a single trace at a time, this function will not succeed if another trace is + /// is already requested or in progress. + /// + /// @returns Success if the trace operation was successfully requested. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if there is a trace in progress already and a new one cannot be started + Pal::Result RequestTrace(); + + /// Cancels a trace currently in progress. + /// + /// @returns Success if the trace was successfully canceled. + /// Otherwise, one of the following errors may be returned: + /// + NotReady if the trace is not ready to be canceled. + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result CancelTrace(); + + /// Cleans up the RDF chunk stream and makes it ready for a new trace again. + /// + /// @returns Success if the trace session and rdf streams were successfully cleaned up and returned to the + /// initialization state + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + Pal::Result CleanupChunkStream(); + + /// Attempts to consume any trace data stored within the trace session. + /// + /// This function will only successfully return trace data after a trace operation is completed on the session. + /// + /// TODO: This function should be replaced with one that uses a callback so we can avoid needing to store the trace + /// data into memory twice. + /// + /// @param [out] pData (Optional) Destination buffer to copy the trace data into + /// If this parameter is nullptr, the size of the trace data in bytes will be + /// returned via pDataSize instead of consuming any trace data. + /// @param [in/out] pDataSize If pData is nullptr, then this parameter is used to return the trace data + /// size in bytes. + /// If pData is valid, this parameter represents the size of the buffer + /// pointed to by pData. + /// + /// @returns Success if the trace data was successfully consumed or the size of the trace data was returned. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library + /// + ErrorUnavailable if trace data is not available for collection at this time + /// + ErrorInvalidPointer if nullptr is passed as pDataSize + /// + ErrorInvalidMemorySize if *pDataSize indicates that pData is too small to contain the trace data + Pal::Result CollectTrace(void* pData, size_t* pDataSize); + + /// Attempts to register a trace controller + /// + /// Once registered, trace controllers can receive configuration updates from the session. + /// They may also manage the trace operation by calling AcceptTrace, BeginTrace, EndTrace, EndPostamble and FinishTrace. + /// + /// Trace controllers can only be registered when there is no trace in progress + /// + /// @param [in] pController The trace controller to register with the session + /// + /// @returns Success if the controller was successfully registered. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + AlreadyExists if this controller has already been registered + /// + ErrorUnavailable if a trace is in progress + /// + ErrorInvalidPointer if nullptr is passed as pController + Pal::Result RegisterController(ITraceController* pController); + + /// Attempts to unregister a previously registered trace controller + /// + /// @param [in] pController The trace controller to unregister from the session + /// + /// @returns Success if the controller was successfully unregistered. + /// Otherwise, one of the following errors may be returned: + /// + NotFound if the provided controller was not previously registered + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is in progress + Pal::Result UnregisterController(ITraceController* pController); + + /// Attempts to register a trace source + /// + /// Once registered, trace sources can receive configuration updates from the session. + /// They may also emit data during trace operations by calling WriteDataChunk. + /// + /// Trace sources can only be registered when there is no trace in progress + /// + /// @param [in] pSource The trace source to register with the session + /// + /// @returns Success if the source was successfully registered. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + AlreadyExists if this source has already been registered + /// + ErrorUnavailable if a trace is in progress + /// + ErrorInvalidPointer if nullptr is passed as pSource + Pal::Result RegisterSource(ITraceSource* pSource); + + /// Attempts to unregister a previously registered trace source + /// + /// @param [in] pSource The trace source to unregister from the session + /// + /// @returns Success if the source was successfully unregistered. + /// Otherwise, one of the following errors may be returned: + /// + NotFound if the provided source was not previously registered + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if a trace is in progress + Pal::Result UnregisterSource(ITraceSource* pSource); + + /// Attempts to accept a previously requested trace with the provided controller + /// + /// Once a trace is successfully accepted by a controller, that controller becomes responsible for managing the + /// rest of the trace operation. Also, once a requested trace is accepted by a controller, no other controllers + /// will be able to accept that trace. Accept is a "consuming" operation. + /// + /// @param [in] pController The trace controller to accept the trace with + /// @param [in] supportedGpuMask Bit mask of GPU indices that are capable of participating in the trace + /// + /// The GPU mask provided to this function is used to determine which GPUs will be involved in the trace. In order + /// to decide which GPUs require GPU work, the session creates a combined mask from all registered sources and + /// checks it against the mask provided by this function. Only GPUs that are present in both masks will be able to + /// submit GPU work during the trace. + /// + /// @returns Success if the trace was successfully accepted. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal PAL error occurs. + /// + ErrorUnavailable if no trace has been requested or a trace is currently in progress + /// + ErrorInvalidPointer if nullptr is passed as pController + Pal::Result AcceptTrace(ITraceController* pController, Pal::uint64 supportedGpuMask); + + /// Begins the trace that was previously accepted by the provided controller + /// + /// This function MUST be called after a successful call to AcceptTrace. When this function is called, the session + /// will communicate with all registered trace sources and instruct them to begin the trace operation. The provided + /// trace controller will be notified if any GPU work is required via ITraceController::OnBeginGpuWork. The command + /// buffers returned by OnBeginGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted BEFORE the command buffers + /// generated in response to the EndTrace call! + /// + /// In situations where multiple GPUs are present, the OnBeginGpuWork function will be called once per GPU index, + /// for all GPUs that are relevant for the current trace sources. + /// + /// @returns Success if the trace was successfully started. + /// Otherwise, the error generated by OnBeginGpuWork will be returned. + Pal::Result BeginTrace(); + + /// Ends the trace that was previously started by the provided controller + /// + /// This function MUST be called after BeginTrace. When this function is called, the session will communicate with + /// all registered trace sources and instruct them to end the trace operation. The provided trace controller will + /// trace controller will be notified if any GPU work is required via ITraceController::OnEndGpuWork. The command + /// buffers returned by OnEndGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers + /// generated in response to the previous BeginTrace call! The generated command buffers MUST also complete + /// execution on the GPU BEFORE FinishTrace is called! + /// + /// In situations where multiple GPUs are present, the OnEndGpuWork function will be called once per GPU index + /// for all GPUs that are relevant for the current trace sources. + /// + /// The Trace Session will enter Postamble phase after EndTrace is called. + /// + /// @returns Success if the trace was successfully ended. + /// Otherwise, the error generated by OnEndGpuWork will be returned. + Pal::Result EndTrace(); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 939 + /// Ends the postamble phase, which typically runs until the detailed trace data is available. + /// This function MUST be called after EndTrace. When this function is called, the session will communicate with + /// all registered trace sources and notify them of the end of the postamble phase. The provided trace controller + /// will be notified if any GPU work is required via ITraceController::OnEndPostambleGpuWork. The command + /// buffers returned by OnEndPostambleGpuWork will be passed to each relevant trace source to record required work. + /// + /// The command buffers generated in response to this this call MUST be submitted AFTER the command buffers + /// generated in response to the previous EndTrace call! The generated command buffers MUST also complete + /// execution on the GPU BEFORE FinishPostamble is called! + /// + /// In situations where multiple GPUs are present, the OnEndPostambleGpuWork function will be called once per GPU index + /// for all GPUs that are relevant for the current trace sources. + /// + /// @returns Success if the trace was successfully ended. + /// Otherwise, the error generated by OnEndPostambleGpuWork will be returned. + Pal::Result EndPostamble(); +#endif + + /// Notifies the session that the trace operation started by the provided controller has finished. + /// + /// This function MUST be called after EndPostamble. When this function is called, the session will communicate with + /// all registered trace sources and notify them that all GPU work is complete. This notification is typically + /// used by sources to retrieve data produced by the GPU and write it into the session's trace data. + void FinishTrace(); + + /// Writes a chunk of trace data into the session. + /// + /// Trace sources are expected to call this function whenever they produce a new data chunk that should be added + /// into the session's trace data. + /// + /// This function may ONLY be called AFTER the BeginTrace function returns and BEFORE the FinishTrace call returns! + /// + /// @param [in] pSource The trace source that generated the provided data chunk + /// @param [in] info Information about the provided chunk that will be written into the trace data + /// + /// @returns Success if the incoming data chunk was successfully written/appended into the current data stream. + /// Otherwise, one of the following errors may be returned: + /// + ErrorUnknown if an internal error occurs in PAL or an unknown error is thrown by external library + Pal::Result WriteDataChunk(ITraceSource* pSource, const TraceChunkInfo& info); + + /// Returns the current TraceSession state + /// + /// @returns Enum value of the current TraceSessionState + TraceSessionState GetTraceSessionState() const + { + return m_sessionState; + } + + /// Sets the TraceSession state based on external operations + /// + /// @param [in] sessionState TraceSessionState value to be assigned as the current state + void SetTraceSessionState(TraceSessionState sessionState) + { + m_sessionState = sessionState; + } + + /// Returns the current active controller + /// + /// @returns Pointer to the current active controller driving the TraceSession + ITraceController* GetActiveController() const + { + return m_pActiveController; + } + + /// Reports an error encountered during an active trace by inserting a "TraceError" chunk to the trace stream + /// + /// If, during a trace or the construction of an RDF chunk, an error is encountered and a chunk that was + /// expected to be written can no longer be, this function may be called to insert an error chunk in place + /// of the expected chunk. + /// + /// @param [in] chunkId Text identifier of the failed RDF chunk + /// @param [in] pPayload Pointer to the data sent for the error + /// If the payloadType is a string, the string must be null-terminated + /// @param [in] payloadSize Size of the data in the payload + /// @param [in] payloadType Type of payload data represented by `pPayload` + /// @param [in] errorResult The PAL result code of the encountered error + /// + /// @returns Success if the error chunk was written successfully + Pal::Result ReportError( + const char chunkId[TextIdentifierSize], + const void* pPayload, + Pal::uint64 payloadSize, + TraceErrorPayload payloadType, + Pal::Result errorResult); + + /// Explicitly activates this TraceSession for managing traces. + /// + /// This should be called during Platform Init in response to a tool-side request to enable UberTrace tracing. + /// This signals that an active connection has been made to tool-side applications and that profiling via + /// PAL Trace should be prioritized in client drivers. + void EnableTracing() + { + m_tracingEnabled = true; + } + + /// Returns a pointer to a byte array containing the trace configuration. + /// + /// @param [out] pTraceConfigSize Sets *pTraceConfigSize to the number of bytes in the trace config + /// + /// @returns A pointer to the trace configuration data + const void* GetTraceConfig(size_t* pTraceConfigSize) const + { + PAL_ASSERT(pTraceConfigSize != nullptr); + (*pTraceConfigSize) = m_configDataSize; + return m_pConfigData; + } + + /// Indicates if a cancel-trace signal has been received and that a cancelation is in progress. + /// + /// @return true if a cancelation is in progress. + bool IsCancelingTrace() const { return m_cancelingTrace; } + +private: + typedef Pal::IPlatform TraceAllocator; + + Pal::IPlatform* const m_pPlatform; // Platform associated with this TraceSesion + DevDriver::IStructuredReader* m_pReader; // Stores the current JSON-based config of the TraceSession + + // RW Locks for trace sources, controllers, and RDF streams + Util::RWLock m_registerTraceSourceLock; + Util::RWLock m_registerTraceControllerLock; + Util::RWLock m_chunkAppendLock; + + // Trace sources registered with this TraceSession. + using TraceSourcesVec = Util::Vector; + TraceSourcesVec m_registeredTraceSources; + + // TraceSources and corresponding configs + typedef Util::HashMap TraceSourcesConfigMap; + TraceSourcesConfigMap m_traceSourcesConfigs; + + // Unique trace controllers registered with this TraceSession. + typedef Util::HashMap TraceControllersMap; + TraceControllersMap m_registeredTraceControllers; + + ITraceController* m_pActiveController; // The controller currently driving the TraceSession. + // We can have only one active controller at a time. + TraceSessionState m_sessionState; // Current state of the TraceSession + rdfChunkFileWriter* m_pChunkFileWriter; // Helper struct that manages create chunk file streams + // and write data chunks + rdfStream* m_pCurrentStream; // Active RDF stream for writing chunks + Pal::int32 m_currentChunkIndex; // The current chunk index of the RDF stream + bool m_tracingEnabled; // Flag indicating UberTrace tracing is enabled tool-side + void* m_pConfigData; // Buffer containing the cached trace configurationn + size_t m_configDataSize; // Size of the cached trace config buffer + bool m_cancelingTrace; // Indicates that a cancel signal has been received and trace cancelation + // is in progress. +}; +} // GpuUtil diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h b/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h index 70357377a6..cded304a46 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palAssert.h @@ -1,368 +1,368 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palAssert.h - * @brief PAL utility collection assert macros. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palDbgPrint.h" -#include "palUtil.h" - -#if defined(_WIN32) -#include -/// OS-independent macro to force a break into the debugger. -#define PAL_DEBUG_BREAK() [[unlikely]] __debugbreak() -#else -#include -/// OS-independent macro to force a break into the debugger. -#define PAL_DEBUG_BREAK() [[unlikely]] raise(SIGTRAP); -#endif - -#if defined(_MSC_VER) && !defined(__clang__) -/// Macro to direct static code analysis to assume the specified expression will always be true. -/// Purpose is to suppress warnings from MSVC's /analysis setting. -/// Only pertains to static code analysis. Does not impact compile optimization. Not the same as C++23's [[assume]]. -# define PAL_ANALYSIS_ASSUME(_expr) __analysis_assume(_expr) -#else -/// This macro is only useful on MSVC builds. It has no meaning for other builds. -# define PAL_ANALYSIS_ASSUME(_expr) ((void)0) -#endif - -namespace Util -{ - -/// A helper function to check the size-in-bits of a 'reserved' member in a bitfield. -/// This is intended for use with static_asserts to ensure things don't go out-of-sync. -/// -/// @param [in] expectedTotalBitWidth Number of bits expected in the whole type -/// @param [in] expectedReservedBits Number of bits in the 'reserved' field -/// -/// @return true if the bit lengths of the type T match the values in the args. -/// true if the compiler lacks support to do this at compile time. -/// -/// @note This may not work properly with old compilers, but this is meant for linting anyhow. -template -constexpr bool CheckReservedBits( - uint32 expectedTotalBitWidth, - uint32 expectedReservedBits) -{ - bool match = false; - - // Fail if the whole size is different - if (sizeof(T) * 8 == expectedTotalBitWidth) - { - // Get the width of the reserved field by detecting when it stops filling bits - T sample = {}; - uint64 mask = 0; - uint32 reservedBits = 0; - do - { - sample = {}; - mask = (mask << 1) | 1; - reservedBits++; - sample.reserved = mask; - } while ((sample.reserved == mask) && (reservedBits < sizeof(T) * 8)); - // when the loop terminates, it's one past the size of the field. - match = (reservedBits - 1) == expectedReservedBits; - } - return match; -} - -/// A helper function to check that a series of static numeric values are sequential. -/// This is intended for use with static_asserts to ensure things don't go out-of-sync. -/// -/// @param [in] args Array of numeric values to check -/// @param [in] interval Expected interval between each (default 1, 4 is also common for field offsets) -/// -/// @return true if all the values are sequential -/// true if the compiler lacks support to do this at compile time. -/// -/// @note This may not work properly with old compilers, but this is meant for linting anyhow. -template -constexpr bool CheckSequential( - const T (&args)[N], - T interval = 1) -{ - bool isSequential = true; - for (int i = 0; i < (N - 1); i++) - { - if ((args[i] + interval) != args[i + 1]) - { - isSequential = false; - break; - } - } - return isSequential; -} - -#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) - -/// Specifies how severe an triggered assert (or alert) is. -/// -/// Both asserts and alerts can print out a debug string and break into the debugger. Asserts are to be used to verify -/// the known, assumed state of the program at any time. Alerts are to be used to notify the developer of a _possible_, -/// but unexpected condition such as memory allocation failure, an OS call failure, or an application behavior that is -/// known to be slow. -enum AssertCategory : uint32 -{ - AssertCatAssert = 0, - AssertCatAlert, - AssertCatCount -}; - -/// Enables/disables the specified assert category. -/// -/// Probably controlled by a setting and set during initialization. -/// -/// @param [in] category Assert category to enable/disable (asserts or alerts). -/// @param [in] enable True to enable the specified assert category, false to disable it. -extern void EnableAssertMode( - AssertCategory category, - bool enable); - -/// Returns true if the specified assert category is enabled and false otherwise. -/// -/// @param [in] category Assert category to check -extern bool IsAssertCategoryEnabled( - AssertCategory category); - -#endif - -} // namespace Util - -#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) -/// Prints an error message with the specified reason via the debug print system. A debug break will also be triggered -/// if they're currently enabled for asserts. -/// -/// @note This version of assert inlines an 'int 3' every time it is used so that each occurrence can be zapped -/// independently. This macro cannot be used in assignment operations. -#define PAL_TRIGGER_ASSERT(_pFormat, ...) [[unlikely]] \ -do { \ - PAL_DPERROR(_pFormat, ##__VA_ARGS__); \ - if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAssert)) \ - { \ - PAL_DEBUG_BREAK(); \ - } \ -} while (false) - -/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the -/// specified reason. -/// -/// @note This assert should not be used in constant evaluated contexts (e.g., constexpr functions). -// -// This previously said: -// if (_expr_eval == false) [[unlikely]] -// { -// PAL_TRIGGER_ASSERT(...); -// } -// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot -// attach it to a compound statement. So: -// 1. we ignore PAL coding standards and don't use a compound statement; -// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. -#define PAL_ASSERT_MSG(_expr, _pReasonFmt, ...) \ -do { \ - const bool _expr_eval = static_cast(_expr); \ - if (_expr_eval == false) \ - PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ - PAL_ANALYSIS_ASSUME(_expr_eval); \ -} while (false) - -#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ < 6) - -// Function to circumvent gcc 5.x inability to use lambdas in unevaluated constant expression contexts. -constexpr void PalTriggerAssertImpl( - const char* pFormat, - const char* pExpr, - const char* pFile, - int line, - const char* pFunc) -{ - // pExpr is always not nullptr, as it's supposed to be a preprocessor string, but it does convince gcc - // to compile PalTriggerAssertImpl() as potentially constexpr - pExpr != nullptr ? - [&] - { - Util::DbgPrintf( - Util::DbgPrintCatErrorMsg, - Util::DbgPrintStyleDefault, - pFormat, - pExpr, - pFile, - line, - pFunc); - if (Util::IsAssertCategoryEnabled(Util::AssertCatAssert)) - { - PAL_DEBUG_BREAK(); - } - return 0; - }() - : 0; -} - -// gcc 5.4 implementation of PAL_CONSTEXPR_ASSERT_MSG that ignores the additional reason for the assertion -// -// This previously said: -// if (_expr_eval == false) [[unlikely]] -// { -// PalTriggerAssertImpl(...); -// } -// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot -// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. -#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ -do { \ - const bool _expr_eval = static_cast(_expr); \ - if (_expr_eval == false) [[unlikely]] \ - PalTriggerAssertImpl("Assertion failed: %s (%s:%d:%s)", #_expr, __FILE__, __LINE__, __func__); \ - PAL_ANALYSIS_ASSUME(_expr_eval); \ -} while (false) - -#else - -/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the -/// specified reason. -/// -/// @note This assert should be used in constant evaluated contexts (e.g., constexpr functions). -/// @note This assert uses an immediately-invoked function expression in the form of an internal lambda to signal a -/// failed assert. Since PAL_TRIGGER_ASSERT is not constexpr, an _expr that evaluates to false will fail to -/// compile the function operator of the lambda. -// -// This previously said: -// if (_expr_eval == false) [[unlikely]] -// { -// [&] { PAL_TRIGGER_ASSERT(...); }(); -// } -// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot -// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. -#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ -do { \ - const bool _expr_eval = static_cast(_expr); \ - if (_expr_eval == false) [[unlikely]] \ - [&] { PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); }(); \ - PAL_ANALYSIS_ASSUME(_expr_eval); \ -} while (false) - -#endif - -/// Calls the PAL_ASSERT_MSG macro with a generic reason string -#define PAL_ASSERT(_expr) PAL_ASSERT_MSG(_expr, "%s", "Unknown") - -/// Calls the PAL_CONSTEXPR_ASSERT_MSG macro with a generic reason string -#define PAL_CONSTEXPR_ASSERT(_expr) PAL_CONSTEXPR_ASSERT_MSG(_expr, "%s", "Unknown") - -#if DEBUG -/// Debug build only PAL assert, the typical usage is when make an assertion on a debug-only variables. -/// The only difference than PAL assert is it's empty in release mode. -#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) \ -do { \ - PAL_ASSERT(_expr); \ -} while (false) -#else -#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) -#endif - -/// If the expression evaluates to true, then a warning message with the specified reason will be printed via the -/// debug print system. A debug break will also be triggered if they're currently enabled for alerts. -/// -/// @note This is the opposite polarity of asserts. The assert macro _asserts_ that the specified condition is true. -/// While the alert macro _alerts_ the developer if the specified condition is true. -/// -/// This macro should be used in places where an assert is inappropriate because an error condition is _possible_, but -/// not typically expected. For example, asserting that an OS call succeeded should be avoided since there cannot be an -/// assumption that it will succeed. Nonetheless, a developer may want to be alerted immediately and dropped into the -/// debugger when such a failure occurs. -#define PAL_TRIGGER_ALERT(_pFormat, ...) [[unlikely]] \ -do { \ - PAL_DPWARN(_pFormat, ##__VA_ARGS__); \ - if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAlert)) \ - { \ - PAL_DEBUG_BREAK(); \ - } \ -} while (false) - -// -// This previously said: -// if (_expr) [[unlikely]] -// { -// PAL_TRIGGER_ASSERT(...); -// } -// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot -// attach it to a compound statement. So: -// 1. we ignore PAL coding standards and don't use a compound statement; -// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. -#define PAL_ALERT_MSG(_expr, _pReasonFmt, ...) \ -do { \ - if (_expr) \ - PAL_TRIGGER_ALERT("Alert triggered: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ -} while (false) - -/// Calls the PAL_ALERT_MSG macro with a generic reason string -#define PAL_ALERT(_expr) PAL_ALERT_MSG(_expr, "%s", "Unknown") - -/// Convenience macro that asserts if something has never been tested. -#define PAL_NOT_TESTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code Not Tested! | Reason: " _pReasonFmt, ##__VA_ARGS__) -#define PAL_NOT_TESTED() PAL_NOT_TESTED_MSG("%s", "Unknown") - -/// Convenience macro that asserts if something has not been implemented. -#define PAL_NOT_IMPLEMENTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Not Implemented! | Reason: " _pReasonFmt, ##__VA_ARGS__) -#define PAL_NOT_IMPLEMENTED() PAL_NOT_IMPLEMENTED_MSG("%s", "Unknown") - -/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. -#define PAL_NEVER_CALLED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code should never be called! | Reason: " _pReasonFmt, ##__VA_ARGS__) -#define PAL_NEVER_CALLED() PAL_NEVER_CALLED_MSG("%s", "Unknown") - -/// Convenience macro that always asserts. Expect this to be used instead of PAL_ASSERT(false). -#define PAL_ASSERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Unconditional Assert | Reason: " _pReasonFmt, ##__VA_ARGS__) -#define PAL_ASSERT_ALWAYS() PAL_ASSERT_ALWAYS_MSG("%s", "Unknown") - -/// Convenience macro that always alerts. Expect this to be used instead of PAL_ALERT(true). -#define PAL_ALERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ALERT("Unconditional Alert | Reason: " _pReasonFmt, ##__VA_ARGS__) -#define PAL_ALERT_ALWAYS() PAL_ALERT_ALWAYS_MSG("%s", "Unknown") - -#else - -#define PAL_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) -#define PAL_CONSTEXPR_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) -#define PAL_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) -#define PAL_CONSTEXPR_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) -#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) -#define PAL_ALERT(_expr) ((void)0) -#define PAL_ALERT_MSG(_expr, ...) ((void)0) -#define PAL_NOT_TESTED() [[unlikely]] ((void)0) -#define PAL_NOT_TESTED_MSG(...) [[unlikely]] ((void)0) -#define PAL_NOT_IMPLEMENTED() [[unlikely]] ((void)0) -#define PAL_NOT_IMPLEMENTED_MSG(...) [[unlikely]] ((void)0) -#define PAL_NEVER_CALLED() [[unlikely]] ((void)0) -#define PAL_NEVER_CALLED_MSG(...) [[unlikely]] ((void)0) -#define PAL_ASSERT_ALWAYS() [[unlikely]] ((void)0) -#define PAL_ASSERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) -#define PAL_ALERT_ALWAYS() [[unlikely]] ((void)0) -#define PAL_ALERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) - -#endif - +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palAssert.h + * @brief PAL utility collection assert macros. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palDbgPrint.h" +#include "palUtil.h" + +#if defined(_WIN32) +#include +/// OS-independent macro to force a break into the debugger. +#define PAL_DEBUG_BREAK() [[unlikely]] __debugbreak() +#else +#include +/// OS-independent macro to force a break into the debugger. +#define PAL_DEBUG_BREAK() [[unlikely]] raise(SIGTRAP); +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +/// Macro to direct static code analysis to assume the specified expression will always be true. +/// Purpose is to suppress warnings from MSVC's /analysis setting. +/// Only pertains to static code analysis. Does not impact compile optimization. Not the same as C++23's [[assume]]. +# define PAL_ANALYSIS_ASSUME(_expr) __analysis_assume(_expr) +#else +/// This macro is only useful on MSVC builds. It has no meaning for other builds. +# define PAL_ANALYSIS_ASSUME(_expr) ((void)0) +#endif + +namespace Util +{ + +/// A helper function to check the size-in-bits of a 'reserved' member in a bitfield. +/// This is intended for use with static_asserts to ensure things don't go out-of-sync. +/// +/// @param [in] expectedTotalBitWidth Number of bits expected in the whole type +/// @param [in] expectedReservedBits Number of bits in the 'reserved' field +/// +/// @return true if the bit lengths of the type T match the values in the args. +/// true if the compiler lacks support to do this at compile time. +/// +/// @note This may not work properly with old compilers, but this is meant for linting anyhow. +template +constexpr bool CheckReservedBits( + uint32 expectedTotalBitWidth, + uint32 expectedReservedBits) +{ + bool match = false; + + // Fail if the whole size is different + if (sizeof(T) * 8 == expectedTotalBitWidth) + { + // Get the width of the reserved field by detecting when it stops filling bits + T sample = {}; + uint64 mask = 0; + uint32 reservedBits = 0; + do + { + sample = {}; + mask = (mask << 1) | 1; + reservedBits++; + sample.reserved = mask; + } while ((sample.reserved == mask) && (reservedBits < sizeof(T) * 8)); + // when the loop terminates, it's one past the size of the field. + match = (reservedBits - 1) == expectedReservedBits; + } + return match; +} + +/// A helper function to check that a series of static numeric values are sequential. +/// This is intended for use with static_asserts to ensure things don't go out-of-sync. +/// +/// @param [in] args Array of numeric values to check +/// @param [in] interval Expected interval between each (default 1, 4 is also common for field offsets) +/// +/// @return true if all the values are sequential +/// true if the compiler lacks support to do this at compile time. +/// +/// @note This may not work properly with old compilers, but this is meant for linting anyhow. +template +constexpr bool CheckSequential( + const T (&args)[N], + T interval = 1) +{ + bool isSequential = true; + for (int i = 0; i < (N - 1); i++) + { + if ((args[i] + interval) != args[i + 1]) + { + isSequential = false; + break; + } + } + return isSequential; +} + +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) + +/// Specifies how severe an triggered assert (or alert) is. +/// +/// Both asserts and alerts can print out a debug string and break into the debugger. Asserts are to be used to verify +/// the known, assumed state of the program at any time. Alerts are to be used to notify the developer of a _possible_, +/// but unexpected condition such as memory allocation failure, an OS call failure, or an application behavior that is +/// known to be slow. +enum AssertCategory : uint32 +{ + AssertCatAssert = 0, + AssertCatAlert, + AssertCatCount +}; + +/// Enables/disables the specified assert category. +/// +/// Probably controlled by a setting and set during initialization. +/// +/// @param [in] category Assert category to enable/disable (asserts or alerts). +/// @param [in] enable True to enable the specified assert category, false to disable it. +extern void EnableAssertMode( + AssertCategory category, + bool enable); + +/// Returns true if the specified assert category is enabled and false otherwise. +/// +/// @param [in] category Assert category to check +extern bool IsAssertCategoryEnabled( + AssertCategory category); + +#endif + +} // namespace Util + +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) +/// Prints an error message with the specified reason via the debug print system. A debug break will also be triggered +/// if they're currently enabled for asserts. +/// +/// @note This version of assert inlines an 'int 3' every time it is used so that each occurrence can be zapped +/// independently. This macro cannot be used in assignment operations. +#define PAL_TRIGGER_ASSERT(_pFormat, ...) [[unlikely]] \ +do { \ + PAL_DPERROR(_pFormat, ##__VA_ARGS__); \ + if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAssert)) \ + { \ + PAL_DEBUG_BREAK(); \ + } \ +} while (false) + +/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the +/// specified reason. +/// +/// @note This assert should not be used in constant evaluated contexts (e.g., constexpr functions). +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// PAL_TRIGGER_ASSERT(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So: +// 1. we ignore PAL coding standards and don't use a compound statement; +// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. +#define PAL_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) \ + PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#if !defined(__clang__) && !defined(_MSC_VER) && (__GNUC__ < 6) + +// Function to circumvent gcc 5.x inability to use lambdas in unevaluated constant expression contexts. +constexpr void PalTriggerAssertImpl( + const char* pFormat, + const char* pExpr, + const char* pFile, + int line, + const char* pFunc) +{ + // pExpr is always not nullptr, as it's supposed to be a preprocessor string, but it does convince gcc + // to compile PalTriggerAssertImpl() as potentially constexpr + pExpr != nullptr ? + [&] + { + Util::DbgPrintf( + Util::DbgPrintCatErrorMsg, + Util::DbgPrintStyleDefault, + pFormat, + pExpr, + pFile, + line, + pFunc); + if (Util::IsAssertCategoryEnabled(Util::AssertCatAssert)) + { + PAL_DEBUG_BREAK(); + } + return 0; + }() + : 0; +} + +// gcc 5.4 implementation of PAL_CONSTEXPR_ASSERT_MSG that ignores the additional reason for the assertion +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// PalTriggerAssertImpl(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) [[unlikely]] \ + PalTriggerAssertImpl("Assertion failed: %s (%s:%d:%s)", #_expr, __FILE__, __LINE__, __func__); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#else + +/// If the expression evaluates to false, then it calls the PAL_TRIGGER_ASSERT macro with an error message with the +/// specified reason. +/// +/// @note This assert should be used in constant evaluated contexts (e.g., constexpr functions). +/// @note This assert uses an immediately-invoked function expression in the form of an internal lambda to signal a +/// failed assert. Since PAL_TRIGGER_ASSERT is not constexpr, an _expr that evaluates to false will fail to +/// compile the function operator of the lambda. +// +// This previously said: +// if (_expr_eval == false) [[unlikely]] +// { +// [&] { PAL_TRIGGER_ASSERT(...); }(); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So we ignore PAL coding standards and don't use a compound statement. +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + const bool _expr_eval = static_cast(_expr); \ + if (_expr_eval == false) [[unlikely]] \ + [&] { PAL_TRIGGER_ASSERT("Assertion failed: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); }(); \ + PAL_ANALYSIS_ASSUME(_expr_eval); \ +} while (false) + +#endif + +/// Calls the PAL_ASSERT_MSG macro with a generic reason string +#define PAL_ASSERT(_expr) PAL_ASSERT_MSG(_expr, "%s", "Unknown") + +/// Calls the PAL_CONSTEXPR_ASSERT_MSG macro with a generic reason string +#define PAL_CONSTEXPR_ASSERT(_expr) PAL_CONSTEXPR_ASSERT_MSG(_expr, "%s", "Unknown") + +#if DEBUG +/// Debug build only PAL assert, the typical usage is when make an assertion on a debug-only variables. +/// The only difference than PAL assert is it's empty in release mode. +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) \ +do { \ + PAL_ASSERT(_expr); \ +} while (false) +#else +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) +#endif + +/// If the expression evaluates to true, then a warning message with the specified reason will be printed via the +/// debug print system. A debug break will also be triggered if they're currently enabled for alerts. +/// +/// @note This is the opposite polarity of asserts. The assert macro _asserts_ that the specified condition is true. +/// While the alert macro _alerts_ the developer if the specified condition is true. +/// +/// This macro should be used in places where an assert is inappropriate because an error condition is _possible_, but +/// not typically expected. For example, asserting that an OS call succeeded should be avoided since there cannot be an +/// assumption that it will succeed. Nonetheless, a developer may want to be alerted immediately and dropped into the +/// debugger when such a failure occurs. +#define PAL_TRIGGER_ALERT(_pFormat, ...) [[unlikely]] \ +do { \ + PAL_DPWARN(_pFormat, ##__VA_ARGS__); \ + if (::Util::IsAssertCategoryEnabled(::Util::AssertCatAlert)) \ + { \ + PAL_DEBUG_BREAK(); \ + } \ +} while (false) + +// +// This previously said: +// if (_expr) [[unlikely]] +// { +// PAL_TRIGGER_ASSERT(...); +// } +// However there is a bug in the initial gcc implementation of [[unlikely]] that means you cannot +// attach it to a compound statement. So: +// 1. we ignore PAL coding standards and don't use a compound statement; +// 2. we don't use [[unlikely]] as the expansion of PAL_TRIGGER_ASSERT already has one. +#define PAL_ALERT_MSG(_expr, _pReasonFmt, ...) \ +do { \ + if (_expr) \ + PAL_TRIGGER_ALERT("Alert triggered: %s | Reason: " _pReasonFmt, #_expr, ##__VA_ARGS__); \ +} while (false) + +/// Calls the PAL_ALERT_MSG macro with a generic reason string +#define PAL_ALERT(_expr) PAL_ALERT_MSG(_expr, "%s", "Unknown") + +/// Convenience macro that asserts if something has never been tested. +#define PAL_NOT_TESTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code Not Tested! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NOT_TESTED() PAL_NOT_TESTED_MSG("%s", "Unknown") + +/// Convenience macro that asserts if something has not been implemented. +#define PAL_NOT_IMPLEMENTED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Not Implemented! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NOT_IMPLEMENTED() PAL_NOT_IMPLEMENTED_MSG("%s", "Unknown") + +/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. +#define PAL_NEVER_CALLED_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Code should never be called! | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_NEVER_CALLED() PAL_NEVER_CALLED_MSG("%s", "Unknown") + +/// Convenience macro that always asserts. Expect this to be used instead of PAL_ASSERT(false). +#define PAL_ASSERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ASSERT("Unconditional Assert | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_ASSERT_ALWAYS() PAL_ASSERT_ALWAYS_MSG("%s", "Unknown") + +/// Convenience macro that always alerts. Expect this to be used instead of PAL_ALERT(true). +#define PAL_ALERT_ALWAYS_MSG(_pReasonFmt, ...) PAL_TRIGGER_ALERT("Unconditional Alert | Reason: " _pReasonFmt, ##__VA_ARGS__) +#define PAL_ALERT_ALWAYS() PAL_ALERT_ALWAYS_MSG("%s", "Unknown") + +#else + +#define PAL_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_CONSTEXPR_ASSERT(_expr) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_CONSTEXPR_ASSERT_MSG(_expr, ...) PAL_ANALYSIS_ASSUME(_expr) +#define PAL_DEBUG_BUILD_ONLY_ASSERT(_expr) ((void)0) +#define PAL_ALERT(_expr) ((void)0) +#define PAL_ALERT_MSG(_expr, ...) ((void)0) +#define PAL_NOT_TESTED() [[unlikely]] ((void)0) +#define PAL_NOT_TESTED_MSG(...) [[unlikely]] ((void)0) +#define PAL_NOT_IMPLEMENTED() [[unlikely]] ((void)0) +#define PAL_NOT_IMPLEMENTED_MSG(...) [[unlikely]] ((void)0) +#define PAL_NEVER_CALLED() [[unlikely]] ((void)0) +#define PAL_NEVER_CALLED_MSG(...) [[unlikely]] ((void)0) +#define PAL_ASSERT_ALWAYS() [[unlikely]] ((void)0) +#define PAL_ASSERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) +#define PAL_ALERT_ALWAYS() [[unlikely]] ((void)0) +#define PAL_ALERT_ALWAYS_MSG(...) [[unlikely]] ((void)0) + +#endif + diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h b/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h index f9105cd62c..765a6b70e7 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palAutoBuffer.h @@ -1,212 +1,212 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palAutoBuffer.h - * @brief PAL utility collection AutoBuffer class definition. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palSpan.h" -#include "palSysMemory.h" - -namespace Util -{ - -/** - *********************************************************************************************************************** - * @brief Safe version of C99's variable-length arrays. - * - * The general idea is that this class encapsulates a variable-length array where we expect the size required to not - * exceed the 'defaultCapacity' template parameter most of the time. In those "normal" cases, this buffer will - * reference a static array of size 'defaultCapacity', but if the constructor's parameter exceeds defaultCapacity, then - * a dynamic array will be allocated from the heap to satisfy the space requirements. The destructor will clean-up any - * dynamic allocation made by the constructor. - * - * This class violates several PAL coding conventions, but for good reason: - * - * - We have overloaded the [] (array-element-accessor) operator to make using this class just like using a regular - * array, which it semantically represents. - * - In order to return array elements by-reference instead of by-value, we need to use C++ references in the - * overloaded operators because this is required by C++. - * - * This class __does not__ clear the contents of the static or dynamic arrays, for performance reasons. If a client - * needs the buffer to be cleared, it must do the memset itself. (However, if 'Item' is a class type rather than - * plain-old-data, the default c'tor will be invoked.) - *********************************************************************************************************************** - */ -template -class AutoBuffer -{ -public: - /// Constructor. - /// - /// The object is initialized to use the static array of items if the required capacity is less than or equal to the - /// default capacity. Otherwise, a larger array is allocated on the heap. - /// - /// @param [in] requiredCapacity Number of items actually required (unknown until runtime). - /// @param [in] pAllocator The allocator that will allocate memory if required. - AutoBuffer( - size_t requiredCapacity, - Allocator*const pAllocator) - : - m_capacity(requiredCapacity), - m_pBuffer(reinterpret_cast(m_localBuffer)), - m_pAllocator(pAllocator) - { - if (requiredCapacity > defaultCapacity) - { - // Create dynamically allocated array, by allocating memory and constructing its objects. - // On failure, to avoid subtle bugs from misuse, AutoBuffer will be in a zombie state with zero capacity. - m_pBuffer = PAL_NEW_ARRAY(Item, requiredCapacity, pAllocator, AllocInternalTemp); - if (m_pBuffer == nullptr) - { - m_capacity = 0; - } - } - else if (!std::is_trivial::value) - { - // Explicitly construct all objects of non-trivial type in the local buffer. - for (uint32 idx = 0; idx < m_capacity; ++idx) - { - PAL_PLACEMENT_NEW(m_pBuffer + idx) Item(); - } - } - } - - /// Destructor. - /// - /// Cleans up the dynamically allocated buffer if we allocated one. - ~AutoBuffer() - { - if (m_pBuffer != reinterpret_cast(m_localBuffer)) - { - // Destory dynamically allocated array, by destroying its objects and freeing memory. - PAL_SAFE_DELETE_ARRAY(m_pBuffer, m_pAllocator); - } - else if (!std::is_trivial::value) - { - // Explicitly destroy all objects of non-trivial type from the local buffer. - for (uint32 idx = 0; idx < m_capacity; ++idx) - { - m_pBuffer[idx].~Item(); - } - } - } - - /// Getter for the capacity of the buffer. - /// - /// Clients can use this function to determine if the constuctor's allocation succeeded. - /// - /// @returns Size of the array in bytes. Should match the requiredCapacity parameter passed to the constructor - /// unless a dynamic memory allocation failed. - constexpr size_t Capacity() const noexcept { return m_capacity; } - - /// Getter for the size of this buffer, in bytes. - constexpr size_t SizeBytes() const noexcept { return (sizeof(Item) * m_capacity); } - - /// Accessor for the nth element of this buffer. - const Item& operator[](size_t n) const - { - PAL_ASSERT(n < m_capacity); - return m_pBuffer[n]; - } - - /// Non-const accessor for the nth element of this buffer. - Item& operator[](size_t n) - { - PAL_ASSERT(n < m_capacity); - return m_pBuffer[n]; - } - - ///@{ - /// Implicitly gets the current contents of the buffer as a Span. - /// - /// @returns The contents of the buffer as a Span; same as Span(Data(), Size()). - operator Span() { return Span(Data(), Capacity()); } - operator Span() const { return Span(Data(), Capacity()); } - ///@} - - /// Returns pointer to the underlying buffer serving as data storage. - /// The returned pointer defines always valid range [Data(), Data() + Capacity()). - /// - /// @returns Pointer to the underlying data storage for read & write access. - /// The returned pointer contains address of the first element. - constexpr Item* Data() noexcept { return m_pBuffer; } - - /// Returns pointer to the underlying buffer serving as data storage. - /// The returned pointer defines always valid range [Data(), Data() + Capacity()), - /// even if the container is empty (Data() is not dereferenceable in that case). - /// - /// @returns Pointer to the underlying data storage for read only access. - /// The returned pointer contains address of the first element. - constexpr const Item* Data() const noexcept { return m_pBuffer; } - - ///@{ - /// @internal Satisfies concept `range_expression`, using Item* as `iterator` and 64-bit size and difference types - /// - /// @note - These are a convenience intended to be used by c++ language features such as range-based-for-loops. - using value_type = Item; - using reference = Item&; - using const_reference = const Item&; - using iterator = Item*; - using const_iterator = const Item*; - using difference_type = ptrdiff_t; - using size_type = size_t; - - constexpr iterator begin() noexcept { return Data(); } - constexpr iterator end() noexcept { return Data() + Capacity(); } - constexpr const_iterator begin() const noexcept { return Data(); } - constexpr const_iterator end() const noexcept { return Data() + Capacity(); } - constexpr const_iterator cbegin() const noexcept { return Data(); } - constexpr const_iterator cend() const noexcept { return Data() + Capacity(); } - [[nodiscard]] constexpr bool empty() const noexcept { return Capacity() == 0; } - constexpr size_type size() const noexcept { return Capacity(); } - ///@} - -private: - // This is a POD-type that exactly fits one Item value. - using ValueStorage = typename std::aligned_storage::type; - - // Capacity of this buffer (in Items). - size_t m_capacity; - - // Buffer pointer this object uses to access the buffer's elements: if the required capacity exceeds the default - // capacity, this points to a dynamic array of Items. Otherwise, this points to m_localBuffer. - Item* m_pBuffer; - - // Static array providing storage for Items which we expect most objects of this type to end up using. - ValueStorage m_localBuffer[defaultCapacity]; - - // Allocator for this AutoBuffer. - Allocator*const m_pAllocator; - - PAL_DISALLOW_DEFAULT_CTOR(AutoBuffer); - PAL_DISALLOW_COPY_AND_ASSIGN(AutoBuffer); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palAutoBuffer.h + * @brief PAL utility collection AutoBuffer class definition. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSpan.h" +#include "palSysMemory.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Safe version of C99's variable-length arrays. + * + * The general idea is that this class encapsulates a variable-length array where we expect the size required to not + * exceed the 'defaultCapacity' template parameter most of the time. In those "normal" cases, this buffer will + * reference a static array of size 'defaultCapacity', but if the constructor's parameter exceeds defaultCapacity, then + * a dynamic array will be allocated from the heap to satisfy the space requirements. The destructor will clean-up any + * dynamic allocation made by the constructor. + * + * This class violates several PAL coding conventions, but for good reason: + * + * - We have overloaded the [] (array-element-accessor) operator to make using this class just like using a regular + * array, which it semantically represents. + * - In order to return array elements by-reference instead of by-value, we need to use C++ references in the + * overloaded operators because this is required by C++. + * + * This class __does not__ clear the contents of the static or dynamic arrays, for performance reasons. If a client + * needs the buffer to be cleared, it must do the memset itself. (However, if 'Item' is a class type rather than + * plain-old-data, the default c'tor will be invoked.) + *********************************************************************************************************************** + */ +template +class AutoBuffer +{ +public: + /// Constructor. + /// + /// The object is initialized to use the static array of items if the required capacity is less than or equal to the + /// default capacity. Otherwise, a larger array is allocated on the heap. + /// + /// @param [in] requiredCapacity Number of items actually required (unknown until runtime). + /// @param [in] pAllocator The allocator that will allocate memory if required. + AutoBuffer( + size_t requiredCapacity, + Allocator*const pAllocator) + : + m_capacity(requiredCapacity), + m_pBuffer(reinterpret_cast(m_localBuffer)), + m_pAllocator(pAllocator) + { + if (requiredCapacity > defaultCapacity) + { + // Create dynamically allocated array, by allocating memory and constructing its objects. + // On failure, to avoid subtle bugs from misuse, AutoBuffer will be in a zombie state with zero capacity. + m_pBuffer = PAL_NEW_ARRAY(Item, requiredCapacity, pAllocator, AllocInternalTemp); + if (m_pBuffer == nullptr) + { + m_capacity = 0; + } + } + else if (!std::is_trivial::value) + { + // Explicitly construct all objects of non-trivial type in the local buffer. + for (uint32 idx = 0; idx < m_capacity; ++idx) + { + PAL_PLACEMENT_NEW(m_pBuffer + idx) Item(); + } + } + } + + /// Destructor. + /// + /// Cleans up the dynamically allocated buffer if we allocated one. + ~AutoBuffer() + { + if (m_pBuffer != reinterpret_cast(m_localBuffer)) + { + // Destory dynamically allocated array, by destroying its objects and freeing memory. + PAL_SAFE_DELETE_ARRAY(m_pBuffer, m_pAllocator); + } + else if (!std::is_trivial::value) + { + // Explicitly destroy all objects of non-trivial type from the local buffer. + for (uint32 idx = 0; idx < m_capacity; ++idx) + { + m_pBuffer[idx].~Item(); + } + } + } + + /// Getter for the capacity of the buffer. + /// + /// Clients can use this function to determine if the constuctor's allocation succeeded. + /// + /// @returns Size of the array in bytes. Should match the requiredCapacity parameter passed to the constructor + /// unless a dynamic memory allocation failed. + constexpr size_t Capacity() const noexcept { return m_capacity; } + + /// Getter for the size of this buffer, in bytes. + constexpr size_t SizeBytes() const noexcept { return (sizeof(Item) * m_capacity); } + + /// Accessor for the nth element of this buffer. + const Item& operator[](size_t n) const + { + PAL_ASSERT(n < m_capacity); + return m_pBuffer[n]; + } + + /// Non-const accessor for the nth element of this buffer. + Item& operator[](size_t n) + { + PAL_ASSERT(n < m_capacity); + return m_pBuffer[n]; + } + + ///@{ + /// Implicitly gets the current contents of the buffer as a Span. + /// + /// @returns The contents of the buffer as a Span; same as Span(Data(), Size()). + operator Span() { return Span(Data(), Capacity()); } + operator Span() const { return Span(Data(), Capacity()); } + ///@} + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Capacity()). + /// + /// @returns Pointer to the underlying data storage for read & write access. + /// The returned pointer contains address of the first element. + constexpr Item* Data() noexcept { return m_pBuffer; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Capacity()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @returns Pointer to the underlying data storage for read only access. + /// The returned pointer contains address of the first element. + constexpr const Item* Data() const noexcept { return m_pBuffer; } + + ///@{ + /// @internal Satisfies concept `range_expression`, using Item* as `iterator` and 64-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as range-based-for-loops. + using value_type = Item; + using reference = Item&; + using const_reference = const Item&; + using iterator = Item*; + using const_iterator = const Item*; + using difference_type = ptrdiff_t; + using size_type = size_t; + + constexpr iterator begin() noexcept { return Data(); } + constexpr iterator end() noexcept { return Data() + Capacity(); } + constexpr const_iterator begin() const noexcept { return Data(); } + constexpr const_iterator end() const noexcept { return Data() + Capacity(); } + constexpr const_iterator cbegin() const noexcept { return Data(); } + constexpr const_iterator cend() const noexcept { return Data() + Capacity(); } + [[nodiscard]] constexpr bool empty() const noexcept { return Capacity() == 0; } + constexpr size_type size() const noexcept { return Capacity(); } + ///@} + +private: + // This is a POD-type that exactly fits one Item value. + using ValueStorage = typename std::aligned_storage::type; + + // Capacity of this buffer (in Items). + size_t m_capacity; + + // Buffer pointer this object uses to access the buffer's elements: if the required capacity exceeds the default + // capacity, this points to a dynamic array of Items. Otherwise, this points to m_localBuffer. + Item* m_pBuffer; + + // Static array providing storage for Items which we expect most objects of this type to end up using. + ValueStorage m_localBuffer[defaultCapacity]; + + // Allocator for this AutoBuffer. + Allocator*const m_pAllocator; + + PAL_DISALLOW_DEFAULT_CTOR(AutoBuffer); + PAL_DISALLOW_COPY_AND_ASSIGN(AutoBuffer); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h index 139566961b..3f3a9b8628 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocator.h @@ -1,202 +1,202 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palBuddyAllocator.h - * @brief PAL utility BuddyAllocator class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palUtil.h" -#include "palHashSet.h" -#include "palHashMap.h" -#include "palMutex.h" - -namespace Util -{ - -/** - *********************************************************************************************************************** - * @brief Buddy Allocator - * - * Responsible for managing small GPU memory requests by allocating a large base allocation and dividing it into - * appropriately sized suballocation blocks. - *********************************************************************************************************************** - */ -template -class BuddyAllocator -{ -public: - /// Constructor. - /// - /// @param [in] pAllocator The allocator that will allocate memory if required. - /// @param [in] baseAllocSize The size of the base allocation this buddy allocator suballocates. - /// @param [in] minAllocSize The size of the smallest block this buddy allocator can allocate. - BuddyAllocator( - Allocator* pAllocator, - gpusize baseAllocSize, - gpusize minAllocSize); - ~BuddyAllocator(); - - /// Initializes the buddy allocator. - /// - /// @returns Success if the buddy allocator has been successfully initialized. - Result Init(); - - /// Suballocates a block from the base allocation that this buddy allocator manages. Expects @ref ClaimGpuMemory to - /// be called directly before it. If a memory manager with multiple buddyAllocators is used, use pattern should - /// be: Iterate through buddyAllocators calling ClaimGpuMemory, if one returns @ref Success break out of the loop, - /// then call Allocate on that buddyAllocator. If none return @ref Success, then a new buddyAllocator needs to be - /// created. The purpose of splitting up buddyAllocator selection and Allocation is to reduce lock contention in - /// multithreaded memory managers. - /// - /// @param [in] size The size of the requested suballocation. - /// @param [in] alignment The alignment requirements of the requested suballocation. - /// @param [out] pOffset The offset the suballocated block starts within the base allocation. - /// - /// @returns Success if the allocation succeeded, @ref ErrorOutOfMemory if there isn't enough system memory to - /// fulfill the request, or @ref ErrorOutOfGpuMemory if there isn't a large enough block free in the - /// base allocation to fulfill the request. - /// - /// @warning Unless @ref ClaimGpuMemory is called before every single call, the results of @ref Allocate will - /// be invalid. If @ref ClaimGpuMemory returns @ref Success, then @ref ErrorOutOfGpuMemory will never be - /// returned. - Result Allocate( - gpusize size, - gpusize alignment, - gpusize* pOffset); - - /// Frees a previously allocated suballocation. - /// - /// @param [in] offset The offset the suballocated block starts within the base allocation. - /// @param [in] size Optional parameter specifying the size of the original allocation. - /// @param [in] alignment Optional parameter specifying the alignment of the original allocation. - void Free( - gpusize offset, - gpusize size = 0, - gpusize alignment = 0); - - /// Tells whether the base allocation is completely free. If the returned value is true then the caller is safe - /// to deallocate the base allocation. - bool IsEmpty() const - { - return (m_numSuballocations == 0); - } - - /// Returns the size of the largest allocation that can be suballocated with this buddy allocator. - gpusize MaximumAllocationSize() const; - - /// Claims (doesn't allocate) some memory, used to quickly determine if a pool of memory has availible memory. - /// Doesn't affect internal state unless Result::Success is returned - /// - /// @param [in] size The size of the requested suballocation. - /// @param [in] alignment The alignment requirements of the requested suballocation. - /// - /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, - /// @ref ErrorOutOfGpuMemory if there is not enough memory - /// - /// @warning Unless this is called to test availible memory before every call to Allocate, then the results will not - /// be valid. - Result ClaimGpuMemory( - gpusize size, - gpusize alignment); - - /// Checks if @ref ClaimGpuMemory can actually claim memory, can be used to find the best fit pool. This function - /// does NOT acquire a lock on the structures ClaimGpuMemory uses, and does NOT claim or allocate the memory. - /// - /// @param [in] size The size of the requested suballocation. - /// @param [in] alignment The alignment requirements of the requested suballocation. - /// @param [out] pKval The highest kval that will need to be split will be stored here. - /// - /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, - /// @ref ErrorOutOfGpuMemory if there is not enough memory - /// - Result CheckIfOpenMemory( - gpusize size, - gpusize alignment, - uint32* pKval); - -private: - typedef Util::HashSet FreeSet; - typedef Util::HashMap UsedMap; - - Result GetNextFreeBlock( - uint32 kval, - gpusize* pOffset); - - Result FreeBlock(gpusize offset); - - static constexpr gpusize KvalToSize(uint32 kVal) { return (1ull << kVal); } - - static uint32 SizeToKval(gpusize size) { return Log2(size); } - - Allocator* const m_pAllocator; - - const uint32 m_baseAllocKval; - const uint32 m_minKval; - - // Array of hashSets of blocks that are free at each level - FreeSet* m_pFreeBlockSets; - - // Hashmap of blocks that are used, key=offset, value=level (kval) - UsedMap* m_pUsedBlockMap; - // List of the free memory at each level - uint32* m_pNumFreeList; - // The highest Kval that has at least 1 free block (used in ClaimGpuMemory) - uint32 m_highestFreeKval; - - uint32 m_numSuballocations; - - // mutex on altering the numFreeList - Util::Mutex m_numFreeMutex; - // mutex on the used block map - Util::Mutex m_usedBlockMapMutex; - // array of mutexes, one for each freeBlockSet - Util::Mutex* m_pFreeSetMutexes; - // mutex on the freeing. Serialize freeing blocks and don't allow allocating blocks while one is freeing. Based on - // testing, applications typically don't try to free and allocate memory at the same time, and almost all of the - // memory freeing is done at the end of the application. - Util::RWLock m_freeLock; - - // Set to true if ClaimGpuMemory is ever called on this buddyAllocator. This signals to free to not merge blocks - // if m_pNumFreeList[kval - m_minKval] = 0 - bool m_usedClaim; - - // HashSet and HashMap utility functions - Result InsertToFreeSet(gpusize offset, uint32 kval); - bool GetKvalUsed(gpusize offset, uint32* pKval); - Result SetKvalUsed(gpusize offset, uint32 kval); - Result PopFromFreeSet(gpusize* pOffset, uint32 kval); - bool IsOffsetFree(gpusize offset, uint32 kval); - Result RemoveOffsetFromFreeSet(gpusize offset, uint32 kval); - Result RemoveOffsetFromUsedMap(gpusize offset); - - PAL_DISALLOW_COPY_AND_ASSIGN(BuddyAllocator); - PAL_DISALLOW_DEFAULT_CTOR(BuddyAllocator); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palBuddyAllocator.h + * @brief PAL utility BuddyAllocator class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include "palHashSet.h" +#include "palHashMap.h" +#include "palMutex.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Buddy Allocator + * + * Responsible for managing small GPU memory requests by allocating a large base allocation and dividing it into + * appropriately sized suballocation blocks. + *********************************************************************************************************************** + */ +template +class BuddyAllocator +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + /// @param [in] baseAllocSize The size of the base allocation this buddy allocator suballocates. + /// @param [in] minAllocSize The size of the smallest block this buddy allocator can allocate. + BuddyAllocator( + Allocator* pAllocator, + gpusize baseAllocSize, + gpusize minAllocSize); + ~BuddyAllocator(); + + /// Initializes the buddy allocator. + /// + /// @returns Success if the buddy allocator has been successfully initialized. + Result Init(); + + /// Suballocates a block from the base allocation that this buddy allocator manages. Expects @ref ClaimGpuMemory to + /// be called directly before it. If a memory manager with multiple buddyAllocators is used, use pattern should + /// be: Iterate through buddyAllocators calling ClaimGpuMemory, if one returns @ref Success break out of the loop, + /// then call Allocate on that buddyAllocator. If none return @ref Success, then a new buddyAllocator needs to be + /// created. The purpose of splitting up buddyAllocator selection and Allocation is to reduce lock contention in + /// multithreaded memory managers. + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// @param [out] pOffset The offset the suballocated block starts within the base allocation. + /// + /// @returns Success if the allocation succeeded, @ref ErrorOutOfMemory if there isn't enough system memory to + /// fulfill the request, or @ref ErrorOutOfGpuMemory if there isn't a large enough block free in the + /// base allocation to fulfill the request. + /// + /// @warning Unless @ref ClaimGpuMemory is called before every single call, the results of @ref Allocate will + /// be invalid. If @ref ClaimGpuMemory returns @ref Success, then @ref ErrorOutOfGpuMemory will never be + /// returned. + Result Allocate( + gpusize size, + gpusize alignment, + gpusize* pOffset); + + /// Frees a previously allocated suballocation. + /// + /// @param [in] offset The offset the suballocated block starts within the base allocation. + /// @param [in] size Optional parameter specifying the size of the original allocation. + /// @param [in] alignment Optional parameter specifying the alignment of the original allocation. + void Free( + gpusize offset, + gpusize size = 0, + gpusize alignment = 0); + + /// Tells whether the base allocation is completely free. If the returned value is true then the caller is safe + /// to deallocate the base allocation. + bool IsEmpty() const + { + return (m_numSuballocations == 0); + } + + /// Returns the size of the largest allocation that can be suballocated with this buddy allocator. + gpusize MaximumAllocationSize() const; + + /// Claims (doesn't allocate) some memory, used to quickly determine if a pool of memory has availible memory. + /// Doesn't affect internal state unless Result::Success is returned + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// + /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, + /// @ref ErrorOutOfGpuMemory if there is not enough memory + /// + /// @warning Unless this is called to test availible memory before every call to Allocate, then the results will not + /// be valid. + Result ClaimGpuMemory( + gpusize size, + gpusize alignment); + + /// Checks if @ref ClaimGpuMemory can actually claim memory, can be used to find the best fit pool. This function + /// does NOT acquire a lock on the structures ClaimGpuMemory uses, and does NOT claim or allocate the memory. + /// + /// @param [in] size The size of the requested suballocation. + /// @param [in] alignment The alignment requirements of the requested suballocation. + /// @param [out] pKval The highest kval that will need to be split will be stored here. + /// + /// @returns Success if there is enough memory in this buddyAllocator to allocate the requested size of memory, + /// @ref ErrorOutOfGpuMemory if there is not enough memory + /// + Result CheckIfOpenMemory( + gpusize size, + gpusize alignment, + uint32* pKval); + +private: + typedef Util::HashSet FreeSet; + typedef Util::HashMap UsedMap; + + Result GetNextFreeBlock( + uint32 kval, + gpusize* pOffset); + + Result FreeBlock(gpusize offset); + + static constexpr gpusize KvalToSize(uint32 kVal) { return (1ull << kVal); } + + static uint32 SizeToKval(gpusize size) { return Log2(size); } + + Allocator* const m_pAllocator; + + const uint32 m_baseAllocKval; + const uint32 m_minKval; + + // Array of hashSets of blocks that are free at each level + FreeSet* m_pFreeBlockSets; + + // Hashmap of blocks that are used, key=offset, value=level (kval) + UsedMap* m_pUsedBlockMap; + // List of the free memory at each level + uint32* m_pNumFreeList; + // The highest Kval that has at least 1 free block (used in ClaimGpuMemory) + uint32 m_highestFreeKval; + + uint32 m_numSuballocations; + + // mutex on altering the numFreeList + Util::Mutex m_numFreeMutex; + // mutex on the used block map + Util::Mutex m_usedBlockMapMutex; + // array of mutexes, one for each freeBlockSet + Util::Mutex* m_pFreeSetMutexes; + // mutex on the freeing. Serialize freeing blocks and don't allow allocating blocks while one is freeing. Based on + // testing, applications typically don't try to free and allocate memory at the same time, and almost all of the + // memory freeing is done at the end of the application. + Util::RWLock m_freeLock; + + // Set to true if ClaimGpuMemory is ever called on this buddyAllocator. This signals to free to not merge blocks + // if m_pNumFreeList[kval - m_minKval] = 0 + bool m_usedClaim; + + // HashSet and HashMap utility functions + Result InsertToFreeSet(gpusize offset, uint32 kval); + bool GetKvalUsed(gpusize offset, uint32* pKval); + Result SetKvalUsed(gpusize offset, uint32 kval); + Result PopFromFreeSet(gpusize* pOffset, uint32 kval); + bool IsOffsetFree(gpusize offset, uint32 kval); + Result RemoveOffsetFromFreeSet(gpusize offset, uint32 kval); + Result RemoveOffsetFromUsedMap(gpusize offset); + + PAL_DISALLOW_COPY_AND_ASSIGN(BuddyAllocator); + PAL_DISALLOW_DEFAULT_CTOR(BuddyAllocator); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h index 225d03cf77..18f4839c89 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palBuddyAllocatorImpl.h @@ -1,607 +1,607 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palBuddyAllocatorImpl.h - * @brief PAL utility BuddyAllocator class implementation. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palBuddyAllocator.h" -#include "palHashMapImpl.h" -#include "palHashSetImpl.h" -#include "palInlineFuncs.h" -#include "palSysMemory.h" - -namespace Util -{ - -// ===================================================================================================================== -template -BuddyAllocator::BuddyAllocator( - Allocator* pAllocator, - gpusize baseAllocSize, - gpusize minAllocSize) - : - m_pAllocator(pAllocator), - m_baseAllocKval(SizeToKval(baseAllocSize)), - m_minKval(SizeToKval(minAllocSize)), - m_pFreeBlockSets(nullptr), - m_pUsedBlockMap(nullptr), - m_pNumFreeList(nullptr), - m_numSuballocations(0), - m_pFreeSetMutexes(nullptr), - m_usedClaim(false) -{ - // Allocator must be non-null - PAL_ASSERT(m_pAllocator != nullptr); - - // Base allocation size must be POT - PAL_ASSERT(KvalToSize(m_baseAllocKval) == baseAllocSize); - - // Minimum allocation size must be POT - PAL_ASSERT(KvalToSize(m_minKval) == minAllocSize); -} - -// ===================================================================================================================== -template -BuddyAllocator::~BuddyAllocator() -{ - // lock this here to ensure no other thread was doing anything with the buddyAllocator when the destructor is called - RWLockAuto freeLock(&m_freeLock); - if (m_pFreeBlockSets != nullptr) - { - const uint32 numKvals = m_baseAllocKval - m_minKval; - for (uint32 i = 0; i < numKvals; ++i) - { - // Call the destructor - m_pFreeBlockSets[i].~HashSet(); - } - - // Free the block list array - PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); - } - if (m_pUsedBlockMap != nullptr) - { - PAL_SAFE_DELETE(m_pUsedBlockMap, m_pAllocator); - } - if (m_pNumFreeList != nullptr) - { - PAL_SAFE_DELETE_ARRAY(m_pNumFreeList, m_pAllocator); - } - if (m_pFreeSetMutexes != nullptr) - { - PAL_SAFE_DELETE_ARRAY(m_pFreeSetMutexes, m_pAllocator); - } -} - -// ===================================================================================================================== -// Gets maximum allocation size supported by this buddy allocator. -template -gpusize BuddyAllocator::MaximumAllocationSize() const -{ - // NOTE: Report one less than our base allocation k-value because there's no sense in suballocating a memory - // request which is larger than half a chunk - return KvalToSize(m_baseAllocKval - 1); -} - -// ===================================================================================================================== -// Initializes the buddy allocator. -template -Result BuddyAllocator::Init() -{ - PAL_ASSERT(m_pFreeBlockSets == nullptr); - PAL_ASSERT(m_pUsedBlockMap == nullptr); - PAL_ASSERT(m_pNumFreeList == nullptr); - PAL_ASSERT(m_pFreeSetMutexes == nullptr); - - // start out with success and take it away if something fails. - Result result = Result::Success; - - const uint32 numKvals = m_baseAllocKval - m_minKval; - - // one hashSet per kval - m_pFreeBlockSets = static_cast(PAL_MALLOC(sizeof(FreeSet) * numKvals, - m_pAllocator, - AllocInternal)); - // Initialize the hashSets. - if (m_pFreeBlockSets != nullptr) - { - for (uint32 i = 0; i < numKvals; ++i) - { - // max number of entries at a level is: 2^distFromTop - const uint32 maxEntriesKval = 1 << (m_baseAllocKval - (i + m_minKval)); - // 32 is a suitable max, however its the higher kvals won't even need 32 buckets. - const uint32 bucketsNeeded = Min(maxEntriesKval / (PAL_CACHE_LINE_BYTES) + 1, 32u); - - PAL_PLACEMENT_NEW(&m_pFreeBlockSets[i]) FreeSet(bucketsNeeded, m_pAllocator); - result = m_pFreeBlockSets[i].Init(); - - // if we failed the Init of the hashSet, delete the ones we did create, and free the array. This avoids - // having to keep track of the hashSets we did initialize in the destructor by just destroying it here. - if (result != Result::Success) - { - for (uint32 j = 0; j <= i; j++) - { - m_pFreeBlockSets[j].~HashSet(); - } - PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); - break; - } - } - } - else - { - result = Result::ErrorOutOfMemory; - } - - if (result == Result::Success) - { - m_pNumFreeList = static_cast(PAL_NEW_ARRAY(uint32, numKvals, m_pAllocator, AllocInternal)); - if (m_pNumFreeList == nullptr) - { - result = Result::ErrorOutOfMemory; - } - } - - if (result == Result::Success) - { - m_pFreeSetMutexes = static_cast(PAL_NEW_ARRAY(Mutex, numKvals, m_pAllocator, AllocInternal)); - if (m_pFreeSetMutexes == nullptr) - { - result = Result::ErrorOutOfMemory; - } - } - - const uint32 maxUsedEntries = 1 << (m_baseAllocKval - m_minKval); - const uint32 usedBucketsNeeded = maxUsedEntries / (PAL_CACHE_LINE_BYTES * 8) + 1; - - if (result == Result::Success) - { - // one hashMap for getting the kval a used block is at - m_pUsedBlockMap = static_cast(PAL_NEW(UsedMap, m_pAllocator, AllocInternal) - (usedBucketsNeeded, m_pAllocator)); - if (m_pUsedBlockMap != nullptr) - { - result = m_pUsedBlockMap->Init(); - } - else - { - result = Result::ErrorOutOfMemory; - } - } - - // if we successfully allocated all the memory we need, create the first two free blocks. - if (result == Result::Success) - { - memset(m_pNumFreeList, 0, sizeof(uint32) * numKvals); - // We need to create the first two largest-size blocks and add them to the last block list - const uint32 blockKval = (m_baseAllocKval - 1); - const gpusize blockSize = KvalToSize(blockKval); - FreeSet* pTopFreeSet = &m_pFreeBlockSets[blockKval - m_minKval]; - - // mark both of these as free blocks - result = pTopFreeSet->Insert(0); - if (result == Result::Success) - { - // even though this will never be reached, to pass the asserts, this needs to be - // as this kval - result = m_pUsedBlockMap->Insert(0, blockKval + 1); - } - if (result == Result::Success) - { - result = pTopFreeSet->Insert(blockSize); - } - m_pNumFreeList[blockKval - m_minKval] = 2; - m_highestFreeKval = blockKval; - } - PAL_ALERT(result != Result::Success); - return result; -} - -// ===================================================================================================================== -// Suballocates a block from the base allocation that this buddy allocator manages. If no free space is found then an -// appropriate error is returned. -// In order for m_pNumFreeList bookkeeping to be correct, ClaimGpuMemory MUST be called directly before this call to -// Allocate. The buddyAllocator will still work without this, but the results of ClaimGpuMemory will not be correct. -// unless it is called before every call to Allocate. -template -Result BuddyAllocator::Allocate( - gpusize size, - gpusize alignment, - gpusize* pOffset) -{ - PAL_ASSERT(m_pFreeBlockSets != nullptr); - PAL_ASSERT(m_pUsedBlockMap != nullptr); - PAL_ASSERT(m_pNumFreeList != nullptr); - PAL_ASSERT(m_pFreeSetMutexes != nullptr); - PAL_ASSERT(pOffset != nullptr); - PAL_ASSERT(size <= MaximumAllocationSize()); - - // Pad the requested allocation size to the nearest POT of the size and alignment - const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); - - RWLockAuto freeLock(&m_freeLock); - Result result = GetNextFreeBlock(kval, pOffset); - // mark this kval as used here. - if (result == Result::Success) - { - result = SetKvalUsed(*pOffset, kval); - } - - if (result == Result::Success) - { - // Increment the number of suballocations this buddy allocator manages - AtomicIncrement(&m_numSuballocations); - } - return result; -} - -// ===================================================================================================================== -// Gets the next free block by recursively dividing larger blocks until a suitible sized block is created. -template -Result BuddyAllocator::GetNextFreeBlock( - uint32 kval, - gpusize* pOffset) -{ - Result result = Result::ErrorOutOfGpuMemory; - if (kval < m_baseAllocKval) - { - // this lock can not get any more fine grained - MutexAuto freeSetLock(&(m_pFreeSetMutexes[kval - m_minKval])); - result = PopFromFreeSet(pOffset, kval); - - if (result == Result::ErrorOutOfGpuMemory) - { // we didn't find a block at this kval, search the next level up - result = GetNextFreeBlock(kval + 1, pOffset); - - if (result == Result::Success) - { - // insert our buddy to the free set - gpusize buddyOffset = *pOffset + KvalToSize(kval); - result = InsertToFreeSet(buddyOffset, kval); - PAL_ASSERT(result == Result::Success); - } - } - else - { - // only two valid options are ErrorOutOfGpuMemory and Success, other result means the hashing failed. - PAL_ASSERT(result == Result::Success); - } - } - PAL_ALERT_MSG(result != Result::Success, - "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); - return result; -} - -// ===================================================================================================================== -// Frees the memory at the given offset, if it's buddy is also free, merges the two and recursively calls this again. -// This doesn't need any internal locks because Free accquires an exclusive lock on the entire allocator (freeLock), and -// the lock on the m_pNumFreeList. These locks could potentially be more fine grained, however freeing and allocating -// don't typically happen at the same time, and Freeing is already much faster than allocating. -template -Result BuddyAllocator::FreeBlock( - gpusize offset) -{ - Result result = Result::ErrorUnknown; - uint32 usedKval; - bool offsetUsed = GetKvalUsed(offset, &usedKval); - PAL_ASSERT(offsetUsed); - PAL_ASSERT(usedKval >= m_minKval && usedKval < m_baseAllocKval); - - gpusize buddyOffset = offset ^ KvalToSize(usedKval); - gpusize offsetUp = Min(offset, buddyOffset); - - // we don't want merge if we are on the top level. We also don't want to merge if a call to claim was made that - // claimed the buddy we are about to free. - if (IsOffsetFree(buddyOffset, usedKval) && (usedKval < m_baseAllocKval -1) && - ((m_pNumFreeList[usedKval - m_minKval] > 0) || (m_usedClaim == false))) - { // We can combine the two blocks and mark the one in the level above as free - // And do this recursively - result = RemoveOffsetFromFreeSet(buddyOffset, usedKval); - if (result == Result::Success) - { - // even though the block is going to be freed, need to set the kval as used - // so that on the recursive call it will be found and freed again. - PAL_ASSERT_MSG((m_pNumFreeList[usedKval - m_minKval] != 0) || (m_usedClaim == false), - "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); - m_pNumFreeList[usedKval - m_minKval] -= 1; - result = SetKvalUsed(offsetUp, usedKval + 1); - } - // if this offset isn't the one that will be set as free in the next level up, we just need to remove it. - if ((result == Result::Success) && (offset != offsetUp)) - { - result = RemoveOffsetFromUsedMap(offset); - } - if (result == Result::Success) - { - result = FreeBlock(offsetUp); - } - } - else - { // We mark this block as free in this level - result = InsertToFreeSet(offset, usedKval); - - if (result == Result::Success) - { - m_pNumFreeList[usedKval - m_minKval] += 1; - m_highestFreeKval = Util::Max(usedKval, m_highestFreeKval); - if (offsetUp == offset) - { // if on the same offset as level up, move where the used block is - result = SetKvalUsed(offsetUp, usedKval + 1); - } - else - { // if at the top of this offset, remove is from used map - result = RemoveOffsetFromUsedMap(offset); - } - } - } - return result; -} -// ===================================================================================================================== -// Frees a suballocated block making it available for future re-use. -template -void BuddyAllocator::Free( - gpusize offset, - gpusize size, - gpusize alignment) -{ - RWLockAuto freeLock(&m_freeLock); - MutexAuto numFreeMutex(&m_numFreeMutex); - - PAL_ASSERT(m_pFreeBlockSets != nullptr); - PAL_ASSERT(m_pUsedBlockMap != nullptr); - PAL_ASSERT(m_pNumFreeList != nullptr); - PAL_ASSERT(m_pFreeSetMutexes != nullptr); - - Result result = FreeBlock(offset); - - // Freeing should always succeed unless something went wrong with the allocation scheme - PAL_ASSERT(result == Result::Success); - - // Decrement the number of suballocations this buddy allocator manages - AtomicDecrement(&m_numSuballocations); -} - -// ===================================================================================================================== -// Claims the memory that will be used when Allocate is called. -// Returns ErrorOutOfGpuMemory if this buddyAllocator has no free blocks, otherwise returns Success. -template -Result BuddyAllocator::ClaimGpuMemory( - gpusize size, - gpusize alignment) -{ - // Set this to true as soon as the first call to claim is done to signal to Free that claim is being used. - m_usedClaim = true; - - PAL_ASSERT(m_pNumFreeList != nullptr); - // Pad the requested allocation size to the nearest POT of the size and alignment - uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); - PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); - - Result result = Result::ErrorOutOfGpuMemory; - - // Do this check twice to avoid taking the lock at all if we have no chance of Claiming the memory. This will stop - // this thread from locking on this, as well as other threads from waiting longer for no reason. - if (kval <= m_highestFreeKval) - { - MutexAuto numFreeLock(&m_numFreeMutex); - if (kval <= m_highestFreeKval) - { - PAL_ASSERT(m_pNumFreeList[m_highestFreeKval - m_minKval] != 0); - result = Result::Success; - // First we add one to each level for every buddy we'll insert - while (m_pNumFreeList[kval - m_minKval] == 0) - { - m_pNumFreeList[kval - m_minKval] += 1; - kval++; - } - - PAL_ASSERT(kval <= m_highestFreeKval); - PAL_ASSERT_MSG(m_pNumFreeList[kval - m_minKval] > 0, - "This should only fail if ClaimGpuMemory() is not called before every call to Allocate()."); - // Then we subtract one for the block we will use or split to the lower level - m_pNumFreeList[kval - m_minKval] -= 1; - - PAL_ASSERT(m_highestFreeKval >= m_minKval); - while (m_pNumFreeList[m_highestFreeKval - m_minKval] == 0) - { - m_highestFreeKval--; - // in this case, there will be no more space left on the entire buddyAllocator - if (m_highestFreeKval < m_minKval) - { - break; - } - } - } - } - return result; -} - -// ===================================================================================================================== -// Used to search through pools before claiming memory to find the one that will fragment the least. pKval will have -// be the highest level needed to be split up for this pool, so the pool with the lowest value will be best. Can NOT -// guarantee the memory will still be availible by the time this thread calls ClaimGpuMemory. -template -Result BuddyAllocator::CheckIfOpenMemory( - gpusize size, - gpusize alignment, - uint32* pKval) -{ - PAL_ASSERT(m_pNumFreeList != nullptr); - // Pad the requested allocation size to the nearest POT of the size and alignment - const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); - PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); - - Result result = Result::ErrorOutOfGpuMemory; - if ((kval <= m_highestFreeKval)) - { - result = Result::Success; - if (pKval != nullptr) - { - uint32 topKval = kval; - for (; topKval < m_baseAllocKval; topKval++) - { - if (m_pNumFreeList[topKval - m_minKval] != 0) - { - *pKval = topKval; - break; - } - } - } - } - return result; -} - -// Hashset helper functions. -// ===================================================================================================================== -template -Result BuddyAllocator::InsertToFreeSet( - gpusize offset, - uint32 kval) -{ - FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; - PAL_ASSERT(pFreeSet->Contains(offset) == false); - Result result = pFreeSet->Insert(offset); - - return result; -} - -// ===================================================================================================================== -template -bool BuddyAllocator::GetKvalUsed( - gpusize offset, - uint32* pKval) -{ - bool isUsed; - MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); - uint32* usedKval = m_pUsedBlockMap->FindKey(offset); - if (usedKval == nullptr) - { - isUsed = false; - } - else - { - isUsed = true; - if (pKval != nullptr) - { - *pKval = *usedKval; - } - } - return isUsed; -} - -// ===================================================================================================================== -template -Result BuddyAllocator::SetKvalUsed( - gpusize offset, - uint32 kval) -{ - uint32* pKval; - bool existed; - MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); - Result result = m_pUsedBlockMap->FindAllocate(offset, &existed, &pKval); - if (result == Result::Success) - { - *pKval = kval; - } - PAL_ASSERT(result == Result::Success); - return result; -} - -// ===================================================================================================================== -// If there are free blocks at this level, removes one, if not, returns Result::ErrorOutOfGpuMemory -template -Result BuddyAllocator::PopFromFreeSet( - gpusize* pOffset, - uint32 kval) -{ - Result result = Result::ErrorUnknown; - - FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; - PAL_ASSERT(pFreeSet != nullptr); - - auto freeSetIt = pFreeSet->Begin(); - if (freeSetIt.Get() != nullptr) - { - *pOffset = freeSetIt.Get()->key; - bool eraseRes = pFreeSet->Erase(*pOffset); - if (eraseRes) - { - result = Result::Success; - } - else - { - // we got the offset from the iterator, no reason for it to fail. - PAL_ASSERT_ALWAYS(); - } - } - else - { - result = Result::ErrorOutOfGpuMemory; - } - return result; -} - -// ===================================================================================================================== -template -bool BuddyAllocator::IsOffsetFree( - gpusize offset, - uint32 kval) -{ - bool isIn = m_pFreeBlockSets[kval - m_minKval].Contains(offset); - - return isIn; -} - -// ===================================================================================================================== -template -Result BuddyAllocator::RemoveOffsetFromFreeSet( - gpusize offset, - uint32 kval) -{ - FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; - bool eraseRes = pFreeSet->Erase(offset); - return (eraseRes) ? Result::Success : Result::ErrorInvalidValue; -} - -// ===================================================================================================================== -template -Result BuddyAllocator::RemoveOffsetFromUsedMap( - gpusize offset) -{ - Result result = Result::Success; - MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); - bool removeRes = m_pUsedBlockMap->Erase(offset); - - if (removeRes == false) - { - result = Result::ErrorInvalidValue; - } - return result; -} -} // Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palBuddyAllocatorImpl.h + * @brief PAL utility BuddyAllocator class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palBuddyAllocator.h" +#include "palHashMapImpl.h" +#include "palHashSetImpl.h" +#include "palInlineFuncs.h" +#include "palSysMemory.h" + +namespace Util +{ + +// ===================================================================================================================== +template +BuddyAllocator::BuddyAllocator( + Allocator* pAllocator, + gpusize baseAllocSize, + gpusize minAllocSize) + : + m_pAllocator(pAllocator), + m_baseAllocKval(SizeToKval(baseAllocSize)), + m_minKval(SizeToKval(minAllocSize)), + m_pFreeBlockSets(nullptr), + m_pUsedBlockMap(nullptr), + m_pNumFreeList(nullptr), + m_numSuballocations(0), + m_pFreeSetMutexes(nullptr), + m_usedClaim(false) +{ + // Allocator must be non-null + PAL_ASSERT(m_pAllocator != nullptr); + + // Base allocation size must be POT + PAL_ASSERT(KvalToSize(m_baseAllocKval) == baseAllocSize); + + // Minimum allocation size must be POT + PAL_ASSERT(KvalToSize(m_minKval) == minAllocSize); +} + +// ===================================================================================================================== +template +BuddyAllocator::~BuddyAllocator() +{ + // lock this here to ensure no other thread was doing anything with the buddyAllocator when the destructor is called + RWLockAuto freeLock(&m_freeLock); + if (m_pFreeBlockSets != nullptr) + { + const uint32 numKvals = m_baseAllocKval - m_minKval; + for (uint32 i = 0; i < numKvals; ++i) + { + // Call the destructor + m_pFreeBlockSets[i].~HashSet(); + } + + // Free the block list array + PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); + } + if (m_pUsedBlockMap != nullptr) + { + PAL_SAFE_DELETE(m_pUsedBlockMap, m_pAllocator); + } + if (m_pNumFreeList != nullptr) + { + PAL_SAFE_DELETE_ARRAY(m_pNumFreeList, m_pAllocator); + } + if (m_pFreeSetMutexes != nullptr) + { + PAL_SAFE_DELETE_ARRAY(m_pFreeSetMutexes, m_pAllocator); + } +} + +// ===================================================================================================================== +// Gets maximum allocation size supported by this buddy allocator. +template +gpusize BuddyAllocator::MaximumAllocationSize() const +{ + // NOTE: Report one less than our base allocation k-value because there's no sense in suballocating a memory + // request which is larger than half a chunk + return KvalToSize(m_baseAllocKval - 1); +} + +// ===================================================================================================================== +// Initializes the buddy allocator. +template +Result BuddyAllocator::Init() +{ + PAL_ASSERT(m_pFreeBlockSets == nullptr); + PAL_ASSERT(m_pUsedBlockMap == nullptr); + PAL_ASSERT(m_pNumFreeList == nullptr); + PAL_ASSERT(m_pFreeSetMutexes == nullptr); + + // start out with success and take it away if something fails. + Result result = Result::Success; + + const uint32 numKvals = m_baseAllocKval - m_minKval; + + // one hashSet per kval + m_pFreeBlockSets = static_cast(PAL_MALLOC(sizeof(FreeSet) * numKvals, + m_pAllocator, + AllocInternal)); + // Initialize the hashSets. + if (m_pFreeBlockSets != nullptr) + { + for (uint32 i = 0; i < numKvals; ++i) + { + // max number of entries at a level is: 2^distFromTop + const uint32 maxEntriesKval = 1 << (m_baseAllocKval - (i + m_minKval)); + // 32 is a suitable max, however its the higher kvals won't even need 32 buckets. + const uint32 bucketsNeeded = Min(maxEntriesKval / (PAL_CACHE_LINE_BYTES) + 1, 32u); + + PAL_PLACEMENT_NEW(&m_pFreeBlockSets[i]) FreeSet(bucketsNeeded, m_pAllocator); + result = m_pFreeBlockSets[i].Init(); + + // if we failed the Init of the hashSet, delete the ones we did create, and free the array. This avoids + // having to keep track of the hashSets we did initialize in the destructor by just destroying it here. + if (result != Result::Success) + { + for (uint32 j = 0; j <= i; j++) + { + m_pFreeBlockSets[j].~HashSet(); + } + PAL_SAFE_FREE(m_pFreeBlockSets, m_pAllocator); + break; + } + } + } + else + { + result = Result::ErrorOutOfMemory; + } + + if (result == Result::Success) + { + m_pNumFreeList = static_cast(PAL_NEW_ARRAY(uint32, numKvals, m_pAllocator, AllocInternal)); + if (m_pNumFreeList == nullptr) + { + result = Result::ErrorOutOfMemory; + } + } + + if (result == Result::Success) + { + m_pFreeSetMutexes = static_cast(PAL_NEW_ARRAY(Mutex, numKvals, m_pAllocator, AllocInternal)); + if (m_pFreeSetMutexes == nullptr) + { + result = Result::ErrorOutOfMemory; + } + } + + const uint32 maxUsedEntries = 1 << (m_baseAllocKval - m_minKval); + const uint32 usedBucketsNeeded = maxUsedEntries / (PAL_CACHE_LINE_BYTES * 8) + 1; + + if (result == Result::Success) + { + // one hashMap for getting the kval a used block is at + m_pUsedBlockMap = static_cast(PAL_NEW(UsedMap, m_pAllocator, AllocInternal) + (usedBucketsNeeded, m_pAllocator)); + if (m_pUsedBlockMap != nullptr) + { + result = m_pUsedBlockMap->Init(); + } + else + { + result = Result::ErrorOutOfMemory; + } + } + + // if we successfully allocated all the memory we need, create the first two free blocks. + if (result == Result::Success) + { + memset(m_pNumFreeList, 0, sizeof(uint32) * numKvals); + // We need to create the first two largest-size blocks and add them to the last block list + const uint32 blockKval = (m_baseAllocKval - 1); + const gpusize blockSize = KvalToSize(blockKval); + FreeSet* pTopFreeSet = &m_pFreeBlockSets[blockKval - m_minKval]; + + // mark both of these as free blocks + result = pTopFreeSet->Insert(0); + if (result == Result::Success) + { + // even though this will never be reached, to pass the asserts, this needs to be + // as this kval + result = m_pUsedBlockMap->Insert(0, blockKval + 1); + } + if (result == Result::Success) + { + result = pTopFreeSet->Insert(blockSize); + } + m_pNumFreeList[blockKval - m_minKval] = 2; + m_highestFreeKval = blockKval; + } + PAL_ALERT(result != Result::Success); + return result; +} + +// ===================================================================================================================== +// Suballocates a block from the base allocation that this buddy allocator manages. If no free space is found then an +// appropriate error is returned. +// In order for m_pNumFreeList bookkeeping to be correct, ClaimGpuMemory MUST be called directly before this call to +// Allocate. The buddyAllocator will still work without this, but the results of ClaimGpuMemory will not be correct. +// unless it is called before every call to Allocate. +template +Result BuddyAllocator::Allocate( + gpusize size, + gpusize alignment, + gpusize* pOffset) +{ + PAL_ASSERT(m_pFreeBlockSets != nullptr); + PAL_ASSERT(m_pUsedBlockMap != nullptr); + PAL_ASSERT(m_pNumFreeList != nullptr); + PAL_ASSERT(m_pFreeSetMutexes != nullptr); + PAL_ASSERT(pOffset != nullptr); + PAL_ASSERT(size <= MaximumAllocationSize()); + + // Pad the requested allocation size to the nearest POT of the size and alignment + const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + + RWLockAuto freeLock(&m_freeLock); + Result result = GetNextFreeBlock(kval, pOffset); + // mark this kval as used here. + if (result == Result::Success) + { + result = SetKvalUsed(*pOffset, kval); + } + + if (result == Result::Success) + { + // Increment the number of suballocations this buddy allocator manages + AtomicIncrement(&m_numSuballocations); + } + return result; +} + +// ===================================================================================================================== +// Gets the next free block by recursively dividing larger blocks until a suitible sized block is created. +template +Result BuddyAllocator::GetNextFreeBlock( + uint32 kval, + gpusize* pOffset) +{ + Result result = Result::ErrorOutOfGpuMemory; + if (kval < m_baseAllocKval) + { + // this lock can not get any more fine grained + MutexAuto freeSetLock(&(m_pFreeSetMutexes[kval - m_minKval])); + result = PopFromFreeSet(pOffset, kval); + + if (result == Result::ErrorOutOfGpuMemory) + { // we didn't find a block at this kval, search the next level up + result = GetNextFreeBlock(kval + 1, pOffset); + + if (result == Result::Success) + { + // insert our buddy to the free set + gpusize buddyOffset = *pOffset + KvalToSize(kval); + result = InsertToFreeSet(buddyOffset, kval); + PAL_ASSERT(result == Result::Success); + } + } + else + { + // only two valid options are ErrorOutOfGpuMemory and Success, other result means the hashing failed. + PAL_ASSERT(result == Result::Success); + } + } + PAL_ALERT_MSG(result != Result::Success, + "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); + return result; +} + +// ===================================================================================================================== +// Frees the memory at the given offset, if it's buddy is also free, merges the two and recursively calls this again. +// This doesn't need any internal locks because Free accquires an exclusive lock on the entire allocator (freeLock), and +// the lock on the m_pNumFreeList. These locks could potentially be more fine grained, however freeing and allocating +// don't typically happen at the same time, and Freeing is already much faster than allocating. +template +Result BuddyAllocator::FreeBlock( + gpusize offset) +{ + Result result = Result::ErrorUnknown; + uint32 usedKval; + bool offsetUsed = GetKvalUsed(offset, &usedKval); + PAL_ASSERT(offsetUsed); + PAL_ASSERT(usedKval >= m_minKval && usedKval < m_baseAllocKval); + + gpusize buddyOffset = offset ^ KvalToSize(usedKval); + gpusize offsetUp = Min(offset, buddyOffset); + + // we don't want merge if we are on the top level. We also don't want to merge if a call to claim was made that + // claimed the buddy we are about to free. + if (IsOffsetFree(buddyOffset, usedKval) && (usedKval < m_baseAllocKval -1) && + ((m_pNumFreeList[usedKval - m_minKval] > 0) || (m_usedClaim == false))) + { // We can combine the two blocks and mark the one in the level above as free + // And do this recursively + result = RemoveOffsetFromFreeSet(buddyOffset, usedKval); + if (result == Result::Success) + { + // even though the block is going to be freed, need to set the kval as used + // so that on the recursive call it will be found and freed again. + PAL_ASSERT_MSG((m_pNumFreeList[usedKval - m_minKval] != 0) || (m_usedClaim == false), + "This should only fail if ClaimGpuMemory() is not called before this call to Allocate()."); + m_pNumFreeList[usedKval - m_minKval] -= 1; + result = SetKvalUsed(offsetUp, usedKval + 1); + } + // if this offset isn't the one that will be set as free in the next level up, we just need to remove it. + if ((result == Result::Success) && (offset != offsetUp)) + { + result = RemoveOffsetFromUsedMap(offset); + } + if (result == Result::Success) + { + result = FreeBlock(offsetUp); + } + } + else + { // We mark this block as free in this level + result = InsertToFreeSet(offset, usedKval); + + if (result == Result::Success) + { + m_pNumFreeList[usedKval - m_minKval] += 1; + m_highestFreeKval = Util::Max(usedKval, m_highestFreeKval); + if (offsetUp == offset) + { // if on the same offset as level up, move where the used block is + result = SetKvalUsed(offsetUp, usedKval + 1); + } + else + { // if at the top of this offset, remove is from used map + result = RemoveOffsetFromUsedMap(offset); + } + } + } + return result; +} +// ===================================================================================================================== +// Frees a suballocated block making it available for future re-use. +template +void BuddyAllocator::Free( + gpusize offset, + gpusize size, + gpusize alignment) +{ + RWLockAuto freeLock(&m_freeLock); + MutexAuto numFreeMutex(&m_numFreeMutex); + + PAL_ASSERT(m_pFreeBlockSets != nullptr); + PAL_ASSERT(m_pUsedBlockMap != nullptr); + PAL_ASSERT(m_pNumFreeList != nullptr); + PAL_ASSERT(m_pFreeSetMutexes != nullptr); + + Result result = FreeBlock(offset); + + // Freeing should always succeed unless something went wrong with the allocation scheme + PAL_ASSERT(result == Result::Success); + + // Decrement the number of suballocations this buddy allocator manages + AtomicDecrement(&m_numSuballocations); +} + +// ===================================================================================================================== +// Claims the memory that will be used when Allocate is called. +// Returns ErrorOutOfGpuMemory if this buddyAllocator has no free blocks, otherwise returns Success. +template +Result BuddyAllocator::ClaimGpuMemory( + gpusize size, + gpusize alignment) +{ + // Set this to true as soon as the first call to claim is done to signal to Free that claim is being used. + m_usedClaim = true; + + PAL_ASSERT(m_pNumFreeList != nullptr); + // Pad the requested allocation size to the nearest POT of the size and alignment + uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); + + Result result = Result::ErrorOutOfGpuMemory; + + // Do this check twice to avoid taking the lock at all if we have no chance of Claiming the memory. This will stop + // this thread from locking on this, as well as other threads from waiting longer for no reason. + if (kval <= m_highestFreeKval) + { + MutexAuto numFreeLock(&m_numFreeMutex); + if (kval <= m_highestFreeKval) + { + PAL_ASSERT(m_pNumFreeList[m_highestFreeKval - m_minKval] != 0); + result = Result::Success; + // First we add one to each level for every buddy we'll insert + while (m_pNumFreeList[kval - m_minKval] == 0) + { + m_pNumFreeList[kval - m_minKval] += 1; + kval++; + } + + PAL_ASSERT(kval <= m_highestFreeKval); + PAL_ASSERT_MSG(m_pNumFreeList[kval - m_minKval] > 0, + "This should only fail if ClaimGpuMemory() is not called before every call to Allocate()."); + // Then we subtract one for the block we will use or split to the lower level + m_pNumFreeList[kval - m_minKval] -= 1; + + PAL_ASSERT(m_highestFreeKval >= m_minKval); + while (m_pNumFreeList[m_highestFreeKval - m_minKval] == 0) + { + m_highestFreeKval--; + // in this case, there will be no more space left on the entire buddyAllocator + if (m_highestFreeKval < m_minKval) + { + break; + } + } + } + } + return result; +} + +// ===================================================================================================================== +// Used to search through pools before claiming memory to find the one that will fragment the least. pKval will have +// be the highest level needed to be split up for this pool, so the pool with the lowest value will be best. Can NOT +// guarantee the memory will still be availible by the time this thread calls ClaimGpuMemory. +template +Result BuddyAllocator::CheckIfOpenMemory( + gpusize size, + gpusize alignment, + uint32* pKval) +{ + PAL_ASSERT(m_pNumFreeList != nullptr); + // Pad the requested allocation size to the nearest POT of the size and alignment + const uint32 kval = Max(SizeToKval(Pow2Pad(Max(size, alignment))), m_minKval); + PAL_ASSERT(kval >= m_minKval && kval < m_baseAllocKval); + + Result result = Result::ErrorOutOfGpuMemory; + if ((kval <= m_highestFreeKval)) + { + result = Result::Success; + if (pKval != nullptr) + { + uint32 topKval = kval; + for (; topKval < m_baseAllocKval; topKval++) + { + if (m_pNumFreeList[topKval - m_minKval] != 0) + { + *pKval = topKval; + break; + } + } + } + } + return result; +} + +// Hashset helper functions. +// ===================================================================================================================== +template +Result BuddyAllocator::InsertToFreeSet( + gpusize offset, + uint32 kval) +{ + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + PAL_ASSERT(pFreeSet->Contains(offset) == false); + Result result = pFreeSet->Insert(offset); + + return result; +} + +// ===================================================================================================================== +template +bool BuddyAllocator::GetKvalUsed( + gpusize offset, + uint32* pKval) +{ + bool isUsed; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + uint32* usedKval = m_pUsedBlockMap->FindKey(offset); + if (usedKval == nullptr) + { + isUsed = false; + } + else + { + isUsed = true; + if (pKval != nullptr) + { + *pKval = *usedKval; + } + } + return isUsed; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::SetKvalUsed( + gpusize offset, + uint32 kval) +{ + uint32* pKval; + bool existed; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + Result result = m_pUsedBlockMap->FindAllocate(offset, &existed, &pKval); + if (result == Result::Success) + { + *pKval = kval; + } + PAL_ASSERT(result == Result::Success); + return result; +} + +// ===================================================================================================================== +// If there are free blocks at this level, removes one, if not, returns Result::ErrorOutOfGpuMemory +template +Result BuddyAllocator::PopFromFreeSet( + gpusize* pOffset, + uint32 kval) +{ + Result result = Result::ErrorUnknown; + + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + PAL_ASSERT(pFreeSet != nullptr); + + auto freeSetIt = pFreeSet->Begin(); + if (freeSetIt.Get() != nullptr) + { + *pOffset = freeSetIt.Get()->key; + bool eraseRes = pFreeSet->Erase(*pOffset); + if (eraseRes) + { + result = Result::Success; + } + else + { + // we got the offset from the iterator, no reason for it to fail. + PAL_ASSERT_ALWAYS(); + } + } + else + { + result = Result::ErrorOutOfGpuMemory; + } + return result; +} + +// ===================================================================================================================== +template +bool BuddyAllocator::IsOffsetFree( + gpusize offset, + uint32 kval) +{ + bool isIn = m_pFreeBlockSets[kval - m_minKval].Contains(offset); + + return isIn; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::RemoveOffsetFromFreeSet( + gpusize offset, + uint32 kval) +{ + FreeSet* pFreeSet = &m_pFreeBlockSets[kval - m_minKval]; + bool eraseRes = pFreeSet->Erase(offset); + return (eraseRes) ? Result::Success : Result::ErrorInvalidValue; +} + +// ===================================================================================================================== +template +Result BuddyAllocator::RemoveOffsetFromUsedMap( + gpusize offset) +{ + Result result = Result::Success; + MutexAuto usedBlockMapLock(&m_usedBlockMapMutex); + bool removeRes = m_pUsedBlockMap->Erase(offset); + + if (removeRes == false) + { + result = Result::ErrorInvalidValue; + } + return result; +} +} // Pal diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h b/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h index 91e9638ebe..791569cdc3 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palDbgPrint.h @@ -1,364 +1,364 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palDbgPrint.h - * @brief Defines PAL utility collection debug print functionality. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palUtil.h" -#include - -#if PAL_ENABLE_LOGGING -#include "palDbgLogHelper.h" -#endif - -namespace Util -{ -#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) -// Forward declarations. -class File; -enum FileAccessMode : uint32; - -/// Specifies the category of a debug print. -/// -/// Driver developer can enable/disable each category separately through settings. -enum DbgPrintCategory : uint32 -{ - DbgPrintCatInfoMsg = 0, ///< Generic informational messages. - DbgPrintCatWarnMsg, ///< Warning messages. - DbgPrintCatErrorMsg, ///< Error messages. - DbgPrintCatScMsg, ///< Generic shader compiler messages. - DbgPrintCatEventPrintMsg, ///< System event messages. - DbgPrintCatEventPrintCallbackMsg, ///< System event messages via callback. - DbgPrintCatMsgFile, ///< All Messages via file. - DbgPrintCatCount -}; - -/// Specifies the debug print mode: disabled, print to debugger, or print to file. -enum class DbgPrintMode : uint32 -{ - Disable, ///< Debug print is ignored. - Print, ///< Debug print is routed to the debug window or stdout. - File, ///< Debug print is routed to a file. - PrintCallback, ///< Debug print is routed to the print callback only -}; - -/// Flags specifying style controls for a debug print. -enum DbgPrintStyle : uint32 -{ - DbgPrintStyleDefault = 0x0, ///< Normal mode: has a prefix and a CR-LF. - DbgPrintStyleNoPrefix = 0x1, ///< Skip the prefix. - DbgPrintStyleNoCrLf = 0x2, ///< Skip the CR-LF. - DbgPrintStyleNoPrefixNoCrLf = 0x3, ///< Skip both the prefix and the CR-LF. -}; - -/// Definition for debug print callback. -/// -/// @param [in] pUserData User data that is installed with the callback for use by the installer. -/// @param [in] category Debug print category that the message belongs to. -/// @param [in] pText Text data to be printed by the callback. -typedef void (PAL_STDCALL *DbgPrintCallbackFunc)( - void* pUserdata, - DbgPrintCategory category, - const char* pText); - -/// Debug print callback struct that bundles the callback function and its userdata pointer. -struct DbgPrintCallback -{ - DbgPrintCallbackFunc pCallbackFunc; - void* pUserdata; -}; - -/// Generic debug printf function to be used when the caller wishes to specify the output category and style. Clients -/// should use the PAL_DPF macro instead of calling this function directly. -/// -/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). -/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). -/// @param [in] pFormat Printf-style format string. -extern void DbgPrintf( - DbgPrintCategory category, - DbgPrintStyle style, - const char* pFormat, - ...); - -/// Generic printf function to be used when the caller wishes to specify the output category and style, and has -/// pre-started the variable arg list (va_list argument instead of ...). -/// -/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). -/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). -/// @param [in] pFormat Printf-style format string. -/// @param [in] argList Variable argument list. -extern void DbgVPrintf( - DbgPrintCategory category, - DbgPrintStyle style, - const char* pFormat, - va_list argList); - -#endif - -#if PAL_ENABLE_PRINTS_ASSERTS -/// Sets the debug print mode (output to debugger, write to file, or disabled) for the specified category of messages. -/// -/// Probably controlled by a setting and set during initialization. -/// -/// @param [in] category Message category to control (e.g., CS dumps, SC output, etc.). -/// @param [in] mode New mode to be used for this message category (print to file, etc.). -extern void SetDbgPrintMode( - DbgPrintCategory category, - DbgPrintMode mode); - -/// Opens a file that resides in the selected log directory. -/// -/// This function exists in all build configurations. -/// -/// @param [in,out] pFile File object to represent the opened file. -/// @param [in] pFilename Filename to open. -/// @param [in] flags ORed mask of FileAccessMode values specifying how this file will be accessed. -/// -/// @returns Success if successful, otherwise an appropriate error. -extern Result OpenLogFile( - File* pFile, - const char* pFilename, - uint32 flags); - -/// Sets the global debug print callback. -/// -/// @param [in] callback Debug print callback struct that contains the callback function and a userdata pointer -extern void SetDbgPrintCallback( - const DbgPrintCallback& callback); - -#endif - -// Forward declarations. -template class StringView; - -/// Logs a text string via client callback when provided. -/// -/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pLogCbInfo -/// parameter to CreatePlatform() will be passed back to the client on every log callback. -/// @param [in] level Log priority level associated with the message. -/// @param [in] categoryMask Log category mask that represents what category fields the message relates to. -/// @param [in] pFormat Format string for the log message. -/// @param [in] args Variable arguments that correspond to the format string. -typedef void (PAL_STDCALL *LogCallbackFunc)( - void* pClientData, - uint32 level, - uint64 categoryMask, - const char* pFormat, - va_list args); - -/// Specifies client-provided logging callbacks. Used as a parameter to Pal::CreatePlatform(). -/// -/// @ingroup LibInit -struct LogCallbackInfo -{ - void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to - /// every @ref LogCallbackFunc call made by PAL. - LogCallbackFunc pfnLogCb; ///< Debug print logging callback. @see LogCallbackFunc. -}; - -/// Compiler-specific wrapper of the standard snprintf implementation. -/// -/// @param [out] pOutput Output string. -/// @param [in] bufSize Available space in pOutput. -/// @param [in] pFormat Printf-style format string. -/// -/// @returns The resultant length of the formatted string. -extern int32 Snprintf( - char* pOutput, - size_t bufSize, - const char* pFormat, - ...); - -/// Compiler-specific wrapper of the standard vsnprintf implementation. -/// -/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be -/// printed had a buffer with enough space been provided. -/// @param [in] bufSize Available space in pOutput. -/// @param [in] pFormat Printf-style format string. -/// @param [in] argList variable argument list. -/// -/// @returns The resultant length of the formatted string. -extern int32 Vsnprintf( - char* pOutput, - size_t bufSize, - const char* pFormat, - va_list argList); - -/// Compiler-specific wrapper of the standard snprintf implementation. -/// -/// @param [out] pOutput Output string. -/// @param [in] bufSize Available space in pOutput. -/// @param [in] pFormat Printf-style format string. -/// -/// @returns The resultant length of the formatted string. -extern int32 Snprintf( - wchar_t* pOutput, - size_t bufSize, - const wchar_t* pFormat, - ...); - -/// Compiler-specific wrapper of the standard vsnprintf implementation. -/// -/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be -/// printed had a buffer with enough space been provided. -/// @param [in] bufSize Available space in pOutput. -/// @param [in] pFormat Printf-style format string. -/// @param [in] argList variable argument list. -/// -/// @returns The resultant length of the formatted string. -extern int32 Vsnprintf( - wchar_t* pOutput, - size_t bufSize, - const wchar_t* pFormat, - va_list argList); - -/// Copy an arbitrary string into the provided buffer, encoding as necessary to avoid characters that are illegal -/// in filenames (assuming the more restrictive Windows rules, even on non-Windows OSs). -/// -/// Any byte that would be illegal is encoded as % then two hex digits, like in a URL. -/// -/// @param [out] pOutput Output string. -/// @param bufSize Available space in pOutput. -/// @param [in] input Input string -/// @param allowSpace Allow (do not % encode) space -/// @param allowDirSeparator Allow (do not % encode) / and \ characters -/// -/// @returns Works like C++ standard snprintf: -/// - If the provided buffer is big enough, it returns the number of bytes written, excluding the -/// terminating \0. -/// - If the provided buffer is not big enough, then the result string is truncated to fit, and the -/// function returns the number of bytes that would have been written if the buffer had been long -/// enough, excluding the terminating \0. -/// - Passing 0 buffer length is allowed as a special case of that, and nullptr pOutput is then allowed. -extern size_t EncodeAsFilename( - char* pOutput, - size_t bufSize, - const StringView& input, - bool allowSpace, - bool allowDirSeparator); - -/// Generate a log filename. -/// -/// @param [inout] pFilenameBuffer Buffer to hold the filename. -/// @param maxSize Max size of the pFilenameBuffer. -/// @param nextPost The next write position. -/// @param [in] pExt The filename extension. -/// @param logDuplicate Log duplicate objects. -extern void GenLogFilename( - char* pFilenameBuffer, - size_t maxSize, - size_t nextPos, - const char* const pExt, - bool logDuplicate); - -} // Util - -/// PAL_ENABLE_LOGGING enables the new logging code. At this time, both, the current and new logging -/// code will be active for development purpose if both macros are enabled. -#if (PAL_ENABLE_PRINTS_ASSERTS && PAL_ENABLE_LOGGING) -/// Debug printf macro. -#define PAL_DPF ::Util::DbgPrintf -/// Debug info printf macro. -#define PAL_DPINFO(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ - ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug warning printf macro. -#define PAL_DPWARN(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ - ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug error printf macro. -#define PAL_DPERROR(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ - ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -#elif PAL_ENABLE_PRINTS_ASSERTS -/// Debug printf macro. -#define PAL_DPF ::Util::DbgPrintf -/// Debug info printf macro. -#define PAL_DPINFO(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug warning printf macro. -#define PAL_DPWARN(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug error printf macro. -#define PAL_DPERROR(_pFormat, ...) \ -{ \ - ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ - ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -#elif PAL_ENABLE_LOGGING -/// Debug printf macro. -#define PAL_DPF ::Util::DbgPrintf -/// Debug info printf macro. -#define PAL_DPINFO(_pFormat, ...) \ -{ \ - ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug warning printf macro. -#define PAL_DPWARN(_pFormat, ...) \ -{ \ - ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -/// Debug error printf macro. -#define PAL_DPERROR(_pFormat, ...) \ -{ \ - ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ - "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ -} -#else -/// Debug printf macro. -#define PAL_DPF(...) ((void)0) -/// Debug info printf macro. -#define PAL_DPINFO(...) ((void)0) -/// Debug warning printf macro. -#define PAL_DPWARN(...) ((void)0) -/// Debug error printf macro. -#define PAL_DPERROR(...) ((void)0) -#endif - +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDbgPrint.h + * @brief Defines PAL utility collection debug print functionality. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include + +#if PAL_ENABLE_LOGGING +#include "palDbgLogHelper.h" +#endif + +namespace Util +{ +#if (PAL_ENABLE_PRINTS_ASSERTS || PAL_ENABLE_LOGGING) +// Forward declarations. +class File; +enum FileAccessMode : uint32; + +/// Specifies the category of a debug print. +/// +/// Driver developer can enable/disable each category separately through settings. +enum DbgPrintCategory : uint32 +{ + DbgPrintCatInfoMsg = 0, ///< Generic informational messages. + DbgPrintCatWarnMsg, ///< Warning messages. + DbgPrintCatErrorMsg, ///< Error messages. + DbgPrintCatScMsg, ///< Generic shader compiler messages. + DbgPrintCatEventPrintMsg, ///< System event messages. + DbgPrintCatEventPrintCallbackMsg, ///< System event messages via callback. + DbgPrintCatMsgFile, ///< All Messages via file. + DbgPrintCatCount +}; + +/// Specifies the debug print mode: disabled, print to debugger, or print to file. +enum class DbgPrintMode : uint32 +{ + Disable, ///< Debug print is ignored. + Print, ///< Debug print is routed to the debug window or stdout. + File, ///< Debug print is routed to a file. + PrintCallback, ///< Debug print is routed to the print callback only +}; + +/// Flags specifying style controls for a debug print. +enum DbgPrintStyle : uint32 +{ + DbgPrintStyleDefault = 0x0, ///< Normal mode: has a prefix and a CR-LF. + DbgPrintStyleNoPrefix = 0x1, ///< Skip the prefix. + DbgPrintStyleNoCrLf = 0x2, ///< Skip the CR-LF. + DbgPrintStyleNoPrefixNoCrLf = 0x3, ///< Skip both the prefix and the CR-LF. +}; + +/// Definition for debug print callback. +/// +/// @param [in] pUserData User data that is installed with the callback for use by the installer. +/// @param [in] category Debug print category that the message belongs to. +/// @param [in] pText Text data to be printed by the callback. +typedef void (PAL_STDCALL *DbgPrintCallbackFunc)( + void* pUserdata, + DbgPrintCategory category, + const char* pText); + +/// Debug print callback struct that bundles the callback function and its userdata pointer. +struct DbgPrintCallback +{ + DbgPrintCallbackFunc pCallbackFunc; + void* pUserdata; +}; + +/// Generic debug printf function to be used when the caller wishes to specify the output category and style. Clients +/// should use the PAL_DPF macro instead of calling this function directly. +/// +/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). +/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). +/// @param [in] pFormat Printf-style format string. +extern void DbgPrintf( + DbgPrintCategory category, + DbgPrintStyle style, + const char* pFormat, + ...); + +/// Generic printf function to be used when the caller wishes to specify the output category and style, and has +/// pre-started the variable arg list (va_list argument instead of ...). +/// +/// @param [in] category Message category (e.g., CS dumps, SC output, etc.). +/// @param [in] style Text output style (i.e., has prefix and/or CR-LF). +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList Variable argument list. +extern void DbgVPrintf( + DbgPrintCategory category, + DbgPrintStyle style, + const char* pFormat, + va_list argList); + +#endif + +#if PAL_ENABLE_PRINTS_ASSERTS +/// Sets the debug print mode (output to debugger, write to file, or disabled) for the specified category of messages. +/// +/// Probably controlled by a setting and set during initialization. +/// +/// @param [in] category Message category to control (e.g., CS dumps, SC output, etc.). +/// @param [in] mode New mode to be used for this message category (print to file, etc.). +extern void SetDbgPrintMode( + DbgPrintCategory category, + DbgPrintMode mode); + +/// Opens a file that resides in the selected log directory. +/// +/// This function exists in all build configurations. +/// +/// @param [in,out] pFile File object to represent the opened file. +/// @param [in] pFilename Filename to open. +/// @param [in] flags ORed mask of FileAccessMode values specifying how this file will be accessed. +/// +/// @returns Success if successful, otherwise an appropriate error. +extern Result OpenLogFile( + File* pFile, + const char* pFilename, + uint32 flags); + +/// Sets the global debug print callback. +/// +/// @param [in] callback Debug print callback struct that contains the callback function and a userdata pointer +extern void SetDbgPrintCallback( + const DbgPrintCallback& callback); + +#endif + +// Forward declarations. +template class StringView; + +/// Logs a text string via client callback when provided. +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pLogCbInfo +/// parameter to CreatePlatform() will be passed back to the client on every log callback. +/// @param [in] level Log priority level associated with the message. +/// @param [in] categoryMask Log category mask that represents what category fields the message relates to. +/// @param [in] pFormat Format string for the log message. +/// @param [in] args Variable arguments that correspond to the format string. +typedef void (PAL_STDCALL *LogCallbackFunc)( + void* pClientData, + uint32 level, + uint64 categoryMask, + const char* pFormat, + va_list args); + +/// Specifies client-provided logging callbacks. Used as a parameter to Pal::CreatePlatform(). +/// +/// @ingroup LibInit +struct LogCallbackInfo +{ + void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to + /// every @ref LogCallbackFunc call made by PAL. + LogCallbackFunc pfnLogCb; ///< Debug print logging callback. @see LogCallbackFunc. +}; + +/// Compiler-specific wrapper of the standard snprintf implementation. +/// +/// @param [out] pOutput Output string. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// +/// @returns The resultant length of the formatted string. +extern int32 Snprintf( + char* pOutput, + size_t bufSize, + const char* pFormat, + ...); + +/// Compiler-specific wrapper of the standard vsnprintf implementation. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList variable argument list. +/// +/// @returns The resultant length of the formatted string. +extern int32 Vsnprintf( + char* pOutput, + size_t bufSize, + const char* pFormat, + va_list argList); + +/// Compiler-specific wrapper of the standard snprintf implementation. +/// +/// @param [out] pOutput Output string. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// +/// @returns The resultant length of the formatted string. +extern int32 Snprintf( + wchar_t* pOutput, + size_t bufSize, + const wchar_t* pFormat, + ...); + +/// Compiler-specific wrapper of the standard vsnprintf implementation. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] pFormat Printf-style format string. +/// @param [in] argList variable argument list. +/// +/// @returns The resultant length of the formatted string. +extern int32 Vsnprintf( + wchar_t* pOutput, + size_t bufSize, + const wchar_t* pFormat, + va_list argList); + +/// Copy an arbitrary string into the provided buffer, encoding as necessary to avoid characters that are illegal +/// in filenames (assuming the more restrictive Windows rules, even on non-Windows OSs). +/// +/// Any byte that would be illegal is encoded as % then two hex digits, like in a URL. +/// +/// @param [out] pOutput Output string. +/// @param bufSize Available space in pOutput. +/// @param [in] input Input string +/// @param allowSpace Allow (do not % encode) space +/// @param allowDirSeparator Allow (do not % encode) / and \ characters +/// +/// @returns Works like C++ standard snprintf: +/// - If the provided buffer is big enough, it returns the number of bytes written, excluding the +/// terminating \0. +/// - If the provided buffer is not big enough, then the result string is truncated to fit, and the +/// function returns the number of bytes that would have been written if the buffer had been long +/// enough, excluding the terminating \0. +/// - Passing 0 buffer length is allowed as a special case of that, and nullptr pOutput is then allowed. +extern size_t EncodeAsFilename( + char* pOutput, + size_t bufSize, + const StringView& input, + bool allowSpace, + bool allowDirSeparator); + +/// Generate a log filename. +/// +/// @param [inout] pFilenameBuffer Buffer to hold the filename. +/// @param maxSize Max size of the pFilenameBuffer. +/// @param nextPost The next write position. +/// @param [in] pExt The filename extension. +/// @param logDuplicate Log duplicate objects. +extern void GenLogFilename( + char* pFilenameBuffer, + size_t maxSize, + size_t nextPos, + const char* const pExt, + bool logDuplicate); + +} // Util + +/// PAL_ENABLE_LOGGING enables the new logging code. At this time, both, the current and new logging +/// code will be active for development purpose if both macros are enabled. +#if (PAL_ENABLE_PRINTS_ASSERTS && PAL_ENABLE_LOGGING) +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ + ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#elif PAL_ENABLE_PRINTS_ASSERTS +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatInfoMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatWarnMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgPrintf(::Util::DbgPrintCatErrorMsg, ::Util::DbgPrintStyleDefault, _pFormat " (%s:%d:%s)", \ + ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#elif PAL_ENABLE_LOGGING +/// Debug printf macro. +#define PAL_DPF ::Util::DbgPrintf +/// Debug info printf macro. +#define PAL_DPINFO(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Info, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug warning printf macro. +#define PAL_DPWARN(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Warning, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +/// Debug error printf macro. +#define PAL_DPERROR(_pFormat, ...) \ +{ \ + ::Util::DbgLog(::Util::SeverityLevel::Error, ::Util::OriginationType::DebugPrint, \ + "AMD-PAL", _pFormat " (%s:%d:%s)", ##__VA_ARGS__, __FILE__, __LINE__, __func__); \ +} +#else +/// Debug printf macro. +#define PAL_DPF(...) ((void)0) +/// Debug info printf macro. +#define PAL_DPINFO(...) ((void)0) +/// Debug warning printf macro. +#define PAL_DPWARN(...) ((void)0) +/// Debug error printf macro. +#define PAL_DPERROR(...) ((void)0) +#endif + diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h b/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h index 2077705a1f..8115e052dc 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palDeque.h @@ -1,319 +1,319 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palDeque.h - * @brief PAL utility collection Deque and DequeIterator class declarations. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palAssert.h" -#include "palSysMemory.h" - -namespace Util -{ - -// Forward declarations. -template class Deque; - -/// @internal Private structure used by Deque and its iterators to store chunks of data elements. -struct DequeBlockHeader -{ - DequeBlockHeader* pPrev; ///< Pointer to the previous block. - DequeBlockHeader* pNext; ///< Pointer to the next block. - void* pStart; ///< Pointer to the first element in this block. - void* pEnd; ///< Pointer to the last element in this block. -}; - -/** - *********************************************************************************************************************** - * @brief Iterator for traversal of elements in a Deque collection. - * - * Allows traversal of all elements in a Deque going either forwards or backwards. If you traverse off either end of - * the deque, then you must create a new iterator by calling either the Deque's Begin() or End() method. - *********************************************************************************************************************** - */ -template -class DequeIterator -{ -public: - /// Trivial destructor. - ~DequeIterator() { } - - /// Returns a pointer to the current element. Will return null if we've gone past the end. - T* Get() const { return m_pCurrent; } - - /// Advances the iterator to the next position (move forward). - void Next(); - - /// Advances the iterator to the previous position (move backward). - void Prev(); - - /// Check if the element the iterator references is valid. - bool IsValid() const { return m_pCurrent != nullptr; } - -private: - DequeIterator(const Deque* pDeque, DequeBlockHeader* pHeader, T* pCurrent); - - const Deque*const m_pDeque; // The Deque we're iterating over. - const DequeBlockHeader* m_pCurrentHeader; // The block we're iterating over. - T* m_pCurrent; // Pointer to the current element. Null if we've gone past the - // end. - - PAL_DISALLOW_DEFAULT_CTOR(DequeIterator); - - // Although this is a transgression of coding standards, it means that Deque does not need to have a public - // interface specifically to implement this class. The added encapsulation this provides is worthwhile. - friend class Deque; -}; - -/** - *********************************************************************************************************************** - * @brief Simple templated deque container - a double-ended queue. - * - * This is meant for storing elements of an arbitrary (but uniform) type. Operations which this class supports are: - * - * - Insertion from the front and back. - * - Deletion from the front and back. - * - Forwards and reverse iteration - * - * @warning This class is not thread-safe for push, pop, or iteration! - * - * @note This class is only designed to work with native types and POD-style structures. If it is needed to have a Deque - * of complex objects with nontrivial destructors, copy constructors or assign operators, then a specialized - * implementation of CleanupElement() will need to be explicitly defined. - *********************************************************************************************************************** - */ -template -class Deque -{ -public: - /// Constructor. - /// - /// @param [in] pAllocator The allocator that will allocate memory if required. - Deque(Allocator*const pAllocator, size_t numElementsPerBlock = 256); - ~Deque(); - - /// Returns the number of elements in the deque. - size_t NumElements() const { return m_numElements; } - - /// Returns an iterator pointing to the first element in the deque. - /// - /// @returns An iterator pointing at the front end of the deque. - DequeIterator Begin() const { return DequeIterator(this, m_pFrontHeader, m_pFront); } - - /// Returns an iterator pointing to the last element in the deque. - /// - /// This is somewhat different from std::deque.End() which returns a pointer to the theoretical object _past_ the - /// end of the deque. - /// - /// @returns An iterator pointing at the back end of the deque. - DequeIterator End() const { return DequeIterator(this, m_pBackHeader, m_pBack); } - - ///@{ - /// Returns the element at the location specified. - /// - /// @warning Calling this function with an out-of-bounds index will cause an access violation! - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - T& At(uint32 index); - - const T& At(uint32 index) const; - - T& operator[](uint32 index); - const T& operator[](uint32 index) const; - ///@} - - /// Returns the object at the front of the deque. - /// - /// @warning This will cause an access violation if called on an empty deque! - /// - /// @returns Reference to the item stored at the front end of the deque. - T& Front() const - { - PAL_ASSERT(m_numElements != 0); - return *m_pFront; - } - - /// Returns the object at the tail of the deque. - /// - /// @warning This will cause an access violation if called on an empty deque! - /// - /// @returns Reference to the item stored at the back end of the deque. - T& Back() const - { - PAL_ASSERT(m_numElements != 0); - return *m_pBack; - } - - /// Pushes a copy of the specified item onto the front of the deque. - /// - /// @param [in] data Item to be added to the front of the deque. - /// - /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation - /// failed because of an internal failure to allocate system memory. - Result PushFront(const T& data); - - /// Emplaces a newly constructed item onto the front of the deque. - /// - /// @param [in] args arguments used to construct the new item. - /// - /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation - /// failed because of an internal failure to allocate system memory. - template - Result EmplaceFront(Args&&... args); - - /// Pushes a copy of the specified item onto the back of the deque. - /// - /// @param [in] data Item to be added to the back of the deque. - /// - /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation - /// failed because of an internal failure to allocate system memory. - Result PushBack(const T& data); - - /// Emplaces a newly constructed item onto the back of the deque. - /// - /// @param [in] args arguments used to construct the new item. - /// - /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation - /// failed because of an internal failure to allocate system memory. - template - Result EmplaceBack(Args&&... args); - - /// Pops the first item off the front of the deque, returning the popped value. - /// - /// @param [out] pOut Item popped off the front of the deque. - /// - /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque - /// is empty. - Result PopFront(T* pOut); - - /// Pops the first item off the back of the deque, returning the popped value. - /// - /// @param [out] pOut Item popped off the back of the deque. - /// - /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque - /// is empty. - Result PopBack(T* pOut); - -private: - Result AllocateFront(T**); - Result AllocateBack(T**); - DequeBlockHeader* AllocateNewBlock(); - void FreeUnusedBlock(DequeBlockHeader* pHeader); - - // A helper function to avoid duplication in const and non-const versions of At(). - T& InternalAt(uint32 index) const; - - size_t m_numElements; // Number of elements - const size_t m_numElementsPerBlock; // Block granularity when we need to alloc a new one - - DequeBlockHeader* m_pFrontHeader; // First block of data elements, null for empty deques. - DequeBlockHeader* m_pBackHeader; // Last block of data elements, null for empty deques/ - - T* m_pFront; // First data element, null for empty deques. - T* m_pBack; // Last data element, null for empty deques. - - DequeBlockHeader* m_pLazyFreeHeader; // Cached pointer to the most-recently freed block. - - Allocator*const m_pAllocator; // Pointer to the allocator for this deque. - - PAL_DISALLOW_COPY_AND_ASSIGN(Deque); - - // Although this is a transgression of coding standards, it prevents DequeIterator requiring a public constructor; - // constructing a 'bare' DequeIterator (i.e. without calling Deque::GetIterator) can never be a legal operation, so - // this means that these two classes are much safer to use. - friend class DequeIterator; -}; - -// ===================================================================================================================== -template -Deque::Deque( - Allocator*const pAllocator, - size_t numElementsPerBlock) - : - m_numElements(0), - m_numElementsPerBlock(numElementsPerBlock), - m_pFrontHeader(nullptr), - m_pBackHeader(nullptr), - m_pFront(nullptr), - m_pBack(nullptr), - m_pLazyFreeHeader(nullptr), - m_pAllocator(pAllocator) -{ -} - -// ===================================================================================================================== -// Frees all of the blocks this object allocated over its lifetime. -template -Deque::~Deque() -{ - if (!std::is_trivial::value) - { - while (m_pFrontHeader != nullptr) - { - // Explicitly destroy the removed value since it's non-trivial and advance. - // We must destroy all of them in the current block before freeing it. - m_pFront->~T(); - ++m_pFront; - --m_numElements; - - if ((m_pFront == m_pFrontHeader->pEnd) || (m_numElements == 0)) - { - // Okay, the front block is now empty. Free it and advance to the next block. - DequeBlockHeader* pBlockToFree = m_pFrontHeader; - m_pFrontHeader = m_pFrontHeader->pNext; - PAL_SAFE_FREE(pBlockToFree, m_pAllocator); - - if (m_pFrontHeader != nullptr) - { - // Fixup to the new block. - m_pFront = static_cast(m_pFrontHeader->pStart); - } - } - } - } - else - { - // Elements are trivial so skip iterating through elements and free each block. - while (m_pFrontHeader != nullptr) - { - DequeBlockHeader* pBlockToFree = m_pFrontHeader; - m_pFrontHeader = m_pFrontHeader->pNext; - PAL_SAFE_FREE(pBlockToFree, m_pAllocator); - } - } - - if (m_pLazyFreeHeader != nullptr) - { - PAL_SAFE_FREE(m_pLazyFreeHeader, m_pAllocator); - } -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palDeque.h + * @brief PAL utility collection Deque and DequeIterator class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palSysMemory.h" + +namespace Util +{ + +// Forward declarations. +template class Deque; + +/// @internal Private structure used by Deque and its iterators to store chunks of data elements. +struct DequeBlockHeader +{ + DequeBlockHeader* pPrev; ///< Pointer to the previous block. + DequeBlockHeader* pNext; ///< Pointer to the next block. + void* pStart; ///< Pointer to the first element in this block. + void* pEnd; ///< Pointer to the last element in this block. +}; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a Deque collection. + * + * Allows traversal of all elements in a Deque going either forwards or backwards. If you traverse off either end of + * the deque, then you must create a new iterator by calling either the Deque's Begin() or End() method. + *********************************************************************************************************************** + */ +template +class DequeIterator +{ +public: + /// Trivial destructor. + ~DequeIterator() { } + + /// Returns a pointer to the current element. Will return null if we've gone past the end. + T* Get() const { return m_pCurrent; } + + /// Advances the iterator to the next position (move forward). + void Next(); + + /// Advances the iterator to the previous position (move backward). + void Prev(); + + /// Check if the element the iterator references is valid. + bool IsValid() const { return m_pCurrent != nullptr; } + +private: + DequeIterator(const Deque* pDeque, DequeBlockHeader* pHeader, T* pCurrent); + + const Deque*const m_pDeque; // The Deque we're iterating over. + const DequeBlockHeader* m_pCurrentHeader; // The block we're iterating over. + T* m_pCurrent; // Pointer to the current element. Null if we've gone past the + // end. + + PAL_DISALLOW_DEFAULT_CTOR(DequeIterator); + + // Although this is a transgression of coding standards, it means that Deque does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class Deque; +}; + +/** + *********************************************************************************************************************** + * @brief Simple templated deque container - a double-ended queue. + * + * This is meant for storing elements of an arbitrary (but uniform) type. Operations which this class supports are: + * + * - Insertion from the front and back. + * - Deletion from the front and back. + * - Forwards and reverse iteration + * + * @warning This class is not thread-safe for push, pop, or iteration! + * + * @note This class is only designed to work with native types and POD-style structures. If it is needed to have a Deque + * of complex objects with nontrivial destructors, copy constructors or assign operators, then a specialized + * implementation of CleanupElement() will need to be explicitly defined. + *********************************************************************************************************************** + */ +template +class Deque +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + Deque(Allocator*const pAllocator, size_t numElementsPerBlock = 256); + ~Deque(); + + /// Returns the number of elements in the deque. + size_t NumElements() const { return m_numElements; } + + /// Returns an iterator pointing to the first element in the deque. + /// + /// @returns An iterator pointing at the front end of the deque. + DequeIterator Begin() const { return DequeIterator(this, m_pFrontHeader, m_pFront); } + + /// Returns an iterator pointing to the last element in the deque. + /// + /// This is somewhat different from std::deque.End() which returns a pointer to the theoretical object _past_ the + /// end of the deque. + /// + /// @returns An iterator pointing at the back end of the deque. + DequeIterator End() const { return DequeIterator(this, m_pBackHeader, m_pBack); } + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + T& At(uint32 index); + + const T& At(uint32 index) const; + + T& operator[](uint32 index); + const T& operator[](uint32 index) const; + ///@} + + /// Returns the object at the front of the deque. + /// + /// @warning This will cause an access violation if called on an empty deque! + /// + /// @returns Reference to the item stored at the front end of the deque. + T& Front() const + { + PAL_ASSERT(m_numElements != 0); + return *m_pFront; + } + + /// Returns the object at the tail of the deque. + /// + /// @warning This will cause an access violation if called on an empty deque! + /// + /// @returns Reference to the item stored at the back end of the deque. + T& Back() const + { + PAL_ASSERT(m_numElements != 0); + return *m_pBack; + } + + /// Pushes a copy of the specified item onto the front of the deque. + /// + /// @param [in] data Item to be added to the front of the deque. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + Result PushFront(const T& data); + + /// Emplaces a newly constructed item onto the front of the deque. + /// + /// @param [in] args arguments used to construct the new item. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + template + Result EmplaceFront(Args&&... args); + + /// Pushes a copy of the specified item onto the back of the deque. + /// + /// @param [in] data Item to be added to the back of the deque. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + Result PushBack(const T& data); + + /// Emplaces a newly constructed item onto the back of the deque. + /// + /// @param [in] args arguments used to construct the new item. + /// + /// @returns @ref Success if the item was successfully added to the deque or @ref ErrorOutOfMemory if the operation + /// failed because of an internal failure to allocate system memory. + template + Result EmplaceBack(Args&&... args); + + /// Pops the first item off the front of the deque, returning the popped value. + /// + /// @param [out] pOut Item popped off the front of the deque. + /// + /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque + /// is empty. + Result PopFront(T* pOut); + + /// Pops the first item off the back of the deque, returning the popped value. + /// + /// @param [out] pOut Item popped off the back of the deque. + /// + /// @returns @ref Success if the item was successfully popped from the deque or @ref ErrorUnavailable if the deque + /// is empty. + Result PopBack(T* pOut); + +private: + Result AllocateFront(T**); + Result AllocateBack(T**); + DequeBlockHeader* AllocateNewBlock(); + void FreeUnusedBlock(DequeBlockHeader* pHeader); + + // A helper function to avoid duplication in const and non-const versions of At(). + T& InternalAt(uint32 index) const; + + size_t m_numElements; // Number of elements + const size_t m_numElementsPerBlock; // Block granularity when we need to alloc a new one + + DequeBlockHeader* m_pFrontHeader; // First block of data elements, null for empty deques. + DequeBlockHeader* m_pBackHeader; // Last block of data elements, null for empty deques/ + + T* m_pFront; // First data element, null for empty deques. + T* m_pBack; // Last data element, null for empty deques. + + DequeBlockHeader* m_pLazyFreeHeader; // Cached pointer to the most-recently freed block. + + Allocator*const m_pAllocator; // Pointer to the allocator for this deque. + + PAL_DISALLOW_COPY_AND_ASSIGN(Deque); + + // Although this is a transgression of coding standards, it prevents DequeIterator requiring a public constructor; + // constructing a 'bare' DequeIterator (i.e. without calling Deque::GetIterator) can never be a legal operation, so + // this means that these two classes are much safer to use. + friend class DequeIterator; +}; + +// ===================================================================================================================== +template +Deque::Deque( + Allocator*const pAllocator, + size_t numElementsPerBlock) + : + m_numElements(0), + m_numElementsPerBlock(numElementsPerBlock), + m_pFrontHeader(nullptr), + m_pBackHeader(nullptr), + m_pFront(nullptr), + m_pBack(nullptr), + m_pLazyFreeHeader(nullptr), + m_pAllocator(pAllocator) +{ +} + +// ===================================================================================================================== +// Frees all of the blocks this object allocated over its lifetime. +template +Deque::~Deque() +{ + if (!std::is_trivial::value) + { + while (m_pFrontHeader != nullptr) + { + // Explicitly destroy the removed value since it's non-trivial and advance. + // We must destroy all of them in the current block before freeing it. + m_pFront->~T(); + ++m_pFront; + --m_numElements; + + if ((m_pFront == m_pFrontHeader->pEnd) || (m_numElements == 0)) + { + // Okay, the front block is now empty. Free it and advance to the next block. + DequeBlockHeader* pBlockToFree = m_pFrontHeader; + m_pFrontHeader = m_pFrontHeader->pNext; + PAL_SAFE_FREE(pBlockToFree, m_pAllocator); + + if (m_pFrontHeader != nullptr) + { + // Fixup to the new block. + m_pFront = static_cast(m_pFrontHeader->pStart); + } + } + } + } + else + { + // Elements are trivial so skip iterating through elements and free each block. + while (m_pFrontHeader != nullptr) + { + DequeBlockHeader* pBlockToFree = m_pFrontHeader; + m_pFrontHeader = m_pFrontHeader->pNext; + PAL_SAFE_FREE(pBlockToFree, m_pAllocator); + } + } + + if (m_pLazyFreeHeader != nullptr) + { + PAL_SAFE_FREE(m_pLazyFreeHeader, m_pAllocator); + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h b/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h index f034a2ec52..d57b3626c1 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palEvent.h @@ -1,141 +1,141 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palEvent.h - * @brief PAL utility collection Event class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palTime.h" -#include "palUtil.h" - -namespace Util -{ - -/// Specifies the flags for event. -struct EventCreateFlags -{ - union - { - struct - { - uint32 manualReset : 1; ///< If true, the event is created as manual reset. - uint32 initiallySignaled : 1; ///< If true, the event is created in signaled state. -#if defined(_WIN32) - uint32 canBeInherited : 1; ///< If true, the event can be inherited by child process, it's - /// Windows-specific. - uint32 reserved : 29; ///< Reserved for future use. -#else - uint32 semaphore : 1; ///< If true, provide semaphore-like semantics for reads from the file - /// descriptor. - uint32 nonBlocking : 1; ///< If true, set the O_NONBLOCK file status flag on the new file descriptor. - uint32 closeOnExecute : 1; ///< If true, set the close-on-exec flag for the new file descriptor. - uint32 reserved : 27; ///< Reserved for future use. -#endif - }; - uint32 u32All; ///< Flags packed as 32-bit uint. - }; -}; - -/** - *********************************************************************************************************************** - * @brief Synchronization primitive that can either be in the _set_ or _reset_ state. - * - * Threads can call WaitForEvents() to block waiting for an Event object to be _set_. This is useful for fine-grain - * synchronization between threads. - * - * Event objects start out in the _reset_ state. - *********************************************************************************************************************** - */ -class Event -{ -public: - Event(); - ~Event(); - /// Initializes the event object. Clients must call this before using the Event object. - /// - /// @param flags Event creation flags. - /// @param pName Specified the event's name, it's Windows-specific, Windows uses this name to - /// uniquely identify fence objects across processes. - /// @returns Success if the event was successfully initialized, otherwise an appropriate error code. - Result Init( - const EventCreateFlags& flags -#if defined(_WIN32) - , - const wchar_t* pName = nullptr -#endif - ); - /// Changes the event state to _set_ - /// - /// @returns Success unless the Event has not been initialized yet (@ref ErrorUnavailable) or an unexpected internal - /// error occured when calling the OS (ErrorUnknown). - Result Set() const; - - /// Changes the event state to _reset_. - /// - /// @returns Success unless the Event has not been initialized yet (ErrorUnavailable) or an unexpected - /// internal error occured when calling the OS (ErrorUnknown). - Result Reset() const; - - /// Waits for the event to enter the _set_ state before returning control to the caller. The event will change to - /// the _reset_ state if manualReset was false on initialization. - /// - /// @param [in] timeout Max time to wait, in seconds. If zero, this call will poll the event without blocking. - /// - /// @returns Success if the wait completed successfully or Timeout if the wait did not complete but the operation - /// timed out. Otherwise, one of the following errors may be returned: - /// + ErrorInvalidValue will be returned if the timeout is negative. - /// + ErrorUnknown may be returned if an unexpected internal occurs when calling the OS. - Result Wait(fseconds timeout) const; - -#if defined(_WIN32) - /// On Windows, a handle to an OS event primitive is a HANDLE, which is just a void*. - typedef void* EventHandle; -#else - /// On Linux, a handle to an OS event primitive is a file descriptor, which is just an int. - typedef int32 EventHandle; -#endif - - /// Returns a handle to the actual OS event primitive associated with this object. - EventHandle GetHandle() const { return m_hEvent; } - - /// Open event handle. - Result Open(EventHandle handle, bool isReference); - - /// Constant EventHandle value which represents an invalid event object. - static const EventHandle InvalidEvent; - -private: - EventHandle m_hEvent; // OS-specific event handle. - bool m_isReference; // If true, the event is a global sharing object handle (not a duplicate) which is - // imported from external, so it can't be closed in the currect destructor, and can only - // be closed by the creater. - - PAL_DISALLOW_COPY_AND_ASSIGN(Event); -}; -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palEvent.h + * @brief PAL utility collection Event class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palTime.h" +#include "palUtil.h" + +namespace Util +{ + +/// Specifies the flags for event. +struct EventCreateFlags +{ + union + { + struct + { + uint32 manualReset : 1; ///< If true, the event is created as manual reset. + uint32 initiallySignaled : 1; ///< If true, the event is created in signaled state. +#if defined(_WIN32) + uint32 canBeInherited : 1; ///< If true, the event can be inherited by child process, it's + /// Windows-specific. + uint32 reserved : 29; ///< Reserved for future use. +#else + uint32 semaphore : 1; ///< If true, provide semaphore-like semantics for reads from the file + /// descriptor. + uint32 nonBlocking : 1; ///< If true, set the O_NONBLOCK file status flag on the new file descriptor. + uint32 closeOnExecute : 1; ///< If true, set the close-on-exec flag for the new file descriptor. + uint32 reserved : 27; ///< Reserved for future use. +#endif + }; + uint32 u32All; ///< Flags packed as 32-bit uint. + }; +}; + +/** + *********************************************************************************************************************** + * @brief Synchronization primitive that can either be in the _set_ or _reset_ state. + * + * Threads can call WaitForEvents() to block waiting for an Event object to be _set_. This is useful for fine-grain + * synchronization between threads. + * + * Event objects start out in the _reset_ state. + *********************************************************************************************************************** + */ +class Event +{ +public: + Event(); + ~Event(); + /// Initializes the event object. Clients must call this before using the Event object. + /// + /// @param flags Event creation flags. + /// @param pName Specified the event's name, it's Windows-specific, Windows uses this name to + /// uniquely identify fence objects across processes. + /// @returns Success if the event was successfully initialized, otherwise an appropriate error code. + Result Init( + const EventCreateFlags& flags +#if defined(_WIN32) + , + const wchar_t* pName = nullptr +#endif + ); + /// Changes the event state to _set_ + /// + /// @returns Success unless the Event has not been initialized yet (@ref ErrorUnavailable) or an unexpected internal + /// error occured when calling the OS (ErrorUnknown). + Result Set() const; + + /// Changes the event state to _reset_. + /// + /// @returns Success unless the Event has not been initialized yet (ErrorUnavailable) or an unexpected + /// internal error occured when calling the OS (ErrorUnknown). + Result Reset() const; + + /// Waits for the event to enter the _set_ state before returning control to the caller. The event will change to + /// the _reset_ state if manualReset was false on initialization. + /// + /// @param [in] timeout Max time to wait, in seconds. If zero, this call will poll the event without blocking. + /// + /// @returns Success if the wait completed successfully or Timeout if the wait did not complete but the operation + /// timed out. Otherwise, one of the following errors may be returned: + /// + ErrorInvalidValue will be returned if the timeout is negative. + /// + ErrorUnknown may be returned if an unexpected internal occurs when calling the OS. + Result Wait(fseconds timeout) const; + +#if defined(_WIN32) + /// On Windows, a handle to an OS event primitive is a HANDLE, which is just a void*. + typedef void* EventHandle; +#else + /// On Linux, a handle to an OS event primitive is a file descriptor, which is just an int. + typedef int32 EventHandle; +#endif + + /// Returns a handle to the actual OS event primitive associated with this object. + EventHandle GetHandle() const { return m_hEvent; } + + /// Open event handle. + Result Open(EventHandle handle, bool isReference); + + /// Constant EventHandle value which represents an invalid event object. + static const EventHandle InvalidEvent; + +private: + EventHandle m_hEvent; // OS-specific event handle. + bool m_isReference; // If true, the event is a global sharing object handle (not a duplicate) which is + // imported from external, so it can't be closed in the currect destructor, and can only + // be closed by the creater. + + PAL_DISALLOW_COPY_AND_ASSIGN(Event); +}; +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palFile.h b/shared/amdgpu-windows-interop/pal/inc/util/palFile.h index f67756ce47..219042feb1 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palFile.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palFile.h @@ -1,300 +1,300 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palFile.h - * @brief PAL utility collection File class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -// pal -#include "palUtil.h" -#include "palInlineFuncs.h" - -// stl -#include -#include - -#if defined(_WIN32) -/// Macro for wide string literal concatenation. -#define PAL_PATH_SEPW L"\\" -/// Macro for narrow string literal concatenation. -#define PAL_PATH_SEP "\\" -#else -/// Macro for wide string literal concatenation. -#define PAL_PATH_SEPW L"/" -/// Macro for narrow string literal concatenation. -#define PAL_PATH_SEP "/" -#endif - -namespace Util -{ -#if defined(_WIN32) -/// Wide-character of the platform's prefered path separator. -static constexpr wchar_t PathSepW = L'\\'; -/// Narrow-character of the platform's prefered path separator. -static constexpr char PathSep = '\\'; -#else -/// Wide-character of the platform's prefered path separator. -static constexpr wchar_t PathSepW = L'/'; -/// Narrow-character of the platform's prefered path separator. -static constexpr char PathSep = '/'; -#endif - -static constexpr uint32 MaxPathStrLen = 512; -static constexpr uint32 MaxFileNameStrLen = 256; - -/// Enumerates access modes that may be required on an opened file. -/// Can be bitwise ORed together to specify multiple simultaneous modes. -enum FileAccessMode : uint32 -{ - FileAccessRead = 0x1, ///< Read access. - FileAccessWrite = 0x2, ///< Write access. - FileAccessAppend = 0x4, ///< Append access. - FileAccessBinary = 0x8, ///< Binary access. - FileAccessNoDiscard = 0x10, ///< Don't discard existing file. - FileAccessShared = 0x20, ///< Require shared file access (simultaneous reading/writing by more than one process) -}; - -/** - *********************************************************************************************************************** - * @brief Exposes simple file I/O functionality by encapsulating standard C runtime file I/O functions like fopen, - * fwrite, etc. - *********************************************************************************************************************** - */ -class File -{ -public: - // Platform-agnostic 64-bit stat structure. - struct Stat - { - uint64 size; // Size of the file in bytes. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 922 - uint64 ctime; // Time of creation of the file (not valid on FAT). - uint64 atime; // Time of last access to the file (not valid on FAT). - uint64 mtime; // Time of last modification to the file. -#else - std::chrono::system_clock::time_point ctime; // Time of creation of the file (not valid on FAT). - std::chrono::system_clock::time_point atime; // Time of last access to the file (not valid on FAT). - std::chrono::system_clock::time_point mtime; // Time of last modification to the file. -#endif - uint32 nlink; // Number of hard links (always 1 on FAT on Windows). - uint32 mode; // Bitmask for the file-mode information. - uint32 dev; // Drive number of the disk containing the file. - - union - { - struct - { - uint32 isDir : 1; - uint32 isRegular : 1; - uint32 reserved : 30; - }; - uint32 u32All; - } flags; - - // Common stat members omitted from this structure: - // uid, gid, and ino because it's not used on Windows - // rdev because it's a duplicate of dev - }; - - // Where in the file to start seeking from. - enum class SeekPosition : int32 - { - // start of the file - Start = SEEK_SET, - // current file pointer position - Current = SEEK_CUR, - // end of the file - End = SEEK_END - }; - - File() : m_pFileHandle(nullptr), m_ownsHandle(false) {} - - /// Closes the file if it is still open. - ~File() { Close(); } - - /// Opens a file stream for read, write or append access. - /// - /// @param [in] pFilename Name of file to open. - /// @param [in] accessFlags Bitmask of FileAccessMode values indicating the usage of the file. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result Open(const char* pFilename, uint32 accessFlags); - - /// Borrows an externally opened C runtime file handle for use by a File object. - /// - /// The caller is still responsible for closing this handle after the File object is destroyed. - /// - /// @param [in] pFile Externally opened C runtime file handle to borrow. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result FromNative(std::FILE* pFile); - - /// Closes the file handle. - void Close(); - - /// Writes a stream of bytes to the file. - /// - /// @param [in] pBuffer Byte stream to be written to the file. - /// @param [in] bufferSize Number of bytes to write. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result Write(const void* pBuffer, size_t bufferSize); - - /// Reads a stream of bytes from the file. - /// - /// @param [out] pBuffer Buffer to be written with data read from file. - /// @param [in] bufferSize Size of the output buffer. - /// @param [out] pBytesRead Number of bytes actually read (can be null). - /// - /// @returns Success if successful, otherwise an appropriate error. - Result Read(void* pBuffer, size_t bufferSize, size_t* pBytesRead); - - /// Reads a single line of bytes from the file. - /// - /// @param [out] pBuffer Buffer to be written with data read from file. - /// @param [in] bufferSize Size of the output buffer. - /// @param [out] pBytesRead Number of bytes actually read (can be null). - /// - /// @returns Success if successful, otherwise an appropriate error. - Result ReadLine(void* pBuffer, size_t bufferSize, size_t* pBytesRead); - - /// Prints a formatted string to the file. - /// - /// @param [in] pFormatStr Printf-style format string. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result Printf(const char* pFormatStr, ...) const; - - /// Prints a formatted string to the file. - /// - /// @param [in] pFormatStr Printf-style format string. - /// @param [in] argList Variable argument list. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result VPrintf(const char* pFormatStr, va_list argList); - - /// Flushes pending I/O to the file. - /// - /// @returns Success if successful, otherwise an appropriate error. - Result Flush() const; - - /// Sets the file position to the beginning of the file. - void Rewind(); - - /// Sets the position indicator to a new position. - /// - /// @param offset Number of bytes to offset - /// @param pos File position to seek from - void Seek(int64 offset, SeekPosition pos); - - /// Sets the position indicator to a new position relative to the beginning of the file. - /// - /// @param offset Number of bytes to offset - void Seek(size_t offset) { Seek(offset, SeekPosition::Start); } - - /// Sets the position indicator to a new position relative to the end of the file - /// - /// @param offset Number of bytes to offset - void Rseek(size_t offset) { Seek(-static_cast(offset), SeekPosition::End); } - - /// Sets the file position to the end of the file. - void FastForward() { Rseek(0); } - - /// Returns true if the file is presently open. - bool IsOpen() const { return (m_pFileHandle != nullptr); } - - /// Gets the size of the file contents in bytes - /// - /// @param [in] pFilename Name of the file to check. - /// - /// @returns Size of the file in bytes, or std::numeric_limits::max() on failure. - static size_t GetFileSize(const char* pFilename); - - /// Checks if a file with the specified name exists. - /// - /// @param [in] pFilename Name of the file to check. - /// - /// @returns True if the specified file exists. - static bool Exists(const char* pFilename); - - /// Platform-agnostic 64-bit stat() function. - /// - /// @param [in] pFilename Name of the file to check. - /// @param [out] pStatus The status of that file, if it exists. - /// - /// @returns Success if the structure was retrieved, error otherwise. - static Result GetStat(const char* pFilename, Stat* pStatus); - - /// Removes/erases a file, if it exists. - /// - /// @param [in] pFilename Name of file to remove. - /// - /// @returns Success if successful, otherwise an appropriate error. - static Result Remove(const char* pFilename); - - /// Reads a file into memory. - /// - /// @param [in] pFilename Name of the file to read. - /// @param [in] pData Buffer where the file contents are written to. - /// @param [in] dataSize Size of the buffer in bytes. - /// @param [out] pBytesRead Number of bytes successfully read into the input buffer (can be null). - /// @param [in] binary True for binary mode, false for text. Defaults to binary. - /// - /// @returns Success if successful, otherwise an appropriate error. - /// - /// @note The input buffer must be large enough to hold the file's contents. If the buffer is larger than the file, - /// then the region of the buffer beyond the file size is _not_ modified by this function. It is the caller's - /// responsibility to _not_ read uninitialized portions of the supplied buffer after this call returns. - /// - /// @note In binary mode, the number of bytes read is equal to the file size in bytes upon a successful return. - /// In text mode, newline conversion is performed on Windows, in which case the number of bytes read may not equal - /// the file size in bytes. - /// - /// @note In text mode, should the caller treat the resulting data as a C string, it is the caller's responsibility - /// to null-terminate the buffer. - static Result ReadFile( - const char* pFilename, - void* pData, - size_t dataSize, - size_t* pBytesRead = nullptr, - bool binary = true); - - /// Gets the handle associated with this file. - /// - /// @returns A pointer to the file handle - const std::FILE* GetHandle() const { return m_pFileHandle; } - -private: - std::FILE* m_pFileHandle; - bool m_ownsHandle; // This object owns the file handle and will close it on destruction. - - PAL_DISALLOW_COPY_AND_ASSIGN(File); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palFile.h + * @brief PAL utility collection File class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +// pal +#include "palUtil.h" +#include "palInlineFuncs.h" + +// stl +#include +#include + +#if defined(_WIN32) +/// Macro for wide string literal concatenation. +#define PAL_PATH_SEPW L"\\" +/// Macro for narrow string literal concatenation. +#define PAL_PATH_SEP "\\" +#else +/// Macro for wide string literal concatenation. +#define PAL_PATH_SEPW L"/" +/// Macro for narrow string literal concatenation. +#define PAL_PATH_SEP "/" +#endif + +namespace Util +{ +#if defined(_WIN32) +/// Wide-character of the platform's prefered path separator. +static constexpr wchar_t PathSepW = L'\\'; +/// Narrow-character of the platform's prefered path separator. +static constexpr char PathSep = '\\'; +#else +/// Wide-character of the platform's prefered path separator. +static constexpr wchar_t PathSepW = L'/'; +/// Narrow-character of the platform's prefered path separator. +static constexpr char PathSep = '/'; +#endif + +static constexpr uint32 MaxPathStrLen = 512; +static constexpr uint32 MaxFileNameStrLen = 256; + +/// Enumerates access modes that may be required on an opened file. +/// Can be bitwise ORed together to specify multiple simultaneous modes. +enum FileAccessMode : uint32 +{ + FileAccessRead = 0x1, ///< Read access. + FileAccessWrite = 0x2, ///< Write access. + FileAccessAppend = 0x4, ///< Append access. + FileAccessBinary = 0x8, ///< Binary access. + FileAccessNoDiscard = 0x10, ///< Don't discard existing file. + FileAccessShared = 0x20, ///< Require shared file access (simultaneous reading/writing by more than one process) +}; + +/** + *********************************************************************************************************************** + * @brief Exposes simple file I/O functionality by encapsulating standard C runtime file I/O functions like fopen, + * fwrite, etc. + *********************************************************************************************************************** + */ +class File +{ +public: + // Platform-agnostic 64-bit stat structure. + struct Stat + { + uint64 size; // Size of the file in bytes. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 922 + uint64 ctime; // Time of creation of the file (not valid on FAT). + uint64 atime; // Time of last access to the file (not valid on FAT). + uint64 mtime; // Time of last modification to the file. +#else + std::chrono::system_clock::time_point ctime; // Time of creation of the file (not valid on FAT). + std::chrono::system_clock::time_point atime; // Time of last access to the file (not valid on FAT). + std::chrono::system_clock::time_point mtime; // Time of last modification to the file. +#endif + uint32 nlink; // Number of hard links (always 1 on FAT on Windows). + uint32 mode; // Bitmask for the file-mode information. + uint32 dev; // Drive number of the disk containing the file. + + union + { + struct + { + uint32 isDir : 1; + uint32 isRegular : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + + // Common stat members omitted from this structure: + // uid, gid, and ino because it's not used on Windows + // rdev because it's a duplicate of dev + }; + + // Where in the file to start seeking from. + enum class SeekPosition : int32 + { + // start of the file + Start = SEEK_SET, + // current file pointer position + Current = SEEK_CUR, + // end of the file + End = SEEK_END + }; + + File() : m_pFileHandle(nullptr), m_ownsHandle(false) {} + + /// Closes the file if it is still open. + ~File() { Close(); } + + /// Opens a file stream for read, write or append access. + /// + /// @param [in] pFilename Name of file to open. + /// @param [in] accessFlags Bitmask of FileAccessMode values indicating the usage of the file. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Open(const char* pFilename, uint32 accessFlags); + + /// Borrows an externally opened C runtime file handle for use by a File object. + /// + /// The caller is still responsible for closing this handle after the File object is destroyed. + /// + /// @param [in] pFile Externally opened C runtime file handle to borrow. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result FromNative(std::FILE* pFile); + + /// Closes the file handle. + void Close(); + + /// Writes a stream of bytes to the file. + /// + /// @param [in] pBuffer Byte stream to be written to the file. + /// @param [in] bufferSize Number of bytes to write. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Write(const void* pBuffer, size_t bufferSize); + + /// Reads a stream of bytes from the file. + /// + /// @param [out] pBuffer Buffer to be written with data read from file. + /// @param [in] bufferSize Size of the output buffer. + /// @param [out] pBytesRead Number of bytes actually read (can be null). + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Read(void* pBuffer, size_t bufferSize, size_t* pBytesRead); + + /// Reads a single line of bytes from the file. + /// + /// @param [out] pBuffer Buffer to be written with data read from file. + /// @param [in] bufferSize Size of the output buffer. + /// @param [out] pBytesRead Number of bytes actually read (can be null). + /// + /// @returns Success if successful, otherwise an appropriate error. + Result ReadLine(void* pBuffer, size_t bufferSize, size_t* pBytesRead); + + /// Prints a formatted string to the file. + /// + /// @param [in] pFormatStr Printf-style format string. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Printf(const char* pFormatStr, ...) const; + + /// Prints a formatted string to the file. + /// + /// @param [in] pFormatStr Printf-style format string. + /// @param [in] argList Variable argument list. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result VPrintf(const char* pFormatStr, va_list argList); + + /// Flushes pending I/O to the file. + /// + /// @returns Success if successful, otherwise an appropriate error. + Result Flush() const; + + /// Sets the file position to the beginning of the file. + void Rewind(); + + /// Sets the position indicator to a new position. + /// + /// @param offset Number of bytes to offset + /// @param pos File position to seek from + void Seek(int64 offset, SeekPosition pos); + + /// Sets the position indicator to a new position relative to the beginning of the file. + /// + /// @param offset Number of bytes to offset + void Seek(size_t offset) { Seek(offset, SeekPosition::Start); } + + /// Sets the position indicator to a new position relative to the end of the file + /// + /// @param offset Number of bytes to offset + void Rseek(size_t offset) { Seek(-static_cast(offset), SeekPosition::End); } + + /// Sets the file position to the end of the file. + void FastForward() { Rseek(0); } + + /// Returns true if the file is presently open. + bool IsOpen() const { return (m_pFileHandle != nullptr); } + + /// Gets the size of the file contents in bytes + /// + /// @param [in] pFilename Name of the file to check. + /// + /// @returns Size of the file in bytes, or std::numeric_limits::max() on failure. + static size_t GetFileSize(const char* pFilename); + + /// Checks if a file with the specified name exists. + /// + /// @param [in] pFilename Name of the file to check. + /// + /// @returns True if the specified file exists. + static bool Exists(const char* pFilename); + + /// Platform-agnostic 64-bit stat() function. + /// + /// @param [in] pFilename Name of the file to check. + /// @param [out] pStatus The status of that file, if it exists. + /// + /// @returns Success if the structure was retrieved, error otherwise. + static Result GetStat(const char* pFilename, Stat* pStatus); + + /// Removes/erases a file, if it exists. + /// + /// @param [in] pFilename Name of file to remove. + /// + /// @returns Success if successful, otherwise an appropriate error. + static Result Remove(const char* pFilename); + + /// Reads a file into memory. + /// + /// @param [in] pFilename Name of the file to read. + /// @param [in] pData Buffer where the file contents are written to. + /// @param [in] dataSize Size of the buffer in bytes. + /// @param [out] pBytesRead Number of bytes successfully read into the input buffer (can be null). + /// @param [in] binary True for binary mode, false for text. Defaults to binary. + /// + /// @returns Success if successful, otherwise an appropriate error. + /// + /// @note The input buffer must be large enough to hold the file's contents. If the buffer is larger than the file, + /// then the region of the buffer beyond the file size is _not_ modified by this function. It is the caller's + /// responsibility to _not_ read uninitialized portions of the supplied buffer after this call returns. + /// + /// @note In binary mode, the number of bytes read is equal to the file size in bytes upon a successful return. + /// In text mode, newline conversion is performed on Windows, in which case the number of bytes read may not equal + /// the file size in bytes. + /// + /// @note In text mode, should the caller treat the resulting data as a C string, it is the caller's responsibility + /// to null-terminate the buffer. + static Result ReadFile( + const char* pFilename, + void* pData, + size_t dataSize, + size_t* pBytesRead = nullptr, + bool binary = true); + + /// Gets the handle associated with this file. + /// + /// @returns A pointer to the file handle + const std::FILE* GetHandle() const { return m_pFileHandle; } + +private: + std::FILE* m_pFileHandle; + bool m_ownsHandle; // This object owns the file handle and will close it on destruction. + + PAL_DISALLOW_COPY_AND_ASSIGN(File); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h index 6997902fe4..c3b481cac9 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashBase.h @@ -1,564 +1,529 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashBase.h - * @brief PAL utility collection shared structures and class declarations used by the HashMap and HashSet containers. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palSysMemory.h" - -namespace Util -{ - -/// Entry at the end of a group allocation which contains the size and a pointer to the next group. -template -struct GroupFooter -{ - Entry* pNextGroup; - uint32 numEntries; -}; - -// Forward declarations. -template class HashBase; - -/// Pointer hash functor. -/// -/// Just directly returns bits 31-6 of the key's first dword. This is a decent hash if the key is a pointer. -template -struct PointerHashFunc -{ - /// Shifts the key to the right and use the resulting bits as a uint hash. - /// - /// @param [in] pVoidKey Pointer to the key to be hashed. If the key is a pointer, which is the best use case for - /// this hash function, then this is really a pointer to a pointer. - /// @param [in] keyLen Ignored. - /// - /// @returns 32-bit uint hash value. - uint32 operator()(const void* pVoidKey, uint32 keyLen) const; - - static constexpr uint32 ShiftNum = 6; ///< Right shift bit number - - /// Makes sure the hashing result always contain at least minNumBits bits. - void Init(uint32 minNumBits) const - { - PAL_ASSERT((Min(sizeof(Key), sizeof(uint32)) * 8) >= (minNumBits + ShiftNum)); - static_assert(std::is_pointer_v, "Usage of PointerHashFunc for non-pointer types!"); - } -}; - -/// Jenkins hash functor. -/// -/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: -/// http://burtleburtle.net/bob/hash/doobs.html -/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this -/// code any way you wish, private, educational, or commercial. It's free. -/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm -/// Use for hash table lookup, or anything where one collision in 2^^32 is -/// acceptable. Do NOT use for cryptographic purposes. -template -struct JenkinsHashFunc -{ - /// Hashes the specified key value via the Jenkins hash algorithm. - /// - /// @param [in] pVoidKey Pointer to the key to be hashed. - /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. - /// - /// @returns 32-bit uint hash value. - uint32 operator()(const void* pVoidKey, uint32 keyLen) const; - - /// No init job. Defined to be compatible with default hash func. - void Init(uint32) const { } -}; - -/// Jenkins hash functor for C-style strings. -/// -/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: -/// http://burtleburtle.net/bob/hash/doobs.html -/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this -/// code any way you wish, private, educational, or commercial. It's free. -/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm -/// Use for hash table lookup, or anything where one collision in 2^^32 is -/// acceptable. Do NOT use for cryptographic purposes. -/// -/// @note This hash function is for char* keys only, since the regular JenkinsHashFunc will attempt to do a hash on the -/// address of the pointer, as opposed to the actual string. -template -struct StringJenkinsHashFunc : JenkinsHashFunc -{ - /// Hashes the specified C-style string key via the Jenkins hash algorithm. - /// - /// @param [in] pVoidKey Pointer to the key string (i.e., this is a char**) to be hashed. - /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. Should always be sizeof(char*). - /// - /// @returns 32-bit uint hash value. - uint32 operator()(const void* pVoidKey, uint32 keyLen) const; -}; - -/// Generic compare functor for types with arbitrary size. -/// -/// Used by @ref HashBase to prevent defining compare functions for each type. -template -struct DefaultEqualFunc -{ - /// Returns true if key1 and key2 are equal (have identical memory contents). - bool operator()(const Key& key1, const Key& key2) const - { - return (memcmp(&key1, &key2, sizeof(Key)) == 0); - } -}; - -/// String compare functor for use with C-style strings. memcmp doesn't work well for strings, so this uses strcmp. -template -struct StringEqualFunc -{ - /// Returns true if the strings in key1 and key2 are equal. - bool operator()(const Key& key1, const Key& key2) const; -}; - -template -using DefaultHashFunc = std::conditional_t, PointerHashFunc, JenkinsHashFunc>; - -/** - *********************************************************************************************************************** - * @brief Fixed-size, growable, and lazy-free memory pool allocator. - * - * Memory is divided into blocks and stored in a fixed-sized structure array. One blocks is made of fixed-sized groups. - * Blocks grows exponentially, that is, each block has twice the number of groups than the previous one. - * - * @warning This class is not thread-safe! - *********************************************************************************************************************** - */ -template -class HashAllocator -{ -public: - /// Constructor. - /// - /// @param [in] groupSize Fixed allocation size. Allocate() will only be able to create allocations of this size. - /// @param [in] alignment Required alignment of the allocation in bytes. - /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. - HashAllocator(size_t groupSize, uint32 alignment, Allocator*const pAllocator); - - ~HashAllocator(); - - /// Allocates a new block of memory. - /// - /// No size parameter, the size of allocation is fixed to the groupSize parameter specified in the constructor. - /// - /// @returns A pointer to the allocate memory, or null if the allocation failed. - void* Allocate(); - - /// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. - void Reset(); - - /// Allocates memory using allocator callbacks. - /// - /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define an - /// Alloc(const AllocInfo&) function. - /// - /// @param [in] allocInfo Structure containing information about memory allocation. - /// - /// @returns Pointer to memory allocated. - void* Alloc( - const AllocInfo& allocInfo) - { return m_pAllocator->Alloc(allocInfo); } - - /// Frees memory using allocator callbacks. - /// - /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define a - /// Free(const FreeInfo&) function. - /// - /// @param [in] freeInfo Structure containing information about memory needing to be freed. - void Free( - const FreeInfo& freeInfo) - { return m_pAllocator->Free(freeInfo); } - - /// Returns true if the allocator is in the 'reset' state and has no active allocations. - /// - /// @note There may still be reused but clean allocations present. - bool IsClean() const { return m_curBlock == -1; } - -private: - struct MemBlock - { - void* pMemory; // Pointer to the memory allocated for this block. - uint32 numGroups; // Number of groups in the block. - uint32 curGroup; // Current group index to be allocated. - }; - - // For the i-th block, it will hold Pow(2,i) groups, the whole array could have 4G groups. - static constexpr int32 NumBlocks = 32; // Number of blocks. - - MemBlock m_blocks[NumBlocks]; // Memory blocks holding exponentially growing memory. - const size_t m_groupSize; // Fixed-group-size for each group in one block. - uint32 m_alignment; // Required alignment of the allocation in bytes. - int32 m_curBlock; // Current block index memory is being allocated from. -1 indicates the - // allocator has just been created and hasn't created any blocks yet. - Allocator*const m_pAllocator; // Allocator for this hash allocation function. -}; - -// ===================================================================================================================== -template -HashAllocator::HashAllocator( - size_t groupSize, // Fixed allocation size. Allocate() will always create allocations of this size. - uint32 alignment, // Required alignment of the allocation in bytes. - Allocator*const pAllocator) // Allocator for this hash allocation function. - : - m_groupSize(groupSize), - m_alignment(alignment), - m_curBlock(-1), - m_pAllocator(pAllocator) -{ - for (int32 i = 0; i < NumBlocks; i++) - { - m_blocks[i].pMemory = nullptr; - m_blocks[i].curGroup = 0; - m_blocks[i].numGroups = (1 << i); - } -} - -// ===================================================================================================================== -template -HashAllocator::~HashAllocator() -{ - for (int32 i = 0; i < NumBlocks; i++) - { - if (m_blocks[i].pMemory == nullptr) - { - break; - } - else - { - PAL_SAFE_FREE(m_blocks[i].pMemory, m_pAllocator); - } - } -} - -/** - *********************************************************************************************************************** - * @brief Iterator for traversal of elements in a Hash container. - * - * Backward iterating is not supported since there is no "footer" or "header" for a hash container. - *********************************************************************************************************************** - */ -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -class HashIterator -{ -public: - /// Convenience typedef for the associated container for this templated iterator. - typedef HashBase Container; - - ~HashIterator() { } - - /// Returns a pointer to current entry. Will return null if the iterator has been advanced off the end of the - /// container. - Entry* Get() const { return m_pCurrentEntry; } - - /// Advances the iterator to the next position (move forward). - void Next(); - - /// Resets the iterator to its starting point. - void Reset(); - -private: - HashIterator(const Container* pHashContainer, uint32 startBucket); - - const Container* const m_pContainer; // Hash container that we're iterating over. - const uint32 m_startBucket; // Bucket where we start iterating. - uint32 m_currentBucket; // Current bucket we're iterating. - Entry* m_pCurrentGroup; // Current group we're iterating (belongs to the current bucket). - Entry* m_pCurrentEntry; // Current entry we're at now (belongs to the current group). - uint32 m_indexInGroup; // Index of current entry in the group. - PAL_DISALLOW_DEFAULT_CTOR(HashIterator); - - // Although this is a transgression of coding standards, it means that Container does not need to have a public - // interface specifically to implement this class. The added encapsulation this provides is worthwhile. - friend class HashBase; -}; - -/** - *********************************************************************************************************************** - * @brief Templated base class for HashMap and HashSet, supporting the ability to store, find, and remove entries. - * - * The hash container has a fixed number of buckets. These buckets contain a growable number of entry groups. Each - * entry group contains a fixed number of entries and a pointer to the next entry group in the bucket. - * - * The following restrictions are made in order to tune it to the desired usage: - * - * - The item must be significantly smaller than a cache line. - * - The key must be POD-style type. - * - * This class aims to be very efficient when looking up the key and storing small attached items is the primary concern. - * It's therefore not desired to have the key associated with a pointer to the attached data, because the attached data - * may be of similar or smaller size than the pointer anyway, it would also introduce much unnecessary memory - * management, and it would imply a minimum of two cache misses in the typical lookup case. - * - * The idea is that these entry groups can be exactly the size of a cache line, so an entry group can be scanned with - * only a single cache miss. This extends the load factor that the hash-map can manage before performance begins to - * degrade. For the very small items that we expect, this should be a significant advantage; we expect one cache miss - * pretty much always, so packing the items together would not be a significant gain, and the cost in memory usage is - * (relatively) small. - * - * The initial hash container will use up about (buckets * GroupSize) bytes. - *********************************************************************************************************************** - */ -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -class HashBase -{ -public: - /// Convenience typedef for iterators of this templated HashBase. - typedef HashIterator Iterator; - - /// Initializes the hash container. This no longer needs to be called by a client of this API; instead - /// subclasses call InitAndFindBucket() instead of FindBucket() in any method that might insert a - /// new entry. - /// - /// @returns @ref Success if the initialization completed successfully, or ErrorOutOfMemory if the operation failed - /// due to an internal failure to allocate system memory. - Result Init(); - - /// Returns number of entries in the container. - uint32 GetNumEntries() const { return m_numEntries; } - - /// Returns an iterator pointing to the first entry. - Iterator Begin() const; - - /// Empty the hash container. - void Reset(); - - /// Removes an entry that matches the specified key. - /// - /// @param [in] key Key of the entry to erase. - /// - /// @returns True if the erase completed successfully, false if an entry for this key did not exist. - bool Erase(const Key& key); - - /// Returns true if the specified key exists in the set. - /// - /// @param [in] key Key to search for. - /// - /// @returns True if the specified key exists in the set. - bool Contains(const Key& key) const; - -protected: - /// @internal Constructor - /// - /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will - /// take (buckets * GroupSize) bytes. - /// @param [in] pAllocator The allocator that will allocate memory if required. - explicit HashBase(uint32 numBuckets, Allocator*const pAllocator); - ~HashBase() { PAL_SAFE_FREE(m_pMemory, &m_allocator); } - - /// @internal Ensures that the hash table has been allocated, then finds the bucket that matches - /// the specified key - /// - /// @param [in] key Key to find matching bucket for. - /// - /// @returns Pointer to the bucket corresponding to the specified key. - Entry* InitAndFindBucket(const Key& key); - - /// @internal Finds the bucket that matches the specified key. A subclass should use this only if it - /// is searching for an entry. If it might want to insert a new entry, it should use InitAndFindBucket() - /// instead. - /// - /// @param [in] key Key to find matching bucket for. - /// - /// @returns Pointer to the bucket corresponding to the specified key. - Entry* FindBucket(const Key& key) const; - - /// @internal Finds a given entry. - /// - /// @param [in] key Key to find matching bucket for. - /// - /// @returns Pointer to the entry corresponding to the specified key or nullptr. - Entry* FindEntry(const Key& key) const; - - /// @internal Finds a given entry; if no entry was found, allocate it. - /// - /// @param [in] key Key to search for. - /// @param [out] pExisted True if an entry for the specified key existed before this call was made. False indicates - /// that a new entry was allocated as a result of this call. - /// @param [out] ppValue Readable/writeable value in the hash map corresponding to the specified key. - /// - /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed - /// because an internal memory allocation failed. - Result FindAllocateEntry(const Key& key, bool* pExisted, Entry** ppValue); - - /// @internal Returns pointer to the next group of the specified group. - /// - /// @param [in] pGroup Current group to find next group for. - /// - /// @returns Pointer to the next group. - static Entry* GetNextGroup(Entry* pGroup); - - /// @internal Helper function which returns a pointer to the footer - /// - /// @param [in] pGroup Group which contains the footer we want. - /// - /// @returns Pointer to the footer - static GroupFooter* GetGroupFooter(Entry* pGroup); - - /// @internal Helper function which returns the number of entries from the footer - /// - /// @param [in] pGroup Group which contains the footer we want. - /// - /// @returns The number of entries from the footer - static uint32 GetGroupFooterNumEntries(Entry* pGroup); - - /// @internal Helper function which sets the number of entries in the footer - /// - /// @param [in] pGroup Group which contains the footer we want. - /// @param [in] numEntries Number of entries that should be set. - static void SetGroupFooterNumEntries(Entry* pGroup, uint32 numEntries); - - /// @internal Helper function which returns the next group from the footer - /// - /// @param [in] pGroup Group which contains the footer we want. - /// - /// @returns Pointer to the next group from the footer - static Entry* GetGroupFooterNextGroup(Entry* pGroup); - - /// @internal Helper function which sets the pointer to the next group in the footer - /// - /// @param [in] pGroup Group which contains the footer we want. - /// @param [in] pNextGroup The next group that should be written into the footer. - static void SetGroupFooterNextGroup(Entry* pGroup, Entry* pNextGroup); - - /// @internal Allocates a new group if the footer of the specified group is null. - /// - /// @param [in] pGroup Current group to allocate a next group for. - /// - /// @returns Pointer to the next group. - Entry* AllocateNextGroup(Entry* pGroup); - - const HashFunc m_hashFunc; ///< @internal Hash functor object. - const EqualFunc m_equalFunc; ///< @internal Key compare function object. - AllocFunc m_allocator; ///< @internal Allocator object. - - uint32 m_numBuckets; ///< @internal Buckets in the hash table; Padded to power of 2. - uint32 m_numEntries; ///< @internal Entries in the table. - size_t m_memorySize; ///< @internal Memory allocation size for m_pMemory. - void* m_pMemory; ///< @internal Base address as allocated (before alignment). - - static constexpr size_t EntrySize = sizeof(Entry); ///< @internal Size (in bytes) of a single entry. - - /// Size (in bytes) of the footer space of a group linking to next group. - static constexpr size_t GroupFooterSize = sizeof(GroupFooter); - - /// Number of entries in a single group. - static constexpr uint32 EntriesInGroup = ((GroupSize - GroupFooterSize) / EntrySize); - - // There must be at least one entry in each group. - static_assert((EntriesInGroup >= 1), "Hash container entry is too big."); - -private: - PAL_DISALLOW_DEFAULT_CTOR(HashBase); - PAL_DISALLOW_COPY_AND_ASSIGN(HashBase); - - // Although this is a transgression of coding standards, it prevents HashIterator requiring a public constructor; - // constructing a 'bare' HashIterator (i.e. without calling HashSet::GetIterator) can never be a legal operation, so - // this means that these two classes are much safer to use. - friend class HashIterator; -}; - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -HashIterator::HashIterator( - const Container* pContainer, ///< [retained] The hash container to iterate over - uint32 startBucket) ///< The beginning bucket - : - m_pContainer(pContainer), - m_startBucket(startBucket), - m_currentBucket(m_startBucket), - m_indexInGroup(0) -{ - if (m_startBucket < m_pContainer->m_numBuckets) - { - m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, - m_startBucket * GroupSize)); - } - else - { - m_pCurrentGroup = nullptr; - } - - m_pCurrentEntry = m_pCurrentGroup; -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -HashBase::HashBase( - uint32 numBuckets, - Allocator*const pAllocator) - : - m_hashFunc(), - m_equalFunc(), - m_allocator(GroupSize, alignof(Entry), pAllocator), - m_numBuckets((numBuckets > 0) ? Pow2Pad(numBuckets) : 1), // We always need at least one bucket. - m_numEntries(0), - m_memorySize(m_numBuckets * GroupSize), - m_pMemory(nullptr) -{ -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashBase.h + * @brief PAL utility collection shared structures and class declarations used by the HashMap and HashSet containers. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSysMemory.h" + +namespace Util +{ + +/// Entry at the end of a group allocation which contains the size and a pointer to the next group. +template +struct GroupFooter +{ + Entry* pNextGroup; + uint32 numEntries; +}; + +// Forward declarations. +template class HashBase; + +/// Default hash functor. +/// +/// Just directly returns bits 31-6 of the key's first dword. This is a decent hash if the key is a pointer. +template +struct DefaultHashFunc +{ + /// Shifts the key to the right and use the resulting bits as a uint hash. + /// + /// @param [in] pVoidKey Pointer to the key to be hashed. If the key is a pointer, which is the best use case for + /// this hash function, then this is really a pointer to a pointer. + /// @param [in] keyLen Ignored. + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; + + static constexpr uint32 ShiftNum = 6; ///< Right shift bit number + + /// Makes sure the hashing result always contain at least minNumBits bits. + void Init(uint32 minNumBits) const + { + PAL_ASSERT((Min(sizeof(Key), sizeof(uint32)) * 8) >= (minNumBits + ShiftNum)); + PAL_ALERT_MSG(sizeof(Key) > sizeof(void*), "Usage of DefaultHashFunc for non-pointer types!"); + } +}; + +/// Jenkins hash functor. +/// +/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: +/// http://burtleburtle.net/bob/hash/doobs.html +/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +/// code any way you wish, private, educational, or commercial. It's free. +/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm +/// Use for hash table lookup, or anything where one collision in 2^^32 is +/// acceptable. Do NOT use for cryptographic purposes. +template +struct JenkinsHashFunc +{ + /// Hashes the specified key value via the Jenkins hash algorithm. + /// + /// @param [in] pVoidKey Pointer to the key to be hashed. + /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; + + /// No init job. Defined to be compatible with default hash func. + void Init(uint32) const { } +}; + +/// Jenkins hash functor for C-style strings. +/// +/// Compute hash value according to the Jenkins algorithm. A description of the algorithm is found here: +/// http://burtleburtle.net/bob/hash/doobs.html +/// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +/// code any way you wish, private, educational, or commercial. It's free. +/// See http://ourworld.compuserve.com/homepages/bob_jenkins/evahash.htm +/// Use for hash table lookup, or anything where one collision in 2^^32 is +/// acceptable. Do NOT use for cryptographic purposes. +/// +/// @note This hash function is for char* keys only, since the regular JenkinsHashFunc will attempt to do a hash on the +/// address of the pointer, as opposed to the actual string. +template +struct StringJenkinsHashFunc : JenkinsHashFunc +{ + /// Hashes the specified C-style string key via the Jenkins hash algorithm. + /// + /// @param [in] pVoidKey Pointer to the key string (i.e., this is a char**) to be hashed. + /// @param [in] keyLen Amount of data at pVoidKey to hash, in bytes. Should always be sizeof(char*). + /// + /// @returns 32-bit uint hash value. + uint32 operator()(const void* pVoidKey, uint32 keyLen) const; +}; + +/// Generic compare functor for types with arbitrary size. +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct DefaultEqualFunc +{ + /// Returns true if key1 and key2 are equal (have identical memory contents). + bool operator()(const Key& key1, const Key& key2) const + { + return (memcmp(&key1, &key2, sizeof(Key)) == 0); + } +}; + +/// String compare functor for use with C-style strings. memcmp doesn't work well for strings, so this uses strcmp. +template +struct StringEqualFunc +{ + /// Returns true if the strings in key1 and key2 are equal. + bool operator()(const Key& key1, const Key& key2) const; +}; + +/** + *********************************************************************************************************************** + * @brief Fixed-size, growable, and lazy-free memory pool allocator. + * + * Memory is divided into blocks and stored in a fixed-sized structure array. One blocks is made of fixed-sized groups. + * Blocks grows exponentially, that is, each block has twice the number of groups than the previous one. + * + * @warning This class is not thread-safe! + *********************************************************************************************************************** + */ +template +class HashAllocator +{ +public: + /// Constructor. + /// + /// @param [in] groupSize Fixed allocation size. Allocate() will only be able to create allocations of this size. + /// @param [in] alignment Required alignment of the allocation in bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + HashAllocator(size_t groupSize, uint32 alignment, Allocator*const pAllocator); + + ~HashAllocator(); + + /// Allocates a new block of memory. + /// + /// No size parameter, the size of allocation is fixed to the groupSize parameter specified in the constructor. + /// + /// @returns A pointer to the allocate memory, or null if the allocation failed. + void* Allocate(); + + /// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. + void Reset(); + + /// Allocates memory using allocator callbacks. + /// + /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define an + /// Alloc(const AllocInfo&) function. + /// + /// @param [in] allocInfo Structure containing information about memory allocation. + /// + /// @returns Pointer to memory allocated. + void* Alloc( + const AllocInfo& allocInfo) + { return m_pAllocator->Alloc(allocInfo); } + + /// Frees memory using allocator callbacks. + /// + /// @note In order for this AllocFunc to be classified as an Allocator itself, we must define a + /// Free(const FreeInfo&) function. + /// + /// @param [in] freeInfo Structure containing information about memory needing to be freed. + void Free( + const FreeInfo& freeInfo) + { return m_pAllocator->Free(freeInfo); } + + /// Returns true if the allocator is in the 'reset' state and has no active allocations. + /// + /// @note There may still be reused but clean allocations present. + bool IsClean() const { return m_curBlock == -1; } + +private: + struct MemBlock + { + void* pMemory; // Pointer to the memory allocated for this block. + uint32 numGroups; // Number of groups in the block. + uint32 curGroup; // Current group index to be allocated. + }; + + // For the i-th block, it will hold Pow(2,i) groups, the whole array could have 4G groups. + static constexpr int32 NumBlocks = 32; // Number of blocks. + + MemBlock m_blocks[NumBlocks]; // Memory blocks holding exponentially growing memory. + const size_t m_groupSize; // Fixed-group-size for each group in one block. + uint32 m_alignment; // Required alignment of the allocation in bytes. + int32 m_curBlock; // Current block index memory is being allocated from. -1 indicates the + // allocator has just been created and hasn't created any blocks yet. + Allocator*const m_pAllocator; // Allocator for this hash allocation function. +}; + +// ===================================================================================================================== +template +HashAllocator::HashAllocator( + size_t groupSize, // Fixed allocation size. Allocate() will always create allocations of this size. + uint32 alignment, // Required alignment of the allocation in bytes. + Allocator*const pAllocator) // Allocator for this hash allocation function. + : + m_groupSize(groupSize), + m_alignment(alignment), + m_curBlock(-1), + m_pAllocator(pAllocator) +{ + for (int32 i = 0; i < NumBlocks; i++) + { + m_blocks[i].pMemory = nullptr; + m_blocks[i].curGroup = 0; + m_blocks[i].numGroups = (1 << i); + } +} + +// ===================================================================================================================== +template +HashAllocator::~HashAllocator() +{ + for (int32 i = 0; i < NumBlocks; i++) + { + if (m_blocks[i].pMemory == nullptr) + { + break; + } + else + { + PAL_SAFE_FREE(m_blocks[i].pMemory, m_pAllocator); + } + } +} + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a Hash container. + * + * Backward iterating is not supported since there is no "footer" or "header" for a hash container. + *********************************************************************************************************************** + */ +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +class HashIterator +{ +public: + /// Convenience typedef for the associated container for this templated iterator. + typedef HashBase Container; + + ~HashIterator() { } + + /// Returns a pointer to current entry. Will return null if the iterator has been advanced off the end of the + /// container. + Entry* Get() const { return m_pCurrentEntry; } + + /// Advances the iterator to the next position (move forward). + void Next(); + + /// Resets the iterator to its starting point. + void Reset(); + +private: + HashIterator(const Container* pHashContainer, uint32 startBucket); + + const Container* const m_pContainer; // Hash container that we're iterating over. + const uint32 m_startBucket; // Bucket where we start iterating. + uint32 m_currentBucket; // Current bucket we're iterating. + Entry* m_pCurrentGroup; // Current group we're iterating (belongs to the current bucket). + Entry* m_pCurrentEntry; // Current entry we're at now (belongs to the current group). + uint32 m_indexInGroup; // Index of current entry in the group. + PAL_DISALLOW_DEFAULT_CTOR(HashIterator); + + // Although this is a transgression of coding standards, it means that Container does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class HashBase; +}; + +/** + *********************************************************************************************************************** + * @brief Templated base class for HashMap and HashSet, supporting the ability to store, find, and remove entries. + * + * The hash container has a fixed number of buckets. These buckets contain a growable number of entry groups. Each + * entry group contains a fixed number of entries and a pointer to the next entry group in the bucket. + * + * The following restrictions are made in order to tune it to the desired usage: + * + * - The item must be significantly smaller than a cache line. + * - The key must be POD-style type. + * + * This class aims to be very efficient when looking up the key and storing small attached items is the primary concern. + * It's therefore not desired to have the key associated with a pointer to the attached data, because the attached data + * may be of similar or smaller size than the pointer anyway, it would also introduce much unnecessary memory + * management, and it would imply a minimum of two cache misses in the typical lookup case. + * + * The idea is that these entry groups can be exactly the size of a cache line, so an entry group can be scanned with + * only a single cache miss. This extends the load factor that the hash-map can manage before performance begins to + * degrade. For the very small items that we expect, this should be a significant advantage; we expect one cache miss + * pretty much always, so packing the items together would not be a significant gain, and the cost in memory usage is + * (relatively) small. + * + * The initial hash container will use up about (buckets * GroupSize) bytes. + *********************************************************************************************************************** + */ +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +class HashBase +{ +public: + /// Convenience typedef for iterators of this templated HashBase. + typedef HashIterator Iterator; + + /// Initializes the hash container. This no longer needs to be called by a client of this API; instead + /// subclasses call InitAndFindBucket() instead of FindBucket() in any method that might insert a + /// new entry. + /// + /// @returns @ref Success if the initialization completed successfully, or ErrorOutOfMemory if the operation failed + /// due to an internal failure to allocate system memory. + Result Init(); + + /// Returns number of entries in the container. + uint32 GetNumEntries() const { return m_numEntries; } + + /// Returns an iterator pointing to the first entry. + Iterator Begin() const; + + /// Empty the hash container. + void Reset(); + +protected: + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator The allocator that will allocate memory if required. + explicit HashBase(uint32 numBuckets, Allocator*const pAllocator); + virtual ~HashBase() { PAL_SAFE_FREE(m_pMemory, &m_allocator); } + + /// @internal Ensures that the hash table has been allocated, then finds the bucket that matches + /// the specified key + /// + /// @param [in] key Key to find matching bucket for. + /// + /// @returns Pointer to the bucket corresponding to the specified key. + Entry* InitAndFindBucket(const Key& key); + + /// @internal Finds the bucket that matches the specified key. A subclass should use this only if it + /// is searching for an entry. If it might want to insert a new entry, it should use InitAndFindBucket() + /// instead. + /// + /// @param [in] key Key to find matching bucket for. + /// + /// @returns Pointer to the bucket corresponding to the specified key. + Entry* FindBucket(const Key& key) const; + + /// @internal Returns pointer to the next group of the specified group. + /// + /// @param [in] pGroup Current group to find next group for. + /// + /// @returns Pointer to the next group. + static Entry* GetNextGroup(Entry* pGroup); + + /// @internal Helper function which returns a pointer to the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns Pointer to the footer + static GroupFooter* GetGroupFooter(Entry* pGroup); + + /// @internal Helper function which returns the number of entries from the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns The number of entries from the footer + static uint32 GetGroupFooterNumEntries(Entry* pGroup); + + /// @internal Helper function which sets the number of entries in the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// @param [in] numEntries Number of entries that should be set. + static void SetGroupFooterNumEntries(Entry* pGroup, uint32 numEntries); + + /// @internal Helper function which returns the next group from the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// + /// @returns Pointer to the next group from the footer + static Entry* GetGroupFooterNextGroup(Entry* pGroup); + + /// @internal Helper function which sets the pointer to the next group in the footer + /// + /// @param [in] pGroup Group which contains the footer we want. + /// @param [in] pNextGroup The next group that should be written into the footer. + static void SetGroupFooterNextGroup(Entry* pGroup, Entry* pNextGroup); + + /// @internal Allocates a new group if the footer of the specified group is null. + /// + /// @param [in] pGroup Current group to allocate a next group for. + /// + /// @returns Pointer to the next group. + Entry* AllocateNextGroup(Entry* pGroup); + + const HashFunc m_hashFunc; ///< @internal Hash functor object. + const EqualFunc m_equalFunc; ///< @internal Key compare function object. + AllocFunc m_allocator; ///< @internal Allocator object. + + uint32 m_numBuckets; ///< @internal Buckets in the hash table; Padded to power of 2. + uint32 m_numEntries; ///< @internal Entries in the table. + size_t m_memorySize; ///< @internal Memory allocation size for m_pMemory. + void* m_pMemory; ///< @internal Base address as allocated (before alignment). + + static constexpr size_t EntrySize = sizeof(Entry); ///< @internal Size (in bytes) of a single entry. + + /// Size (in bytes) of the footer space of a group linking to next group. + static constexpr size_t GroupFooterSize = sizeof(GroupFooter); + + /// Number of entries in a single group. + static constexpr uint32 EntriesInGroup = ((GroupSize - GroupFooterSize) / EntrySize); + + // There must be at least one entry in each group. + static_assert((EntriesInGroup >= 1), "Hash container entry is too big."); + +private: + PAL_DISALLOW_DEFAULT_CTOR(HashBase); + PAL_DISALLOW_COPY_AND_ASSIGN(HashBase); + + // Although this is a transgression of coding standards, it prevents HashIterator requiring a public constructor; + // constructing a 'bare' HashIterator (i.e. without calling HashSet::GetIterator) can never be a legal operation, so + // this means that these two classes are much safer to use. + friend class HashIterator; +}; + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashIterator::HashIterator( + const Container* pContainer, ///< [retained] The hash container to iterate over + uint32 startBucket) ///< The beginning bucket + : + m_pContainer(pContainer), + m_startBucket(startBucket), + m_currentBucket(m_startBucket), + m_indexInGroup(0) +{ + if (m_startBucket < m_pContainer->m_numBuckets) + { + m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_startBucket * GroupSize)); + } + else + { + m_pCurrentGroup = nullptr; + } + + m_pCurrentEntry = m_pCurrentGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashBase::HashBase( + uint32 numBuckets, + Allocator*const pAllocator) + : + m_hashFunc(), + m_equalFunc(), + m_allocator(GroupSize, alignof(Entry), pAllocator), + m_numBuckets((numBuckets > 0) ? Pow2Pad(numBuckets) : 1), // We always need at least one bucket. + m_numEntries(0), + m_memorySize(m_numBuckets * GroupSize), + m_pMemory(nullptr) +{ +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h index d7279cb363..395186a5aa 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashBaseImpl.h @@ -1,844 +1,650 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashBaseImpl.h - * @brief PAL utility collection shared class implementations used by the HashMap and HashSet containers. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palHashBase.h" - -namespace Util -{ - -// ===================================================================================================================== -// Hash function for pointers. Simply shift the key to the right and use the resulting bits as the hash. -template -uint32 PointerHashFunc::operator()( - const void* pVoidKey, - uint32 keyLen - ) const -{ - // We need this union to do a type conversion from a Key type to a uint for the hash function. This ensures - // that our data won't change when casting and that we don't have to guess which _cast<> operation is the most - // appropriate for each data type for which this template class may be instantiated. - union KeyUint32 - { - Key key; - uint32 uint; - } hash = { }; - - // Get the raw bits. - const Key* pKey = static_cast(pVoidKey); - hash.key = *pKey; - - // Discard the low bits. - return (hash.uint >> ShiftNum); -} - -// ===================================================================================================================== -// Hashes the specified key value with the Jenkins hash algorithm. Implementation based on the algorithm description -// found here: http://burtleburtle.net/bob/hash/doobs.html. -// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this -// code any way you wish, private, educational, or commercial. It's free. -// See http:\\ourworld.compuserve.com\homepages\bob_jenkins\evahash.htm -// Use for hash table lookup, or anything where one collision in 2^^32 is -// acceptable. Do NOT use for cryptographic purposes. -template -uint32 JenkinsHashFunc::operator()( - const void* pVoidKey, - uint32 keyLen - ) const -{ - // Mixing table. - static const uint8 MixTable[256] = - { - 251, 175, 119, 215, 81, 14, 79, 191, 103, 49, 181, 143, 186, 157, 0, 232, - 31, 32, 55, 60, 152, 58, 17, 237, 174, 70, 160, 144, 220, 90, 57, 223, - 59, 3, 18, 140, 111, 166, 203, 196, 134, 243, 124, 95, 222, 179, 197, 65, - 180, 48, 36, 15, 107, 46, 233, 130, 165, 30, 123, 161, 209, 23, 97, 16, - 40, 91, 219, 61, 100, 10, 210, 109, 250, 127, 22, 138, 29, 108, 244, 67, - 207, 9, 178, 204, 74, 98, 126, 249, 167, 116, 34, 77, 193, 200, 121, 5, - 20, 113, 71, 35, 128, 13, 182, 94, 25, 226, 227, 199, 75, 27, 41, 245, - 230, 224, 43, 225, 177, 26, 155, 150, 212, 142, 218, 115, 241, 73, 88, 105, - 39, 114, 62, 255, 192, 201, 145, 214, 168, 158, 221, 148, 154, 122, 12, 84, - 82, 163, 44, 139, 228, 236, 205, 242, 217, 11, 187, 146, 159, 64, 86, 239, - 195, 42, 106, 198, 118, 112, 184, 172, 87, 2, 173, 117, 176, 229, 247, 253, - 137, 185, 99, 164, 102, 147, 45, 66, 231, 52, 141, 211, 194, 206, 246, 238, - 56, 110, 78, 248, 63, 240, 189, 93, 92, 51, 53, 183, 19, 171, 72, 50, - 33, 104, 101, 69, 8, 252, 83, 120, 76, 135, 85, 54, 202, 125, 188, 213, - 96, 235, 136, 208, 162, 129, 190, 132, 156, 38, 47, 1, 7, 254, 24, 4, - 216, 131, 89, 21, 28, 133, 37, 153, 149, 80, 170, 68, 6, 169, 234, 151 - }; - - const uint8* pKey = static_cast(pVoidKey); - - uint32 a = 0x9e3779b9; // The golden ratio; an arbitrary value. - uint32 b = a; - uint32 c = MixTable[pKey[0]]; // Arbitrary value. - uint32 len = keyLen; - - // Handle most of the key. - while (len >= 12) - { - a = a + (pKey[0] + (static_cast(pKey[1]) << 8) + - (static_cast(pKey[2]) << 16) + - (static_cast(pKey[3]) << 24)); - b = b + (pKey[4] + (static_cast(pKey[5]) << 8) + - (static_cast(pKey[6]) << 16) + - (static_cast(pKey[7]) << 24)); - c = c + (pKey[8] + (static_cast(pKey[9]) << 8) + - (static_cast(pKey[10]) << 16) + - (static_cast(pKey[11]) << 24)); - - a = a - b; a = a - c; a = a ^ (c >> 13); - b = b - c; b = b - a; b = b ^ (a << 8); - c = c - a; c = c - b; c = c ^ (b >> 13); - a = a - b; a = a - c; a = a ^ (c >> 12); - b = b - c; b = b - a; b = b ^ (a << 16); - c = c - a; c = c - b; c = c ^ (b >> 5); - a = a - b; a = a - c; a = a ^ (c >> 3); - b = b - c; b = b - a; b = b ^ (a << 10); - c = c - a; c = c - b; c = c ^ (b >> 15); - - pKey = pKey + 12; - len = len - 12; - } - - // Handle last 11 bytes. - c = c + keyLen; - switch (len) - { - case 11: c = c + (static_cast(pKey[10]) << 24); [[fallthrough]]; - case 10: c = c + (static_cast(pKey[9]) << 16); [[fallthrough]]; - case 9: c = c + (static_cast(pKey[8]) << 8); [[fallthrough]]; - // the first byte of c is reserved for the length - case 8: b = b + (static_cast(pKey[7]) << 24); [[fallthrough]]; - case 7: b = b + (static_cast(pKey[6]) << 16); [[fallthrough]]; - case 6: b = b + (static_cast(pKey[5]) << 8); [[fallthrough]]; - case 5: b = b + pKey[4]; [[fallthrough]]; - case 4: a = a + (static_cast(pKey[3]) << 24); [[fallthrough]]; - case 3: a = a + (static_cast(pKey[2]) << 16); [[fallthrough]]; - case 2: a = a + (static_cast(pKey[1]) << 8); [[fallthrough]]; - case 1: a = a + pKey[0]; - // case 0: nothing left to add - } - - a = a - b; a = a - c; a = a ^ (c >> 13); - b = b - c; b = b - a; b = b ^ (a << 8); - c = c - a; c = c - b; c = c ^ (b >> 13); - a = a - b; a = a - c; a = a ^ (c >> 12); - b = b - c; b = b - a; b = b ^ (a << 16); - c = c - a; c = c - b; c = c ^ (b >> 5); - a = a - b; a = a - c; a = a ^ (c >> 3); - b = b - c; b = b - a; b = b ^ (a << 10); - c = c - a; c = c - b; c = c ^ (b >> 15); - - return c; -} - -// ===================================================================================================================== -// Hashes the specified C-style string key with the Jenkins hash algorithm. -template -uint32 StringJenkinsHashFunc::operator()( - const void* pVoidKey, - uint32 keyLen - ) const -{ - const Key* pKey = static_cast(pVoidKey); - const Key key = *pKey; - keyLen = static_cast(strlen(key)); - - return JenkinsHashFunc::operator()(key, keyLen); -} - -// ===================================================================================================================== -// Returns true if the strings in key1 and key2 are the same. -template -bool StringEqualFunc::operator()( - const Key& key1, - const Key& key2 - ) const -{ - bool ret = false; - - // Can't do strcmp on null. - if ((key1 != nullptr) && (key2 != nullptr)) - { - ret = (strcmp(key1, key2) == 0); - } - else if ((key1 == nullptr) && (key2 == nullptr)) - { - ret = true; - } - - return ret; -} - -// ===================================================================================================================== -// Allocates a new block of memory. -template -void* HashAllocator::Allocate() -{ - void* pMemory = nullptr; - - // Leave pBlock null if this is the first allocation made with this object. - MemBlock* pBlock = (m_curBlock >= 0) ? &m_blocks[m_curBlock] : nullptr; - - // If current block is used up (or we haven't allocated one yet), go to next. - if ((pBlock == nullptr) || (pBlock->curGroup >= pBlock->numGroups)) - { - // Only advance to the next block if the current one had memory allocated to it (which implies that it's - // full). - uint32_t nextBlock = m_curBlock; - - if ((pBlock == nullptr) || (pBlock->pMemory != nullptr)) - { - nextBlock++; - } - - PAL_ASSERT(nextBlock < NumBlocks); - - pBlock = &m_blocks[nextBlock]; - - PAL_ASSERT(pBlock->curGroup == 0); - - // Allocate memory if needed (note that this may rarely fail) - if (pBlock->pMemory == nullptr) - { - // Here we allocate another chunk of memory from outside, that we can later distribute internally - // to whichever bucket needs another group linked to it. - PAL_DPWARN("HashAllocator allocating more external memory, enough to hold %u Groups. " - "Consider increasing the GroupSize(%llu) in order to fit more Entries" - "In a Group.", - pBlock->numGroups, static_cast(m_groupSize)); - - pBlock->pMemory = PAL_CALLOC_ALIGNED(pBlock->numGroups * m_groupSize, m_alignment, - m_pAllocator, AllocInternal); - } - - // If we successfully allocated memory (or the block already had some), make it current - if (pBlock->pMemory != nullptr) - { - m_curBlock = nextBlock; - } - } - - if (pBlock->pMemory != nullptr) - { - pMemory = VoidPtrInc(pBlock->pMemory, ((pBlock->curGroup++) * m_groupSize)); - } - - return pMemory; -} - -// ===================================================================================================================== -// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. -template -void HashAllocator::Reset() -{ - for (int32 i = 0; i <= m_curBlock; ++i) - { - PAL_ASSERT(m_blocks[i].pMemory != nullptr); - memset(m_blocks[i].pMemory, 0, m_blocks[i].numGroups * m_groupSize); - - m_blocks[i].curGroup = 0; - } - - m_curBlock = -1; -} - -// ===================================================================================================================== -// Proceeds to the next entry, null if to the end. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -void HashIterator::Next() -{ - if (m_pCurrentEntry != nullptr) - { - PAL_ASSERT(m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup]); - - Entry* pNextGroup = Container::GetNextGroup(m_pCurrentGroup); - - // We're in the middle of a group. - uint32 numEntries = m_pContainer->GetGroupFooterNumEntries(m_pCurrentGroup); - if ((m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup - 1]) && - (m_indexInGroup + 1 < numEntries)) - { - m_pCurrentEntry++; - m_indexInGroup++; - } - // We're in the last entry of a group. - // Considering that the next chained group could be an empty group already, it is better to check the - // next group's footer->numEntries before jump to the next group. If the numEntry of the next chained - // group is 0 (invalid), we need to jump to the next bucket directly to avoid returning invalid entry. - else if ((pNextGroup != nullptr) && - (m_indexInGroup == numEntries - 1) && - (reinterpret_cast*>(&pNextGroup[Container::EntriesInGroup])->numEntries > 0)) - { - m_pCurrentGroup = pNextGroup; - m_pCurrentEntry = pNextGroup; - m_indexInGroup = 0; - } - // The current bucket is done, step to the next. - else - { - do - { - m_currentBucket = (m_currentBucket + 1) % m_pContainer->m_numBuckets; - - pNextGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, - m_currentBucket * GroupSize)); - - numEntries = m_pContainer->GetGroupFooterNumEntries(pNextGroup); - if (numEntries > 0) - { - m_indexInGroup = 0; - break; - } - } while(m_currentBucket != m_startBucket); - - if (m_currentBucket != m_startBucket) - { - m_pCurrentGroup = pNextGroup; - m_pCurrentEntry = pNextGroup; - m_indexInGroup = 0; - } - else - { - m_pCurrentEntry = nullptr; - } - } - } -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -void HashIterator::Reset() -{ - m_currentBucket = m_startBucket; - m_indexInGroup = 0; - - if (m_startBucket < m_pContainer->m_numBuckets) - { - m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, - m_startBucket * GroupSize)); - } - else - { - m_pCurrentGroup = nullptr; - } - - m_pCurrentEntry = m_pCurrentGroup; -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Result HashBase::Init() -{ - // Each bucket's address must be aligned as Entry required. - PAL_ASSERT(IsPow2Aligned(GroupSize, alignof(Entry))); - - // Since (m_numBuckets - 1) will mask the hashing result, the hash func should make sure the hashing result always - // contain enough effective bits. - m_hashFunc.Init(Log2(m_numBuckets)); - - // Allocate the hash table. Zero out the memory to mark all entries invalid, since a key of 0 is invalid. - m_pMemory = PAL_CALLOC_ALIGNED(m_memorySize, alignof(Entry), &m_allocator, AllocInternal); - - PAL_ALERT(m_pMemory == nullptr); - - return (m_pMemory != nullptr) ? Result::Success : Result::ErrorOutOfMemory; -} - -// ===================================================================================================================== -// Returns an iterator pointing to the first entry. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -HashIterator -HashBase::Begin() const -{ - uint32 bucket = 0; - - if (m_numEntries != 0) - { - PAL_ASSERT(m_pMemory != nullptr); - for (;bucket < m_numBuckets; ++bucket) - { - Entry* pEntry = static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)); - const uint32 numEntries = GetGroupFooterNumEntries(pEntry); - if (numEntries > 0) - { - break; - } - } - } - else - { - // If the backing memory does not exist we should return a null Iterator. - // This can be done by setting the start bucket such that it is off the end of the bucket list. - bucket = m_numBuckets; - } - - return Iterator(this, bucket); -} - -// ===================================================================================================================== -// Empty the hash table. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -void HashBase::Reset() -{ - if ((m_pMemory != nullptr) && ((m_numEntries != 0) || (m_allocator.IsClean() == false))) - { - // Re-zero out the hash table. - // We can skip this if: - // - m_numEntries is 0, then each group's numEntries and entry data is already reset. - // - the allocator is clean, then each group's chain pointer must be nullptr (nowhere to point to) - memset(m_pMemory, 0, m_memorySize); - } - - m_numEntries = 0; - - m_allocator.Reset(); -} - -// ===================================================================================================================== -// Removes an entry with the specified key. -template -bool HashBase::Erase( - const Key& key) -{ - // Get the bucket base address. - Entry* pGroup = this->FindBucket(key); - - Entry* pFoundEntry = nullptr; - Entry* pLastEntry = nullptr; - Entry* pLastEntryGroup = nullptr; - - // Find the entry to delete - while (pGroup != nullptr) - { - const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); - - // Search each group - uint32 i = 0; - for (; i < numEntries; i++) - { - if (this->m_equalFunc(pGroup[i].key, key) == true) - { - // We shouldn't find the same key twice. - PAL_ASSERT(pFoundEntry == nullptr); - - pFoundEntry = &(pGroup[i]); - } - - // keep track of last entry of all groups in bucket - pLastEntry = &(pGroup[i]); - pLastEntryGroup = pGroup; - } - - // Chain to the next entry group. - pGroup = this->GetNextGroup(pGroup); - } - - // Copy the last entry's data into the entry that we are removing and invalidate the last entry as it now appears - // earlier in the list. This also handles the case where the entry to be removed is the last entry. - if (pFoundEntry != nullptr) - { - PAL_ASSERT(pLastEntry != nullptr); - - *pFoundEntry = std::move(*pLastEntry); - memset(pLastEntry, 0, sizeof(Entry)); - - PAL_ASSERT(this->m_numEntries > 0); - this->m_numEntries--; - const uint32 numEntries = this->GetGroupFooterNumEntries(pLastEntryGroup); - this->SetGroupFooterNumEntries(pLastEntryGroup, numEntries - 1); - } - - return (pFoundEntry != nullptr); -} - -// ===================================================================================================================== -// Check if the given hashtable contains the given key. -template -bool HashBase::Contains( - const Key& key) const -{ - return FindEntry(key) != nullptr; -} - -// ===================================================================================================================== -// Ensures that the hash table has been allocated, then returns pointer to start group of the bucket -// corresponding to the specified key. A return of nullptr means out of memory. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Entry* HashBase::InitAndFindBucket( - const Key& key - ) -{ - if (m_pMemory == nullptr) - { - Init(); - } - return FindBucket(key); -} - -// ===================================================================================================================== -// Returns pointer to start group of the bucket corresponding to the specified key. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Entry* HashBase::FindBucket( - const Key& key - ) const -{ - const uint32 bucket = m_hashFunc(&key, sizeof(key)) & (m_numBuckets - 1); - return (m_pMemory != nullptr) ? static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)) : nullptr; -} - -// ===================================================================================================================== -// Gets a pointer to the entry that matches the key. Returns null if no entry is present matching the specified key. -template -Entry* HashBase::FindEntry( - const Key& key - ) const -{ - // Get the bucket base address. - Entry* pGroup = this->FindBucket(key); - Entry* pMatchingEntry = nullptr; - - while (pGroup != nullptr) - { - const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); - - // Search this entry group - uint32 i = 0; - for (; i < numEntries; i++) - { - if (this->m_equalFunc(pGroup[i].key, key)) - { - // We've found the entry. - pMatchingEntry = &(pGroup[i]); - break; - } - } - - if ((pMatchingEntry != nullptr) || (i < EntriesInGroup)) - { - break; - } - - // Chain to the next entry group. - pGroup = this->GetNextGroup(pGroup); - } - - return pMatchingEntry; -} - -// ===================================================================================================================== -// Gets a pointer to the entry that matches the key. If the key is not present, a pointer to empty space for the value -// is returned. -template -Result HashBase::FindAllocateEntry( - const Key& key, // Key to search for. - bool* pExisted, // [out] True if a matching key was found. - Entry** ppEntry) // [out] Pointer to the value entry of the hash map's entry for the specified key. -{ - PAL_ASSERT(pExisted != nullptr); - PAL_ASSERT(ppEntry != nullptr); - - Result result = Result::ErrorOutOfMemory; - - // Get the bucket base address.... - Entry* pGroup = this->InitAndFindBucket(key); - - *pExisted = false; - *ppEntry = nullptr; - - Entry* pMatchingEntry = nullptr; - - while (pGroup != nullptr) - { - const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); - - // Search this entry group. - uint32 i = 0; - for (; i < numEntries; i++) - { - if (this->m_equalFunc(pGroup[i].key, key)) - { - // We've found the entry. - pMatchingEntry = &(pGroup[i]); - *pExisted = true; - break; - } - } - - // We've reached the end of the allocated buckets and the entry was not found. - // Allocate this entry for the key. - if ((pMatchingEntry == nullptr) && (i < EntriesInGroup)) - { - pGroup[i].key = key; - pMatchingEntry = &(pGroup[i]); - this->m_numEntries++; - this->SetGroupFooterNumEntries(pGroup, numEntries + 1); - } - - if (pMatchingEntry != nullptr) - { - *ppEntry= pMatchingEntry; - result = Result::Success; - break; - } - - // Chain to the next entry group. - pGroup = this->AllocateNextGroup(pGroup); - } - - PAL_ASSERT(result == Result::Success); - - return result; -} - -// ===================================================================================================================== -// Returns pointer to the next group of the spcified group. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Entry* HashBase::GetNextGroup( - Entry* pGroup) -{ - // Footer of a group stores the pointer to the next group - return HashBase::GetGroupFooterNextGroup(pGroup); -} - -// ===================================================================================================================== -// Allocates a new group if the footer of the specified group is null. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Entry* HashBase::AllocateNextGroup( - Entry* pGroup) -{ - // Footer of a group stores the pointer to the next group. - Entry* pNextGroup = GetGroupFooterNextGroup(pGroup); - - if (pNextGroup == nullptr) - { - // This warning is useful in order to tune hash maps, but probably doesn't need to be enabled for anyone not - // actively tuning. We're not asking for more memory here, just assigning more of the memory chunk we have - // already asked for to a bucket. Each bucket starts with a group that it can place hash hits in. - // when the group for that bucket fills up, we call this function and link another group for the same bucket, - // in the form of a linked list, onto that. - // This is expected to happen a bit, as hash distributions aren't perfect. But if this happens too many times, - // you're really searching a linked list, not a hash map, which is much slower. - // It's at that point you need this warning: to help balance out the number of buckets and group sizes - // to better fit your use case. - //PAL_DPWARN("HashBase needs to allocate more internal memory after inserting %u entries. " - // "Consider increasing the NumBuckets(%u) or GroupSize(%llu) in order to " - // "fit more Entries In a Group(%u).", - // m_numEntries, m_numBuckets, GroupSize, EntriesInGroup); - - // We allocate the next entry group if it does not exist. - pNextGroup = static_cast(m_allocator.Allocate()); - SetGroupFooterNextGroup(pGroup, pNextGroup); - } - - PAL_ASSERT(pNextGroup != nullptr); - - return pNextGroup; -} - -// ===================================================================================================================== -// Return a pointer to the group footer. -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -GroupFooter* HashBase::GetGroupFooter( - Entry* pGroup) -{ - return reinterpret_cast*>(&pGroup[EntriesInGroup]); -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -uint32 HashBase::GetGroupFooterNumEntries( - Entry* pGroup) -{ - const uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + - offsetof(GroupFooter, numEntries)); - uint32 numEntries; - memcpy(&numEntries, pNumEntries, sizeof(numEntries)); - return numEntries; -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -void HashBase::SetGroupFooterNumEntries( - Entry* pGroup, uint32 numEntries) -{ - uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + - offsetof(GroupFooter, numEntries)); - memcpy(pNumEntries, &numEntries, sizeof(numEntries)); -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -Entry* HashBase::GetGroupFooterNextGroup( - Entry* pGroup) -{ - Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + - offsetof(GroupFooter, pNextGroup)); - Entry* pNextGroup; - memcpy(&pNextGroup, ppNextGroup, sizeof(pNextGroup)); - return pNextGroup; -} - -// ===================================================================================================================== -template< - typename Key, - typename Entry, - typename Allocator, - typename HashFunc, - typename EqualFunc, - typename AllocFunc, - size_t GroupSize> -void HashBase::SetGroupFooterNextGroup( - Entry* pGroup, Entry* pNextGroup) -{ - Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + - offsetof(GroupFooter, pNextGroup)); - memcpy(ppNextGroup, &pNextGroup, sizeof(pNextGroup)); -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashBaseImpl.h + * @brief PAL utility collection shared class implementations used by the HashMap and HashSet containers. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +// ===================================================================================================================== +// Default hash function implementation. Simply shift the key to the right and use the resulting bits as the hash. +template +uint32 DefaultHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + // We need this union to do a type conversion from a Key type to a uint for the hash function. This ensures + // that our data won't change when casting and that we don't have to guess which _cast<> operation is the most + // appropriate for each data type for which this template class may be instantiated. + union KeyUint32 + { + Key key; + uint32 uint; + } hash = { }; + + // Get the raw bits. + const Key* pKey = static_cast(pVoidKey); + hash.key = *pKey; + + // Discard the low bits. + return (hash.uint >> ShiftNum); +} + +// ===================================================================================================================== +// Hashes the specified key value with the Jenkins hash algorithm. Implementation based on the algorithm description +// found here: http://burtleburtle.net/bob/hash/doobs.html. +// By Bob Jenkins, 1996. bob_jenkins@compuserve.com. You may use this +// code any way you wish, private, educational, or commercial. It's free. +// See http:\\ourworld.compuserve.com\homepages\bob_jenkins\evahash.htm +// Use for hash table lookup, or anything where one collision in 2^^32 is +// acceptable. Do NOT use for cryptographic purposes. +template +uint32 JenkinsHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + // Mixing table. + static const uint8 MixTable[256] = + { + 251, 175, 119, 215, 81, 14, 79, 191, 103, 49, 181, 143, 186, 157, 0, 232, + 31, 32, 55, 60, 152, 58, 17, 237, 174, 70, 160, 144, 220, 90, 57, 223, + 59, 3, 18, 140, 111, 166, 203, 196, 134, 243, 124, 95, 222, 179, 197, 65, + 180, 48, 36, 15, 107, 46, 233, 130, 165, 30, 123, 161, 209, 23, 97, 16, + 40, 91, 219, 61, 100, 10, 210, 109, 250, 127, 22, 138, 29, 108, 244, 67, + 207, 9, 178, 204, 74, 98, 126, 249, 167, 116, 34, 77, 193, 200, 121, 5, + 20, 113, 71, 35, 128, 13, 182, 94, 25, 226, 227, 199, 75, 27, 41, 245, + 230, 224, 43, 225, 177, 26, 155, 150, 212, 142, 218, 115, 241, 73, 88, 105, + 39, 114, 62, 255, 192, 201, 145, 214, 168, 158, 221, 148, 154, 122, 12, 84, + 82, 163, 44, 139, 228, 236, 205, 242, 217, 11, 187, 146, 159, 64, 86, 239, + 195, 42, 106, 198, 118, 112, 184, 172, 87, 2, 173, 117, 176, 229, 247, 253, + 137, 185, 99, 164, 102, 147, 45, 66, 231, 52, 141, 211, 194, 206, 246, 238, + 56, 110, 78, 248, 63, 240, 189, 93, 92, 51, 53, 183, 19, 171, 72, 50, + 33, 104, 101, 69, 8, 252, 83, 120, 76, 135, 85, 54, 202, 125, 188, 213, + 96, 235, 136, 208, 162, 129, 190, 132, 156, 38, 47, 1, 7, 254, 24, 4, + 216, 131, 89, 21, 28, 133, 37, 153, 149, 80, 170, 68, 6, 169, 234, 151 + }; + + const uint8* pKey = static_cast(pVoidKey); + + uint32 a = 0x9e3779b9; // The golden ratio; an arbitrary value. + uint32 b = a; + uint32 c = MixTable[pKey[0]]; // Arbitrary value. + uint32 len = keyLen; + + // Handle most of the key. + while (len >= 12) + { + a = a + (pKey[0] + (static_cast(pKey[1]) << 8) + + (static_cast(pKey[2]) << 16) + + (static_cast(pKey[3]) << 24)); + b = b + (pKey[4] + (static_cast(pKey[5]) << 8) + + (static_cast(pKey[6]) << 16) + + (static_cast(pKey[7]) << 24)); + c = c + (pKey[8] + (static_cast(pKey[9]) << 8) + + (static_cast(pKey[10]) << 16) + + (static_cast(pKey[11]) << 24)); + + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + pKey = pKey + 12; + len = len - 12; + } + + // Handle last 11 bytes. + c = c + keyLen; + switch (len) + { + case 11: c = c + (static_cast(pKey[10]) << 24); [[fallthrough]]; + case 10: c = c + (static_cast(pKey[9]) << 16); [[fallthrough]]; + case 9: c = c + (static_cast(pKey[8]) << 8); [[fallthrough]]; + // the first byte of c is reserved for the length + case 8: b = b + (static_cast(pKey[7]) << 24); [[fallthrough]]; + case 7: b = b + (static_cast(pKey[6]) << 16); [[fallthrough]]; + case 6: b = b + (static_cast(pKey[5]) << 8); [[fallthrough]]; + case 5: b = b + pKey[4]; [[fallthrough]]; + case 4: a = a + (static_cast(pKey[3]) << 24); [[fallthrough]]; + case 3: a = a + (static_cast(pKey[2]) << 16); [[fallthrough]]; + case 2: a = a + (static_cast(pKey[1]) << 8); [[fallthrough]]; + case 1: a = a + pKey[0]; + // case 0: nothing left to add + } + + a = a - b; a = a - c; a = a ^ (c >> 13); + b = b - c; b = b - a; b = b ^ (a << 8); + c = c - a; c = c - b; c = c ^ (b >> 13); + a = a - b; a = a - c; a = a ^ (c >> 12); + b = b - c; b = b - a; b = b ^ (a << 16); + c = c - a; c = c - b; c = c ^ (b >> 5); + a = a - b; a = a - c; a = a ^ (c >> 3); + b = b - c; b = b - a; b = b ^ (a << 10); + c = c - a; c = c - b; c = c ^ (b >> 15); + + return c; +} + +// ===================================================================================================================== +// Hashes the specified C-style string key with the Jenkins hash algorithm. +template +uint32 StringJenkinsHashFunc::operator()( + const void* pVoidKey, + uint32 keyLen + ) const +{ + const Key* pKey = static_cast(pVoidKey); + const Key key = *pKey; + keyLen = static_cast(strlen(key)); + + return JenkinsHashFunc::operator()(key, keyLen); +} + +// ===================================================================================================================== +// Returns true if the strings in key1 and key2 are the same. +template +bool StringEqualFunc::operator()( + const Key& key1, + const Key& key2 + ) const +{ + bool ret = false; + + // Can't do strcmp on null. + if ((key1 != nullptr) && (key2 != nullptr)) + { + ret = (strcmp(key1, key2) == 0); + } + else if ((key1 == nullptr) && (key2 == nullptr)) + { + ret = true; + } + + return ret; +} + +// ===================================================================================================================== +// Allocates a new block of memory. +template +void* HashAllocator::Allocate() +{ + void* pMemory = nullptr; + + // Leave pBlock null if this is the first allocation made with this object. + MemBlock* pBlock = (m_curBlock >= 0) ? &m_blocks[m_curBlock] : nullptr; + + // If current block is used up (or we haven't allocated one yet), go to next. + if ((pBlock == nullptr) || (pBlock->curGroup >= pBlock->numGroups)) + { + // Only advance to the next block if the current one had memory allocated to it (which implies that it's + // full). + uint32_t nextBlock = m_curBlock; + + if ((pBlock == nullptr) || (pBlock->pMemory != nullptr)) + { + nextBlock++; + } + + PAL_ASSERT(nextBlock < NumBlocks); + + pBlock = &m_blocks[nextBlock]; + + PAL_ASSERT(pBlock->curGroup == 0); + + // Allocate memory if needed (note that this may rarely fail) + if (pBlock->pMemory == nullptr) + { + // Here we allocate another chunk of memory from outside, that we can later distribute internally + // to whichever bucket needs another group linked to it. + PAL_DPWARN("HashAllocator allocating more external memory, enough to hold %u Groups. " + "Consider increasing the GroupSize(%llu) in order to fit more Entries" + "In a Group.", + pBlock->numGroups, static_cast(m_groupSize)); + + pBlock->pMemory = PAL_CALLOC_ALIGNED(pBlock->numGroups * m_groupSize, m_alignment, + m_pAllocator, AllocInternal); + } + + // If we successfully allocated memory (or the block already had some), make it current + if (pBlock->pMemory != nullptr) + { + m_curBlock = nextBlock; + } + } + + if (pBlock->pMemory != nullptr) + { + pMemory = VoidPtrInc(pBlock->pMemory, ((pBlock->curGroup++) * m_groupSize)); + } + + return pMemory; +} + +// ===================================================================================================================== +// Recycles all allocated memory. Memory isn't actually freed, but becomes available for reuse. +template +void HashAllocator::Reset() +{ + for (int32 i = 0; i <= m_curBlock; ++i) + { + PAL_ASSERT(m_blocks[i].pMemory != nullptr); + memset(m_blocks[i].pMemory, 0, m_blocks[i].numGroups * m_groupSize); + + m_blocks[i].curGroup = 0; + } + + m_curBlock = -1; +} + +// ===================================================================================================================== +// Proceeds to the next entry, null if to the end. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashIterator::Next() +{ + if (m_pCurrentEntry != nullptr) + { + PAL_ASSERT(m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup]); + + Entry* pNextGroup = Container::GetNextGroup(m_pCurrentGroup); + + // We're in the middle of a group. + uint32 numEntries = m_pContainer->GetGroupFooterNumEntries(m_pCurrentGroup); + if ((m_pCurrentEntry < &m_pCurrentGroup[Container::EntriesInGroup - 1]) && + (m_indexInGroup + 1 < numEntries)) + { + m_pCurrentEntry++; + m_indexInGroup++; + } + // We're in the last entry of a group. + // Considering that the next chained group could be an empty group already, it is better to check the + // next group's footer->numEntries before jump to the next group. If the numEntry of the next chained + // group is 0 (invalid), we need to jump to the next bucket directly to avoid returning invalid entry. + else if ((pNextGroup != nullptr) && + (m_indexInGroup == numEntries - 1) && + (reinterpret_cast*>(&pNextGroup[Container::EntriesInGroup])->numEntries > 0)) + { + m_pCurrentGroup = pNextGroup; + m_pCurrentEntry = pNextGroup; + m_indexInGroup = 0; + } + // The current bucket is done, step to the next. + else + { + do + { + m_currentBucket = (m_currentBucket + 1) % m_pContainer->m_numBuckets; + + pNextGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_currentBucket * GroupSize)); + + numEntries = m_pContainer->GetGroupFooterNumEntries(pNextGroup); + if (numEntries > 0) + { + m_indexInGroup = 0; + break; + } + } while(m_currentBucket != m_startBucket); + + if (m_currentBucket != m_startBucket) + { + m_pCurrentGroup = pNextGroup; + m_pCurrentEntry = pNextGroup; + m_indexInGroup = 0; + } + else + { + m_pCurrentEntry = nullptr; + } + } + } +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashIterator::Reset() +{ + m_currentBucket = m_startBucket; + m_indexInGroup = 0; + + if (m_startBucket < m_pContainer->m_numBuckets) + { + m_pCurrentGroup = static_cast(VoidPtrInc(m_pContainer->m_pMemory, + m_startBucket * GroupSize)); + } + else + { + m_pCurrentGroup = nullptr; + } + + m_pCurrentEntry = m_pCurrentGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashBase::Init() +{ + // Each bucket's address must be aligned as Entry required. + PAL_ASSERT(IsPow2Aligned(GroupSize, alignof(Entry))); + + // Since (m_numBuckets - 1) will mask the hashing result, the hash func should make sure the hashing result always + // contain enough effective bits. + m_hashFunc.Init(Log2(m_numBuckets)); + + // Allocate the hash table. Zero out the memory to mark all entries invalid, since a key of 0 is invalid. + m_pMemory = PAL_CALLOC_ALIGNED(m_memorySize, alignof(Entry), &m_allocator, AllocInternal); + + PAL_ALERT(m_pMemory == nullptr); + + return (m_pMemory != nullptr) ? Result::Success : Result::ErrorOutOfMemory; +} + +// ===================================================================================================================== +// Returns an iterator pointing to the first entry. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +HashIterator +HashBase::Begin() const +{ + uint32 bucket = 0; + + if (m_numEntries != 0) + { + PAL_ASSERT(m_pMemory != nullptr); + for (;bucket < m_numBuckets; ++bucket) + { + Entry* pEntry = static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)); + const uint32 numEntries = GetGroupFooterNumEntries(pEntry); + if (numEntries > 0) + { + break; + } + } + } + else + { + // If the backing memory does not exist we should return a null Iterator. + // This can be done by setting the start bucket such that it is off the end of the bucket list. + bucket = m_numBuckets; + } + + return Iterator(this, bucket); +} + +// ===================================================================================================================== +// Empty the hash table. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::Reset() +{ + if ((m_pMemory != nullptr) && ((m_numEntries != 0) || (m_allocator.IsClean() == false))) + { + // Re-zero out the hash table. + // We can skip this if: + // - m_numEntries is 0, then each group's numEntries and entry data is already reset. + // - the allocator is clean, then each group's chain pointer must be nullptr (nowhere to point to) + memset(m_pMemory, 0, m_memorySize); + } + + m_numEntries = 0; + + m_allocator.Reset(); +} + +// ===================================================================================================================== +// Ensures that the hash table has been allocated, then returns pointer to start group of the bucket +// corresponding to the specified key. A return of nullptr means out of memory. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::InitAndFindBucket( + const Key& key + ) +{ + if (m_pMemory == nullptr) + { + Init(); + } + return FindBucket(key); +} + +// ===================================================================================================================== +// Returns pointer to start group of the bucket corresponding to the specified key. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::FindBucket( + const Key& key + ) const +{ + const uint32 bucket = m_hashFunc(&key, sizeof(key)) & (m_numBuckets - 1); + return (m_pMemory != nullptr) ? static_cast(VoidPtrInc(m_pMemory, bucket * GroupSize)) : nullptr; +} + +// ===================================================================================================================== +// Returns pointer to the next group of the spcified group. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::GetNextGroup( + Entry* pGroup) +{ + // Footer of a group stores the pointer to the next group + return HashBase::GetGroupFooterNextGroup(pGroup); +} + +// ===================================================================================================================== +// Allocates a new group if the footer of the specified group is null. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::AllocateNextGroup( + Entry* pGroup) +{ + // Footer of a group stores the pointer to the next group. + Entry* pNextGroup = GetGroupFooterNextGroup(pGroup); + + if (pNextGroup == nullptr) + { + // This warning is useful in order to tune hash maps, but probably doesn't need to be enabled for anyone not + // actively tuning. We're not asking for more memory here, just assigning more of the memory chunk we have + // already asked for to a bucket. Each bucket starts with a group that it can place hash hits in. + // when the group for that bucket fills up, we call this function and link another group for the same bucket, + // in the form of a linked list, onto that. + // This is expected to happen a bit, as hash distributions aren't perfect. But if this happens too many times, + // you're really searching a linked list, not a hash map, which is much slower. + // It's at that point you need this warning: to help balance out the number of buckets and group sizes + // to better fit your use case. + //PAL_DPWARN("HashBase needs to allocate more internal memory after inserting %u entries. " + // "Consider increasing the NumBuckets(%u) or GroupSize(%llu) in order to " + // "fit more Entries In a Group(%u).", + // m_numEntries, m_numBuckets, GroupSize, EntriesInGroup); + + // We allocate the next entry group if it does not exist. + pNextGroup = static_cast(m_allocator.Allocate()); + SetGroupFooterNextGroup(pGroup, pNextGroup); + } + + PAL_ASSERT(pNextGroup != nullptr); + + return pNextGroup; +} + +// ===================================================================================================================== +// Return a pointer to the group footer. +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +GroupFooter* HashBase::GetGroupFooter( + Entry* pGroup) +{ + return reinterpret_cast*>(&pGroup[EntriesInGroup]); +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +uint32 HashBase::GetGroupFooterNumEntries( + Entry* pGroup) +{ + const uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, numEntries)); + uint32 numEntries; + memcpy(&numEntries, pNumEntries, sizeof(numEntries)); + return numEntries; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::SetGroupFooterNumEntries( + Entry* pGroup, uint32 numEntries) +{ + uint32* pNumEntries = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, numEntries)); + memcpy(pNumEntries, &numEntries, sizeof(numEntries)); +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +Entry* HashBase::GetGroupFooterNextGroup( + Entry* pGroup) +{ + Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, pNextGroup)); + Entry* pNextGroup; + memcpy(&pNextGroup, ppNextGroup, sizeof(pNextGroup)); + return pNextGroup; +} + +// ===================================================================================================================== +template< + typename Key, + typename Entry, + typename Allocator, + typename HashFunc, + typename EqualFunc, + typename AllocFunc, + size_t GroupSize> +void HashBase::SetGroupFooterNextGroup( + Entry* pGroup, Entry* pNextGroup) +{ + Entry** ppNextGroup = reinterpret_cast(reinterpret_cast(&pGroup[EntriesInGroup]) + + offsetof(GroupFooter, pNextGroup)); + memcpy(ppNextGroup, &pNextGroup, sizeof(pNextGroup)); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h index d56a419246..e5ea46ddb5 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashMap.h @@ -1,137 +1,143 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashMap.h - * @brief PAL utility collection HashMap class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palHashBase.h" - -namespace Util -{ - -/// Encapsulates one key/value pair in a hash map. -template -struct HashMapEntry -{ - Key key; ///< Hash map entry key. - Value value; ///< Hash map entry value. -}; - -/** - *********************************************************************************************************************** - * @brief Templated hash map container. - * - * This container is meant for storing elements of an arbitrary (but uniform) key/value type. Supported operations: - * - * - Searching - * - Insertion - * - Deletion - * - Iteration - * - * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: - * - * - DefaultHashFunc: Default hash function, selects best hash function based on type of key. - * - PointerHashFunc: Good choice when the key is a pointer. - * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. - * - StringJenkinsHashFunc: Good choice when the key is a C-style string. - * - * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: - * - * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. - * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. - * - * @warning This class is not thread-safe for Insert, FindAllocate, Erase, or iteration! - * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before - * initialization and Begin() will always return an iterator that points to null. - * - * For more details please refer to @ref HashBase. - *********************************************************************************************************************** - */ -template class HashFunc = DefaultHashFunc, - template class EqualFunc = DefaultEqualFunc, - typename AllocFunc = HashAllocator, - size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> -class HashMap : public HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> -{ -public: - /// Convenience typedef for a templated entry of this hash map. - typedef HashMapEntry Entry; - - /// @internal Constructor - /// - /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will - /// take (buckets * GroupSize) bytes. - /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. - explicit HashMap(uint32 numBuckets, Allocator*const pAllocator): Base::HashBase(numBuckets, pAllocator) { } - ~HashMap() { } - - /// Finds a given entry; if no entry was found, allocate it. - /// - /// @param [in] key Key to search for. - /// @param [out] pExisted True if an entry for the specified key existed before this call was made. False indicates - /// that a new entry was allocated as a result of this call. - /// @param [out] ppValue Readable/writeable value in the hash map corresponding to the specified key. - /// - /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed - /// because an internal memory allocation failed. - Result FindAllocate(const Key& key, bool* pExisted, Value** ppValue); - - /// Gets a pointer to the value that matches the specified key. - /// - /// @param [in] key Key to search for. - /// - /// @returns A pointer to the value that matches the specified key or null if an entry for the key does not exist. - Value* FindKey(const Key& key) const; - - /// Inserts a key/value pair entry if the key doesn't already exist in the hash map. - /// - /// @warning No action will be taken if an entry matching this key already exists, even if the specified value - /// differs from the current value stored in the entry matching the specified key. - /// - /// @param [in] key Key of the new entry to insert. - /// @param [in] value Value of the new entry to insert. - /// - /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed - /// because an internal memory allocation failed. - Result Insert(const Key& key, const Value& value); - -private: - // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when - // accessing members of HashBase. - typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; - - PAL_DISALLOW_DEFAULT_CTOR(HashMap); - PAL_DISALLOW_COPY_AND_ASSIGN(HashMap); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashMap.h + * @brief PAL utility collection HashMap class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +/// Encapsulates one key/value pair in a hash map. +template +struct HashMapEntry +{ + Key key; ///< Hash map entry key. + Value value; ///< Hash map entry value. +}; + +/** + *********************************************************************************************************************** + * @brief Templated hash map container. + * + * This container is meant for storing elements of an arbitrary (but uniform) key/value type. Supported operations: + * + * - Searching + * - Insertion + * - Deletion + * - Iteration + * + * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: + * + * - DefaultHashFunc: Good choice when the key is a pointer. + * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. + * - StringJenkinsHashFunc: Good choice when the key is a C-style string. + * + * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: + * + * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. + * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. + * + * @warning This class is not thread-safe for Insert, FindAllocate, Erase, or iteration! + * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before + * initialization and Begin() will always return an iterator that points to null. + * + * For more details please refer to @ref HashBase. + *********************************************************************************************************************** + */ +template class HashFunc = DefaultHashFunc, + template class EqualFunc = DefaultEqualFunc, + typename AllocFunc = HashAllocator, + size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> +class HashMap : public HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> +{ +public: + /// Convenience typedef for a templated entry of this hash map. + typedef HashMapEntry Entry; + + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + explicit HashMap(uint32 numBuckets, Allocator*const pAllocator): Base::HashBase(numBuckets, pAllocator) { } + virtual ~HashMap() { } + + /// Finds a given entry; if no entry was found, allocate it. + /// + /// @param [in] key Key to search for. + /// @param [out] pExisted True if an entry for the specified key existed before this call was made. False indicates + /// that a new entry was allocated as a result of this call. + /// @param [out] ppValue Readable/writeable value in the hash map corresponding to the specified key. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result FindAllocate(const Key& key, bool* pExisted, Value** ppValue); + + /// Gets a pointer to the value that matches the specified key. + /// + /// @param [in] key Key to search for. + /// + /// @returns A pointer to the value that matches the specified key or null if an entry for the key does not exist. + Value* FindKey(const Key& key) const; + + /// Inserts a key/value pair entry if the key doesn't already exist in the hash map. + /// + /// @warning No action will be taken if an entry matching this key already exists, even if the specified value + /// differs from the current value stored in the entry matching the specified key. + /// + /// @param [in] key Key of the new entry to insert. + /// @param [in] value Value of the new entry to insert. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result Insert(const Key& key, const Value& value); + + /// Removes an entry that matches the specified key. + /// + /// @param [in] key Key of the entry to erase. + /// + /// @returns True if the erase completed successfully, false if an entry for this key did not exist. + bool Erase(const Key& key); + +private: + // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when + // accessing members of HashBase. + typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; + + PAL_DISALLOW_DEFAULT_CTOR(HashMap); + PAL_DISALLOW_COPY_AND_ASSIGN(HashMap); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h index 4cde3904c4..36905ae1ac 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashMapImpl.h @@ -1,114 +1,250 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashMapImpl.h - * @brief PAL utility collection HashMap class implementation. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palHashBaseImpl.h" -#include "palHashMap.h" - -namespace Util -{ - -// ===================================================================================================================== -// Gets a pointer to the value that matches the key. If the key is not present, a pointer to empty space for the value -// is returned. -template class HashFunc, - template class EqualFunc, - typename AllocFunc, - size_t GroupSize> -Result HashMap::FindAllocate( - const Key& key, // Key to search for. - bool* pExisted, // [out] True if a matching key was found. - Value** ppValue) // [out] Pointer to the value entry of the hash map's entry for the specified key. -{ - PAL_ASSERT(pExisted != nullptr); - PAL_ASSERT(ppValue != nullptr); - - Entry* pEntry = nullptr; - Result result = Base::FindAllocateEntry(key, pExisted, &pEntry); - if (result == Result::Success) - { - *ppValue = &pEntry->value; - } - - return result; -} - -// ===================================================================================================================== -// Gets a pointer to the value that matches the key. Returns null if no entry is present matching the specified key. -template class HashFunc, - template class EqualFunc, - typename AllocFunc, - size_t GroupSize> -Value* HashMap::FindKey( - const Key& key - ) const -{ - Entry* pEntry = Base::FindEntry(key); - return (pEntry != nullptr) ? &pEntry->value : nullptr; -} - -// ===================================================================================================================== -// Inserts a key/value pair entry if it doesn't already exist. -template class HashFunc, - template class EqualFunc, - typename AllocFunc, - size_t GroupSize> -Result HashMap::Insert( - const Key& key, - const Value& value) -{ - bool existed = true; - Entry* pEntry = nullptr; - - Result result = Base::FindAllocateEntry(key, &existed, &pEntry); - - // Add the new value if it did not exist already. If FindAllocate returns Success, pValue != nullptr. - if ((result == Result::Success) && (existed == false)) - { - pEntry->value = value; - } - - PAL_ASSERT(result == Result::Success); - - return result; -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashMapImpl.h + * @brief PAL utility collection HashMap class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBaseImpl.h" +#include "palHashMap.h" + +namespace Util +{ + +// ===================================================================================================================== +// Gets a pointer to the value that matches the key. If the key is not present, a pointer to empty space for the value +// is returned. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashMap::FindAllocate( + const Key& key, // Key to search for. + bool* pExisted, // [out] True if a matching key was found. + Value** ppValue) // [out] Pointer to the value entry of the hash map's entry for the specified key. +{ + PAL_ASSERT(pExisted != nullptr); + PAL_ASSERT(ppValue != nullptr); + + Result result = Result::ErrorOutOfMemory; + + // Get the bucket base address.... + Entry* pGroup = this->InitAndFindBucket(key); + + *pExisted = false; + *ppValue = nullptr; + + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + *pExisted = true; + break; + } + } + + // We've reached the end of the allocated buckets and the entry was not found. + // Allocate this entry for the key. + if ((pMatchingEntry == nullptr) && (i < Base::EntriesInGroup)) + { + pGroup[i].key = key; + pMatchingEntry = &(pGroup[i]); + this->m_numEntries++; + this->SetGroupFooterNumEntries(pGroup, numEntries + 1); + } + + if (pMatchingEntry != nullptr) + { + *ppValue = &(pMatchingEntry->value); + result = Result::Success; + break; + } + + // Chain to the next entry group. + pGroup = this->AllocateNextGroup(pGroup); + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Gets a pointer to the value that matches the key. Returns null if no entry is present matching the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Value* HashMap::FindKey( + const Key& key + ) const +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + break; + } + } + + if ((pMatchingEntry != nullptr) || (i < Base::EntriesInGroup)) + { + break; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + return (pMatchingEntry != nullptr) ? &(pMatchingEntry->value) : nullptr; +} + +// ===================================================================================================================== +// Inserts a key/value pair entry if it doesn't already exist. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashMap::Insert( + const Key& key, + const Value& value) +{ + bool existed = true; + Value* pValue = nullptr; + + Result result = FindAllocate(key, &existed, &pValue); + + // Add the new value if it did not exist already. If FindAllocate returns Success, pValue != nullptr. + if ((result == Result::Success) && (existed == false)) + { + *pValue = value; + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Removes an entry with the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashMap::Erase( + const Key& key) +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + + Entry* pFoundEntry = nullptr; + Entry* pLastEntry = nullptr; + Entry* pLastEntryGroup = nullptr; + + // Find the entry to delete + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search each group + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key) == true) + { + // We shouldn't find the same key twice. + PAL_ASSERT(pFoundEntry == nullptr); + + pFoundEntry = &(pGroup[i]); + } + + // keep track of last entry of all groups in bucket + pLastEntry = &(pGroup[i]); + pLastEntryGroup = pGroup; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + // Copy the last entry's data into the entry that we are removing and invalidate the last entry as it now appears + // earlier in the list. This also handles the case where the entry to be removed is the last entry. + if (pFoundEntry != nullptr) + { + PAL_ASSERT(pLastEntry != nullptr); + + pFoundEntry->key = pLastEntry->key; + pFoundEntry->value = pLastEntry->value; + + memset(pLastEntry, 0, sizeof(Entry)); + + PAL_ASSERT(this->m_numEntries > 0); + this->m_numEntries--; + const uint32 numEntries = this->GetGroupFooterNumEntries(pLastEntryGroup); + this->SetGroupFooterNumEntries(pLastEntryGroup, numEntries - 1); + } + + return (pFoundEntry != nullptr); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h index 7c43d59adc..9876cd30e0 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashSet.h @@ -1,131 +1,144 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashSet.h - * @brief PAL utility collection HashSet class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palHashBase.h" - -namespace Util -{ - -/// Encapsulates one entry of a hash set. -template -struct HashSetEntry -{ - Key key; ///< Hash set entry key. -}; - -/** - *********************************************************************************************************************** - * @brief Templated hash set container. - * - * This is meant for storing elements of an arbitrary (but uniform) key type. Supported operations: - * - * - Searching - * - Insertion - * - Deletion - * - Iteration - * - * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: - * - * - DefaultHashFunc: Default hash function, selects best hash function based on type of key. - * - PointerHashFunc: Good choice when the key is a pointer. - * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. - * - StringJenkinsHashFunc: Good choice when the key is a C-style string. - * - * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: - * - * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. - * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. - * - * @warning This class is not thread-safe for Insert, Erase, or iteration! - * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before - * initialization and Begin() will always return an iterator that points to null. - * - * For more details please refer to @ref HashBase. - *********************************************************************************************************************** - */ -template class HashFunc = DefaultHashFunc, - template class EqualFunc = DefaultEqualFunc, - typename AllocFunc = HashAllocator, - size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> -class HashSet : public HashBase, - Allocator, - HashFunc, - EqualFunc, - AllocFunc, - GroupSize> -{ -public: - /// Convenience typedef for a templated entry of this hash set. - typedef HashSetEntry Entry; - - /// @internal Constructor - /// - /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will - /// take (buckets * GroupSize) bytes. - /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. - explicit HashSet(uint32 numBuckets, Allocator*const pAllocator) : Base::HashBase(numBuckets, pAllocator) {} - ~HashSet() { } - - /// Finds a given entry; if no entry was found, allocate it. - /// - /// @param [in] ppKey Key to search for. - /// @param [out] pExisted True if an entry for the specified key existed before this call was made. - /// False indicates that a new entry was allocated as a result of this call. - /// - /// @returns @ref Success if the operation completed successfully - /// @ref ErrorOutOfMemory if the operation failed because an internal memory allocation failed. - Result FindAllocate(Key** ppKey, bool* pExisted); - - /// Inserts an entry. - /// - /// No action will be taken if an entry matching this key already exists in the set. - /// - /// @param [in] key New entry to insert. - /// - /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed - /// because an internal memory allocation failed. - Result Insert(const Key& key); - -private: - // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when - // accessing members of HashBase. - typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; - - PAL_DISALLOW_DEFAULT_CTOR(HashSet); - PAL_DISALLOW_COPY_AND_ASSIGN(HashSet); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashSet.h + * @brief PAL utility collection HashSet class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBase.h" + +namespace Util +{ + +/// Encapsulates one entry of a hash set. +template +struct HashSetEntry +{ + Key key; ///< Hash set entry key. +}; + +/** + *********************************************************************************************************************** + * @brief Templated hash set container. + * + * This is meant for storing elements of an arbitrary (but uniform) key type. Supported operations: + * + * - Searching + * - Insertion + * - Deletion + * - Iteration + * + * HashFunc is a functor for hashing keys. Built-in choices for HashFunc are: + * + * - DefaultHashFunc: Good choice when the key is a pointer. + * - JenkinsHashFunc: Good choice when the key is arbitrary binary data. + * - StringJenkinsHashFunc: Good choice when the key is a C-style string. + * + * EqualFunc is a functor for comparing keys. Built-in choices for EqualFunc are: + * + * - DefaultEqualFunc: Determines keys are equal by bitwise comparison. + * - StringEqualFunc: Treats keys as a char* and compares them as C-style strings. + * + * @warning This class is not thread-safe for Insert, Erase, or iteration! + * @warning Init() must be called before using this container. Begin() and Reset() can be safely called before + * initialization and Begin() will always return an iterator that points to null. + * + * For more details please refer to @ref HashBase. + *********************************************************************************************************************** + */ +template class HashFunc = DefaultHashFunc, + template class EqualFunc = DefaultEqualFunc, + typename AllocFunc = HashAllocator, + size_t GroupSize = PAL_CACHE_LINE_BYTES * 2> +class HashSet : public HashBase, + Allocator, + HashFunc, + EqualFunc, + AllocFunc, + GroupSize> +{ +public: + /// Convenience typedef for a templated entry of this hash set. + typedef HashSetEntry Entry; + + /// @internal Constructor + /// + /// @param [in] numBuckets Number of buckets to allocate for this hash container. The initial hash container will + /// take (buckets * GroupSize) bytes. + /// @param [in] pAllocator Pointer to an allocator that will create system memory requested by this hash container. + explicit HashSet(uint32 numBuckets, Allocator*const pAllocator) : Base::HashBase(numBuckets, pAllocator) {} + virtual ~HashSet() { } + + /// Finds a given entry; if no entry was found, allocate it. + /// + /// @param [in] ppKey Key to search for. + /// @param [out] pExisted True if an entry for the specified key existed before this call was made. + /// False indicates that a new entry was allocated as a result of this call. + /// + /// @returns @ref Success if the operation completed successfully + /// @ref ErrorOutOfMemory if the operation failed because an internal memory allocation failed. + Result FindAllocate(Key** ppKey, bool* pExisted); + + /// Returns true if the specified key exists in the set. + /// + /// @param [in] key Key to search for. + /// + /// @returns True if the specified key exists in the set. + bool Contains(const Key& key) const; + + /// Inserts an entry. + /// + /// No action will be taken if an entry matching this key already exists in the set. + /// + /// @param [in] key New entry to insert. + /// + /// @returns @ref Success if the operation completed successfully, or @ref ErrorOutOfMemory if the operation failed + /// because an internal memory allocation failed. + Result Insert(const Key& key); + + /// Removes an entry that matches the specified key. + /// + /// @param [in] key Key of the entry to erase. + /// + /// @returns True if the erase completed successfully, false if an entry for this key did not exist. + bool Erase(const Key& key); + +private: + // Typedef for the specialized 'HashBase' object we're inheriting from so we can use properly qualified names when + // accessing members of HashBase. + typedef HashBase, Allocator, HashFunc, EqualFunc, AllocFunc, GroupSize> Base; + + PAL_DISALLOW_DEFAULT_CTOR(HashSet); + PAL_DISALLOW_COPY_AND_ASSIGN(HashSet); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h index b1721f23be..87966886bb 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palHashSetImpl.h @@ -1,75 +1,231 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palHashSetImpl.h - * @brief PAL utility collection HashSet class implementation. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palHashBaseImpl.h" -#include "palHashSet.h" - -namespace Util -{ - -// ===================================================================================================================== -// Inserts a key if it doesn't already exist. -template class HashFunc, - template class EqualFunc, - typename AllocFunc, - size_t GroupSize> -Result HashSet::Insert( - const Key& key) -{ - Entry* pEntry = nullptr; - bool existed = false; - return Base::FindAllocateEntry(key, &existed, &pEntry); -} - -// ===================================================================================================================== -// Finds a given entry; if no entry was found, allocate it. -template class HashFunc, - template class EqualFunc, - typename AllocFunc, - size_t GroupSize> -Result HashSet::FindAllocate( - Key** ppKey, - bool* pExisted) -{ - PAL_ASSERT(ppKey != nullptr); - PAL_ASSERT(pExisted != nullptr); - - static_assert(offsetof(Entry, key) == 0); - return Base::FindAllocateEntry(**ppKey, pExisted, reinterpret_cast(ppKey)); -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palHashSetImpl.h + * @brief PAL utility collection HashSet class implementation. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palHashBaseImpl.h" +#include "palHashSet.h" + +namespace Util +{ + +// ===================================================================================================================== +// Inserts a key if it doesn't already exist. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashSet::Insert( + const Key& key) +{ + Key* pKey = const_cast(&key); + bool existed; + const Result result = FindAllocate(&pKey, &existed); + if (existed == false) + { + *pKey = key; + } + return result; +} + +// ===================================================================================================================== +// Finds a given entry; if no entry was found, allocate it. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +Result HashSet::FindAllocate( + Key** ppKey, + bool* pExisted) +{ + PAL_ASSERT(ppKey != nullptr); + PAL_ASSERT(pExisted != nullptr); + + Result result = Result::ErrorOutOfMemory; + + // Get the bucket base address. + Entry* pGroup = this->InitAndFindBucket(**ppKey); + + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, **ppKey)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + *pExisted = true; + break; + } + } + + if ((pMatchingEntry == nullptr) && (i < Base::EntriesInGroup)) + { + // We've reached the end of the bucket and the entry was not found. Allocate this entry for the key. + *pExisted = false; + *ppKey = &pGroup[i].key; + pMatchingEntry = &(pGroup[i]); + this->m_numEntries++; + this->SetGroupFooterNumEntries(pGroup, numEntries + 1); + } + + if (pMatchingEntry != nullptr) + { + result = Result::Success; + break; + } + + // Chain to the next entry group. + pGroup = this->AllocateNextGroup(pGroup); + } + + PAL_ASSERT(result == Result::Success); + + return result; +} + +// ===================================================================================================================== +// Searches for the specified key to see if it exists. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashSet::Contains( + const Key& key + ) const +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + Entry* pMatchingEntry = nullptr; + + while (pGroup != nullptr) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry group. + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key)) + { + // We've found the entry. + pMatchingEntry = &(pGroup[i]); + break; + } + } + + if ((pMatchingEntry != nullptr) || (i < Base::EntriesInGroup)) + { + break; + } + + // Chain to the next entry group. + pGroup = this->GetNextGroup(pGroup); + } + + return (pMatchingEntry != nullptr); +} + +// ===================================================================================================================== +// Removes an entry with the specified key. +template class HashFunc, + template class EqualFunc, + typename AllocFunc, + size_t GroupSize> +bool HashSet::Erase( + const Key& key) +{ + // Get the bucket base address. + Entry* pGroup = this->FindBucket(key); + + Entry* pFoundEntry = nullptr; + Entry* pLastEntry = nullptr; + + Entry* pLastEntryGroup = nullptr; + + // Find the entry to delete. + while ((pGroup != nullptr)) + { + const uint32 numEntries = this->GetGroupFooterNumEntries(pGroup); + + // Search this entry + uint32 i = 0; + for (; i < numEntries; i++) + { + if (this->m_equalFunc(pGroup[i].key, key) == true) + { + // We shouldn't find the same key twice. + PAL_ASSERT(pFoundEntry == nullptr); + + pFoundEntry = &(pGroup[i]); + } + + // keep track of last entry of all groups in bucket + pLastEntry = &(pGroup[i]); + pLastEntryGroup = pGroup; + } + + // Chain to the next entry group + pGroup = this->GetNextGroup(pGroup); + } + + // Copy the last entry's data into the entry that we are removing and invalidate the last entry as it now appears + // earlier in the list. This also handles the case where the entry to be removed is the last entry. + if (pFoundEntry != nullptr) + { + PAL_ASSERT(pLastEntry != nullptr); + + pFoundEntry->key = pLastEntry->key; + memset(pLastEntry, 0, sizeof(Entry)); + + PAL_ASSERT(this->m_numEntries > 0); + this->m_numEntries--; + const uint32 numEntries = this->GetGroupFooterNumEntries(pLastEntryGroup); + this->SetGroupFooterNumEntries(pLastEntryGroup, numEntries - 1); + } + + return (pFoundEntry != nullptr); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h b/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h index b5dafb3b07..1040e8108c 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palInlineFuncs.h @@ -1,1565 +1,1565 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palInlineFuncs.h - * @brief PAL utility collection inline functions. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palAssert.h" -#include "palStringUtil.h" -#include -#include -#include -#include -#include -#include -#include - -namespace Util -{ - -/// Describes a value type, primarily used for loading settings values. -enum class ValueType : uint32 -{ - Boolean, ///< Boolean type. - Int8, ///< 8-bit integer type. - Uint8, ///< 8-bit unsigned integer type. - Int16, ///< 16-bit integer type. - Uint16, ///< 16-bit unsigned integer type. - Int32, ///< 32-bit integer type. - Uint32, ///< 32-bit unsigned integer type. - Int64, ///< 64-bit integer type. - Uint64, ///< 64-bit unsigned integer type. - Float, ///< Floating point type. - Str, ///< String type. -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 905 - Int = Int32, ///< Signed integer type. - Uint = Uint32, ///< Unsigned integer type. -#endif -}; - -/// Determines the length of an array at compile-time. -/// -/// @returns The length of the array. -template -constexpr size_t ArrayLen( - const T (&array)[N]) ///< The array of arbitrary type T. -{ - return N; -} - -/// Determines the 32-bit length of an array at compile-time. -/// -/// @returns The length of the array. -template -constexpr uint32 ArrayLen32( - const T (&array)[N]) ///< The array of arbitrary type T. -{ - return N; -} - -/// Increments a const pointer by nBytes by first casting it to a const uint8*. -/// -/// @returns Incremented pointer. -constexpr const void* VoidPtrInc( - const void* p, ///< [in] Pointer to be incremented. - size_t numBytes) ///< Number of bytes to increment the pointer by. -{ - return (static_cast(p) + numBytes); -} - -/// Increments a pointer by nBytes by first casting it to a uint8*. -/// -/// @returns Incremented pointer. -constexpr void* VoidPtrInc( - void* p, ///< [in] Pointer to be incremented. - size_t numBytes) ///< Number of bytes to increment the pointer by. -{ - return (static_cast(p) + numBytes); -} - -/// Decrements a const pointer by nBytes by first casting it to a const uint8*. -/// -/// @returns Decremented pointer. -constexpr const void* VoidPtrDec( - const void* p, ///< [in] Pointer to be decremented. - size_t numBytes) ///< Number of bytes to decrement the pointer by. -{ - return (static_cast(p) - numBytes); -} - -/// Decrements a pointer by nBytes by first casting it to a uint8*. -/// -/// @returns Decremented pointer. -constexpr void* VoidPtrDec( - void* p, ///< [in] Pointer to be decremented. - size_t numBytes) ///< Number of bytes to decrement the pointer by. -{ - return (static_cast(p) - numBytes); -} - -/// Finds the number of bytes between two pointers by first casting them to uint8*. -/// -/// This function expects the first pointer to not be smaller than the second. -/// -/// @returns Number of bytes between the two pointers. -constexpr size_t VoidPtrDiff( - const void* p1, ///< [in] First pointer (higher address). - const void* p2) ///< [in] Second pointer (lower address). -{ - PAL_CONSTEXPR_ASSERT(p1 >= p2); - return (static_cast(p1) - static_cast(p2)); -} - -/// Returns the high 32 bits of a 64-bit integer. -/// -/// @returns Returns the high 32 bits of a 64-bit integer. -constexpr uint32 HighPart( - uint64 value) ///< 64-bit input value. -{ - return (value & 0xFFFFFFFF00000000) >> 32; -} - -/// Returns the low 32 bits of a 64-bit integer. -/// -/// @returns Returns the low 32 bits of a 64-bit integer. -constexpr uint32 LowPart( - uint64 value) ///< 64-bit input value. -{ - return (value & 0x00000000FFFFFFFF); -} - -/// Returns the high 32 bits of a 64-bit integer as a 64-bit integer. -/// -/// @returns Returns the high 32 bits of a 64-bit integer as a 64-bit integer -/// without shifting -constexpr uint64 HighPart64( - uint64 value) ///< 64-bit input value. -{ - return (value & 0xFFFFFFFF00000000); -} - -/// Combines the low and high 32 bits of a 64-bit integer. -/// -/// @returns Returns the 64-bit integer. -constexpr uint64 Uint64CombineParts( - uint32 lowPart, - uint32 highPart) -{ - return (uint64(highPart) << 32) | uint64(lowPart); -} - -/// Returns a larger value from repeating a single byte -constexpr uint32 ReplicateByteAcrossDword( - uint8 value) ///< 8-bit input value. -{ - return (value | (value << 8) | (value << 16) | (value << 24)); -} - -/// Returns a larger value from repeating a single byte -constexpr uint64 ReplicateByteAcrossQword( - uint8 value) ///< 8-bit input value. -{ - return ((static_cast(ReplicateByteAcrossDword(value)) << 32) | ReplicateByteAcrossDword(value)); -} - -/// Combines four characters into a uint32-based four-character-code "string". There's no null terminator so it's not a -/// real c-string, it just looks like there's a string if you view the uint in a hex editor or memcmp against a string. -/// -/// For example, FourCC('A', 'B', 'C', 'D') turns into 0x44434241. 'A' is 0x41 and it ends up in the first byte. -/// This function assumes we're running on a little endian platform (PAL only supports little-endian platforms). -/// -/// @returns Returns a uin32 four-character-code made from the given chars. -constexpr uint32 FourCc( - char c1, ///< The 1st character (lowest byte). - char c2, ///< The 2nd character. - char c3, ///< The 3rd character. - char c4) ///< The 4th character (highest byte). -{ - return (uint32(c4) << 24) | (uint32(c3) << 16) | (uint32(c2) << 8) | uint32(c1); -} - -/// Returns a bitfield from within some value. -/// -/// @returns Returns a bitfield from within some value. -template -constexpr T BitExtract( - T value, ///< Extract a bitfield from here. - uint32 firstBit, ///< The zero-based index of the first bit to extract. - uint32 lastBit) ///< The zero-based index of the last bit to extract. -{ - return (value >> firstBit) & ((1 << (lastBit - firstBit + 1)) - 1); -} - -/// Determines if any of the bits set in "test" are also set in "src". -/// -/// @returns True if any bits in "test" are set in "src", false otherwise. -constexpr bool TestAnyFlagSet( - uint32 src, ///< Source pattern. - uint32 test) ///< Test pattern. -{ - return ((src & test) != 0); -} - -/// Determines if all of the bits set in "test" are also set in "src". -/// -/// @returns True if all bits set in "test" are also set in "src", false otherwise. -constexpr bool TestAllFlagsSet( - uint32 src, ///< Source pattern. - uint32 test) ///< Test pattern. -{ - return ((src & test) == test); -} - -/// Determines if any of the bits set in "test" are also set in "src". -/// -/// @returns True if any bits in "test" are set in "src", false otherwise. -constexpr bool TestAnyFlagSet64( - uint64 src, ///< Source pattern. - uint64 test) ///< Test pattern. -{ - return ((src & test) != 0); -} - -/// Determines if all of the bits set in "test" are also set in "src". -/// -/// @returns True if all bits set in "test" are also set in "src", false otherwise. -constexpr bool TestAllFlagsSet64( - uint64 src, ///< Source pattern. - uint64 test) ///< Test pattern. -{ - return ((src & test) == test); -} - -/// Tests if a single bit in a bitfield is set. -/// -/// @param [in] bitfield Bitfield being tested -/// @param [in] bit Bit index to test -/// -/// @returns True if the flag is set. -template -constexpr bool BitfieldIsSet( - const T bitfield, - uint32 bit) -{ - PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); - return (bitfield & (static_cast(1) << bit)); -} - -/// Sets a single bit in a bitfield to one. -/// -/// @param [in] bitfield Reference to the bitfield being modified -/// @param [in] bit Index of the bit to set -template -void BitfieldSetBit( - T &bitfield, - uint32 bit) -{ - PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); - bitfield |= (static_cast(1) << bit); -} - -///@{ -/// Counts the number of one bits (population count) in an unsigned integer using some bitwise magic explained in the -/// Software Optimization Guide for AMD64 Processors. -/// -/// @param [in] value The value need to be counted. -/// -/// @returns Number of one bits in the input -template -constexpr uint32 CountSetBits( - T value) -{ - uint32 x = static_cast(value); - - x = x - ((x >> 1) & 0x55555555); - x = (x & 0x33333333) + ((x >> 2) & 0x33333333); - x = (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> ((sizeof(uint32) - 1) << 3); - - return x; -} - -constexpr uint32 CountSetBits( - uint64 value) -{ - uint64 x = value; - - x = x - ((x >> 1) & 0x5555555555555555ull); - x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull); - x = (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> ((sizeof(uint64) - 1) << 3); - - return static_cast(x); -} -///@} - -/// Update a subfield of a bitfield. -/// -/// @param [in] bitFieldToUpdate Bitfield being updated -/// @param [in] updateValue Source value to update -/// @param [in] updateMask Bitmask to update -/// -/// @returns True if the flag is set. -template -void BitfieldUpdateSubfield( - T* pBitFieldToUpdate, - const T updateValue, - const T updateMask) -{ - *pBitFieldToUpdate = ((*pBitFieldToUpdate) & ~updateMask) | - (updateValue & updateMask); -} - -/// Tests if a single bit in a "wide bitfield" is set. A "wide bifield" is a bitfield which spans an array of -/// integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Reference to the bitfield being tested -/// @param [in] bit Index of the flag to test -/// -/// @returns True if the flag is set. -template -constexpr bool WideBitfieldIsSet( - const T (&bitfield)[N], - uint32 bit) -{ - const uint32 index = (bit / (sizeof(T) << 3)); - const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); - - return (0 != (bitfield[index] & mask)); -} - -/// Checks if any bit is set in a wide bitfield. A "wide bitfield" is a bitfield which spans an array of -/// integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Wide bitfield to count. -/// -/// @returns True if the wide bitfield is non-zero; false otherwise. -template -bool WideBitfieldIsAnyBitSet( - const T(&bitfield)[N]) -{ - bool isBitSet = false; - for (uint32 i = 0; i < N; i++) - { - isBitSet |= (bitfield[i] != 0); - } - - return isBitSet; -} - -/// Sets a single bit in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of -/// integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Reference to the bitfield being modified -/// @param [in] bit Index of the flag to set -template -void WideBitfieldSetBit( - T (&bitfield)[N], - uint32 bit) -{ - const uint32 index = (bit / (sizeof(T) << 3)); - const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); - - bitfield[index] |= mask; -} - -/// Clears a single bit in a "wide bitfield" to zero. A "wide bifield" is a bitfield which spans an array of -/// integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Reference to the bitfield being modified -/// @param [in] bit Index of the flag to set -template -void WideBitfieldClearBit( - T (&bitfield)[N], - uint32 bit) -{ - const uint32 index = (bit / (sizeof(T) << 3)); - const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); - - bitfield[index] &= ~mask; -} - -/// Sets consecutive bits in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of -/// integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Reference to the bitfield being modified -/// @param [in] startingBit Index of the first flag to set -/// @param [in] numBits Count of consecutive flags to set -template -void WideBitfieldSetRange( - T (&bitfield)[N], - uint32 startingBit, - uint32 numBits) -{ - constexpr uint32 SizeInBits = (sizeof(T) << 3); - - PAL_ASSERT((startingBit + numBits) <= (SizeInBits * N)); - - uint32 index = (startingBit / SizeInBits); - - startingBit &= (SizeInBits - 1); - - while (numBits > 0) - { - const uint32 maxNumBits = SizeInBits - startingBit; - const uint32 curNumBits = (maxNumBits < numBits) ? maxNumBits : numBits; - const T bitMask = (curNumBits == SizeInBits) ? -1 : ((static_cast(1) << curNumBits) - 1); - - bitfield[index++] |= (bitMask << startingBit); - - startingBit = 0; - numBits -= curNumBits; - } -} - -/// XORs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers -/// because there are more flags than bits in one integer. -/// -/// @param [in] bitfield1 Reference to the first bitfield. -/// @param [in] bitfield2 Reference to the second bitfield. -/// @param [out] pOut Result of (bitfield1 ^ bitfield2) -template -void WideBitfieldXorBits( - const T (&bitfield1)[N], - const T (&bitfield2)[N], - T* pOut) -{ - for (uint32 i = 0; i < N; i++) - { - pOut[i] = (bitfield1[i] ^ bitfield2[i]); - } -} - -/// ANDs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers -/// because there are more flags than bits in one integer. -/// -/// @param [in] bitfield1 Reference to the first bitfield. -/// @param [in] bitfield2 Reference to the second bitfield. -/// @param [out] pOut Result of (bitfield1 & bitfield2) -template -void WideBitfieldAndBits( - const T (&bitfield1)[N], - const T (&bitfield2)[N], - T* pOut) -{ - for (uint32 i = 0; i < N; i++) - { - pOut[i] = (bitfield1[i] & bitfield2[i]); - } -} - -/// Counts the number of one bits (population count) in a wide bitfield. A "wide bitfield" is a bitfield which spans -/// an array of integers because there are more flags than bits in one integer. -/// -/// @param [in] bitfield Wide bitfield to count. -/// -/// @returns Number of one bits in the input -template -uint32 WideBitfieldCountSetBits( - const T(&bitfield)[N]) -{ - uint32 count = 0; - for (uint32 i = 0; i < N; i++) - { - count += CountSetBits(bitfield[i]); - } - - return count; -} - -/// Unsets the least-significant '1' bit in the given number. -/// Usually used in conjunction with BitMaskScanForward -/// -/// @param [in] value The value to be modified -/// -/// @returns A copy of value with the lowest '1' bit unset. -template -T UnsetLeastBit( - T val) -{ - static_assert(std::is_unsigned::value, "Must use unsigned ints here"); - return val & (val - 1); -} - -/// Scans the specified bit-mask for the least-significant '1' bit. -/// -/// @returns True if the input was nonzero; false otherwise. -template -bool BitMaskScanForward( - uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. - T mask) ///< Bit-mask to scan. -{ - // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key - // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. - bool result = false; - - if (mask != 0) - { -#if defined(_WIN64) && defined(_M_X64) - *pIndex = (sizeof(T) > 4) ? static_cast(::_tzcnt_u64(mask)) : (::_tzcnt_u32(static_cast(mask))); -#elif defined(_WIN64) - auto*const pOut = reinterpret_cast(pIndex); - (sizeof(T) > 4) ? (::_BitScanForward64(pOut, mask)) : (::_BitScanForward(pOut, static_cast(mask))); -#elif defined(_WIN32) && defined(_M_IX86) - const uint32 lowPart = LowPart(mask); - uint32 index = (::_tzcnt_u32(((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); - *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; -#elif defined(_WIN32) - unsigned long index; - const uint32 lowPart = LowPart(mask); - (::_BitScanForward(&index, ((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); - *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; -#elif defined(__GNUC__) - *pIndex = (sizeof(T) > 4) ? __builtin_ctzll(mask) : __builtin_ctz(static_cast(mask)); -#else - uint32 index = 0; - for (; ((mask & 0x1) == 0); mask >>= 1, ++index); - *pIndex = index; -#endif - - result = true; - } - return result; -} - -/// Scans the specified bit-mask for the most-significant '1' bit. -/// -/// @returns True if the input was nonzero; false otherwise. -template -bool BitMaskScanReverse( - uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. - T mask) ///< Bit-mask to scan. -{ - // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key - // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. - bool result = false; - - if (mask != 0) - { -#if defined(_WIN64) - auto*const pOut = reinterpret_cast(pIndex); - (sizeof(T) > 4) ? (::_BitScanReverse64(pOut, mask)) : (::_BitScanReverse(pOut, static_cast(mask))); -#elif defined(_WIN32) - unsigned long index; - const uint32 highPart = (sizeof(T) > 4) ? HighPart(mask) : 0; - (::_BitScanReverse(&index, ((sizeof(T) > 4) && (highPart != 0)) ? highPart : LowPart(mask))); - *pIndex = ((sizeof(T) > 4) && (highPart != 0)) ? (index + 32u) : index; -#elif defined(__GNUC__) - *pIndex = (sizeof(T) > 4) ? (63u - __builtin_clzll(mask)) : (31u - __builtin_clz(static_cast(mask))); -#else - uint32 index = 31u; - for (; (((mask >> index) & 0x1) == 0); --index); - *pIndex = index; -#endif - - result = true; - } - return result; -} - -/// Scans the specified wide bit-mask for the least-significant '1' bit. -/// -/// @returns True if input was nonzero; false otherwise. -template -bool WideBitMaskScanForward( - uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. - const T (&mask)[N]) ///< Bit-mask to scan. -{ - uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); - - // Check to see if the wide bitmask has some bits set. - uint32 index = 0; - while ((mask[index] == 0) && (++index < N)); - bool result = (index < N); - - while (result == true) - { - result = BitMaskScanForward(pIndex, mask[maskIndex]); - - if (result == false) - { - ++maskIndex; - result = (maskIndex < N); - } - else - { - (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); - break; - } - } - - return result; -} - -/// Scans the specified wide bit-mask for the most-significant '1' bit. -/// -/// @returns True if input was nonzero; false otherwise. -template -bool WideBitMaskScanReverse( - uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. - const T (&mask)[N]) ///< Bit-mask to scan. -{ - uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); - - // Check to see if the wide bitmask has some bits set. - uint32 index = N - 1; - while ((mask[index] == 0) && (--index > 0)); - bool result = (mask[index] != 0); - - while (result == true) - { - result = BitMaskScanReverse(pIndex, mask[maskIndex]); - - if (result == false) - { - const uint32 oldIndex = maskIndex--; - result = (oldIndex != 0); - } - else - { - (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); - break; - } - } - - return result; -} - -/// Generates a bitmask. -/// -/// @param [in] numBits Number of bits to set (starting at 0) -/// -/// @returns Bitmask in storage of type T with bits [0:numBits-1] set. -template -constexpr T BitfieldGenMask( - T numBits) -{ - PAL_CONSTEXPR_ASSERT(numBits <= (sizeof(T) * 8)); - - const T mask = (numBits < (sizeof(T) * 8)) ? ((static_cast(1) << (numBits)) - static_cast(1)) : static_cast(-1); - return mask; -} - -/// Determines if a value is a power of two. -/// -/// @returns True if it is a power of two, false otherwise. -constexpr bool IsPowerOfTwo( - uint64 value) ///< Value to check. -{ - return (value == 0) ? false : ((value & (value - 1)) == 0); -} - -/// Determines if 'value' is at least aligned to the specified power-of-2 alignment. -/// -/// @returns True if aligned, false otherwise. -constexpr bool IsPow2Aligned( - uint64 value, ///< Value to check. - uint64 alignment) ///< Desired alignment. -{ - PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); - return ((value & (alignment - 1)) == 0); -} - -/// Determines if 'ptr' is at least aligned to the specified power-of-2 alignment. -/// -/// @returns True if aligned, false otherwise. -inline bool VoidPtrIsPow2Aligned( - const void* ptr, ///< Pointer to check. - uint64 alignment) ///< Desired alignment. -{ - PAL_ASSERT(IsPowerOfTwo(alignment)); - return ((reinterpret_cast(ptr) & (alignment - 1)) == 0); -} - -/// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 -/// alignments are supported by this function. -/// -/// @returns Aligned value. -template -constexpr T Pow2Align( - T value, ///< Value to align. - uint64 alignment) ///< Desired alignment (must be a power of 2). -{ - PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); - return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); -} - -/// Rounds the specified uint 'value' up to the nearest power of 2 -/// -/// @param [in] value The value to pad. -/// -/// @returns Power of 2 padded value. -template -T Pow2Pad( - T value) -{ - T ret = value; - - if ((value & (value - 1)) != 0) - { - uint32 lastBitIndex = 0; - BitMaskScanReverse(&lastBitIndex, value); - ret = (static_cast(0x2) << lastBitIndex); - } - - return ret; -} - -/// Computes the base-2 logarithm of an unsigned integer. -/// -/// If the given integer is not a power of 2, this function will not provide an exact answer. -/// -/// @param [in] u Value to compute the logarithm of. -/// -/// @returns log_2(u) -template -uint32 Log2( - T u) -{ - uint32 logValue = 0; - return BitMaskScanReverse(&logValue, u) ? logValue : 0; -} - -/// Computes the base-2 logarithm of an unsigned 64-bit integer based on ceiling -/// -/// If the given integer is not a power of 2, this function will not provide an exact answer. -/// -/// @returns ceilLog_2(u) -template -uint32 CeilLog2( - T u) ///< Value to compute the ceil logarithm of. -{ - const uint32 logValue = Log2(u); - return ((static_cast(0x1ul) << logValue) < u) ? (logValue + 1) : logValue; -} - -/// Implements an alternative version of integer division in which the quotient is always rounded up instead of down. -/// -/// @returns The rounded quotient. -template -constexpr T RoundUpQuotient( - T dividend, ///< Value to divide. - T divisor) ///< Value to divide by. -{ - return ((dividend + (divisor - 1)) / divisor); -} - -/// Rounds up the specified integer to the nearest multiple of the specified alignment value. -/// -/// @returns Rounded value. -template -constexpr T RoundUpToMultiple( - T operand, ///< Value to be aligned. - T alignment) ///< Alignment desired. -{ - return (((operand + (alignment - 1)) / alignment) * alignment); -} - -/// Rounds down the specified integer to the nearest multiple of the specified alignment value. -/// -/// @returns Rounded value. -template -constexpr T RoundDownToMultiple( - T operand, ///< Value to be aligned. - T alignment) ///< Alignment desired. -{ - return ((operand / alignment) * alignment); -} - -/// Rounds the specified 'value' down to the nearest value meeting the specified 'alignment'. Only power of 2 -/// alignments are supported by this function. -/// -/// @returns Rounded value. -template -constexpr T Pow2AlignDown( - T value, ///< Value to align. - uint64 alignment) ///< Desired alignment (must be a power of 2). -{ - PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); - return (value & ~(alignment - 1)); -} - -/// Determines the maximum of two numbers. -/// -/// @returns The larger of the two inputs. -template -constexpr T Max( - T value1, ///< First value to check. - T value2) ///< Second value to check. -{ - return ((value1 > value2) ? value1 : value2); -} - -/// Determines the maximum of N numbers. -/// -/// @returns The largest of all the inputs. -template -constexpr T Max( - T value1, ///< First value to check. - T value2, ///< Second value to check. - Ts... values) ///< Additional values to check. -{ - return Max(((value1 > value2) ? value1 : value2), values...); -} - -/// Determines the minimum of two numbers. -/// -/// @returns The smaller of the two inputs. -template -constexpr T Min( - T value1, ///< First value to check. - T value2) ///< Second value to check. -{ - return ((value1 < value2) ? value1 : value2); -} - -/// Determines the minimum of N numbers. -/// -/// @returns The smallest of all the inputs. -template -constexpr T Min( - T value1, ///< First value to check. - T value2, ///< Second value to check. - Ts... values) ///< Additional values to check. -{ - return Min(((value1 < value2) ? value1 : value2), values...); -} - -/// Clamps the input number so that it falls in-between the lower and upper bounds (inclusive). -/// -/// @returns Clamped input number. -template -constexpr T Clamp( - T input, ///< Input number to clamp. - T lowBound, ///< Lower-bound to clamp to. - T highBound) ///< Upper-bound to clamp to. -{ - return ((input <= lowBound) ? lowBound : - (input >= highBound) ? highBound : input); -} - -/// Determines if the input is within the range specified (inclusive). -/// -/// @returns True if within range, False otherwise. -template -constexpr bool InRange( - T input, ///< Input number to range check. - T lowBound, ///< Low bound of the range to check (inclusive). - T highBound) ///< High bound of the range to check (inclusive). -{ - return (lowBound <= input) && (input <= highBound); -} - -/// Converts a byte value to the equivalent number of DWORDs (uint32) rounded up. I.e., 3 bytes will return 1 dword. -/// -/// @returns Number of dwords necessary to cover numBytes. -constexpr uint32 NumBytesToNumDwords( - uint32 numBytes) ///< Byte count to convert. -{ - return Pow2Align(numBytes, static_cast(sizeof(uint32))) / sizeof(uint32); -} - -/// Compare two strings ignoring case -inline int Strcasecmp( - const char* pSrc, ///< [in] The source string to be compared. - const char* pDst) ///< [in] The dest string to compare. -{ - PAL_ASSERT(pSrc != nullptr); - PAL_ASSERT(pDst != nullptr); - -#if defined(_WIN32) - return _stricmp(pDst, pSrc); -#else - return strcasecmp(pDst, pSrc); -#endif -} - -/// Performs a safe strcpy by requiring the destination buffer size. -inline void Strncpy( - char* pDst, ///< [out] Destination string. - const char* pSrc, ///< [in] Source string to be copied into destination. - size_t dstSize) ///< Size of the destination buffer in bytes. -{ - PAL_ASSERT(pDst != nullptr); - PAL_ASSERT(pSrc != nullptr); - PAL_ALERT(strlen(pSrc) >= dstSize); - - if (dstSize > 0) - { -#if defined(_WIN32) - // Clamp the copy to the size of the dst buffer (1 char reserved for the null terminator). - strncpy_s(pDst, dstSize, pSrc, _TRUNCATE); -#else - strncpy(pDst, pSrc, (dstSize - 1)); - pDst[dstSize - 1] = '\0'; -#endif - } -} - -/// Simple wrapper for wcscpy_s or wcsncpy, which are available on Windows and Linux, respectively. -inline void Wcsncpy( - wchar_t* pDst, ///< [out] Destination string. - const wchar_t* pSrc, ///< [in] Source string to copy. - size_t dstSize) ///< Length of the destination buffer, in wchar_t's. -{ -#if defined(_WIN32) - wcscpy_s(pDst, dstSize, pSrc); -#else - wcsncpy(pDst, pSrc, (dstSize - 1)); - pDst[dstSize - 1] = L'\0'; -#endif -} - -// Wrapper for wcscat or wcscat_s which provides a safe version of wcscat -inline void Wcscat( - wchar_t* pDst, - const wchar_t* pSrc, - size_t dstSize) -{ -#if defined(_WIN32) - wcsncat_s(pDst, dstSize, pSrc, _TRUNCATE); -#else - const size_t dstLen = std::wcslen(pDst); - wcsncat(pDst, pSrc, (dstSize - dstLen - 1)); - pDst[dstSize - 1] = L'\0'; -#endif -} - -/// Simple wrapper for strncat or strncat_s which provides a safe version of strncat. -inline void Strncat( - char* pDst, ///< [in,out] Destination string. - size_t sizeDst, ///< Length of the destination string, including the null terminator. - const char* pSrc) ///< [in] Source string. -{ - PAL_ASSERT((pDst != nullptr) && (pSrc != nullptr)); - -#if defined(_WIN32) - // MS compilers provide strncat_s, which will truncate the copy to prevent buffer overruns and always guarantee that - // pDst is null-terminated. - strncat_s(pDst, sizeDst, pSrc, _TRUNCATE); -#else - // Compute the length of the destination string to prevent buffer overruns. - const size_t dstLength = strlen(pDst); - strncat(pDst, pSrc, (sizeDst - dstLength - 1)); -#endif -} - -/// Simple wrapper for strtok_s or strtok_r which provides a safe version of strtok. -inline char* Strtok( - char* str, ///< [in] Token string. - const char* delim, ///< [in] Token delimit. - char** buf) ///< [in,out] Buffer to store the rest of the string. -{ - PAL_ASSERT((delim != nullptr) && (buf != nullptr)); - - char* pToken = nullptr; - -#if defined(_WIN32) - pToken = strtok_s(str, delim, buf); -#else - pToken = strtok_r(str, delim, buf); -#endif - - return pToken; -} - -/// Rounds the specified pointer up to the nearest value meeting the specified 'alignment'. Only power of 2 alignments -/// are supported by this function. -/// -/// @returns Aligned pointer. -inline void* VoidPtrAlign( - void* ptr, ///< Pointer to align. - size_t alignment) ///< Desired alignment. -{ - // This function only works for POW2 alignment - PAL_ASSERT(IsPowerOfTwo(alignment)); - - return reinterpret_cast( - (reinterpret_cast(ptr) + (alignment - 1)) & ~(alignment - 1)); -} - -/// Converts a raw string value to the correct data type. -inline void StringToValueType( - const char* pStrValue, ///< [in] Setting value in string form. - ValueType type, ///< Data type of the value being converted. - size_t valueSize, ///< Size of pValue buffer. - void* pValue) ///< [out] Converted setting value buffer. -{ - switch (type) - { - case ValueType::Boolean: - *(static_cast(pValue)) = ((atoi(pStrValue)) ? true : false); - break; - case ValueType::Int8: - *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); - break; - case ValueType::Uint8: - *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); - break; - case ValueType::Int16: - *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); - break; - case ValueType::Uint16: - *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); - break; - case ValueType::Int32: - *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); - break; - case ValueType::Uint32: - *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); - break; - case ValueType::Int64: - *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); - break; - case ValueType::Uint64: - *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); - break; - case ValueType::Float: - *(static_cast(pValue)) = static_cast(atof(pStrValue)); - break; - case ValueType::Str: - Strncpy(static_cast(pValue), pStrValue, valueSize); - break; - } -} - -/// Converts a raw string value to the correct data type, returning 'true' if parsed correctly. -/// When not parsed correctly, the value will be unchanged. -/// -/// @note: A string that is truncated returns false. -/// @note: If the destination type is integer, the string is parsed as either int64 or uint64, and the parsed value is -/// clamped to fit the range of the destination type. -[[nodiscard]] inline bool StringToValueTypeChecked( - const char* pStrValue, ///< [in] Setting value in string form. - ValueType type, ///< Data type of the value being converted. - size_t valueSize, ///< Size of pValue buffer. - void* pValue) ///< [out] Converted setting value buffer. -{ - auto CheckTrailingCharacters = [](char* pChar, const char* pEnd) -> bool { - while ((pChar < pEnd) && isspace(*pChar)) - { - // ignore trailing whitespace. strtoX handles leading whitespace - pChar++; - } - return (pChar == pEnd); - }; - - const size_t len = strlen(pStrValue); - const char* pTerminator = pStrValue + len; - char* pEndptr = nullptr; - bool valid = false; - - switch (type) - { - case ValueType::Boolean: - { - bool value = (strtol(pStrValue, &pEndptr, 0) != 0); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = value; - } - } - break; - case ValueType::Int8: - { - const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); - const int64 value = Clamp(parsedValue, - int64((std::numeric_limits::min)()), - int64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Uint8: - { - const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); - const uint64 value = Clamp(parsedValue, - uint64((std::numeric_limits::min)()), - uint64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Int16: - { - const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); - const int64 value = Clamp(parsedValue, - int64((std::numeric_limits::min)()), - int64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Uint16: - { - const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); - const uint64 value = Clamp(parsedValue, - uint64((std::numeric_limits::min)()), - uint64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Int32: - { - const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); - const int64 value = Clamp(parsedValue, - int64((std::numeric_limits::min)()), - int64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Uint32: - { - const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); - const uint64 value = Clamp(parsedValue, - uint64((std::numeric_limits::min)()), - uint64((std::numeric_limits::max)())); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = static_cast(value); - } - } - break; - case ValueType::Int64: - { - const int64 value = strtoll(pStrValue, &pEndptr, 0); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = value; - } - } - break; - case ValueType::Uint64: - { - const uint64 value = strtoull(pStrValue, &pEndptr, 0); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = value; - } - } - break; - case ValueType::Float: - { - float value = static_cast(strtof(pStrValue, &pEndptr)); - valid = CheckTrailingCharacters(pEndptr, pTerminator); - if (valid) - { - *(static_cast(pValue)) = value; - } - } - break; - case ValueType::Str: - if (len + 1 <= valueSize) - { - valid = true; - Strncpy(static_cast(pValue), pStrValue, valueSize); - } - break; - } - return valid; -} - -/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. -/// -/// @returns 32-bit hash generated from the provided string. -template -constexpr uint32 HashString( - const Char* pStr, ///< [in] String to be hashed. - size_t strSize) ///< Size of the input string. -{ - PAL_CONSTEXPR_ASSERT((pStr != nullptr) && (strSize > 0)); - - constexpr uint32 FnvPrime = 16777619u; - constexpr uint32 FnvOffset = 2166136261u; - - uint32 hash = FnvOffset; - - for (size_t i = 0; i < strSize; i++) - { - Char c = pStr[i]; - for (uint32 j = 0; j < sizeof(Char); ++j) - { - hash ^= uint8(c); - hash *= FnvPrime; - c = (c >> 8); - } - } - - return hash; -} - -/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. -/// -/// @returns 32-bit hash generated from the provided string. -template -constexpr uint32 HashString( - const Char* pString) -{ - return HashString(pString, StringLength(pString)); -} - -/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. -/// Same as HashString() except consteval enforces that this can only be called at compile-time. -/// -/// @returns 32-bit hash generated from the provided string. -template -#if defined(__cpp_consteval) -consteval -#else -constexpr -#endif -uint32 CompileTimeHashString( - const Char* pString) -{ - return HashString(pString); -} - -/// Indicates that an object may be moved from. -/// Can be understood as preparation for possible move operation. -/// -/// @warning Do not read object after it has been moved from! -/// -/// @param [in] object Universal reference to an object that may be moved from. -/// -/// @returns Rvalue reference to the parameter object. -template -constexpr typename std::remove_reference::type&& Move(T&& object) -{ - // Cast universal reference to rvalue reference. - return static_cast::type&&>(object); -} - -/// Exchanges values between two variables. -/// -/// @param [in] left First variable used in swap operation. -/// @param [in] right Second variable used in swap operation. -template -constexpr void Swap(T& left, T& right) -{ - T tmp = Move(left); - left = Move(right); - right = Move(tmp); -} - -/// Convenient alias for C style arrays. -template -using Array = Element[Size]; - -/// Prevent swapping arrays because of the cost of this operation. -template -void Swap(Array& a, Array& b); - -/// Compacts an array by moving all empty slots to the end of the array. -/// +---+---+---+---+---+---+---+---+---+---+ -/// Input: | A | | C | D | | E | | A | X | J | -/// +---+---+---+---+---+---+---+---+---+---+ -/// +---+---+---+---+---+---+---+---+---+---+ -/// Output: | A | C | D | E | A | X | J | | | | -/// +---+---+---+---+---+---+---+---+---+---+ -template -void PackArray(Array& array, const Element& emptySlot) -{ - int lastOccupiedSlot = -1; - - for (size_t i = 0; i < Size; ++i) - { - if (array[i] != emptySlot) - { - Swap(array[i], array[lastOccupiedSlot + 1]); - ++lastOccupiedSlot; - } - } -} - -/// Performs a safe mbstowcs by requiring the destination buffer size. -inline void Mbstowcs( - wchar_t* pDst, ///< [out] dst string - const char* pSrc, ///< [in] src string - size_t dstSizeInWords) ///< size of the destination buffer in words -{ - PAL_ASSERT(pDst != nullptr); - PAL_ASSERT(pSrc != nullptr); - - bool result = false; - // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) -#if defined(_WIN32) - size_t bytesConverted = 0; - errno_t retCode = mbstowcs_s(&bytesConverted, pDst, dstSizeInWords, pSrc, (dstSizeInWords - 1)); - - result = (retCode != 0) ? false : true; -#else - size_t retCode = mbstowcs(pDst, pSrc, dstSizeInWords); - - result = (retCode == static_cast(-1)) ? false : true; - - if (retCode == dstSizeInWords) - { - // Alert the user when the string has been truncated. - PAL_ALERT_ALWAYS(); - - // NULL terminate the string. - pDst[dstSizeInWords - 1] = '\0'; - } -#endif - - if (result == false) - { - // A non-convertible character was encountered or the string was truncated on the mbstowcs_s or - // ConvertCharStringToUtf16 code paths. - PAL_ALERT_ALWAYS(); - pDst[0] = '\0'; - } -} - -/// Performs a safe wcstombs by requiring the destination buffer size. -inline void Wcstombs( - char* pDst, ///< [out] dst string - const wchar_t* pSrc, ///< [in] src string - size_t dstSizeInBytes) ///< size of the destination buffer in bytes -{ - PAL_ASSERT(pDst != nullptr); - PAL_ASSERT(pSrc != nullptr); - - bool result = false; - // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) -#if defined(_WIN32) - size_t bytesConverted = 0; - errno_t retCode = wcstombs_s(&bytesConverted, pDst, dstSizeInBytes, pSrc, (dstSizeInBytes - 1)); - - result = (retCode != 0) ? false : true; -#else - size_t retCode = wcstombs(pDst, pSrc, (dstSizeInBytes - 1)); - - result = (retCode == static_cast(-1)) ? false : true; -#endif - - if (result == false) - { - // A non-convertible character was encountered. - PAL_ASSERT_ALWAYS(); - pDst[0] = '\0'; - } - - if (wcslen(pSrc) >= dstSizeInBytes) - { - // Assert to alert the user when the string has been truncated. - PAL_ASSERT_ALWAYS(); - - // NULL terminate the string. - pDst[dstSizeInBytes - 1] = '\0'; - } -} - -/// Computes the Greatest Common Divisor of two numbers -/// -/// @returns The GCD of the two inputs. -template -inline typename std::common_type::type Gcd( - T1 value1, - T2 value2) -{ - static_assert((std::is_integral::value == true) && - (std::is_integral::value == true), - "GCD requires integral types"); - - static_assert((std::is_unsigned::value == true) && - (std::is_unsigned::value == true), - "GCD requires unsigned types"); - - static_assert((std::is_same::value == false) && - (std::is_same::value == false), - "GCD requires nonboolean types"); - - using T = typename std::common_type::type; - T ret = 0u; - - if (value1 == 0u) - { - ret = static_cast(value2); - } - else if (value2 == 0u) - { - ret = static_cast(value1); - } - else - { - uint32 value1TrailingZeros = 0u; - BitMaskScanForward(&value1TrailingZeros, value1); - uint32 value2TrailingZeros = 0u; - BitMaskScanForward(&value2TrailingZeros, value2); - - const uint32 shift = Min(value1TrailingZeros, value2TrailingZeros); - value1 >>= value1TrailingZeros; - value2 >>= shift; - - do - { - BitMaskScanForward(&value2TrailingZeros, value2); - value2 >>= value2TrailingZeros; - - if (value1 > value2) - { - T tmp = value1; - value1 = value2; - value2 = tmp; - } - - value2 -= value1; - } - while (value2 != 0); - - ret = static_cast(value1 << shift); - } - - return ret; -} - -/// Computes the Greatest Common Divisor of N numbers -/// -/// @returns The GCD of the all inputs. -template -inline typename std::common_type::type>::type Gcd( - T1 value1, - T2 value2, - Ts... values) -{ - return Gcd(Gcd(value1, value2), values...); -} - -/// Computes the Least Common Multiple of two numbers -/// -/// @returns The LCM of the two inputs. -template -constexpr typename std::common_type::type Lcm( - T1 value1, - T2 value2) -{ - static_assert((std::is_integral::value == true) && - (std::is_integral::value == true), - "LCM requires integral types"); - - static_assert((std::is_unsigned::value == true) && - (std::is_unsigned::value == true), - "LCM requires unsigned types"); - - static_assert((std::is_same::value == false) && - (std::is_same::value == false), - "LCM requires nonboolean types"); - - using T = typename std::common_type::type; - - return (value1 != 0u) && (value2 != 0u) ? static_cast((value1 / Gcd(value1, value2)) * value2) : 0u; -} - -/// Computes the Least Common Multiple of N numbers -/// -/// @returns The LCM of all the inputs. -template -constexpr typename std::common_type::type>::type Lcm( - T1 value1, - T2 value2, - Ts... values) -{ - return Lcm(Lcm(value1, value2), values...); -} - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 -/// Returns the length of a wchar_t based string. -/// -/// @returns The length of the given string in wide characters -inline size_t Wcslen( - const wchar_t* pWideStr) -{ - return wcslen(pWideStr); -} - -/// Performs a reverse string find of wide character wc. -/// -/// @returns The matching character at the end of the string or nullptr if not found. -inline wchar_t* Wcsrchr(wchar_t *pStr, wchar_t wc) -{ - return wcsrchr(pStr, wc); -} -#endif - -/// Compile-time function to report if two values from unrelated strong enums are equivalent. This is useful for -/// static asserts ensuring it is safe to cast an enum without a conversion lookup table. -template -inline constexpr bool EnumSameVal( - T1 lhs, - T2 rhs) -{ - return (static_cast(lhs) == static_cast(rhs)); -} - -/// Comparison function for Sort() below. -template int PAL_CDECL SortComparisonFunc( - const void* pLhs, - const void* pRhs) -{ - return int(*static_cast(pRhs) < *static_cast(pLhs)) - - int(*static_cast(pLhs) < *static_cast(pRhs)); -} - -/// In-place sort of an array. Uses C library qsort, so is probably a non-order-preserving quicksort. -/// Sorts the array given by the random iterator range [pStart,pEnd). -/// The element type (the type you get by dereferencing RandomIt) must have an operator<. -template void Sort( - RandomIt pStart, - RandomIt pEnd) -{ - using ElementTy = typename std::iterator_traits::value_type; - qsort(&pStart[0], pEnd - pStart, sizeof(ElementTy), SortComparisonFunc); -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palInlineFuncs.h + * @brief PAL utility collection inline functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palStringUtil.h" +#include +#include +#include +#include +#include +#include +#include + +namespace Util +{ + +/// Describes a value type, primarily used for loading settings values. +enum class ValueType : uint32 +{ + Boolean, ///< Boolean type. + Int8, ///< 8-bit integer type. + Uint8, ///< 8-bit unsigned integer type. + Int16, ///< 16-bit integer type. + Uint16, ///< 16-bit unsigned integer type. + Int32, ///< 32-bit integer type. + Uint32, ///< 32-bit unsigned integer type. + Int64, ///< 64-bit integer type. + Uint64, ///< 64-bit unsigned integer type. + Float, ///< Floating point type. + Str, ///< String type. +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 905 + Int = Int32, ///< Signed integer type. + Uint = Uint32, ///< Unsigned integer type. +#endif +}; + +/// Determines the length of an array at compile-time. +/// +/// @returns The length of the array. +template +constexpr size_t ArrayLen( + const T (&array)[N]) ///< The array of arbitrary type T. +{ + return N; +} + +/// Determines the 32-bit length of an array at compile-time. +/// +/// @returns The length of the array. +template +constexpr uint32 ArrayLen32( + const T (&array)[N]) ///< The array of arbitrary type T. +{ + return N; +} + +/// Increments a const pointer by nBytes by first casting it to a const uint8*. +/// +/// @returns Incremented pointer. +constexpr const void* VoidPtrInc( + const void* p, ///< [in] Pointer to be incremented. + size_t numBytes) ///< Number of bytes to increment the pointer by. +{ + return (static_cast(p) + numBytes); +} + +/// Increments a pointer by nBytes by first casting it to a uint8*. +/// +/// @returns Incremented pointer. +constexpr void* VoidPtrInc( + void* p, ///< [in] Pointer to be incremented. + size_t numBytes) ///< Number of bytes to increment the pointer by. +{ + return (static_cast(p) + numBytes); +} + +/// Decrements a const pointer by nBytes by first casting it to a const uint8*. +/// +/// @returns Decremented pointer. +constexpr const void* VoidPtrDec( + const void* p, ///< [in] Pointer to be decremented. + size_t numBytes) ///< Number of bytes to decrement the pointer by. +{ + return (static_cast(p) - numBytes); +} + +/// Decrements a pointer by nBytes by first casting it to a uint8*. +/// +/// @returns Decremented pointer. +constexpr void* VoidPtrDec( + void* p, ///< [in] Pointer to be decremented. + size_t numBytes) ///< Number of bytes to decrement the pointer by. +{ + return (static_cast(p) - numBytes); +} + +/// Finds the number of bytes between two pointers by first casting them to uint8*. +/// +/// This function expects the first pointer to not be smaller than the second. +/// +/// @returns Number of bytes between the two pointers. +constexpr size_t VoidPtrDiff( + const void* p1, ///< [in] First pointer (higher address). + const void* p2) ///< [in] Second pointer (lower address). +{ + PAL_CONSTEXPR_ASSERT(p1 >= p2); + return (static_cast(p1) - static_cast(p2)); +} + +/// Returns the high 32 bits of a 64-bit integer. +/// +/// @returns Returns the high 32 bits of a 64-bit integer. +constexpr uint32 HighPart( + uint64 value) ///< 64-bit input value. +{ + return (value & 0xFFFFFFFF00000000) >> 32; +} + +/// Returns the low 32 bits of a 64-bit integer. +/// +/// @returns Returns the low 32 bits of a 64-bit integer. +constexpr uint32 LowPart( + uint64 value) ///< 64-bit input value. +{ + return (value & 0x00000000FFFFFFFF); +} + +/// Returns the high 32 bits of a 64-bit integer as a 64-bit integer. +/// +/// @returns Returns the high 32 bits of a 64-bit integer as a 64-bit integer +/// without shifting +constexpr uint64 HighPart64( + uint64 value) ///< 64-bit input value. +{ + return (value & 0xFFFFFFFF00000000); +} + +/// Combines the low and high 32 bits of a 64-bit integer. +/// +/// @returns Returns the 64-bit integer. +constexpr uint64 Uint64CombineParts( + uint32 lowPart, + uint32 highPart) +{ + return (uint64(highPart) << 32) | uint64(lowPart); +} + +/// Returns a larger value from repeating a single byte +constexpr uint32 ReplicateByteAcrossDword( + uint8 value) ///< 8-bit input value. +{ + return (value | (value << 8) | (value << 16) | (value << 24)); +} + +/// Returns a larger value from repeating a single byte +constexpr uint64 ReplicateByteAcrossQword( + uint8 value) ///< 8-bit input value. +{ + return ((static_cast(ReplicateByteAcrossDword(value)) << 32) | ReplicateByteAcrossDword(value)); +} + +/// Combines four characters into a uint32-based four-character-code "string". There's no null terminator so it's not a +/// real c-string, it just looks like there's a string if you view the uint in a hex editor or memcmp against a string. +/// +/// For example, FourCC('A', 'B', 'C', 'D') turns into 0x44434241. 'A' is 0x41 and it ends up in the first byte. +/// This function assumes we're running on a little endian platform (PAL only supports little-endian platforms). +/// +/// @returns Returns a uin32 four-character-code made from the given chars. +constexpr uint32 FourCc( + char c1, ///< The 1st character (lowest byte). + char c2, ///< The 2nd character. + char c3, ///< The 3rd character. + char c4) ///< The 4th character (highest byte). +{ + return (uint32(c4) << 24) | (uint32(c3) << 16) | (uint32(c2) << 8) | uint32(c1); +} + +/// Returns a bitfield from within some value. +/// +/// @returns Returns a bitfield from within some value. +template +constexpr T BitExtract( + T value, ///< Extract a bitfield from here. + uint32 firstBit, ///< The zero-based index of the first bit to extract. + uint32 lastBit) ///< The zero-based index of the last bit to extract. +{ + return (value >> firstBit) & ((1 << (lastBit - firstBit + 1)) - 1); +} + +/// Determines if any of the bits set in "test" are also set in "src". +/// +/// @returns True if any bits in "test" are set in "src", false otherwise. +constexpr bool TestAnyFlagSet( + uint32 src, ///< Source pattern. + uint32 test) ///< Test pattern. +{ + return ((src & test) != 0); +} + +/// Determines if all of the bits set in "test" are also set in "src". +/// +/// @returns True if all bits set in "test" are also set in "src", false otherwise. +constexpr bool TestAllFlagsSet( + uint32 src, ///< Source pattern. + uint32 test) ///< Test pattern. +{ + return ((src & test) == test); +} + +/// Determines if any of the bits set in "test" are also set in "src". +/// +/// @returns True if any bits in "test" are set in "src", false otherwise. +constexpr bool TestAnyFlagSet64( + uint64 src, ///< Source pattern. + uint64 test) ///< Test pattern. +{ + return ((src & test) != 0); +} + +/// Determines if all of the bits set in "test" are also set in "src". +/// +/// @returns True if all bits set in "test" are also set in "src", false otherwise. +constexpr bool TestAllFlagsSet64( + uint64 src, ///< Source pattern. + uint64 test) ///< Test pattern. +{ + return ((src & test) == test); +} + +/// Tests if a single bit in a bitfield is set. +/// +/// @param [in] bitfield Bitfield being tested +/// @param [in] bit Bit index to test +/// +/// @returns True if the flag is set. +template +constexpr bool BitfieldIsSet( + const T bitfield, + uint32 bit) +{ + PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); + return (bitfield & (static_cast(1) << bit)); +} + +/// Sets a single bit in a bitfield to one. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the bit to set +template +void BitfieldSetBit( + T &bitfield, + uint32 bit) +{ + PAL_CONSTEXPR_ASSERT(bit < (sizeof(T) * 8)); + bitfield |= (static_cast(1) << bit); +} + +///@{ +/// Counts the number of one bits (population count) in an unsigned integer using some bitwise magic explained in the +/// Software Optimization Guide for AMD64 Processors. +/// +/// @param [in] value The value need to be counted. +/// +/// @returns Number of one bits in the input +template +constexpr uint32 CountSetBits( + T value) +{ + uint32 x = static_cast(value); + + x = x - ((x >> 1) & 0x55555555); + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + x = (((x + (x >> 4)) & 0x0F0F0F0F) * 0x01010101) >> ((sizeof(uint32) - 1) << 3); + + return x; +} + +constexpr uint32 CountSetBits( + uint64 value) +{ + uint64 x = value; + + x = x - ((x >> 1) & 0x5555555555555555ull); + x = (x & 0x3333333333333333ull) + ((x >> 2) & 0x3333333333333333ull); + x = (((x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Full) * 0x0101010101010101ull) >> ((sizeof(uint64) - 1) << 3); + + return static_cast(x); +} +///@} + +/// Update a subfield of a bitfield. +/// +/// @param [in] bitFieldToUpdate Bitfield being updated +/// @param [in] updateValue Source value to update +/// @param [in] updateMask Bitmask to update +/// +/// @returns True if the flag is set. +template +void BitfieldUpdateSubfield( + T* pBitFieldToUpdate, + const T updateValue, + const T updateMask) +{ + *pBitFieldToUpdate = ((*pBitFieldToUpdate) & ~updateMask) | + (updateValue & updateMask); +} + +/// Tests if a single bit in a "wide bitfield" is set. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being tested +/// @param [in] bit Index of the flag to test +/// +/// @returns True if the flag is set. +template +constexpr bool WideBitfieldIsSet( + const T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + return (0 != (bitfield[index] & mask)); +} + +/// Checks if any bit is set in a wide bitfield. A "wide bitfield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Wide bitfield to count. +/// +/// @returns True if the wide bitfield is non-zero; false otherwise. +template +bool WideBitfieldIsAnyBitSet( + const T(&bitfield)[N]) +{ + bool isBitSet = false; + for (uint32 i = 0; i < N; i++) + { + isBitSet |= (bitfield[i] != 0); + } + + return isBitSet; +} + +/// Sets a single bit in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the flag to set +template +void WideBitfieldSetBit( + T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + bitfield[index] |= mask; +} + +/// Clears a single bit in a "wide bitfield" to zero. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] bit Index of the flag to set +template +void WideBitfieldClearBit( + T (&bitfield)[N], + uint32 bit) +{ + const uint32 index = (bit / (sizeof(T) << 3)); + const T mask = (static_cast(1) << (bit & ((sizeof(T) << 3) - 1))); + + bitfield[index] &= ~mask; +} + +/// Sets consecutive bits in a "wide bitfield" to one. A "wide bifield" is a bitfield which spans an array of +/// integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Reference to the bitfield being modified +/// @param [in] startingBit Index of the first flag to set +/// @param [in] numBits Count of consecutive flags to set +template +void WideBitfieldSetRange( + T (&bitfield)[N], + uint32 startingBit, + uint32 numBits) +{ + constexpr uint32 SizeInBits = (sizeof(T) << 3); + + PAL_ASSERT((startingBit + numBits) <= (SizeInBits * N)); + + uint32 index = (startingBit / SizeInBits); + + startingBit &= (SizeInBits - 1); + + while (numBits > 0) + { + const uint32 maxNumBits = SizeInBits - startingBit; + const uint32 curNumBits = (maxNumBits < numBits) ? maxNumBits : numBits; + const T bitMask = (curNumBits == SizeInBits) ? -1 : ((static_cast(1) << curNumBits) - 1); + + bitfield[index++] |= (bitMask << startingBit); + + startingBit = 0; + numBits -= curNumBits; + } +} + +/// XORs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers +/// because there are more flags than bits in one integer. +/// +/// @param [in] bitfield1 Reference to the first bitfield. +/// @param [in] bitfield2 Reference to the second bitfield. +/// @param [out] pOut Result of (bitfield1 ^ bitfield2) +template +void WideBitfieldXorBits( + const T (&bitfield1)[N], + const T (&bitfield2)[N], + T* pOut) +{ + for (uint32 i = 0; i < N; i++) + { + pOut[i] = (bitfield1[i] ^ bitfield2[i]); + } +} + +/// ANDs all of the bits in two "wide bitfields". A "wide bifield" is a bitfield which spans an array of integers +/// because there are more flags than bits in one integer. +/// +/// @param [in] bitfield1 Reference to the first bitfield. +/// @param [in] bitfield2 Reference to the second bitfield. +/// @param [out] pOut Result of (bitfield1 & bitfield2) +template +void WideBitfieldAndBits( + const T (&bitfield1)[N], + const T (&bitfield2)[N], + T* pOut) +{ + for (uint32 i = 0; i < N; i++) + { + pOut[i] = (bitfield1[i] & bitfield2[i]); + } +} + +/// Counts the number of one bits (population count) in a wide bitfield. A "wide bitfield" is a bitfield which spans +/// an array of integers because there are more flags than bits in one integer. +/// +/// @param [in] bitfield Wide bitfield to count. +/// +/// @returns Number of one bits in the input +template +uint32 WideBitfieldCountSetBits( + const T(&bitfield)[N]) +{ + uint32 count = 0; + for (uint32 i = 0; i < N; i++) + { + count += CountSetBits(bitfield[i]); + } + + return count; +} + +/// Unsets the least-significant '1' bit in the given number. +/// Usually used in conjunction with BitMaskScanForward +/// +/// @param [in] value The value to be modified +/// +/// @returns A copy of value with the lowest '1' bit unset. +template +T UnsetLeastBit( + T val) +{ + static_assert(std::is_unsigned::value, "Must use unsigned ints here"); + return val & (val - 1); +} + +/// Scans the specified bit-mask for the least-significant '1' bit. +/// +/// @returns True if the input was nonzero; false otherwise. +template +bool BitMaskScanForward( + uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. + T mask) ///< Bit-mask to scan. +{ + // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key + // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. + bool result = false; + + if (mask != 0) + { +#if defined(_WIN64) && defined(_M_X64) + *pIndex = (sizeof(T) > 4) ? static_cast(::_tzcnt_u64(mask)) : (::_tzcnt_u32(static_cast(mask))); +#elif defined(_WIN64) + auto*const pOut = reinterpret_cast(pIndex); + (sizeof(T) > 4) ? (::_BitScanForward64(pOut, mask)) : (::_BitScanForward(pOut, static_cast(mask))); +#elif defined(_WIN32) && defined(_M_IX86) + const uint32 lowPart = LowPart(mask); + uint32 index = (::_tzcnt_u32(((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); + *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; +#elif defined(_WIN32) + unsigned long index; + const uint32 lowPart = LowPart(mask); + (::_BitScanForward(&index, ((sizeof(T) > 4) && (lowPart == 0)) ? HighPart(mask) : lowPart)); + *pIndex = ((sizeof(T) > 4) && (lowPart == 0)) ? (index + 32u) : index; +#elif defined(__GNUC__) + *pIndex = (sizeof(T) > 4) ? __builtin_ctzll(mask) : __builtin_ctz(static_cast(mask)); +#else + uint32 index = 0; + for (; ((mask & 0x1) == 0); mask >>= 1, ++index); + *pIndex = index; +#endif + + result = true; + } + return result; +} + +/// Scans the specified bit-mask for the most-significant '1' bit. +/// +/// @returns True if the input was nonzero; false otherwise. +template +bool BitMaskScanReverse( + uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. + T mask) ///< Bit-mask to scan. +{ + // Bitscan intrinsics may compile to flaky code in certain situations. Discarding bitscan flags avoids this. The key + // is to forward declare result, and set it in a conditional branch after the bitscan. Be careful if modifying this. + bool result = false; + + if (mask != 0) + { +#if defined(_WIN64) + auto*const pOut = reinterpret_cast(pIndex); + (sizeof(T) > 4) ? (::_BitScanReverse64(pOut, mask)) : (::_BitScanReverse(pOut, static_cast(mask))); +#elif defined(_WIN32) + unsigned long index; + const uint32 highPart = (sizeof(T) > 4) ? HighPart(mask) : 0; + (::_BitScanReverse(&index, ((sizeof(T) > 4) && (highPart != 0)) ? highPart : LowPart(mask))); + *pIndex = ((sizeof(T) > 4) && (highPart != 0)) ? (index + 32u) : index; +#elif defined(__GNUC__) + *pIndex = (sizeof(T) > 4) ? (63u - __builtin_clzll(mask)) : (31u - __builtin_clz(static_cast(mask))); +#else + uint32 index = 31u; + for (; (((mask >> index) & 0x1) == 0); --index); + *pIndex = index; +#endif + + result = true; + } + return result; +} + +/// Scans the specified wide bit-mask for the least-significant '1' bit. +/// +/// @returns True if input was nonzero; false otherwise. +template +bool WideBitMaskScanForward( + uint32* pIndex, ///< [out] Index of least-significant '1' bit. Undefined if input is zero. + const T (&mask)[N]) ///< Bit-mask to scan. +{ + uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); + + // Check to see if the wide bitmask has some bits set. + uint32 index = 0; + while ((mask[index] == 0) && (++index < N)); + bool result = (index < N); + + while (result == true) + { + result = BitMaskScanForward(pIndex, mask[maskIndex]); + + if (result == false) + { + ++maskIndex; + result = (maskIndex < N); + } + else + { + (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); + break; + } + } + + return result; +} + +/// Scans the specified wide bit-mask for the most-significant '1' bit. +/// +/// @returns True if input was nonzero; false otherwise. +template +bool WideBitMaskScanReverse( + uint32* pIndex, ///< [out] Index of most-significant '1' bit. Undefined if input is zero. + const T (&mask)[N]) ///< Bit-mask to scan. +{ + uint32 maskIndex = ((*pIndex) / (sizeof(T) << 3)); + + // Check to see if the wide bitmask has some bits set. + uint32 index = N - 1; + while ((mask[index] == 0) && (--index > 0)); + bool result = (mask[index] != 0); + + while (result == true) + { + result = BitMaskScanReverse(pIndex, mask[maskIndex]); + + if (result == false) + { + const uint32 oldIndex = maskIndex--; + result = (oldIndex != 0); + } + else + { + (*pIndex) = (*pIndex) + (maskIndex * (sizeof(T) << 3)); + break; + } + } + + return result; +} + +/// Generates a bitmask. +/// +/// @param [in] numBits Number of bits to set (starting at 0) +/// +/// @returns Bitmask in storage of type T with bits [0:numBits-1] set. +template +constexpr T BitfieldGenMask( + T numBits) +{ + PAL_CONSTEXPR_ASSERT(numBits <= (sizeof(T) * 8)); + + const T mask = (numBits < (sizeof(T) * 8)) ? ((static_cast(1) << (numBits)) - static_cast(1)) : static_cast(-1); + return mask; +} + +/// Determines if a value is a power of two. +/// +/// @returns True if it is a power of two, false otherwise. +constexpr bool IsPowerOfTwo( + uint64 value) ///< Value to check. +{ + return (value == 0) ? false : ((value & (value - 1)) == 0); +} + +/// Determines if 'value' is at least aligned to the specified power-of-2 alignment. +/// +/// @returns True if aligned, false otherwise. +constexpr bool IsPow2Aligned( + uint64 value, ///< Value to check. + uint64 alignment) ///< Desired alignment. +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return ((value & (alignment - 1)) == 0); +} + +/// Determines if 'ptr' is at least aligned to the specified power-of-2 alignment. +/// +/// @returns True if aligned, false otherwise. +inline bool VoidPtrIsPow2Aligned( + const void* ptr, ///< Pointer to check. + uint64 alignment) ///< Desired alignment. +{ + PAL_ASSERT(IsPowerOfTwo(alignment)); + return ((reinterpret_cast(ptr) & (alignment - 1)) == 0); +} + +/// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 +/// alignments are supported by this function. +/// +/// @returns Aligned value. +template +constexpr T Pow2Align( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); +} + +/// Rounds the specified uint 'value' up to the nearest power of 2 +/// +/// @param [in] value The value to pad. +/// +/// @returns Power of 2 padded value. +template +T Pow2Pad( + T value) +{ + T ret = value; + + if ((value & (value - 1)) != 0) + { + uint32 lastBitIndex = 0; + BitMaskScanReverse(&lastBitIndex, value); + ret = (static_cast(0x2) << lastBitIndex); + } + + return ret; +} + +/// Computes the base-2 logarithm of an unsigned integer. +/// +/// If the given integer is not a power of 2, this function will not provide an exact answer. +/// +/// @param [in] u Value to compute the logarithm of. +/// +/// @returns log_2(u) +template +uint32 Log2( + T u) +{ + uint32 logValue = 0; + return BitMaskScanReverse(&logValue, u) ? logValue : 0; +} + +/// Computes the base-2 logarithm of an unsigned 64-bit integer based on ceiling +/// +/// If the given integer is not a power of 2, this function will not provide an exact answer. +/// +/// @returns ceilLog_2(u) +template +uint32 CeilLog2( + T u) ///< Value to compute the ceil logarithm of. +{ + const uint32 logValue = Log2(u); + return ((static_cast(0x1ul) << logValue) < u) ? (logValue + 1) : logValue; +} + +/// Implements an alternative version of integer division in which the quotient is always rounded up instead of down. +/// +/// @returns The rounded quotient. +template +constexpr T RoundUpQuotient( + T dividend, ///< Value to divide. + T divisor) ///< Value to divide by. +{ + return ((dividend + (divisor - 1)) / divisor); +} + +/// Rounds up the specified integer to the nearest multiple of the specified alignment value. +/// +/// @returns Rounded value. +template +constexpr T RoundUpToMultiple( + T operand, ///< Value to be aligned. + T alignment) ///< Alignment desired. +{ + return (((operand + (alignment - 1)) / alignment) * alignment); +} + +/// Rounds down the specified integer to the nearest multiple of the specified alignment value. +/// +/// @returns Rounded value. +template +constexpr T RoundDownToMultiple( + T operand, ///< Value to be aligned. + T alignment) ///< Alignment desired. +{ + return ((operand / alignment) * alignment); +} + +/// Rounds the specified 'value' down to the nearest value meeting the specified 'alignment'. Only power of 2 +/// alignments are supported by this function. +/// +/// @returns Rounded value. +template +constexpr T Pow2AlignDown( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). +{ + PAL_CONSTEXPR_ASSERT(IsPowerOfTwo(alignment)); + return (value & ~(alignment - 1)); +} + +/// Determines the maximum of two numbers. +/// +/// @returns The larger of the two inputs. +template +constexpr T Max( + T value1, ///< First value to check. + T value2) ///< Second value to check. +{ + return ((value1 > value2) ? value1 : value2); +} + +/// Determines the maximum of N numbers. +/// +/// @returns The largest of all the inputs. +template +constexpr T Max( + T value1, ///< First value to check. + T value2, ///< Second value to check. + Ts... values) ///< Additional values to check. +{ + return Max(((value1 > value2) ? value1 : value2), values...); +} + +/// Determines the minimum of two numbers. +/// +/// @returns The smaller of the two inputs. +template +constexpr T Min( + T value1, ///< First value to check. + T value2) ///< Second value to check. +{ + return ((value1 < value2) ? value1 : value2); +} + +/// Determines the minimum of N numbers. +/// +/// @returns The smallest of all the inputs. +template +constexpr T Min( + T value1, ///< First value to check. + T value2, ///< Second value to check. + Ts... values) ///< Additional values to check. +{ + return Min(((value1 < value2) ? value1 : value2), values...); +} + +/// Clamps the input number so that it falls in-between the lower and upper bounds (inclusive). +/// +/// @returns Clamped input number. +template +constexpr T Clamp( + T input, ///< Input number to clamp. + T lowBound, ///< Lower-bound to clamp to. + T highBound) ///< Upper-bound to clamp to. +{ + return ((input <= lowBound) ? lowBound : + (input >= highBound) ? highBound : input); +} + +/// Determines if the input is within the range specified (inclusive). +/// +/// @returns True if within range, False otherwise. +template +constexpr bool InRange( + T input, ///< Input number to range check. + T lowBound, ///< Low bound of the range to check (inclusive). + T highBound) ///< High bound of the range to check (inclusive). +{ + return (lowBound <= input) && (input <= highBound); +} + +/// Converts a byte value to the equivalent number of DWORDs (uint32) rounded up. I.e., 3 bytes will return 1 dword. +/// +/// @returns Number of dwords necessary to cover numBytes. +constexpr uint32 NumBytesToNumDwords( + uint32 numBytes) ///< Byte count to convert. +{ + return Pow2Align(numBytes, static_cast(sizeof(uint32))) / sizeof(uint32); +} + +/// Compare two strings ignoring case +inline int Strcasecmp( + const char* pSrc, ///< [in] The source string to be compared. + const char* pDst) ///< [in] The dest string to compare. +{ + PAL_ASSERT(pSrc != nullptr); + PAL_ASSERT(pDst != nullptr); + +#if defined(_WIN32) + return _stricmp(pDst, pSrc); +#else + return strcasecmp(pDst, pSrc); +#endif +} + +/// Performs a safe strcpy by requiring the destination buffer size. +inline void Strncpy( + char* pDst, ///< [out] Destination string. + const char* pSrc, ///< [in] Source string to be copied into destination. + size_t dstSize) ///< Size of the destination buffer in bytes. +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + PAL_ALERT(strlen(pSrc) >= dstSize); + + if (dstSize > 0) + { +#if defined(_WIN32) + // Clamp the copy to the size of the dst buffer (1 char reserved for the null terminator). + strncpy_s(pDst, dstSize, pSrc, _TRUNCATE); +#else + strncpy(pDst, pSrc, (dstSize - 1)); + pDst[dstSize - 1] = '\0'; +#endif + } +} + +/// Simple wrapper for wcscpy_s or wcsncpy, which are available on Windows and Linux, respectively. +inline void Wcsncpy( + wchar_t* pDst, ///< [out] Destination string. + const wchar_t* pSrc, ///< [in] Source string to copy. + size_t dstSize) ///< Length of the destination buffer, in wchar_t's. +{ +#if defined(_WIN32) + wcscpy_s(pDst, dstSize, pSrc); +#else + wcsncpy(pDst, pSrc, (dstSize - 1)); + pDst[dstSize - 1] = L'\0'; +#endif +} + +// Wrapper for wcscat or wcscat_s which provides a safe version of wcscat +inline void Wcscat( + wchar_t* pDst, + const wchar_t* pSrc, + size_t dstSize) +{ +#if defined(_WIN32) + wcsncat_s(pDst, dstSize, pSrc, _TRUNCATE); +#else + const size_t dstLen = std::wcslen(pDst); + wcsncat(pDst, pSrc, (dstSize - dstLen - 1)); + pDst[dstSize - 1] = L'\0'; +#endif +} + +/// Simple wrapper for strncat or strncat_s which provides a safe version of strncat. +inline void Strncat( + char* pDst, ///< [in,out] Destination string. + size_t sizeDst, ///< Length of the destination string, including the null terminator. + const char* pSrc) ///< [in] Source string. +{ + PAL_ASSERT((pDst != nullptr) && (pSrc != nullptr)); + +#if defined(_WIN32) + // MS compilers provide strncat_s, which will truncate the copy to prevent buffer overruns and always guarantee that + // pDst is null-terminated. + strncat_s(pDst, sizeDst, pSrc, _TRUNCATE); +#else + // Compute the length of the destination string to prevent buffer overruns. + const size_t dstLength = strlen(pDst); + strncat(pDst, pSrc, (sizeDst - dstLength - 1)); +#endif +} + +/// Simple wrapper for strtok_s or strtok_r which provides a safe version of strtok. +inline char* Strtok( + char* str, ///< [in] Token string. + const char* delim, ///< [in] Token delimit. + char** buf) ///< [in,out] Buffer to store the rest of the string. +{ + PAL_ASSERT((delim != nullptr) && (buf != nullptr)); + + char* pToken = nullptr; + +#if defined(_WIN32) + pToken = strtok_s(str, delim, buf); +#else + pToken = strtok_r(str, delim, buf); +#endif + + return pToken; +} + +/// Rounds the specified pointer up to the nearest value meeting the specified 'alignment'. Only power of 2 alignments +/// are supported by this function. +/// +/// @returns Aligned pointer. +inline void* VoidPtrAlign( + void* ptr, ///< Pointer to align. + size_t alignment) ///< Desired alignment. +{ + // This function only works for POW2 alignment + PAL_ASSERT(IsPowerOfTwo(alignment)); + + return reinterpret_cast( + (reinterpret_cast(ptr) + (alignment - 1)) & ~(alignment - 1)); +} + +/// Converts a raw string value to the correct data type. +inline void StringToValueType( + const char* pStrValue, ///< [in] Setting value in string form. + ValueType type, ///< Data type of the value being converted. + size_t valueSize, ///< Size of pValue buffer. + void* pValue) ///< [out] Converted setting value buffer. +{ + switch (type) + { + case ValueType::Boolean: + *(static_cast(pValue)) = ((atoi(pStrValue)) ? true : false); + break; + case ValueType::Int8: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint8: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int16: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint16: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int32: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint32: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Int64: + *(static_cast(pValue)) = static_cast(strtoll(pStrValue, nullptr, 0)); + break; + case ValueType::Uint64: + *(static_cast(pValue)) = static_cast(strtoull(pStrValue, nullptr, 0)); + break; + case ValueType::Float: + *(static_cast(pValue)) = static_cast(atof(pStrValue)); + break; + case ValueType::Str: + Strncpy(static_cast(pValue), pStrValue, valueSize); + break; + } +} + +/// Converts a raw string value to the correct data type, returning 'true' if parsed correctly. +/// When not parsed correctly, the value will be unchanged. +/// +/// @note: A string that is truncated returns false. +/// @note: If the destination type is integer, the string is parsed as either int64 or uint64, and the parsed value is +/// clamped to fit the range of the destination type. +[[nodiscard]] inline bool StringToValueTypeChecked( + const char* pStrValue, ///< [in] Setting value in string form. + ValueType type, ///< Data type of the value being converted. + size_t valueSize, ///< Size of pValue buffer. + void* pValue) ///< [out] Converted setting value buffer. +{ + auto CheckTrailingCharacters = [](char* pChar, const char* pEnd) -> bool { + while ((pChar < pEnd) && isspace(*pChar)) + { + // ignore trailing whitespace. strtoX handles leading whitespace + pChar++; + } + return (pChar == pEnd); + }; + + const size_t len = strlen(pStrValue); + const char* pTerminator = pStrValue + len; + char* pEndptr = nullptr; + bool valid = false; + + switch (type) + { + case ValueType::Boolean: + { + bool value = (strtol(pStrValue, &pEndptr, 0) != 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Int8: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint8: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int16: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint16: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int32: + { + const int64 parsedValue = strtoll(pStrValue, &pEndptr, 0); + const int64 value = Clamp(parsedValue, + int64((std::numeric_limits::min)()), + int64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Uint32: + { + const uint64 parsedValue = strtoull(pStrValue, &pEndptr, 0); + const uint64 value = Clamp(parsedValue, + uint64((std::numeric_limits::min)()), + uint64((std::numeric_limits::max)())); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = static_cast(value); + } + } + break; + case ValueType::Int64: + { + const int64 value = strtoll(pStrValue, &pEndptr, 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Uint64: + { + const uint64 value = strtoull(pStrValue, &pEndptr, 0); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Float: + { + float value = static_cast(strtof(pStrValue, &pEndptr)); + valid = CheckTrailingCharacters(pEndptr, pTerminator); + if (valid) + { + *(static_cast(pValue)) = value; + } + } + break; + case ValueType::Str: + if (len + 1 <= valueSize) + { + valid = true; + Strncpy(static_cast(pValue), pStrValue, valueSize); + } + break; + } + return valid; +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// +/// @returns 32-bit hash generated from the provided string. +template +constexpr uint32 HashString( + const Char* pStr, ///< [in] String to be hashed. + size_t strSize) ///< Size of the input string. +{ + PAL_CONSTEXPR_ASSERT((pStr != nullptr) && (strSize > 0)); + + constexpr uint32 FnvPrime = 16777619u; + constexpr uint32 FnvOffset = 2166136261u; + + uint32 hash = FnvOffset; + + for (size_t i = 0; i < strSize; i++) + { + Char c = pStr[i]; + for (uint32 j = 0; j < sizeof(Char); ++j) + { + hash ^= uint8(c); + hash *= FnvPrime; + c = (c >> 8); + } + } + + return hash; +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// +/// @returns 32-bit hash generated from the provided string. +template +constexpr uint32 HashString( + const Char* pString) +{ + return HashString(pString, StringLength(pString)); +} + +/// Hashes the provided string using FNV1a hashing (http://www.isthe.com/chongo/tech/comp/fnv/) algorithm. +/// Same as HashString() except consteval enforces that this can only be called at compile-time. +/// +/// @returns 32-bit hash generated from the provided string. +template +#if defined(__cpp_consteval) +consteval +#else +constexpr +#endif +uint32 CompileTimeHashString( + const Char* pString) +{ + return HashString(pString); +} + +/// Indicates that an object may be moved from. +/// Can be understood as preparation for possible move operation. +/// +/// @warning Do not read object after it has been moved from! +/// +/// @param [in] object Universal reference to an object that may be moved from. +/// +/// @returns Rvalue reference to the parameter object. +template +constexpr typename std::remove_reference::type&& Move(T&& object) +{ + // Cast universal reference to rvalue reference. + return static_cast::type&&>(object); +} + +/// Exchanges values between two variables. +/// +/// @param [in] left First variable used in swap operation. +/// @param [in] right Second variable used in swap operation. +template +constexpr void Swap(T& left, T& right) +{ + T tmp = Move(left); + left = Move(right); + right = Move(tmp); +} + +/// Convenient alias for C style arrays. +template +using Array = Element[Size]; + +/// Prevent swapping arrays because of the cost of this operation. +template +void Swap(Array& a, Array& b); + +/// Compacts an array by moving all empty slots to the end of the array. +/// +---+---+---+---+---+---+---+---+---+---+ +/// Input: | A | | C | D | | E | | A | X | J | +/// +---+---+---+---+---+---+---+---+---+---+ +/// +---+---+---+---+---+---+---+---+---+---+ +/// Output: | A | C | D | E | A | X | J | | | | +/// +---+---+---+---+---+---+---+---+---+---+ +template +void PackArray(Array& array, const Element& emptySlot) +{ + int lastOccupiedSlot = -1; + + for (size_t i = 0; i < Size; ++i) + { + if (array[i] != emptySlot) + { + Swap(array[i], array[lastOccupiedSlot + 1]); + ++lastOccupiedSlot; + } + } +} + +/// Performs a safe mbstowcs by requiring the destination buffer size. +inline void Mbstowcs( + wchar_t* pDst, ///< [out] dst string + const char* pSrc, ///< [in] src string + size_t dstSizeInWords) ///< size of the destination buffer in words +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + + bool result = false; + // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) +#if defined(_WIN32) + size_t bytesConverted = 0; + errno_t retCode = mbstowcs_s(&bytesConverted, pDst, dstSizeInWords, pSrc, (dstSizeInWords - 1)); + + result = (retCode != 0) ? false : true; +#else + size_t retCode = mbstowcs(pDst, pSrc, dstSizeInWords); + + result = (retCode == static_cast(-1)) ? false : true; + + if (retCode == dstSizeInWords) + { + // Alert the user when the string has been truncated. + PAL_ALERT_ALWAYS(); + + // NULL terminate the string. + pDst[dstSizeInWords - 1] = '\0'; + } +#endif + + if (result == false) + { + // A non-convertible character was encountered or the string was truncated on the mbstowcs_s or + // ConvertCharStringToUtf16 code paths. + PAL_ALERT_ALWAYS(); + pDst[0] = '\0'; + } +} + +/// Performs a safe wcstombs by requiring the destination buffer size. +inline void Wcstombs( + char* pDst, ///< [out] dst string + const wchar_t* pSrc, ///< [in] src string + size_t dstSizeInBytes) ///< size of the destination buffer in bytes +{ + PAL_ASSERT(pDst != nullptr); + PAL_ASSERT(pSrc != nullptr); + + bool result = false; + // clamp the conversion to the size of the dst buffer (1 char reserved for the NULL terminator) +#if defined(_WIN32) + size_t bytesConverted = 0; + errno_t retCode = wcstombs_s(&bytesConverted, pDst, dstSizeInBytes, pSrc, (dstSizeInBytes - 1)); + + result = (retCode != 0) ? false : true; +#else + size_t retCode = wcstombs(pDst, pSrc, (dstSizeInBytes - 1)); + + result = (retCode == static_cast(-1)) ? false : true; +#endif + + if (result == false) + { + // A non-convertible character was encountered. + PAL_ASSERT_ALWAYS(); + pDst[0] = '\0'; + } + + if (wcslen(pSrc) >= dstSizeInBytes) + { + // Assert to alert the user when the string has been truncated. + PAL_ASSERT_ALWAYS(); + + // NULL terminate the string. + pDst[dstSizeInBytes - 1] = '\0'; + } +} + +/// Computes the Greatest Common Divisor of two numbers +/// +/// @returns The GCD of the two inputs. +template +inline typename std::common_type::type Gcd( + T1 value1, + T2 value2) +{ + static_assert((std::is_integral::value == true) && + (std::is_integral::value == true), + "GCD requires integral types"); + + static_assert((std::is_unsigned::value == true) && + (std::is_unsigned::value == true), + "GCD requires unsigned types"); + + static_assert((std::is_same::value == false) && + (std::is_same::value == false), + "GCD requires nonboolean types"); + + using T = typename std::common_type::type; + T ret = 0u; + + if (value1 == 0u) + { + ret = static_cast(value2); + } + else if (value2 == 0u) + { + ret = static_cast(value1); + } + else + { + uint32 value1TrailingZeros = 0u; + BitMaskScanForward(&value1TrailingZeros, value1); + uint32 value2TrailingZeros = 0u; + BitMaskScanForward(&value2TrailingZeros, value2); + + const uint32 shift = Min(value1TrailingZeros, value2TrailingZeros); + value1 >>= value1TrailingZeros; + value2 >>= shift; + + do + { + BitMaskScanForward(&value2TrailingZeros, value2); + value2 >>= value2TrailingZeros; + + if (value1 > value2) + { + T tmp = value1; + value1 = value2; + value2 = tmp; + } + + value2 -= value1; + } + while (value2 != 0); + + ret = static_cast(value1 << shift); + } + + return ret; +} + +/// Computes the Greatest Common Divisor of N numbers +/// +/// @returns The GCD of the all inputs. +template +inline typename std::common_type::type>::type Gcd( + T1 value1, + T2 value2, + Ts... values) +{ + return Gcd(Gcd(value1, value2), values...); +} + +/// Computes the Least Common Multiple of two numbers +/// +/// @returns The LCM of the two inputs. +template +constexpr typename std::common_type::type Lcm( + T1 value1, + T2 value2) +{ + static_assert((std::is_integral::value == true) && + (std::is_integral::value == true), + "LCM requires integral types"); + + static_assert((std::is_unsigned::value == true) && + (std::is_unsigned::value == true), + "LCM requires unsigned types"); + + static_assert((std::is_same::value == false) && + (std::is_same::value == false), + "LCM requires nonboolean types"); + + using T = typename std::common_type::type; + + return (value1 != 0u) && (value2 != 0u) ? static_cast((value1 / Gcd(value1, value2)) * value2) : 0u; +} + +/// Computes the Least Common Multiple of N numbers +/// +/// @returns The LCM of all the inputs. +template +constexpr typename std::common_type::type>::type Lcm( + T1 value1, + T2 value2, + Ts... values) +{ + return Lcm(Lcm(value1, value2), values...); +} + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 +/// Returns the length of a wchar_t based string. +/// +/// @returns The length of the given string in wide characters +inline size_t Wcslen( + const wchar_t* pWideStr) +{ + return wcslen(pWideStr); +} + +/// Performs a reverse string find of wide character wc. +/// +/// @returns The matching character at the end of the string or nullptr if not found. +inline wchar_t* Wcsrchr(wchar_t *pStr, wchar_t wc) +{ + return wcsrchr(pStr, wc); +} +#endif + +/// Compile-time function to report if two values from unrelated strong enums are equivalent. This is useful for +/// static asserts ensuring it is safe to cast an enum without a conversion lookup table. +template +inline constexpr bool EnumSameVal( + T1 lhs, + T2 rhs) +{ + return (static_cast(lhs) == static_cast(rhs)); +} + +/// Comparison function for Sort() below. +template int PAL_CDECL SortComparisonFunc( + const void* pLhs, + const void* pRhs) +{ + return int(*static_cast(pRhs) < *static_cast(pLhs)) - + int(*static_cast(pLhs) < *static_cast(pRhs)); +} + +/// In-place sort of an array. Uses C library qsort, so is probably a non-order-preserving quicksort. +/// Sorts the array given by the random iterator range [pStart,pEnd). +/// The element type (the type you get by dereferencing RandomIt) must have an operator<. +template void Sort( + RandomIt pStart, + RandomIt pEnd) +{ + using ElementTy = typename std::iterator_traits::value_type; + qsort(&pStart[0], pEnd - pStart, sizeof(ElementTy), SortComparisonFunc); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h b/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h index bf95e58553..c441ecda3b 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palIntrusiveList.h @@ -1,293 +1,293 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palIntrusiveList.h - * @brief PAL utility collection IntrusiveList and IntrusiveListIterator class declarations. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palUtil.h" -#include "palAssert.h" - -namespace Util -{ - -// Forward declarations. -template class IntrusiveList; -template class IntrusiveListIterator; - -/** - *********************************************************************************************************************** - * @brief Encapsulates one node of an intrusive double-linked-list. - * - * A node is associated with one data pointer at construction. The data pointer cannot be changed and must be non-null. - * - * Note that InList() allows intrusive list users to verify if a given value has been stored in a list without iterating - * over the list provided that each node object has been designated for a particular list. - *********************************************************************************************************************** - */ -template -class IntrusiveListNode -{ -public: - /// @param [in,out] pData Address of the data element which contains this intrusive node. - explicit IntrusiveListNode(T* pData); - - /// Returns true if this node is present in an intrusive list. - bool InList() const; - -private: - // This special constructor is provided for IntrusiveList's sentinel node which must have a null data pointer. - IntrusiveListNode(); - - T*const m_pData; // The data object that contains this node. - IntrusiveListNode* m_pPrev; // Previous node in the list or null if this node is not in a list. - IntrusiveListNode* m_pNext; // Next node in the list or null if this node is not in a list. - - PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveListNode); - - // Although this is a transgression of coding standards, it prevents IntrusiveListNode from requiring public - // accessor functions. The added encapsulation this provides is worthwhile. - friend class IntrusiveList; - friend class IntrusiveListIterator; -}; - -/** - *********************************************************************************************************************** - * @brief Iterator for traversal of elements in a List collection. - * - * Allows traversal of all elements in a List going either forwards or backwards. - *********************************************************************************************************************** - */ -template -class IntrusiveListIterator -{ -public: - ~IntrusiveListIterator() { } - - /// Returns true unless the iterator has advanced past the end of the list. - bool IsValid() const { return m_pCurrent != m_pSentinel; } - - /// Returns a pointer to the current element. Returns null if the iterator is invalid. - T* Get() const { return m_pCurrent->m_pData; } - - /// Advances the iterator to the previous position (move backward). - void Prev() { m_pCurrent = m_pCurrent->m_pPrev; } - - /// Advances the iterator to the next position (move forward). - void Next() { m_pCurrent = m_pCurrent->m_pNext; } - - /// Moves the iterator back to the start of the list. - void Restart() { m_pCurrent = m_pSentinel->m_pNext; } - -private: - IntrusiveListIterator(const IntrusiveListNode*const pSentinel, IntrusiveListNode* pStart); - - const IntrusiveListNode*const m_pSentinel; // We need the sentinel to locate the list boundaries. - IntrusiveListNode* m_pCurrent; // Pointer to the current node. - - PAL_DISALLOW_DEFAULT_CTOR(IntrusiveListIterator); - - // Although this is a transgression of coding standards, it means that List does not need to have a public interface - // specifically to implement this class. The added encapsulation this provides is worthwhile. - friend class IntrusiveList; -}; - -/** - *********************************************************************************************************************** - * @brief Templated, doubly-linked, intrusive, list container. - * - * This is meant for storing non-null pointers to elements of an arbitrary type using externally managed nodes. - * Operations which this class supports are: - * - * - Insertion at any point - * - Deletion at any point - * - Forwards and reverse iteration - * - * @warning This class is not thread-safe. - *********************************************************************************************************************** - */ -template -class IntrusiveList -{ -public: - /// A convenient shorthand for IntrusiveListNode. - typedef IntrusiveListNode Node; - - /// A convenient shorthand for IntrusiveListIterator. - typedef IntrusiveListIterator Iter; - - IntrusiveList(); - - /// Returns the number of elements in the list, not counting the sentinel. - size_t NumElements() const { return m_numElements; } - - /// Returns true if the list is empty. - bool IsEmpty() const { return m_sentinel.m_pNext == &m_sentinel; } - - /// Returns an iterator pointing to the first element in the list. If the list is empty, the iterator starts out - /// pointing at the permanent sentinel node. - /// - /// @returns An iterator pointing at the front end of the list. - Iter Begin() const { return Iter(&m_sentinel, m_sentinel.m_pNext); } - - /// Returns an iterator pointing to the last element in the list. If the list is empty, the iterator starts out - /// pointing at the permanent sentinel node. - /// - /// @returns An iterator pointing at the back end of the list. - Iter End() const { return Iter(&m_sentinel, m_sentinel.m_pPrev); } - - /// Returns the data pointer at the front of the list. - /// - /// @returns The data pointer at the front of the list or null if the list is empty. - T* Front() const { return m_sentinel.m_pNext->m_pData; } - - /// Returns the data pointer at the back of the list. - /// - /// @returns The data pointer at the back of the list or null if the list is empty. - T* Back() const { return m_sentinel.m_pPrev->m_pData; } - - /// Pushes the specified node onto the front of the list. - /// - /// @param [in] pNode Externally-owned list node to link into the list. - void PushFront(Node* pNode) { InsertBefore(m_sentinel.m_pNext, pNode); } - - /// Pushes the specified node onto the back of the list. - /// - /// @param [in] pNode Externally-owned list node to link into the list. - void PushBack(Node* pNode) { InsertBefore(&m_sentinel, pNode); } - - /// Pushes the contents of pSource onto the front of this list. The ordering of pSource is preserved, meaning that - /// the front of pSource will be the new front of this list. Note that pSource will be left entirely empty. - /// - /// It is illegal to call this function with an empty pSource. - /// - /// @param [in] pSource The contents of pSource will be pushed in-order onto the front of this list. - void PushFrontList(IntrusiveList* pSource); - - /// Pushes the contents of pSource onto the back of this list. The ordering of pSource is preserved, meaning that - /// the end of pSource will be the new end of this list. Note that pSource will be left entirely empty. - /// - /// It is illegal to call this function with an empty pSource. - /// - /// @param [in] pSource The contents of pSource will be pushed in-order onto the back of this list. - void PushBackList(IntrusiveList* pSource); - - /// Inserts the specified node before a particular node in a list. - /// - /// If the iterator has advanced off the end of the list (i.e., the iterator is invalid), the added node will be the - /// new tail node. - /// - /// @param [in] iter Identifies a node where the insertion should take place. The iterator will point to the same - /// spot in the list after insertion. - /// @param [in] pNode Externally-owned list node to link into the list. - void InsertBefore(const Iter& iter, Node* pNode); - - /// Removes the node at the specified position from the list. - /// - /// It is illegal to call this function with an iterator that has already advanced off the end of the list. - /// - /// @param [in,out] pIter Iterator identifying the node to be removed. After the node is removed, this iterator - /// will be advanced to the next node. If this call removes the final remaining node in the - /// list then the iterator will point at the sentinel and will be invalid. - void Erase(Iter* pIter); - - /// Removes the node at the specified position from the list. It is illegal to call this function with a Node that - /// not in this list. - /// - /// @param [in] pNode Node to be removed. - void Erase(Node* pNode); - - /// Removes all nodes from the list. - void EraseAll(); - - /// Truncates the list without touching the elements - void InvalidateList() - { - m_sentinel.m_pNext = &m_sentinel; - m_sentinel.m_pPrev = &m_sentinel; - m_numElements = 0; - } - -private: - void InsertBefore(Node* pBeforeMe, Node* pNode); - void Unlink(Node* pNode); - - Node m_sentinel; // Ties the head to the tail and signifies the boundary of the list. - size_t m_numElements; // Number of elements. - - PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveList); -}; - -// ===================================================================================================================== -// This is the public node constructor; it must be given a non-null data pointer. -template -IntrusiveListNode::IntrusiveListNode( - T* pData) - : - m_pData(pData), - m_pPrev(nullptr), - m_pNext(nullptr) -{ - PAL_ASSERT(pData != nullptr); -} - -// ===================================================================================================================== -// This is the private node constructor which is used exclusively for sentinel nodes. -template -IntrusiveListNode::IntrusiveListNode() - : - m_pData(nullptr), - m_pPrev(nullptr), - m_pNext(nullptr) -{ -} - -// ===================================================================================================================== -// Returns true if this node is present in an intrusive list. -template -bool IntrusiveListNode::InList() const -{ - // The node pointers should always be null or non-null together. - PAL_DEBUG_BUILD_ONLY_ASSERT((m_pPrev == nullptr) == (m_pNext == nullptr)); - - return (m_pNext != nullptr); -} - -// ===================================================================================================================== -template -IntrusiveListIterator::IntrusiveListIterator( - const IntrusiveListNode*const pSentinel, - IntrusiveListNode* pStart) - : - m_pSentinel(pSentinel), - m_pCurrent(pStart) -{ -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palIntrusiveList.h + * @brief PAL utility collection IntrusiveList and IntrusiveListIterator class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" + +namespace Util +{ + +// Forward declarations. +template class IntrusiveList; +template class IntrusiveListIterator; + +/** + *********************************************************************************************************************** + * @brief Encapsulates one node of an intrusive double-linked-list. + * + * A node is associated with one data pointer at construction. The data pointer cannot be changed and must be non-null. + * + * Note that InList() allows intrusive list users to verify if a given value has been stored in a list without iterating + * over the list provided that each node object has been designated for a particular list. + *********************************************************************************************************************** + */ +template +class IntrusiveListNode +{ +public: + /// @param [in,out] pData Address of the data element which contains this intrusive node. + explicit IntrusiveListNode(T* pData); + + /// Returns true if this node is present in an intrusive list. + bool InList() const; + +private: + // This special constructor is provided for IntrusiveList's sentinel node which must have a null data pointer. + IntrusiveListNode(); + + T*const m_pData; // The data object that contains this node. + IntrusiveListNode* m_pPrev; // Previous node in the list or null if this node is not in a list. + IntrusiveListNode* m_pNext; // Next node in the list or null if this node is not in a list. + + PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveListNode); + + // Although this is a transgression of coding standards, it prevents IntrusiveListNode from requiring public + // accessor functions. The added encapsulation this provides is worthwhile. + friend class IntrusiveList; + friend class IntrusiveListIterator; +}; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in a List collection. + * + * Allows traversal of all elements in a List going either forwards or backwards. + *********************************************************************************************************************** + */ +template +class IntrusiveListIterator +{ +public: + ~IntrusiveListIterator() { } + + /// Returns true unless the iterator has advanced past the end of the list. + bool IsValid() const { return m_pCurrent != m_pSentinel; } + + /// Returns a pointer to the current element. Returns null if the iterator is invalid. + T* Get() const { return m_pCurrent->m_pData; } + + /// Advances the iterator to the previous position (move backward). + void Prev() { m_pCurrent = m_pCurrent->m_pPrev; } + + /// Advances the iterator to the next position (move forward). + void Next() { m_pCurrent = m_pCurrent->m_pNext; } + + /// Moves the iterator back to the start of the list. + void Restart() { m_pCurrent = m_pSentinel->m_pNext; } + +private: + IntrusiveListIterator(const IntrusiveListNode*const pSentinel, IntrusiveListNode* pStart); + + const IntrusiveListNode*const m_pSentinel; // We need the sentinel to locate the list boundaries. + IntrusiveListNode* m_pCurrent; // Pointer to the current node. + + PAL_DISALLOW_DEFAULT_CTOR(IntrusiveListIterator); + + // Although this is a transgression of coding standards, it means that List does not need to have a public interface + // specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class IntrusiveList; +}; + +/** + *********************************************************************************************************************** + * @brief Templated, doubly-linked, intrusive, list container. + * + * This is meant for storing non-null pointers to elements of an arbitrary type using externally managed nodes. + * Operations which this class supports are: + * + * - Insertion at any point + * - Deletion at any point + * - Forwards and reverse iteration + * + * @warning This class is not thread-safe. + *********************************************************************************************************************** + */ +template +class IntrusiveList +{ +public: + /// A convenient shorthand for IntrusiveListNode. + typedef IntrusiveListNode Node; + + /// A convenient shorthand for IntrusiveListIterator. + typedef IntrusiveListIterator Iter; + + IntrusiveList(); + + /// Returns the number of elements in the list, not counting the sentinel. + size_t NumElements() const { return m_numElements; } + + /// Returns true if the list is empty. + bool IsEmpty() const { return m_sentinel.m_pNext == &m_sentinel; } + + /// Returns an iterator pointing to the first element in the list. If the list is empty, the iterator starts out + /// pointing at the permanent sentinel node. + /// + /// @returns An iterator pointing at the front end of the list. + Iter Begin() const { return Iter(&m_sentinel, m_sentinel.m_pNext); } + + /// Returns an iterator pointing to the last element in the list. If the list is empty, the iterator starts out + /// pointing at the permanent sentinel node. + /// + /// @returns An iterator pointing at the back end of the list. + Iter End() const { return Iter(&m_sentinel, m_sentinel.m_pPrev); } + + /// Returns the data pointer at the front of the list. + /// + /// @returns The data pointer at the front of the list or null if the list is empty. + T* Front() const { return m_sentinel.m_pNext->m_pData; } + + /// Returns the data pointer at the back of the list. + /// + /// @returns The data pointer at the back of the list or null if the list is empty. + T* Back() const { return m_sentinel.m_pPrev->m_pData; } + + /// Pushes the specified node onto the front of the list. + /// + /// @param [in] pNode Externally-owned list node to link into the list. + void PushFront(Node* pNode) { InsertBefore(m_sentinel.m_pNext, pNode); } + + /// Pushes the specified node onto the back of the list. + /// + /// @param [in] pNode Externally-owned list node to link into the list. + void PushBack(Node* pNode) { InsertBefore(&m_sentinel, pNode); } + + /// Pushes the contents of pSource onto the front of this list. The ordering of pSource is preserved, meaning that + /// the front of pSource will be the new front of this list. Note that pSource will be left entirely empty. + /// + /// It is illegal to call this function with an empty pSource. + /// + /// @param [in] pSource The contents of pSource will be pushed in-order onto the front of this list. + void PushFrontList(IntrusiveList* pSource); + + /// Pushes the contents of pSource onto the back of this list. The ordering of pSource is preserved, meaning that + /// the end of pSource will be the new end of this list. Note that pSource will be left entirely empty. + /// + /// It is illegal to call this function with an empty pSource. + /// + /// @param [in] pSource The contents of pSource will be pushed in-order onto the back of this list. + void PushBackList(IntrusiveList* pSource); + + /// Inserts the specified node before a particular node in a list. + /// + /// If the iterator has advanced off the end of the list (i.e., the iterator is invalid), the added node will be the + /// new tail node. + /// + /// @param [in] iter Identifies a node where the insertion should take place. The iterator will point to the same + /// spot in the list after insertion. + /// @param [in] pNode Externally-owned list node to link into the list. + void InsertBefore(const Iter& iter, Node* pNode); + + /// Removes the node at the specified position from the list. + /// + /// It is illegal to call this function with an iterator that has already advanced off the end of the list. + /// + /// @param [in,out] pIter Iterator identifying the node to be removed. After the node is removed, this iterator + /// will be advanced to the next node. If this call removes the final remaining node in the + /// list then the iterator will point at the sentinel and will be invalid. + void Erase(Iter* pIter); + + /// Removes the node at the specified position from the list. It is illegal to call this function with a Node that + /// not in this list. + /// + /// @param [in] pNode Node to be removed. + void Erase(Node* pNode); + + /// Removes all nodes from the list. + void EraseAll(); + + /// Truncates the list without touching the elements + void InvalidateList() + { + m_sentinel.m_pNext = &m_sentinel; + m_sentinel.m_pPrev = &m_sentinel; + m_numElements = 0; + } + +private: + void InsertBefore(Node* pBeforeMe, Node* pNode); + void Unlink(Node* pNode); + + Node m_sentinel; // Ties the head to the tail and signifies the boundary of the list. + size_t m_numElements; // Number of elements. + + PAL_DISALLOW_COPY_AND_ASSIGN(IntrusiveList); +}; + +// ===================================================================================================================== +// This is the public node constructor; it must be given a non-null data pointer. +template +IntrusiveListNode::IntrusiveListNode( + T* pData) + : + m_pData(pData), + m_pPrev(nullptr), + m_pNext(nullptr) +{ + PAL_ASSERT(pData != nullptr); +} + +// ===================================================================================================================== +// This is the private node constructor which is used exclusively for sentinel nodes. +template +IntrusiveListNode::IntrusiveListNode() + : + m_pData(nullptr), + m_pPrev(nullptr), + m_pNext(nullptr) +{ +} + +// ===================================================================================================================== +// Returns true if this node is present in an intrusive list. +template +bool IntrusiveListNode::InList() const +{ + // The node pointers should always be null or non-null together. + PAL_DEBUG_BUILD_ONLY_ASSERT((m_pPrev == nullptr) == (m_pNext == nullptr)); + + return (m_pNext != nullptr); +} + +// ===================================================================================================================== +template +IntrusiveListIterator::IntrusiveListIterator( + const IntrusiveListNode*const pSentinel, + IntrusiveListNode* pStart) + : + m_pSentinel(pSentinel), + m_pCurrent(pStart) +{ +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h b/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h index 3876d7714b..052c4e0f66 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palLinearAllocator.h @@ -1,346 +1,346 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palLinearAllocator.h - * @brief * @brief PAL utility allocator LinearAllocator class. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palIntrusiveList.h" -#include "palSysMemory.h" - -namespace Util -{ - -/** - *********************************************************************************************************************** - * @brief A linear allocator that allocates virtual memory. - * - * To improve performance, a linear allocator can be used in performance-critical areas to avoid unnecessary heap - * allocations. The VirtualLinearAllocator will instead reserve a specified amount of virtual address space and will - * incrementally back it with real memory as necessary. - * - * As clients reach a steady state, allocations from this allocator will become "free," essentially just costing a - * pointer increment. - * - * This allocator can be used with any of the memory management macros. @see Allocators for more information about the - * Allocation pattern. - *********************************************************************************************************************** - */ -class VirtualLinearAllocator -{ -public: - /// Constructor. - /// - /// @param [in] size Maximum size, in bytes, of virtual memory that this allocator should reserve. - /// Does not need to be aligned to page size. - VirtualLinearAllocator(size_t size) : - m_pStart(nullptr), - m_pCurrent(nullptr), - m_size(size), - m_pageSize(0) {} - - /// Destructor. - virtual ~VirtualLinearAllocator() - { - if (m_pStart != nullptr) - { - // Free all of the pages. - Result result = VirtualRelease(m_pStart, m_size); - PAL_ASSERT(result == Result::_Success); - } - } - - /// Initializes the linear allocator by reserving the requested number of pages. - /// - /// @returns Result::Success if memory reservation and committing of the first page is successful. - Result Init() - { - m_pageSize = VirtualPageSize(); - m_size = Pow2Align(m_size, m_pageSize); - - Result result = VirtualReserve(m_size, &m_pStart); - - if (result == Result::_Success) - { - result = VirtualCommit(m_pStart, m_pageSize); - } - - if (result == Result::_Success) - { - m_pCurrent = m_pStart; - m_pCommittedToPage = VoidPtrInc(m_pCurrent, m_pageSize); - } - - return result; - } - - /// Allocates a block of memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) - { - void* pAlignedCurrent = VoidPtrAlign(m_pCurrent, allocInfo.alignment); - void* pNextCurrent = VoidPtrInc(pAlignedCurrent, allocInfo.bytes); - void* pAlignedEnd = VoidPtrAlign(pNextCurrent, m_pageSize); - - if (allocInfo.bytes > Remaining()) - { - pAlignedCurrent = nullptr; - } - else if (pAlignedEnd > m_pCommittedToPage) - { - const size_t commitBytes = VoidPtrDiff(pAlignedEnd, m_pCommittedToPage); - - const Result result = VirtualCommit(m_pCommittedToPage, commitBytes); - - if (result == Result::_Success) - { - m_pCommittedToPage = VoidPtrInc(m_pCommittedToPage, commitBytes); - m_pCurrent = pNextCurrent; - } - else - { - // Return nullptr if allocation fails. - pAlignedCurrent = nullptr; - } - } - else - { - m_pCurrent = pNextCurrent; - } - - return pAlignedCurrent; - } - - /// Frees a block of memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) {} - - /// Rewinds the current pointer to the specified location to reuse already allocated memory. - /// - /// @param pStart Where to reset the m_pCurrent to. - /// @param decommit If true, pages that are rewound are freed/decommitted. - void Rewind(void* pStart, bool decommit) - { - PAL_ASSERT((m_pStart <= pStart) && (pStart <= m_pCurrent)); - - if (pStart != m_pCurrent) - { - if (decommit) - { - void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); - void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); - const size_t numPages = VoidPtrDiff(pCurrentPage, pStartPage) / m_pageSize; - - if (numPages > 0) - { - Result result = VirtualDecommit(pStartPage, m_pageSize * numPages); - PAL_ASSERT(result == Result::_Success); - - m_pCommittedToPage = pStartPage; - } - } -#if DEBUG - else - { - void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); - void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); - const size_t numDwords = VoidPtrDiff(pCurrentPage, pStartPage) / sizeof(uint32); - uint32* pNewCurrent = static_cast(pStartPage); - - for (size_t dword = 0; dword < numDwords; dword++) - { - pNewCurrent[dword] = 0xDEADBEEF; - } - } -#endif - - m_pCurrent = pStart; - } - } - - /// Returns the current pointer to backing memory. - /// - /// @returns Current pointer to backing memory. - void* Current() { return m_pCurrent; } - - /// Returns the starting pointer to backing memory. - /// - /// @returns Pointer to the start of backing memory. - void* Start() { return m_pStart; } - - /// Returns the number of bytes that have been allocated. - /// - /// @returns Number of bytes allocated through this allocator. - size_t BytesAllocated() { return VoidPtrDiff(m_pCurrent, m_pStart); } - - /// Compute remaining unallocated space in the allocator; once this space is exhausted allocations will fail. - /// - /// @returns The size of the remaining unallocated space in bytes. - size_t Remaining() const { return m_size - VoidPtrDiff(m_pCurrent, m_pStart); } - -private: - void* m_pStart; ///< Pointer to where the backing allocation starts. - void* m_pCurrent; ///< Pointer to the current position of backing memory. - void* m_pCommittedToPage; ///< Pointer to the end of the last committed page. - - size_t m_size; ///< Size of the allocation. - size_t m_pageSize; ///< OS' defined page size. - - PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocator); - PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocator); -}; - -/** - *********************************************************************************************************************** - * @brief A "resource acquisition is initialization" (RAII) wrapper for the LinearAllocator classes. - * - * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and - * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use - * this class for stack-allocated objects. - * - * This object will ensure that anything allocated the object is allocated on the stack and when it goes out of scope - * will be properly "rewound" by the allocator. See the below example. - * - * - * { - * [Current pointer = 0x10] - * LinearAllocatorAuto allocator(pPtrToAllocator); - * Allocations occur ... - * [Current pointer = 0x80] - * } - * [Current pointer rewinds = 0x10] - *********************************************************************************************************************** - */ -template -class LinearAllocatorAuto -{ -public: - /// Tracks the current start pointer. - /// - /// @param pAllocator The allocator to wrap. - /// @param decommit Whether to decommit any pages of memory allocated when this goes out of scope. - LinearAllocatorAuto(LinearAllocator* pAllocator, bool decommit) - : - m_pAllocator(pAllocator), -#if PAL_MEMTRACK - m_memTracker(pAllocator), -#endif - m_pStart(nullptr), - m_decommit(decommit) - { - PAL_ASSERT(pAllocator != nullptr); - m_pStart = m_pAllocator->Current(); - -#if PAL_MEMTRACK - Result result = m_memTracker.Init(); - PAL_ASSERT(result == Result::_Success); -#endif - } - - /// Rewinds any allocations made when this goes out of scope. - ~LinearAllocatorAuto() - { - m_pAllocator->Rewind(m_pStart, m_decommit); - } - - /// Allocates a block of memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) - { - void* pMemory = nullptr; -#if PAL_MEMTRACK - pMemory = m_memTracker.Alloc(allocInfo); -#else - pMemory = m_pAllocator->Alloc(allocInfo); -#endif - - return pMemory; - } - - /// Frees a block of memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) - { -#if PAL_MEMTRACK - m_memTracker.Free(freeInfo); -#else - m_pAllocator->Free(freeInfo); -#endif - } - -private: - LinearAllocator*const m_pAllocator; ///< The LinearAllocator which this object wraps. - -#if PAL_MEMTRACK - MemTracker m_memTracker; ///< Memory tracker for this LinearAllocatorAuto. -#endif - - void* m_pStart; ///< Where the LinearAllocator started when wrapped by this. - const bool m_decommit; ///< Whether to decommit any pages of memory allocated on destruction. - - PAL_DISALLOW_DEFAULT_CTOR(LinearAllocatorAuto); - PAL_DISALLOW_COPY_AND_ASSIGN(LinearAllocatorAuto); -}; - -/** - *********************************************************************************************************************** - * @brief A simple extension of VirtualLinearAllocator that contains an IntrusiveListNode pointing at itself. - * This makes it very easy to create and manage IntrusiveLists of VirtualLinearAllocators. - *********************************************************************************************************************** - */ -class VirtualLinearAllocatorWithNode : public VirtualLinearAllocator -{ -public: - /// Constructor. - VirtualLinearAllocatorWithNode(size_t size) : VirtualLinearAllocator(size), m_node(this) {} - - /// Destructor. - virtual ~VirtualLinearAllocatorWithNode() {} - - /// Gets this linear allocator's associated IntrusiveListNode. - /// - /// @returns Pointer to this allocator's associated IntrusiveListNode. - IntrusiveListNode* GetNode() { return &m_node; } - -private: - IntrusiveListNode m_node; - - PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocatorWithNode); - PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocatorWithNode); -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palLinearAllocator.h + * @brief * @brief PAL utility allocator LinearAllocator class. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palIntrusiveList.h" +#include "palSysMemory.h" + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief A linear allocator that allocates virtual memory. + * + * To improve performance, a linear allocator can be used in performance-critical areas to avoid unnecessary heap + * allocations. The VirtualLinearAllocator will instead reserve a specified amount of virtual address space and will + * incrementally back it with real memory as necessary. + * + * As clients reach a steady state, allocations from this allocator will become "free," essentially just costing a + * pointer increment. + * + * This allocator can be used with any of the memory management macros. @see Allocators for more information about the + * Allocation pattern. + *********************************************************************************************************************** + */ +class VirtualLinearAllocator +{ +public: + /// Constructor. + /// + /// @param [in] size Maximum size, in bytes, of virtual memory that this allocator should reserve. + /// Does not need to be aligned to page size. + VirtualLinearAllocator(size_t size) : + m_pStart(nullptr), + m_pCurrent(nullptr), + m_size(size), + m_pageSize(0) {} + + /// Destructor. + virtual ~VirtualLinearAllocator() + { + if (m_pStart != nullptr) + { + // Free all of the pages. + Result result = VirtualRelease(m_pStart, m_size); + PAL_ASSERT(result == Result::_Success); + } + } + + /// Initializes the linear allocator by reserving the requested number of pages. + /// + /// @returns Result::Success if memory reservation and committing of the first page is successful. + Result Init() + { + m_pageSize = VirtualPageSize(); + m_size = Pow2Align(m_size, m_pageSize); + + Result result = VirtualReserve(m_size, &m_pStart); + + if (result == Result::_Success) + { + result = VirtualCommit(m_pStart, m_pageSize); + } + + if (result == Result::_Success) + { + m_pCurrent = m_pStart; + m_pCommittedToPage = VoidPtrInc(m_pCurrent, m_pageSize); + } + + return result; + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pAlignedCurrent = VoidPtrAlign(m_pCurrent, allocInfo.alignment); + void* pNextCurrent = VoidPtrInc(pAlignedCurrent, allocInfo.bytes); + void* pAlignedEnd = VoidPtrAlign(pNextCurrent, m_pageSize); + + if (allocInfo.bytes > Remaining()) + { + pAlignedCurrent = nullptr; + } + else if (pAlignedEnd > m_pCommittedToPage) + { + const size_t commitBytes = VoidPtrDiff(pAlignedEnd, m_pCommittedToPage); + + const Result result = VirtualCommit(m_pCommittedToPage, commitBytes); + + if (result == Result::_Success) + { + m_pCommittedToPage = VoidPtrInc(m_pCommittedToPage, commitBytes); + m_pCurrent = pNextCurrent; + } + else + { + // Return nullptr if allocation fails. + pAlignedCurrent = nullptr; + } + } + else + { + m_pCurrent = pNextCurrent; + } + + return pAlignedCurrent; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) {} + + /// Rewinds the current pointer to the specified location to reuse already allocated memory. + /// + /// @param pStart Where to reset the m_pCurrent to. + /// @param decommit If true, pages that are rewound are freed/decommitted. + void Rewind(void* pStart, bool decommit) + { + PAL_ASSERT((m_pStart <= pStart) && (pStart <= m_pCurrent)); + + if (pStart != m_pCurrent) + { + if (decommit) + { + void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); + void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); + const size_t numPages = VoidPtrDiff(pCurrentPage, pStartPage) / m_pageSize; + + if (numPages > 0) + { + Result result = VirtualDecommit(pStartPage, m_pageSize * numPages); + PAL_ASSERT(result == Result::_Success); + + m_pCommittedToPage = pStartPage; + } + } +#if DEBUG + else + { + void* pStartPage = VoidPtrAlign(VoidPtrInc(pStart, 1), m_pageSize); + void* pCurrentPage = VoidPtrAlign(m_pCurrent, m_pageSize); + const size_t numDwords = VoidPtrDiff(pCurrentPage, pStartPage) / sizeof(uint32); + uint32* pNewCurrent = static_cast(pStartPage); + + for (size_t dword = 0; dword < numDwords; dword++) + { + pNewCurrent[dword] = 0xDEADBEEF; + } + } +#endif + + m_pCurrent = pStart; + } + } + + /// Returns the current pointer to backing memory. + /// + /// @returns Current pointer to backing memory. + void* Current() { return m_pCurrent; } + + /// Returns the starting pointer to backing memory. + /// + /// @returns Pointer to the start of backing memory. + void* Start() { return m_pStart; } + + /// Returns the number of bytes that have been allocated. + /// + /// @returns Number of bytes allocated through this allocator. + size_t BytesAllocated() { return VoidPtrDiff(m_pCurrent, m_pStart); } + + /// Compute remaining unallocated space in the allocator; once this space is exhausted allocations will fail. + /// + /// @returns The size of the remaining unallocated space in bytes. + size_t Remaining() const { return m_size - VoidPtrDiff(m_pCurrent, m_pStart); } + +private: + void* m_pStart; ///< Pointer to where the backing allocation starts. + void* m_pCurrent; ///< Pointer to the current position of backing memory. + void* m_pCommittedToPage; ///< Pointer to the end of the last committed page. + + size_t m_size; ///< Size of the allocation. + size_t m_pageSize; ///< OS' defined page size. + + PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocator); + PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocator); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the LinearAllocator classes. + * + * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything allocated the object is allocated on the stack and when it goes out of scope + * will be properly "rewound" by the allocator. See the below example. + * + * + * { + * [Current pointer = 0x10] + * LinearAllocatorAuto allocator(pPtrToAllocator); + * Allocations occur ... + * [Current pointer = 0x80] + * } + * [Current pointer rewinds = 0x10] + *********************************************************************************************************************** + */ +template +class LinearAllocatorAuto +{ +public: + /// Tracks the current start pointer. + /// + /// @param pAllocator The allocator to wrap. + /// @param decommit Whether to decommit any pages of memory allocated when this goes out of scope. + LinearAllocatorAuto(LinearAllocator* pAllocator, bool decommit) + : + m_pAllocator(pAllocator), +#if PAL_MEMTRACK + m_memTracker(pAllocator), +#endif + m_pStart(nullptr), + m_decommit(decommit) + { + PAL_ASSERT(pAllocator != nullptr); + m_pStart = m_pAllocator->Current(); + +#if PAL_MEMTRACK + Result result = m_memTracker.Init(); + PAL_ASSERT(result == Result::_Success); +#endif + } + + /// Rewinds any allocations made when this goes out of scope. + ~LinearAllocatorAuto() + { + m_pAllocator->Rewind(m_pStart, m_decommit); + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pMemory = nullptr; +#if PAL_MEMTRACK + pMemory = m_memTracker.Alloc(allocInfo); +#else + pMemory = m_pAllocator->Alloc(allocInfo); +#endif + + return pMemory; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { +#if PAL_MEMTRACK + m_memTracker.Free(freeInfo); +#else + m_pAllocator->Free(freeInfo); +#endif + } + +private: + LinearAllocator*const m_pAllocator; ///< The LinearAllocator which this object wraps. + +#if PAL_MEMTRACK + MemTracker m_memTracker; ///< Memory tracker for this LinearAllocatorAuto. +#endif + + void* m_pStart; ///< Where the LinearAllocator started when wrapped by this. + const bool m_decommit; ///< Whether to decommit any pages of memory allocated on destruction. + + PAL_DISALLOW_DEFAULT_CTOR(LinearAllocatorAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(LinearAllocatorAuto); +}; + +/** + *********************************************************************************************************************** + * @brief A simple extension of VirtualLinearAllocator that contains an IntrusiveListNode pointing at itself. + * This makes it very easy to create and manage IntrusiveLists of VirtualLinearAllocators. + *********************************************************************************************************************** + */ +class VirtualLinearAllocatorWithNode : public VirtualLinearAllocator +{ +public: + /// Constructor. + VirtualLinearAllocatorWithNode(size_t size) : VirtualLinearAllocator(size), m_node(this) {} + + /// Destructor. + virtual ~VirtualLinearAllocatorWithNode() {} + + /// Gets this linear allocator's associated IntrusiveListNode. + /// + /// @returns Pointer to this allocator's associated IntrusiveListNode. + IntrusiveListNode* GetNode() { return &m_node; } + +private: + IntrusiveListNode m_node; + + PAL_DISALLOW_DEFAULT_CTOR(VirtualLinearAllocatorWithNode); + PAL_DISALLOW_COPY_AND_ASSIGN(VirtualLinearAllocatorWithNode); +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMath.h b/shared/amdgpu-windows-interop/pal/inc/util/palMath.h index bc441a5a94..0c8937bda6 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palMath.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMath.h @@ -1,226 +1,226 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palMath.h - * @brief PAL utility collection function/constant declarations for the Math sub-namespace. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palSysMemory.h" - -#include - -namespace Util -{ - -/// Util sub-namespace defining several useful math routines and constants. -namespace Math -{ - -/// Exponent mask of a single-precision IEEE float. -constexpr uint32 FloatExponentMask = 0x7F800000; -/// Exponent bias of a single-precision IEEE float. -constexpr uint32 FloatExponentBias = 127; -/// Number of bits in the mantissa of a single-precision IEEE float. -constexpr uint32 FloatNumMantissaBits = 23; -/// Mantissa mask of a single-precision IEEE float. -constexpr uint32 FloatMantissaMask = 0x007FFFFF; -/// Sign bit mask of a single precision IEEE float. -constexpr uint32 FloatSignBitMask = 0x80000000; -/// Mask of all non-sign bits of a single-precision IEEE float. -constexpr uint32 FloatMaskOutSignBit = 0x7FFFFFFF; -/// Minimum number of float bits in a normalized IEE float. -constexpr uint32 MinNormalizedFloatBits = 0x00800000; - -/// Positive one. -constexpr float FloatOne = 1.0f; -/// Negative one. -constexpr float FloatNegOne = -1.0f; -/// Zero. -constexpr float FloatZero = 0.0f; -/// Positive infinity. -constexpr float FloatInfinity = std::numeric_limits::infinity(); - -/// Fraction structure. -struct Fraction -{ - uint32 num; ///< Numerator - uint32 den; ///< Denominator -}; - -/// Returns the bits of a floating point value as an unsigned integer. -inline uint32 FloatToBits(float f) -{ - return (*(reinterpret_cast(&f))); -} - -/// Assigns the bits contained in an unsigned integer to the float pointer location -inline void SetBitsToFloat(float* f, uint32 u) -{ - *(reinterpret_cast(f)) = u; -} - -/// Returns true if the specified float is denormalized. -extern bool IsDenorm(float f); -/// Returns true if the specified float is +/- infinity. -extern bool IsInf(float f); -/// Returns true if the specified float is a NaN. -extern bool IsNaN(float f); - -/// Determines if a floating-point number is either +/-Infinity or NaN. -inline bool IsInfOrNaN(float f) -{ - return (IsInf(f) || IsNaN(f)); -} - -/// @brief Converts a floating point number to a signed fixed point number with the given integer and fractional bits. -/// -/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If the -/// intBits is zero, the fracBits is assumed to include 1 sign bit, otherwise the sign bit is assumed to be part of the -/// intBits. A typical use for enableRounding would be when converting SNORM/UNORM values to fixed point. -/// -/// @param [in] f Floating point value to convert. -/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. -/// @param [in] fracBits Number of fractional bits in the fixed point output. -/// @param [in] enableRounding Round before conversion. -/// -/// @returns Fixed point number in a uint32. -extern uint32 FloatToSFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); - -/// @brief Converts a floating point number to an unsigned fixed point number with the given integer and -/// fractional bits. -/// -/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. A typical use -/// for enableRounding would be when converting SNORM/UNORM values to fixed point. -/// -/// @param [in] f Floating point value to convert. -/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. -/// @param [in] fracBits Number of fractional bits in the fixed point output. -/// @param [in] enableRounding Round before conversion. -/// -/// @returns Fixed point number in a uint32. -extern uint32 FloatToUFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); - -/// @brief Converts a signed fixed point number with the given integer and fractional bits to a floating point number. -/// -/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If numIntBits -/// is 0, numFracBits is assumed to have 1 bit for the sign, otherwise the sign bit is assumed to be part of the integer -/// bits. -/// -/// @param [in] fixedPtNum Fixed point number to convert. -/// @param [in] intBits Number of integer bits (including the sign bit). -/// @param [in] fracBits Number of fractional bits. -/// -/// @returns Converted floating point number. -extern float SFixedToFloat(int32 fixedPtNum, uint32 intBits, uint32 fracBits); - -/// @brief Converts a unsigned fixed point number with the given integer and fractional bits to a floating point number. -/// -/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [0, 1.0]. -/// -/// @param [in] fixedPtNum Fixed point number to convert. -/// @param [in] intBits Number of integer bits (including the sign bit). -/// @param [in] fracBits Number of fractional bits. -/// -/// @returns Converted floating point number. -extern float UFixedToFloat(uint32 fixedPtNum, uint32 intBits, uint32 fracBits); - -/// Converts a 32-bit IEEE floating point number to a 16-bit signed floating point number. -extern uint32 Float32ToFloat16(float f); - -/// Converts a 32-bit IEEE floating point number to an 11-bit signed floating point number. -extern uint32 Float32ToFloat11(float f); - -/// Converts a 32-bit IEEE floating point number to a 10-bit signed floating point number. -extern uint32 Float32ToFloat10(float f); - -/// Converts a 32-bit IEEE floating-point number to a 10-bit unsigned floating-point number. -extern uint32 Float32ToFloat10_6e4(float f); - -/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. -extern float Float10_6e4ToFloat32(uint32 fBits); - -/// Converts a 32-bit IEEE floating point number to a N-bit signed floating point number. -extern uint32 Float32ToNumBits(float float32, uint32 numBits); - -/// Converts a 16-bit signed floating point number to a 32-bit IEEE floating point number. -extern float Float16ToFloat32(uint32 fBits); - -/// Converts an 11-bit signed floating point number to a 32-bit IEEE floating point number. -extern float Float11ToFloat32(uint32 fBits); - -/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. -extern float Float10ToFloat32(uint32 fBits); - -/// Converts an N-bit signed floating point number to a 32-bit IEEE floating point number. -extern float FloatNumBitsToFloat32(uint32 input, uint32 numBits); - -/// Converts a 32-bit IEEE floating point number to a fraction. -extern Fraction Float32ToFraction(float float32); - -/// Returns the square root of the specified value. -extern float Sqrt(float f); - -/// Returns the result of an exponent operation (base^exponent). -extern float Pow(float base, float exponent); - -/// Returns the unsigned integer absolute value. -extern uint32 Absu(int32 number); - -/// Return sign-preserved zero if input is denorm, otherwise input value -extern float FlushDenormToZero(float input); - -/// Return value in 1.7 signed magnitude format. Valid input range is (-127, 127) -extern uint8 IntToSignedMagnitude(int8 input); - -/// @brief Performs unsigned fixed-point rounding operation. -/// -/// @param [in] value Fixed point number to convert in Qm.f format. -/// @param [in] n Number of fractional bits. -/// -/// @returns rounded fixed point number in Q0 format (unsigned integer). -constexpr uint32 UFixedRoundToUint32(uint32 value, uint8 n) -{ - PAL_CONSTEXPR_ASSERT((0 < n) && (n < 31)); - return ((value + (((1 << n) >> 1))) >> n); -} - -/// @brief Performs signed fixed-point rounding operation. -/// -/// @param [in] value Fixed point number to convert in Qm.f format. -/// @param [in] n Number of fractional bits. -/// -/// @returns rounded fixed point number in Q0 format (signed integer). -constexpr int32 SFixedRoundToInt32(int32 value, uint8 n) -{ - PAL_CONSTEXPR_ASSERT((0 < n) && (n < 30)); - return ((value + (((1 << n) >> 1))) >> n); -} - -} // Math -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMath.h + * @brief PAL utility collection function/constant declarations for the Math sub-namespace. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palSysMemory.h" + +#include + +namespace Util +{ + +/// Util sub-namespace defining several useful math routines and constants. +namespace Math +{ + +/// Exponent mask of a single-precision IEEE float. +constexpr uint32 FloatExponentMask = 0x7F800000; +/// Exponent bias of a single-precision IEEE float. +constexpr uint32 FloatExponentBias = 127; +/// Number of bits in the mantissa of a single-precision IEEE float. +constexpr uint32 FloatNumMantissaBits = 23; +/// Mantissa mask of a single-precision IEEE float. +constexpr uint32 FloatMantissaMask = 0x007FFFFF; +/// Sign bit mask of a single precision IEEE float. +constexpr uint32 FloatSignBitMask = 0x80000000; +/// Mask of all non-sign bits of a single-precision IEEE float. +constexpr uint32 FloatMaskOutSignBit = 0x7FFFFFFF; +/// Minimum number of float bits in a normalized IEE float. +constexpr uint32 MinNormalizedFloatBits = 0x00800000; + +/// Positive one. +constexpr float FloatOne = 1.0f; +/// Negative one. +constexpr float FloatNegOne = -1.0f; +/// Zero. +constexpr float FloatZero = 0.0f; +/// Positive infinity. +constexpr float FloatInfinity = std::numeric_limits::infinity(); + +/// Fraction structure. +struct Fraction +{ + uint32 num; ///< Numerator + uint32 den; ///< Denominator +}; + +/// Returns the bits of a floating point value as an unsigned integer. +inline uint32 FloatToBits(float f) +{ + return (*(reinterpret_cast(&f))); +} + +/// Assigns the bits contained in an unsigned integer to the float pointer location +inline void SetBitsToFloat(float* f, uint32 u) +{ + *(reinterpret_cast(f)) = u; +} + +/// Returns true if the specified float is denormalized. +extern bool IsDenorm(float f); +/// Returns true if the specified float is +/- infinity. +extern bool IsInf(float f); +/// Returns true if the specified float is a NaN. +extern bool IsNaN(float f); + +/// Determines if a floating-point number is either +/-Infinity or NaN. +inline bool IsInfOrNaN(float f) +{ + return (IsInf(f) || IsNaN(f)); +} + +/// @brief Converts a floating point number to a signed fixed point number with the given integer and fractional bits. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If the +/// intBits is zero, the fracBits is assumed to include 1 sign bit, otherwise the sign bit is assumed to be part of the +/// intBits. A typical use for enableRounding would be when converting SNORM/UNORM values to fixed point. +/// +/// @param [in] f Floating point value to convert. +/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. +/// @param [in] fracBits Number of fractional bits in the fixed point output. +/// @param [in] enableRounding Round before conversion. +/// +/// @returns Fixed point number in a uint32. +extern uint32 FloatToSFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); + +/// @brief Converts a floating point number to an unsigned fixed point number with the given integer and +/// fractional bits. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. A typical use +/// for enableRounding would be when converting SNORM/UNORM values to fixed point. +/// +/// @param [in] f Floating point value to convert. +/// @param [in] intBits Number of integer bits (including the sign bit) in the fixed point output. +/// @param [in] fracBits Number of fractional bits in the fixed point output. +/// @param [in] enableRounding Round before conversion. +/// +/// @returns Fixed point number in a uint32. +extern uint32 FloatToUFixed(float f, uint32 intBits, uint32 fracBits, bool enableRounding = false); + +/// @brief Converts a signed fixed point number with the given integer and fractional bits to a floating point number. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [-1.0, 1.0]. If numIntBits +/// is 0, numFracBits is assumed to have 1 bit for the sign, otherwise the sign bit is assumed to be part of the integer +/// bits. +/// +/// @param [in] fixedPtNum Fixed point number to convert. +/// @param [in] intBits Number of integer bits (including the sign bit). +/// @param [in] fracBits Number of fractional bits. +/// +/// @returns Converted floating point number. +extern float SFixedToFloat(int32 fixedPtNum, uint32 intBits, uint32 fracBits); + +/// @brief Converts a unsigned fixed point number with the given integer and fractional bits to a floating point number. +/// +/// If the number of integer bits is zero, the incoming value is treated as normalized, i.e. [0, 1.0]. +/// +/// @param [in] fixedPtNum Fixed point number to convert. +/// @param [in] intBits Number of integer bits (including the sign bit). +/// @param [in] fracBits Number of fractional bits. +/// +/// @returns Converted floating point number. +extern float UFixedToFloat(uint32 fixedPtNum, uint32 intBits, uint32 fracBits); + +/// Converts a 32-bit IEEE floating point number to a 16-bit signed floating point number. +extern uint32 Float32ToFloat16(float f); + +/// Converts a 32-bit IEEE floating point number to an 11-bit signed floating point number. +extern uint32 Float32ToFloat11(float f); + +/// Converts a 32-bit IEEE floating point number to a 10-bit signed floating point number. +extern uint32 Float32ToFloat10(float f); + +/// Converts a 32-bit IEEE floating-point number to a 10-bit unsigned floating-point number. +extern uint32 Float32ToFloat10_6e4(float f); + +/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float10_6e4ToFloat32(uint32 fBits); + +/// Converts a 32-bit IEEE floating point number to a N-bit signed floating point number. +extern uint32 Float32ToNumBits(float float32, uint32 numBits); + +/// Converts a 16-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float16ToFloat32(uint32 fBits); + +/// Converts an 11-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float11ToFloat32(uint32 fBits); + +/// Converts a 10-bit signed floating point number to a 32-bit IEEE floating point number. +extern float Float10ToFloat32(uint32 fBits); + +/// Converts an N-bit signed floating point number to a 32-bit IEEE floating point number. +extern float FloatNumBitsToFloat32(uint32 input, uint32 numBits); + +/// Converts a 32-bit IEEE floating point number to a fraction. +extern Fraction Float32ToFraction(float float32); + +/// Returns the square root of the specified value. +extern float Sqrt(float f); + +/// Returns the result of an exponent operation (base^exponent). +extern float Pow(float base, float exponent); + +/// Returns the unsigned integer absolute value. +extern uint32 Absu(int32 number); + +/// Return sign-preserved zero if input is denorm, otherwise input value +extern float FlushDenormToZero(float input); + +/// Return value in 1.7 signed magnitude format. Valid input range is (-127, 127) +extern uint8 IntToSignedMagnitude(int8 input); + +/// @brief Performs unsigned fixed-point rounding operation. +/// +/// @param [in] value Fixed point number to convert in Qm.f format. +/// @param [in] n Number of fractional bits. +/// +/// @returns rounded fixed point number in Q0 format (unsigned integer). +constexpr uint32 UFixedRoundToUint32(uint32 value, uint8 n) +{ + PAL_CONSTEXPR_ASSERT((0 < n) && (n < 31)); + return ((value + (((1 << n) >> 1))) >> n); +} + +/// @brief Performs signed fixed-point rounding operation. +/// +/// @param [in] value Fixed point number to convert in Qm.f format. +/// @param [in] n Number of fractional bits. +/// +/// @returns rounded fixed point number in Q0 format (signed integer). +constexpr int32 SFixedRoundToInt32(int32 value, uint8 n) +{ + PAL_CONSTEXPR_ASSERT((0 < n) && (n < 30)); + return ((value + (((1 << n) >> 1))) >> n); +} + +} // Math +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h b/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h index 060c7eee83..db4df424ba 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMemTracker.h @@ -1,158 +1,158 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palMemTracker.h - * @brief PAL utility collection MemTracker class declaration. - *********************************************************************************************************************** - */ - -#pragma once - -#if PAL_MEMTRACK - -#include "palIntrusiveList.h" -#include "palMutex.h" - -namespace Util -{ - -// Forward declarations -struct AllocInfo; -struct FreeInfo; -struct MemTrackerElem; -enum SystemAllocType : uint32; - -/// @internal -/// -/// An alloc-less list used by the MemTracker to keep track of all allocations. -typedef IntrusiveList MemTrackerList; - -/// @internal -/// -/// Specifies whether a particular memory block was allocated with PAL_MALLOC/PAL_CALLOC, PAL_NEW, or PAL_NEW_ARRAY. -/// Used to verify correct matching with PAL_FREE, PAL_DELETE, and PAL_DELETE_ARRAY. -enum class MemBlkType : uint32 -{ - Malloc = 0, - New, - NewArray, -}; - -/// @internal -/// -/// Internal structure used by MemTracker to store information on each allocation. -struct MemTrackerElem -{ - size_t size; ///< Size of allocation request. - MemBlkType blockType; ///< Memory block type (malloc, new, new array). - const char* pFilename; ///< File that requested allocation. - uint32 lineNumber; ///< Line number that requested allocation. - void* pClientMem; ///< Starting "client usable" data address. - void* pOrigMem; ///< Original address of the allocation returned from our underlying allocator. - size_t allocNum; ///< The number of the memory allocation. 1 based. - MemTrackerList* pList; ///< The list this struct is in. It helps check which MemTracker owns this struct. -}; - -/** - *********************************************************************************************************************** - * @brief Class responsible for tracking allocations and frees to notify the developer of memory leaks. - * - * Tracking is enabled/disabled via the PAL_MEMTRACK define. - *********************************************************************************************************************** - */ -template -class MemTracker -{ -public: - /// Constructor. - /// - /// @param [in] pAllocator The allocator that will allocate memory if required. - MemTracker(Allocator*const pAllocator); - ~MemTracker(); - - /// Performs any non-safe initialization that cannot be done in the constructor. - /// - /// @returns Result::Success if initialization is successful, otherwise an appropriate error. - Result Init(); - - /// Allocates a block of memory and track it using the memory tracker. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc( - const AllocInfo& allocInfo); - - /// Frees a block of memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free( - const FreeInfo& freeInfo); - -private: - void* AddMemElement( - void* pMem, - size_t bytes, - size_t align, - MemBlkType blockType, - const char* pFilename, - uint32 lineNumber); - - void* RemoveMemElement(void* pMem, MemBlkType blockType); - - void MemoryReport(); - void FreeLeakedMemory(); - - // Sentinel patterns used to detect memory underrun. - static constexpr uint32 UnderrunSentinel = 0xDEADBEEF; - // Sentinel patterns used to detect memory overrun. - static constexpr uint32 OverrunSentinel = 0xCAFEBABE; - - // Size of markers for underruns/overruns. Setting this to 0 disables this feature. - static constexpr size_t MarkerSizeUints = PAL_CACHE_LINE_BYTES / sizeof(uint32); - - // Size of underrun/overrun markers in bytes. - static constexpr size_t MarkerSizeBytes = MarkerSizeUints * sizeof(uint32); - - MemTrackerList m_trackerList; // The list of active allocations. - Mutex m_mutex; // Serializes access to list of active allocations. - - const size_t m_markerSizeUints; // Member variable copy of MarkerSizeUints. Only used to prevent compiler - // warnings when MarkerSizeUints is 0. - const size_t m_markerSizeBytes; // Member variable copy of MarkerSizeBytes. Only used to prevent compiler - // warnings when MarkerSizeBytes is 0. - - Allocator*const m_pAllocator; // Allocator for performing the actual allocations. - - size_t m_nextAllocNum; // The allocation number that the next allocated block will receive. - const size_t m_breakOnAllocNum; // The allocation number to trigger a debug break on. - - PAL_DISALLOW_COPY_AND_ASSIGN(MemTracker); -}; - -} // Util - -#endif +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMemTracker.h + * @brief PAL utility collection MemTracker class declaration. + *********************************************************************************************************************** + */ + +#pragma once + +#if PAL_MEMTRACK + +#include "palIntrusiveList.h" +#include "palMutex.h" + +namespace Util +{ + +// Forward declarations +struct AllocInfo; +struct FreeInfo; +struct MemTrackerElem; +enum SystemAllocType : uint32; + +/// @internal +/// +/// An alloc-less list used by the MemTracker to keep track of all allocations. +typedef IntrusiveList MemTrackerList; + +/// @internal +/// +/// Specifies whether a particular memory block was allocated with PAL_MALLOC/PAL_CALLOC, PAL_NEW, or PAL_NEW_ARRAY. +/// Used to verify correct matching with PAL_FREE, PAL_DELETE, and PAL_DELETE_ARRAY. +enum class MemBlkType : uint32 +{ + Malloc = 0, + New, + NewArray, +}; + +/// @internal +/// +/// Internal structure used by MemTracker to store information on each allocation. +struct MemTrackerElem +{ + size_t size; ///< Size of allocation request. + MemBlkType blockType; ///< Memory block type (malloc, new, new array). + const char* pFilename; ///< File that requested allocation. + uint32 lineNumber; ///< Line number that requested allocation. + void* pClientMem; ///< Starting "client usable" data address. + void* pOrigMem; ///< Original address of the allocation returned from our underlying allocator. + size_t allocNum; ///< The number of the memory allocation. 1 based. + MemTrackerList* pList; ///< The list this struct is in. It helps check which MemTracker owns this struct. +}; + +/** + *********************************************************************************************************************** + * @brief Class responsible for tracking allocations and frees to notify the developer of memory leaks. + * + * Tracking is enabled/disabled via the PAL_MEMTRACK define. + *********************************************************************************************************************** + */ +template +class MemTracker +{ +public: + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + MemTracker(Allocator*const pAllocator); + ~MemTracker(); + + /// Performs any non-safe initialization that cannot be done in the constructor. + /// + /// @returns Result::Success if initialization is successful, otherwise an appropriate error. + Result Init(); + + /// Allocates a block of memory and track it using the memory tracker. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc( + const AllocInfo& allocInfo); + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free( + const FreeInfo& freeInfo); + +private: + void* AddMemElement( + void* pMem, + size_t bytes, + size_t align, + MemBlkType blockType, + const char* pFilename, + uint32 lineNumber); + + void* RemoveMemElement(void* pMem, MemBlkType blockType); + + void MemoryReport(); + void FreeLeakedMemory(); + + // Sentinel patterns used to detect memory underrun. + static constexpr uint32 UnderrunSentinel = 0xDEADBEEF; + // Sentinel patterns used to detect memory overrun. + static constexpr uint32 OverrunSentinel = 0xCAFEBABE; + + // Size of markers for underruns/overruns. Setting this to 0 disables this feature. + static constexpr size_t MarkerSizeUints = PAL_CACHE_LINE_BYTES / sizeof(uint32); + + // Size of underrun/overrun markers in bytes. + static constexpr size_t MarkerSizeBytes = MarkerSizeUints * sizeof(uint32); + + MemTrackerList m_trackerList; // The list of active allocations. + Mutex m_mutex; // Serializes access to list of active allocations. + + const size_t m_markerSizeUints; // Member variable copy of MarkerSizeUints. Only used to prevent compiler + // warnings when MarkerSizeUints is 0. + const size_t m_markerSizeBytes; // Member variable copy of MarkerSizeBytes. Only used to prevent compiler + // warnings when MarkerSizeBytes is 0. + + Allocator*const m_pAllocator; // Allocator for performing the actual allocations. + + size_t m_nextAllocNum; // The allocation number that the next allocated block will receive. + const size_t m_breakOnAllocNum; // The allocation number to trigger a debug break on. + + PAL_DISALLOW_COPY_AND_ASSIGN(MemTracker); +}; + +} // Util + +#endif diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h b/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h index 525fc63dbf..cb10bfd629 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMemTrackerImpl.h @@ -1,331 +1,331 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palMemTrackerImpl.h - * @brief PAL utility collection MemTracker class implementations. - *********************************************************************************************************************** - */ - -#pragma once - -#if PAL_MEMTRACK - -#include "palIntrusiveListImpl.h" -#include "palMemTracker.h" -#include "palSysMemory.h" - -#include - -namespace Util -{ - -/// Table to convert a blockType to a string. Used by the logging routines. -constexpr const char* MemBlkTypeStr[] = -{ - "Malloc", ///< MemBlkType::Malloc - "New", ///< MemBlkType::New - "NewArray", ///< MemBlkType::NewArray -}; - -// ===================================================================================================================== -template -MemTracker::MemTracker( - Allocator*const pAllocator) - : - m_markerSizeUints(MarkerSizeUints), - m_markerSizeBytes(MarkerSizeBytes), - m_pAllocator(pAllocator), - m_nextAllocNum(1), - m_breakOnAllocNum(0) -{ -} - -// ===================================================================================================================== -template -MemTracker::~MemTracker() -{ - // Clean-up leaked memory if needed - if (m_trackerList.IsEmpty() == false) - { - // If the list isn't empty, we have a leak. The leak could either be caused by an internal PAL leak, - // a client leak, or even the application not destroying API objects. - PAL_ALERT_ALWAYS(); - - // Dump out a list of unfreed blocks. - MemoryReport(); - - FreeLeakedMemory(); - } -} - -// ===================================================================================================================== -template -Result MemTracker::Init() -{ - return Result::Success; -} - -// ===================================================================================================================== -// Adds the newly allocated memory block to the list of blocks for tracking. -// -// The tracking information includes things like filename, line numbers, and type of block. Also, given a pointer, -// adds the Underrun/Overrun markers to the memory allocated, and return a pointer to the actual client usable memory. -// -// See MemTracker::Alloc() which is used to allocate memory that is being tracked. -template -void* MemTracker::AddMemElement( - void* pMem, // [in,out] Original pointer allocated by MemTracker::Alloc. - size_t bytes, // Client requested allocation size in bytes. - size_t align, // The max of the client-requested alignment or the internal alignment, in bytes. - MemBlkType blockType, // Block type based on calling allocation routine. - const char* pFilename, // Client filename that is requesting the memory. - uint32 lineNumber) // Line number in client file that is requesting the memory. -{ - // Our internal data is all relative to the client pointer so find that first. See Alloc for more details. - // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) - constexpr size_t InternalSize = sizeof(MemTrackerList::Node) + sizeof(MemTrackerElem); - - void*const pClientMem = VoidPtrAlign(VoidPtrInc(pMem, m_markerSizeBytes + InternalSize), align); - uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); - uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(bytes, sizeof(uint32)))); - - auto*const pNewElement = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); - void*const pNewNodeMem = VoidPtrDec(pNewElement, sizeof(MemTrackerList::Node)); - auto*const pNewNode = PAL_PLACEMENT_NEW(pNewNodeMem) MemTrackerList::Node(pNewElement); - - // Mark the memory with the underrun/overrun marker. - for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) - { - *pUnderrun++ = UnderrunSentinel; - *pOverrun++ = OverrunSentinel; - } - - pNewElement->size = bytes; - pNewElement->pFilename = pFilename; - pNewElement->lineNumber = lineNumber; - pNewElement->blockType = blockType; - pNewElement->pClientMem = pClientMem; - pNewElement->pOrigMem = pMem; - pNewElement->pList = &m_trackerList; - - MutexAuto lock(&m_mutex); - - // Trigger an assert if we're about to allocate the break-on-allocation number. - if (m_nextAllocNum == m_breakOnAllocNum) - { - PAL_ASSERT_ALWAYS(); - } - - pNewElement->allocNum = m_nextAllocNum; - ++m_nextAllocNum; - - m_trackerList.PushFront(pNewNode); - - return pClientMem; -} - -// ===================================================================================================================== -// Removes an allocated block from the list of blocks used for tracking. -// -// The routine checks for invalid frees (and duplicate frees). Also, the routine is able to detect mismatched alloc/free -// usage based on the blockType. The routine is called with the pointer to the client usable memory and returns the -// pointer to the allocated memory. -// -// See MemTracker::Free() which is used to free memory that is being tracked. -template -void* MemTracker::RemoveMemElement( - void* pClientMem, // Pointer to client usable memory. - MemBlkType blockType) // Block type based on calling deallocation routine. -{ - void* pOrigPtr = nullptr; - - // Recall that this is our internal memory layout. See Alloc for more details. - // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) - uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); - auto*const pCurrent = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); - auto*const pCurrentNode = static_cast(VoidPtrDec(pCurrent, sizeof(MemTrackerList::Node))); - uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(pCurrent->size, sizeof(uint32)))); - - // We should not be trying to free something twice or trying to free something which has not been allocated - // by this MemTracker. We can verify both of these things by checking that the tracker's pList is equal to the - // MemTracker's list. - if (pCurrent->pList != &m_trackerList) - { - // A free was attempted on an unrecognized pointer. - PAL_DPERROR("Invalid Free Attempted with ptr = : (%#x)", pClientMem); - } - else if (pCurrent->blockType != blockType) - { - // We have a mismatch in the alloc/free pair, e.g. PAL_NEW with PAL_FREE etc. return early here without freeing - // the memory so it shows up as a leak. - PAL_DPERROR("Trying to Free %s as %s.", - MemBlkTypeStr[static_cast(pCurrent->blockType)], - MemBlkTypeStr[static_cast(blockType)]); - } - else - { - // We should check for memory corruption due to overflow or underflow before continuing because any underflow - // might indicate that our internal state is corrupted. This could lead to a crash in the code below. - for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) - { - PAL_ASSERT(*pUnderrun++ == UnderrunSentinel); - PAL_ASSERT(*pOverrun++ == OverrunSentinel); - } - - // Remove our tracker from the list and set it's pList to null to detect a double-free in the future. - MutexAuto lock(&m_mutex); - - m_trackerList.Erase(pCurrentNode); - - pCurrent->pList = nullptr; - pOrigPtr = pCurrent->pOrigMem; - } - - // Return a pointer to the actual allocated block. - return pOrigPtr; -} - -// ===================================================================================================================== -// Allocates a block of memory and tracks it using the memory tracker. -template -void* MemTracker::Alloc( - const AllocInfo& allocInfo) -{ - // Allocating zero bytes of memory results in undefined behavior. - PAL_ASSERT(allocInfo.bytes > 0); - - void* pMem = nullptr; - - // We want to allocate extra memory from the caller's allocator, in this layout: - // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) - // Here's why we need each of those sections: - // 1. align1 is zero or more bytes needed to align the client allocation and our internal data. - // 2. The MemTrackerList::Node object, which is used to link this allocation into m_trackerList. - // 3. The MemTrackerElem struct contains bookkeeping data we need to report memory errors. - // 4. The underflow and overflow trackers detect out of bounds writes. They are optional. - // 5. The client allocation, which is actually returned to the caller. - // 6. align2 is zero or more bytes needed to DWORD-align the overflow tracker. - constexpr size_t InternalAlignment = Max(alignof(MemTrackerList::Node), alignof(MemTrackerElem)); - const size_t paddedAlignBytes = Max(allocInfo.alignment, InternalAlignment); - const size_t paddedSizeBytes = (paddedAlignBytes + // 1 - sizeof(MemTrackerList::Node) + // 2 - sizeof(MemTrackerElem) + // 3 - m_markerSizeBytes + // 4.a - Pow2Align(allocInfo.bytes, sizeof(uint32)) + // 5 & 6 - m_markerSizeBytes); // 4.b - - const AllocInfo memTrackerInfo(paddedSizeBytes, paddedAlignBytes, allocInfo.zeroMem, allocInfo.allocType, - allocInfo.blockType, allocInfo.pFilename, allocInfo.lineNumber); - - pMem = m_pAllocator->Alloc(memTrackerInfo); - - if (pMem != nullptr) - { - // Don't bother adding a failed allocation to the Memtrack list. - pMem = AddMemElement(pMem, - allocInfo.bytes, - paddedAlignBytes, - allocInfo.blockType, - allocInfo.pFilename, - allocInfo.lineNumber); - } - - return pMem; -} - -// ===================================================================================================================== -// Frees a block of memory. The routine is called with the pointer to the client usable memory. -// -// See MemTracker::RemoveMemElement() which is used to validate the free. -template -void MemTracker::Free( - const FreeInfo& freeInfo) -{ - // Don't want to call RemoveMemElement if the ptr is null. - if (freeInfo.pClientMem != nullptr) - { - void* pMem = RemoveMemElement(freeInfo.pClientMem, freeInfo.blockType); - - // If this free call is valid (RemoveMemElement doesn't return nullptr), release the memory. - if (pMem != nullptr) - { - m_pAllocator->Free(FreeInfo(pMem, freeInfo.blockType)); - } - } -} - -// ===================================================================================================================== -// Frees all memory that has not been explicitly freed (in other words, memory that has leaked). This function is only -// expected to be called when the memory tracker is being destroyed. -template -void MemTracker::FreeLeakedMemory() -{ - for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); ) - { - MemTrackerElem*const pCurrent = iter.Get(); - - // Free will release the memory for tracking and the actual element. This will invalidate our list iterator - // unless we advance the iterator first. - iter.Next(); - - Free(FreeInfo(pCurrent->pClientMem, pCurrent->blockType)); - } -} - -// ===================================================================================================================== -// Outputs information about leaked memory by traversing the memory tracker list. -template -void MemTracker::MemoryReport() -{ - // When this env var is set to non-zero, don't report leaks. - // Useful for crashing apps that don't give us a chance to clean up. - const char* pToggle = getenv("AMDPAL_NO_LEAK_REPORT"); - - if ((pToggle == nullptr) || (atoi(pToggle) == 0)) - { - PAL_DPWARN("================ List of Leaked Blocks ================"); - - for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); iter.Next()) - { - MemTrackerElem*const pCurrent = iter.Get(); - - PAL_DPWARN( - "ClientMem = 0x%p, AllocSize = %8d, MemBlkType = %s, File = %-15s, LineNumber = %8d, AllocNum = %8d", - pCurrent->pClientMem, - pCurrent->size, - MemBlkTypeStr[static_cast(pCurrent->blockType)], - pCurrent->pFilename, - pCurrent->lineNumber, - pCurrent->allocNum); - } - - PAL_DPWARN("================ End of List ==========================="); - } -} - -} // Util - -#endif +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMemTrackerImpl.h + * @brief PAL utility collection MemTracker class implementations. + *********************************************************************************************************************** + */ + +#pragma once + +#if PAL_MEMTRACK + +#include "palIntrusiveListImpl.h" +#include "palMemTracker.h" +#include "palSysMemory.h" + +#include + +namespace Util +{ + +/// Table to convert a blockType to a string. Used by the logging routines. +constexpr const char* MemBlkTypeStr[] = +{ + "Malloc", ///< MemBlkType::Malloc + "New", ///< MemBlkType::New + "NewArray", ///< MemBlkType::NewArray +}; + +// ===================================================================================================================== +template +MemTracker::MemTracker( + Allocator*const pAllocator) + : + m_markerSizeUints(MarkerSizeUints), + m_markerSizeBytes(MarkerSizeBytes), + m_pAllocator(pAllocator), + m_nextAllocNum(1), + m_breakOnAllocNum(0) +{ +} + +// ===================================================================================================================== +template +MemTracker::~MemTracker() +{ + // Clean-up leaked memory if needed + if (m_trackerList.IsEmpty() == false) + { + // If the list isn't empty, we have a leak. The leak could either be caused by an internal PAL leak, + // a client leak, or even the application not destroying API objects. + PAL_ALERT_ALWAYS(); + + // Dump out a list of unfreed blocks. + MemoryReport(); + + FreeLeakedMemory(); + } +} + +// ===================================================================================================================== +template +Result MemTracker::Init() +{ + return Result::Success; +} + +// ===================================================================================================================== +// Adds the newly allocated memory block to the list of blocks for tracking. +// +// The tracking information includes things like filename, line numbers, and type of block. Also, given a pointer, +// adds the Underrun/Overrun markers to the memory allocated, and return a pointer to the actual client usable memory. +// +// See MemTracker::Alloc() which is used to allocate memory that is being tracked. +template +void* MemTracker::AddMemElement( + void* pMem, // [in,out] Original pointer allocated by MemTracker::Alloc. + size_t bytes, // Client requested allocation size in bytes. + size_t align, // The max of the client-requested alignment or the internal alignment, in bytes. + MemBlkType blockType, // Block type based on calling allocation routine. + const char* pFilename, // Client filename that is requesting the memory. + uint32 lineNumber) // Line number in client file that is requesting the memory. +{ + // Our internal data is all relative to the client pointer so find that first. See Alloc for more details. + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + constexpr size_t InternalSize = sizeof(MemTrackerList::Node) + sizeof(MemTrackerElem); + + void*const pClientMem = VoidPtrAlign(VoidPtrInc(pMem, m_markerSizeBytes + InternalSize), align); + uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); + uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(bytes, sizeof(uint32)))); + + auto*const pNewElement = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); + void*const pNewNodeMem = VoidPtrDec(pNewElement, sizeof(MemTrackerList::Node)); + auto*const pNewNode = PAL_PLACEMENT_NEW(pNewNodeMem) MemTrackerList::Node(pNewElement); + + // Mark the memory with the underrun/overrun marker. + for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) + { + *pUnderrun++ = UnderrunSentinel; + *pOverrun++ = OverrunSentinel; + } + + pNewElement->size = bytes; + pNewElement->pFilename = pFilename; + pNewElement->lineNumber = lineNumber; + pNewElement->blockType = blockType; + pNewElement->pClientMem = pClientMem; + pNewElement->pOrigMem = pMem; + pNewElement->pList = &m_trackerList; + + MutexAuto lock(&m_mutex); + + // Trigger an assert if we're about to allocate the break-on-allocation number. + if (m_nextAllocNum == m_breakOnAllocNum) + { + PAL_ASSERT_ALWAYS(); + } + + pNewElement->allocNum = m_nextAllocNum; + ++m_nextAllocNum; + + m_trackerList.PushFront(pNewNode); + + return pClientMem; +} + +// ===================================================================================================================== +// Removes an allocated block from the list of blocks used for tracking. +// +// The routine checks for invalid frees (and duplicate frees). Also, the routine is able to detect mismatched alloc/free +// usage based on the blockType. The routine is called with the pointer to the client usable memory and returns the +// pointer to the allocated memory. +// +// See MemTracker::Free() which is used to free memory that is being tracked. +template +void* MemTracker::RemoveMemElement( + void* pClientMem, // Pointer to client usable memory. + MemBlkType blockType) // Block type based on calling deallocation routine. +{ + void* pOrigPtr = nullptr; + + // Recall that this is our internal memory layout. See Alloc for more details. + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + uint32* pUnderrun = static_cast(VoidPtrDec(pClientMem, m_markerSizeBytes)); + auto*const pCurrent = static_cast(VoidPtrDec(pUnderrun, sizeof(MemTrackerElem))); + auto*const pCurrentNode = static_cast(VoidPtrDec(pCurrent, sizeof(MemTrackerList::Node))); + uint32* pOverrun = static_cast(VoidPtrInc(pClientMem, Pow2Align(pCurrent->size, sizeof(uint32)))); + + // We should not be trying to free something twice or trying to free something which has not been allocated + // by this MemTracker. We can verify both of these things by checking that the tracker's pList is equal to the + // MemTracker's list. + if (pCurrent->pList != &m_trackerList) + { + // A free was attempted on an unrecognized pointer. + PAL_DPERROR("Invalid Free Attempted with ptr = : (%#x)", pClientMem); + } + else if (pCurrent->blockType != blockType) + { + // We have a mismatch in the alloc/free pair, e.g. PAL_NEW with PAL_FREE etc. return early here without freeing + // the memory so it shows up as a leak. + PAL_DPERROR("Trying to Free %s as %s.", + MemBlkTypeStr[static_cast(pCurrent->blockType)], + MemBlkTypeStr[static_cast(blockType)]); + } + else + { + // We should check for memory corruption due to overflow or underflow before continuing because any underflow + // might indicate that our internal state is corrupted. This could lead to a crash in the code below. + for (uint32 markerUints = 0; markerUints < m_markerSizeUints; ++markerUints) + { + PAL_ASSERT(*pUnderrun++ == UnderrunSentinel); + PAL_ASSERT(*pOverrun++ == OverrunSentinel); + } + + // Remove our tracker from the list and set it's pList to null to detect a double-free in the future. + MutexAuto lock(&m_mutex); + + m_trackerList.Erase(pCurrentNode); + + pCurrent->pList = nullptr; + pOrigPtr = pCurrent->pOrigMem; + } + + // Return a pointer to the actual allocated block. + return pOrigPtr; +} + +// ===================================================================================================================== +// Allocates a block of memory and tracks it using the memory tracker. +template +void* MemTracker::Alloc( + const AllocInfo& allocInfo) +{ + // Allocating zero bytes of memory results in undefined behavior. + PAL_ASSERT(allocInfo.bytes > 0); + + void* pMem = nullptr; + + // We want to allocate extra memory from the caller's allocator, in this layout: + // (align1)(MemTrackerList::Node)(MemTrackerElem)(underflow tracker)(client allocation)(align2)(overflow tracker) + // Here's why we need each of those sections: + // 1. align1 is zero or more bytes needed to align the client allocation and our internal data. + // 2. The MemTrackerList::Node object, which is used to link this allocation into m_trackerList. + // 3. The MemTrackerElem struct contains bookkeeping data we need to report memory errors. + // 4. The underflow and overflow trackers detect out of bounds writes. They are optional. + // 5. The client allocation, which is actually returned to the caller. + // 6. align2 is zero or more bytes needed to DWORD-align the overflow tracker. + constexpr size_t InternalAlignment = Max(alignof(MemTrackerList::Node), alignof(MemTrackerElem)); + const size_t paddedAlignBytes = Max(allocInfo.alignment, InternalAlignment); + const size_t paddedSizeBytes = (paddedAlignBytes + // 1 + sizeof(MemTrackerList::Node) + // 2 + sizeof(MemTrackerElem) + // 3 + m_markerSizeBytes + // 4.a + Pow2Align(allocInfo.bytes, sizeof(uint32)) + // 5 & 6 + m_markerSizeBytes); // 4.b + + const AllocInfo memTrackerInfo(paddedSizeBytes, paddedAlignBytes, allocInfo.zeroMem, allocInfo.allocType, + allocInfo.blockType, allocInfo.pFilename, allocInfo.lineNumber); + + pMem = m_pAllocator->Alloc(memTrackerInfo); + + if (pMem != nullptr) + { + // Don't bother adding a failed allocation to the Memtrack list. + pMem = AddMemElement(pMem, + allocInfo.bytes, + paddedAlignBytes, + allocInfo.blockType, + allocInfo.pFilename, + allocInfo.lineNumber); + } + + return pMem; +} + +// ===================================================================================================================== +// Frees a block of memory. The routine is called with the pointer to the client usable memory. +// +// See MemTracker::RemoveMemElement() which is used to validate the free. +template +void MemTracker::Free( + const FreeInfo& freeInfo) +{ + // Don't want to call RemoveMemElement if the ptr is null. + if (freeInfo.pClientMem != nullptr) + { + void* pMem = RemoveMemElement(freeInfo.pClientMem, freeInfo.blockType); + + // If this free call is valid (RemoveMemElement doesn't return nullptr), release the memory. + if (pMem != nullptr) + { + m_pAllocator->Free(FreeInfo(pMem, freeInfo.blockType)); + } + } +} + +// ===================================================================================================================== +// Frees all memory that has not been explicitly freed (in other words, memory that has leaked). This function is only +// expected to be called when the memory tracker is being destroyed. +template +void MemTracker::FreeLeakedMemory() +{ + for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); ) + { + MemTrackerElem*const pCurrent = iter.Get(); + + // Free will release the memory for tracking and the actual element. This will invalidate our list iterator + // unless we advance the iterator first. + iter.Next(); + + Free(FreeInfo(pCurrent->pClientMem, pCurrent->blockType)); + } +} + +// ===================================================================================================================== +// Outputs information about leaked memory by traversing the memory tracker list. +template +void MemTracker::MemoryReport() +{ + // When this env var is set to non-zero, don't report leaks. + // Useful for crashing apps that don't give us a chance to clean up. + const char* pToggle = getenv("AMDPAL_NO_LEAK_REPORT"); + + if ((pToggle == nullptr) || (atoi(pToggle) == 0)) + { + PAL_DPWARN("================ List of Leaked Blocks ================"); + + for (MemTrackerList::Iter iter = m_trackerList.Begin(); iter.IsValid(); iter.Next()) + { + MemTrackerElem*const pCurrent = iter.Get(); + + PAL_DPWARN( + "ClientMem = 0x%p, AllocSize = %8d, MemBlkType = %s, File = %-15s, LineNumber = %8d, AllocNum = %8d", + pCurrent->pClientMem, + pCurrent->size, + MemBlkTypeStr[static_cast(pCurrent->blockType)], + pCurrent->pFilename, + pCurrent->lineNumber, + pCurrent->allocNum); + } + + PAL_DPWARN("================ End of List ==========================="); + } +} + +} // Util + +#endif diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h b/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h index 2006a366fb..147d86c234 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palMutex.h @@ -1,398 +1,398 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palMutex.h - * @brief PAL utility collection Mutex and MutexAuto class declarations. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palAssert.h" - -#if defined(_WIN32) -// WIN32_NO_STATUS prevents winnt.h from re-defining NTSTATUS macros that cause build warnings if -// ntstatus.h is also included. -#define WIN32_NO_STATUS -#include -#undef WIN32_NO_STATUS -#else -#include -#include - -#endif - -namespace Util -{ - -/** - *********************************************************************************************************************** - * @brief Platform-agnostic mutex primitive. - *********************************************************************************************************************** - */ -class Mutex -{ -public: -#if defined(_WIN32) - /// Defines MutexData as a Windows CRITICAL_SECTION - typedef CRITICAL_SECTION MutexData; - Mutex() noexcept : m_osMutex {} { InitializeCriticalSection(&m_osMutex); } - ~Mutex() { DeleteCriticalSection(&m_osMutex); }; -#else - /// Defines MutexData as a unix pthread_mutex_t - typedef pthread_mutex_t MutexData; - Mutex() noexcept : m_osMutex {} { pthread_mutex_init(&m_osMutex, nullptr); } - ~Mutex() { pthread_mutex_destroy(&m_osMutex); }; -#endif - - /// Enters the critical section if it is not contended. If it is contended, wait for the critical section to become - /// available, then enter it. - void Lock(); - - /// Enters the critical section if it is not contended. Does not wait for the critical section to become available - /// if it is contended. - /// - /// @returns True if the critical section was entered, false otherwise. - bool TryLock(); - - /// Leaves the critical section. - void Unlock(); - - /// Returns the OS specific mutex data. - MutexData* GetMutexData() { return &m_osMutex; } - -private: - MutexData m_osMutex; ///< Opaque structure to the OS-specific Mutex data - - PAL_DISALLOW_COPY_AND_ASSIGN(Mutex); -}; - -/** - *********************************************************************************************************************** - * @brief A "resource acquisition is initialization" (RAII) wrapper for the Mutex class. - * - * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and - * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use - * this class for stack-allocated objects. - * - * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope - * will be protected from access by multiple threads. See the below example. - * - * [Code not protected] - * { - * [Code not protected] - * MutexAuto lock(pPtrToMutex); - * [Code is protected] - * } - * [Code not protected] - *********************************************************************************************************************** - */ -class MutexAuto -{ -public: - /// Locks the given Mutex. - explicit MutexAuto(Mutex* pMutex) : m_pMutex(pMutex) - { - PAL_ASSERT(m_pMutex != nullptr); - m_pMutex->Lock(); - } - - /// Unlocks the Mutex we locked in the constructor. - ~MutexAuto() - { - m_pMutex->Unlock(); - } - -private: - Mutex* const m_pMutex; ///< The Mutex which this object wraps. - - PAL_DISALLOW_DEFAULT_CTOR(MutexAuto); - PAL_DISALLOW_COPY_AND_ASSIGN(MutexAuto); -}; - -/** - *********************************************************************************************************************** - * @brief Platform-agnostic rw lock primitive. - *********************************************************************************************************************** - */ -class RWLock -{ -public: -#if defined(_WIN32) - /// Defines RWLockData as a Windows RWLOCK - typedef SRWLOCK RWLockData; - RWLock() noexcept : m_osRWLock {} { InitializeSRWLock(&m_osRWLock); } - ~RWLock() noexcept { /* No Win32 destory function */ }; -#else - /// Defines RWLockData as a unix pthread_rwlock_t - typedef pthread_rwlock_t RWLockData; - /// @note pthread_rwlock_init will not fail as called - RWLock() noexcept : m_osRWLock {} { pthread_rwlock_init(&m_osRWLock, nullptr); } - ~RWLock() noexcept { pthread_rwlock_destroy(&m_osRWLock); }; -#endif - - /// Enumerates the lock type of RWLockAuto - enum LockType - { - ReadOnly = 0, ///< Lock in readonly mode, in other words shared mode. - ReadWrite ///< Lock in readwrite mode, in other words exclusive mode. - }; - - /// Acquires a rw lock in shared mode if it is not contended in exclusive mode. - /// If it is contended, wait for rw lock to become available, then enter it. - void LockForRead(); - - /// Acquires a rw lock in exclusive mode if it is not contended. - /// If it is contended, wait for rw lock to become available, then enter it. - void LockForWrite(); - - /// Try to acquires a rw lock in shared mode if it is not contended in exclusive mode. - /// Does not wait for the rw lock to become available. - /// @returns True if the rw lock was acquired, false otherwise. - bool TryLockForRead(); - - /// Try to acquires a rw lock in exclusive mode if it is not contended. - /// Does not wait for the rw lock to become available. - /// @returns True if the rw lock was acquired, false otherwise. - bool TryLockForWrite(); - - /// Release the rw lock which is previously contended in shared mode. - void UnlockForRead(); - - /// Release the rw lock which is previously contended in exclusive mode. - void UnlockForWrite(); - - /// Returns the OS specific RWLOCK data. - RWLockData* GetRWLockData() { return &m_osRWLock; } - -private: - RWLockData m_osRWLock; ///< Opaque structure to the OS-specific RWLock data - - PAL_DISALLOW_COPY_AND_ASSIGN(RWLock); -}; - -/** - *********************************************************************************************************************** - * @brief A "resource acquisition is initialization" (RAII) wrapper for the RWLock class. - * - * The RAII paradigm allows rw lcok to be automatically acquired during this class' constructor, and - * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use - * this class for stack-allocated objects. - * - * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope - * will be protected from access by multiple threads. See the below example. - * - * [Code not protected] - * { - * [Code not protected] - * RWLockAuto lock(pPtrToMutex, type); - * [Code is protected] - * } - * [Code not protected] - *********************************************************************************************************************** - */ -template -class RWLockAuto -{ -public: - /// Locks the given RWLock. - explicit RWLockAuto(RWLock* pRWLock) : m_pRWLock(pRWLock) - { - PAL_ASSERT(m_pRWLock != nullptr); - if (type == RWLock::ReadOnly) - { - m_pRWLock->LockForRead(); - } - else - { - m_pRWLock->LockForWrite(); - } - } - - /// Unlocks the RWLock we locked in the constructor. - ~RWLockAuto() - { - if (type == RWLock::ReadOnly) - { - m_pRWLock->UnlockForRead(); - } - else - { - m_pRWLock->UnlockForWrite(); - } - } - -private: - RWLock* const m_pRWLock; ///< The RWLock which this object wraps. - - PAL_DISALLOW_DEFAULT_CTOR(RWLockAuto); - PAL_DISALLOW_COPY_AND_ASSIGN(RWLockAuto); -}; - -/// Yields the current thread to another thread in the ready state (if available). -extern void YieldThread(); - -/// Atomic write of 64-bit unsigned integer, using a relaxed memory ordering policy. -/// If you need to synchronize more than just pTarget, you may need a new function. -/// -/// @param [in] pTarget Pointer to the value to be read. -/// -/// @returns The original value of *pTarget. -extern void AtomicWriteRelaxed64(volatile uint64* pTarget, uint64 newValue); - -/// Atomic read of 64-bit unsigned integer, using a relaxed memory ordering policy. -/// If you need to synchronize more than just pTarget, you may need a new function. -/// -/// @param [in] pTarget Pointer to the value to be read. -/// -/// @returns The original value of *pTarget. -extern uint64 AtomicReadRelaxed64(const volatile uint64* pTarget); - -/// Atomically increments the specified 32-bit unsigned integer. -/// -/// @param [in,out] pValue Pointer to the value to be incremented. -/// -/// @returns Result of the increment operation. -extern uint32 AtomicIncrement(volatile uint32* pValue); - -/// Atomically increment a 64-bit-unsigned integer -/// -/// @param [in,out] pAddend Pointer to the value to be incremented -/// -/// @returns Result of the increment operation. -extern uint64 AtomicIncrement64(volatile uint64* pAddend); - -/// Atomically decrements the specified 32-bit unsigned integer. -/// -/// @param [in,out] pValue Pointer to the value to be decremented. -/// -/// @returns Result of the decrement operation. -extern uint32 AtomicDecrement(volatile uint32* pValue); - -/// Atomically decrements the specified 64-bit unsigned integer. -/// -/// @param [in,out] pValue Pointer to the value to be decremented. -/// -/// @returns Result of the decrement operation. -extern uint32 AtomicDecrement64(volatile uint64* pValue); - -/// Performs an atomic compare and swap operation on two 32-bit unsigned integers. This operation compares *pTarget -/// with oldValue and replaces it with newValue if they match. If the values don't match, no action is taken. -/// The original value of *pTarget is returned as a result. -/// -/// @param [in,out] pTarget Pointer to the destination value of the operation. -/// @param [in] oldValue Value to compare *pTarget to. -/// @param [in] newValue Value to replace *pTarget with if *pTarget matches oldValue. -/// -/// @returns Previous value at *pTarget. -extern uint32 AtomicCompareAndSwap(volatile uint32* pTarget, uint32 oldValue, uint32 newValue); - -/// Atomically exchanges a pair of 32-bit unsigned integers. -/// -/// @param [in,out] pTarget Pointer to the destination value of the operation. -/// @param [in] value New value to be stored in *pTarget. -/// -/// @returns Previous value at *pTarget. -extern uint32 AtomicExchange(volatile uint32* pTarget, uint32 value); - -/// Atomically exchanges a pair of 64-bit unsigned integers. -/// -/// @param [in,out] pTarget Pointer to the destination value of the operation. -/// @param [in] value New value to be stored in *pTarget. -/// -/// @returns Previous value at *pTarget. -extern uint64 AtomicExchange64(volatile uint64* pTarget, uint64 value); - -/// Atomically exchanges a pair of pointers. -/// -/// @param [in,out] ppTarget Pointer to the address to exchange. The function sets the address pointed to by *ppTarget -/// to pValue. -/// @param [in] pValue New pointer to be stored in *ppTarget. -/// -/// @returns Previous value at *ppTarget. -extern void* AtomicExchangePointer(void*volatile* ppTarget, void* pValue); - -/// Performs an atomic compare and swap operation on a pair of pointers. This operation compares *ppTarget -/// with pOldValue and replaces it with pNewValue if they match. If the values don't match, no action is taken. -/// The original value of *ppTarget is returned as a result. -/// -/// @param [in,out] ppTarget Pointer to the destination value of the operation. - -/// @param [in] pOldValue Old pointer to compare *ppTarget to. -/// @param [in] pNewValue New pointer to replace *ppTarget with if *ppTarget matches pOldValue. -/// -/// @returns Previous value at *ppTarget. -extern void* AtomicCompareExchangePointer(void*volatile* ppTarget, void* pOldValue, void* pNewValue); - -/// Atomically add a value to the specific 32-bit unsigned integer. -/// -/// @param [in,out] pAddend Pointer to the value to be modified. -/// @param [in] value Value to add to *pAddend. -/// -/// @returns Result of the add operation. -extern uint32 AtomicAdd(volatile uint32* pAddend, uint32 value); - -/// Atomically add a value to the specified 64-bit unsigned integer. -/// -/// @param [in,out] pAddend Pointer to the value to be modified. -/// @param [in] value Value to add to *pAddend. -/// -/// @returns Result of the add operation. -extern uint64 AtomicAdd64(volatile uint64* pAddend, uint64 value); - -/// Atomically OR a value to the specific 32-bit unsigned integer. -/// -/// @param [in,out] pTarget Pointer to the value to be modified. -/// @param [in] value Value to OR to *pTarget. -/// -/// @returns The original value of *pTarget. -extern uint32 AtomicOr(volatile uint32* pTarget, uint32 value); - -/// Atomically OR a value to the specified 64-bit unsigned integer. -/// -/// @param [in,out] pTarget Pointer to the value to be modified. -/// @param [in] value Value to OR to *pTarget. -/// -/// @returns The original value of *pTarget. -extern uint64 AtomicOr64(volatile uint64* pTarget, uint64 value); - -/// Atomically AND a value to the specific 32-bit unsigned integer. -/// -/// @param [in,out] pTarget Pointer to the value to be modified. -/// @param [in] value Value to AND to *pTarget. -/// -/// @returns The original value of *pTarget. -extern uint32 AtomicAnd(volatile uint32* pTarget, uint32 value); - -/// Atomically AND a value to the specified 64-bit unsigned integer. -/// -/// @param [in,out] pTarget Pointer to the value to be modified. -/// @param [in] value Value to AND to *pTarget. -/// -/// @returns The original value of *pTarget. -extern uint64 AtomicAnd64(volatile uint64* pTarget, uint64 value); - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palMutex.h + * @brief PAL utility collection Mutex and MutexAuto class declarations. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" + +#if defined(_WIN32) +// WIN32_NO_STATUS prevents winnt.h from re-defining NTSTATUS macros that cause build warnings if +// ntstatus.h is also included. +#define WIN32_NO_STATUS +#include +#undef WIN32_NO_STATUS +#else +#include +#include + +#endif + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Platform-agnostic mutex primitive. + *********************************************************************************************************************** + */ +class Mutex +{ +public: +#if defined(_WIN32) + /// Defines MutexData as a Windows CRITICAL_SECTION + typedef CRITICAL_SECTION MutexData; + Mutex() noexcept : m_osMutex {} { InitializeCriticalSection(&m_osMutex); } + ~Mutex() { DeleteCriticalSection(&m_osMutex); }; +#else + /// Defines MutexData as a unix pthread_mutex_t + typedef pthread_mutex_t MutexData; + Mutex() noexcept : m_osMutex {} { pthread_mutex_init(&m_osMutex, nullptr); } + ~Mutex() { pthread_mutex_destroy(&m_osMutex); }; +#endif + + /// Enters the critical section if it is not contended. If it is contended, wait for the critical section to become + /// available, then enter it. + void Lock(); + + /// Enters the critical section if it is not contended. Does not wait for the critical section to become available + /// if it is contended. + /// + /// @returns True if the critical section was entered, false otherwise. + bool TryLock(); + + /// Leaves the critical section. + void Unlock(); + + /// Returns the OS specific mutex data. + MutexData* GetMutexData() { return &m_osMutex; } + +private: + MutexData m_osMutex; ///< Opaque structure to the OS-specific Mutex data + + PAL_DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the Mutex class. + * + * The RAII paradigm allows critical sections to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope + * will be protected from access by multiple threads. See the below example. + * + * [Code not protected] + * { + * [Code not protected] + * MutexAuto lock(pPtrToMutex); + * [Code is protected] + * } + * [Code not protected] + *********************************************************************************************************************** + */ +class MutexAuto +{ +public: + /// Locks the given Mutex. + explicit MutexAuto(Mutex* pMutex) : m_pMutex(pMutex) + { + PAL_ASSERT(m_pMutex != nullptr); + m_pMutex->Lock(); + } + + /// Unlocks the Mutex we locked in the constructor. + ~MutexAuto() + { + m_pMutex->Unlock(); + } + +private: + Mutex* const m_pMutex; ///< The Mutex which this object wraps. + + PAL_DISALLOW_DEFAULT_CTOR(MutexAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(MutexAuto); +}; + +/** + *********************************************************************************************************************** + * @brief Platform-agnostic rw lock primitive. + *********************************************************************************************************************** + */ +class RWLock +{ +public: +#if defined(_WIN32) + /// Defines RWLockData as a Windows RWLOCK + typedef SRWLOCK RWLockData; + RWLock() noexcept : m_osRWLock {} { InitializeSRWLock(&m_osRWLock); } + ~RWLock() noexcept { /* No Win32 destory function */ }; +#else + /// Defines RWLockData as a unix pthread_rwlock_t + typedef pthread_rwlock_t RWLockData; + /// @note pthread_rwlock_init will not fail as called + RWLock() noexcept : m_osRWLock {} { pthread_rwlock_init(&m_osRWLock, nullptr); } + ~RWLock() noexcept { pthread_rwlock_destroy(&m_osRWLock); }; +#endif + + /// Enumerates the lock type of RWLockAuto + enum LockType + { + ReadOnly = 0, ///< Lock in readonly mode, in other words shared mode. + ReadWrite ///< Lock in readwrite mode, in other words exclusive mode. + }; + + /// Acquires a rw lock in shared mode if it is not contended in exclusive mode. + /// If it is contended, wait for rw lock to become available, then enter it. + void LockForRead(); + + /// Acquires a rw lock in exclusive mode if it is not contended. + /// If it is contended, wait for rw lock to become available, then enter it. + void LockForWrite(); + + /// Try to acquires a rw lock in shared mode if it is not contended in exclusive mode. + /// Does not wait for the rw lock to become available. + /// @returns True if the rw lock was acquired, false otherwise. + bool TryLockForRead(); + + /// Try to acquires a rw lock in exclusive mode if it is not contended. + /// Does not wait for the rw lock to become available. + /// @returns True if the rw lock was acquired, false otherwise. + bool TryLockForWrite(); + + /// Release the rw lock which is previously contended in shared mode. + void UnlockForRead(); + + /// Release the rw lock which is previously contended in exclusive mode. + void UnlockForWrite(); + + /// Returns the OS specific RWLOCK data. + RWLockData* GetRWLockData() { return &m_osRWLock; } + +private: + RWLockData m_osRWLock; ///< Opaque structure to the OS-specific RWLock data + + PAL_DISALLOW_COPY_AND_ASSIGN(RWLock); +}; + +/** + *********************************************************************************************************************** + * @brief A "resource acquisition is initialization" (RAII) wrapper for the RWLock class. + * + * The RAII paradigm allows rw lcok to be automatically acquired during this class' constructor, and + * automatically released when a stack-allocated wrapper object goes out-of-scope. As such, it only makes sense to use + * this class for stack-allocated objects. + * + * This object will ensure that anything between when the object is allocated on the stack and when it goes out of scope + * will be protected from access by multiple threads. See the below example. + * + * [Code not protected] + * { + * [Code not protected] + * RWLockAuto lock(pPtrToMutex, type); + * [Code is protected] + * } + * [Code not protected] + *********************************************************************************************************************** + */ +template +class RWLockAuto +{ +public: + /// Locks the given RWLock. + explicit RWLockAuto(RWLock* pRWLock) : m_pRWLock(pRWLock) + { + PAL_ASSERT(m_pRWLock != nullptr); + if (type == RWLock::ReadOnly) + { + m_pRWLock->LockForRead(); + } + else + { + m_pRWLock->LockForWrite(); + } + } + + /// Unlocks the RWLock we locked in the constructor. + ~RWLockAuto() + { + if (type == RWLock::ReadOnly) + { + m_pRWLock->UnlockForRead(); + } + else + { + m_pRWLock->UnlockForWrite(); + } + } + +private: + RWLock* const m_pRWLock; ///< The RWLock which this object wraps. + + PAL_DISALLOW_DEFAULT_CTOR(RWLockAuto); + PAL_DISALLOW_COPY_AND_ASSIGN(RWLockAuto); +}; + +/// Yields the current thread to another thread in the ready state (if available). +extern void YieldThread(); + +/// Atomic write of 64-bit unsigned integer, using a relaxed memory ordering policy. +/// If you need to synchronize more than just pTarget, you may need a new function. +/// +/// @param [in] pTarget Pointer to the value to be read. +/// +/// @returns The original value of *pTarget. +extern void AtomicWriteRelaxed64(volatile uint64* pTarget, uint64 newValue); + +/// Atomic read of 64-bit unsigned integer, using a relaxed memory ordering policy. +/// If you need to synchronize more than just pTarget, you may need a new function. +/// +/// @param [in] pTarget Pointer to the value to be read. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicReadRelaxed64(const volatile uint64* pTarget); + +/// Atomically increments the specified 32-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be incremented. +/// +/// @returns Result of the increment operation. +extern uint32 AtomicIncrement(volatile uint32* pValue); + +/// Atomically increment a 64-bit-unsigned integer +/// +/// @param [in,out] pAddend Pointer to the value to be incremented +/// +/// @returns Result of the increment operation. +extern uint64 AtomicIncrement64(volatile uint64* pAddend); + +/// Atomically decrements the specified 32-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be decremented. +/// +/// @returns Result of the decrement operation. +extern uint32 AtomicDecrement(volatile uint32* pValue); + +/// Atomically decrements the specified 64-bit unsigned integer. +/// +/// @param [in,out] pValue Pointer to the value to be decremented. +/// +/// @returns Result of the decrement operation. +extern uint32 AtomicDecrement64(volatile uint64* pValue); + +/// Performs an atomic compare and swap operation on two 32-bit unsigned integers. This operation compares *pTarget +/// with oldValue and replaces it with newValue if they match. If the values don't match, no action is taken. +/// The original value of *pTarget is returned as a result. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] oldValue Value to compare *pTarget to. +/// @param [in] newValue Value to replace *pTarget with if *pTarget matches oldValue. +/// +/// @returns Previous value at *pTarget. +extern uint32 AtomicCompareAndSwap(volatile uint32* pTarget, uint32 oldValue, uint32 newValue); + +/// Atomically exchanges a pair of 32-bit unsigned integers. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] value New value to be stored in *pTarget. +/// +/// @returns Previous value at *pTarget. +extern uint32 AtomicExchange(volatile uint32* pTarget, uint32 value); + +/// Atomically exchanges a pair of 64-bit unsigned integers. +/// +/// @param [in,out] pTarget Pointer to the destination value of the operation. +/// @param [in] value New value to be stored in *pTarget. +/// +/// @returns Previous value at *pTarget. +extern uint64 AtomicExchange64(volatile uint64* pTarget, uint64 value); + +/// Atomically exchanges a pair of pointers. +/// +/// @param [in,out] ppTarget Pointer to the address to exchange. The function sets the address pointed to by *ppTarget +/// to pValue. +/// @param [in] pValue New pointer to be stored in *ppTarget. +/// +/// @returns Previous value at *ppTarget. +extern void* AtomicExchangePointer(void*volatile* ppTarget, void* pValue); + +/// Performs an atomic compare and swap operation on a pair of pointers. This operation compares *ppTarget +/// with pOldValue and replaces it with pNewValue if they match. If the values don't match, no action is taken. +/// The original value of *ppTarget is returned as a result. +/// +/// @param [in,out] ppTarget Pointer to the destination value of the operation. + +/// @param [in] pOldValue Old pointer to compare *ppTarget to. +/// @param [in] pNewValue New pointer to replace *ppTarget with if *ppTarget matches pOldValue. +/// +/// @returns Previous value at *ppTarget. +extern void* AtomicCompareExchangePointer(void*volatile* ppTarget, void* pOldValue, void* pNewValue); + +/// Atomically add a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pAddend Pointer to the value to be modified. +/// @param [in] value Value to add to *pAddend. +/// +/// @returns Result of the add operation. +extern uint32 AtomicAdd(volatile uint32* pAddend, uint32 value); + +/// Atomically add a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pAddend Pointer to the value to be modified. +/// @param [in] value Value to add to *pAddend. +/// +/// @returns Result of the add operation. +extern uint64 AtomicAdd64(volatile uint64* pAddend, uint64 value); + +/// Atomically OR a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to OR to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint32 AtomicOr(volatile uint32* pTarget, uint32 value); + +/// Atomically OR a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to OR to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicOr64(volatile uint64* pTarget, uint64 value); + +/// Atomically AND a value to the specific 32-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to AND to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint32 AtomicAnd(volatile uint32* pTarget, uint32 value); + +/// Atomically AND a value to the specified 64-bit unsigned integer. +/// +/// @param [in,out] pTarget Pointer to the value to be modified. +/// @param [in] value Value to AND to *pTarget. +/// +/// @returns The original value of *pTarget. +extern uint64 AtomicAnd64(volatile uint64* pTarget, uint64 value); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h b/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h index a71b6d6f41..8bd28fa554 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSpan.h @@ -1,447 +1,447 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file palSpan.h -* @brief PAL utility collection Span class declaration. -*********************************************************************************************************************** -*/ - -#pragma once - -#include "palUtil.h" -#include "palAssert.h" -#include "palSysMemory.h" -#include "palInlineFuncs.h" -#include - -namespace Util -{ - -/** - *********************************************************************************************************************** - * @brief Span container - * - * Span is an array with a length, where the data is not owned by the Span object. It is similar to C++20 std::span, - * but only the dynamic extent variant. It is similar to LLVM MutableArrayRef and ArrayRef. A Span is intended to - * be passed around by value. - * - *********************************************************************************************************************** - */ -template -class Span -{ -public: - /// Constructor from nothing. This allows you to use {} to mean an empty Span. - constexpr Span() : m_pData(nullptr), m_numElements(0) {} - - /// Constructor from pointer and length - /// - /// @param [in] data Pointer to the start of the array - /// @param numElements Number of elements in the array - constexpr Span(T* pData, size_t numElements) : m_pData(pData), m_numElements(numElements) {} - - /// Copy constructor - /// - /// @param [in] src Other Span to copy from - constexpr Span(const Span& src) : m_pData(src.m_pData), m_numElements(src.m_numElements) {} - - /// Constructor from C++ array - /// - /// @param [in] src C++ array - template constexpr Span(T(& src)[NumElements]) : m_pData(&src[0]), m_numElements(NumElements) {} - - /// Constructor from single element - /// - /// @param [in] src Single element - constexpr Span(T& src) : m_pData(&src), m_numElements(1) {} - - /// Implicitly convert a Span to its const-element equivalent. - /// - /// @returns The same span, but with const element type - constexpr operator Span() const { return Span(m_pData, m_numElements); } - - /// Assignment operator - /// - /// @param [in] src Other Span to copy from - constexpr Span& operator=(const Span& src) { - m_pData = src.m_pData; - m_numElements = src.m_numElements; - return *this; - } - - ///@{ - /// Returns the element at the location specified. - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - constexpr T& At(size_t index) const - { - PAL_CONSTEXPR_ASSERT(index < m_numElements); - return *(m_pData + index); - } - - constexpr T& operator[](size_t index) const noexcept { return At(index); } - ///@} - - /// Returns the data at the front of the vector. - /// - /// @returns The data at the front of the vector. - constexpr T& Front() const - { - PAL_CONSTEXPR_ASSERT(IsEmpty() == false); - return *m_pData; - } - - /// Returns the data at the back of the vector. - /// - /// @returns The data at the back of the vector. - constexpr T& Back() const - { - PAL_CONSTEXPR_ASSERT(IsEmpty() == false); - return *(m_pData + (m_numElements - 1)); - } - - /// Returns an iterator to the first element of the vector. - /// - /// @returns An iterator to first element of the vector. - constexpr T* Begin() const { return m_pData; } - - /// Returns an iterator beyond the last element of the vector. (NOT at the last element like Util::Vector::End()!) - /// - /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! - /// - /// @returns VectorIterator An iterator to last element of the vector. - constexpr T* End() const { return m_pData + m_numElements; } - - /// Returns pointer to the underlying buffer serving as data storage. - /// - /// @returns Pointer to the underlying data storage. - /// For a non-empty span, the returned pointer contains address of the first element. - /// For an empty span, the returned pointer may or may not be a null pointer. - constexpr T* Data() const { return m_pData; } - - /// Returns the extent of the span. - /// - /// @returns An unsigned integer equal to the number of elements currently present in the span. - constexpr size_t NumElements() const { return m_numElements; } - - /// Returns the size in bytes the Span represents. - /// - /// @returns An unsigned integer equal to the size in bytes the entire span represents. - constexpr size_t SizeInBytes() const { return ElementSize() * m_numElements; } - - /// Returns true if the number of elements present in the vector is equal to zero. - /// - /// @returns True if the span is empty. - constexpr bool IsEmpty() const { return (m_numElements == 0); } - - /// Returns a "subspan", a view over a subset range of the elements. - /// - /// @warning Behavior is undefined if either - /// - offset is greater than NumElements(), or - /// - count is not size_t(-1) and is greater than NumElements()-offset. - /// - /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses - /// in the same way to mean "take the remainder of the elements from offset". - /// - /// @param offset Zero-based offset to start the subspan at - /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset - /// - /// @returns The subspan - constexpr Span Subspan( - size_t offset, - size_t count) const - { - PAL_CONSTEXPR_ASSERT((offset <= NumElements()) - && ((count == size_t(-1)) || (count <= NumElements() - offset))); - if (count == size_t(-1)) - { - count = NumElements() - offset; - } - return Span(Data() + offset, count); - } - - /// Returns a subspan dropping the specified number (default 1) of elements from the front. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the front - /// - /// @returns The subspan - constexpr Span DropFront( - size_t count = 1) const - { - Span retVal; - if (count < NumElements()) - { - retVal = Subspan(count, size_t(-1)); - } - return retVal; - } - - /// Returns a subspan dropping the specified number (default 1) of elements from the back. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the back - /// - /// @returns The subspan - constexpr Span DropBack( - size_t count = 1) const - { - Span retVal; - if (count < NumElements()) - { - retVal = Subspan(0, NumElements() - count); - } - return retVal; - } - - ///@{ - /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types - /// - /// @note - These are a convenience intended to be used by c++ language features such as `range for`. - /// These should not be called directly as they do not adhere to PAL coding standards. - using value_type = T; - using reference = T&; - using iterator = T*; - using difference_type = size_t; - using size_type = size_t; - - constexpr iterator begin() const noexcept { return m_pData; } - constexpr iterator end() const noexcept { return (m_pData + m_numElements); } - constexpr bool empty() const noexcept { return IsEmpty(); } - constexpr size_type size() const noexcept { return m_numElements; } - ///@} - -protected: - template, char, U>> - static constexpr size_t ElementSize() { return sizeof(R); } - - template - using IfConst = std::enable_if_t == Condition>; - - template - using IfPtr = std::enable_if_t == Condition>; - - T* m_pData; // Pointer to the current data. - size_t m_numElements; // Number of elements present. -}; - -/// ==================================================================================================================== -/// Span template specialization for const void byte buffers. -template<> -class Span : public Span -{ -using Byte = const char; -using Base = Span; - -public: - /// Constructor from nothing. This allows you to use {} to mean an empty Span. - constexpr Span() : Base() {} - - /// Template constructor from any pointer and length - /// - /// @param [in] data Pointer to the start of the buffer or array - /// @param numElements Number of bytes or elements in the buffer or array - template - Span(const T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) {} - - /// Template copy constructor - /// - /// @param [in] src Other Span to copy from - template - Span(const Span& src) : Span(src.Data(), src.NumElements()) {} - - /// Template constructor from any C++ array - /// - /// @param [in] src C++ array - template - Span(const T(& src)[NumElements]) : Span(&src[0], NumElements) {} - - /// Constructor from any single element - /// - /// @param [in] src Single element - template> - Span(const T& src) : Span(&src, 1) {} - - /// Templated conversion of this typeless Span to a typed Subspan - /// - /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) - template - operator Span() const - { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } - - /// Returns the element at the location specified. - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - template - const T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } - - /// Returns pointer to the underlying buffer serving as data storage. - /// - /// @returns Pointer to the underlying data storage. - /// For a non-empty span, the returned pointer contains address of the first element. - /// For an empty span, the returned pointer may or may not be a null pointer. - template - constexpr const T* Data() const { return reinterpret_cast(Base::Data()); } - - /// Returns a "subspan", a view over a subset range of the elements. - /// - /// @warning Behavior is undefined if either - /// - offset is greater than NumElements(), or - /// - count is not size_t(-1) and is greater than NumElements()-offset. - /// - /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses - /// in the same way to mean "take the remainder of the elements from offset". - /// - /// @param offset Zero-based offset to start the subspan at - /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset - /// - /// @returns The subspan - template - Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } - - /// Returns a subspan dropping the specified number (default 1) of elements from the front. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the front - /// - /// @returns The subspan - template - Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } - - /// Returns a subspan dropping the specified number (default 1) of elements from the back. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the back - /// - /// @returns The subspan - template - Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } -}; - -/// ==================================================================================================================== -/// Span template specialization for mutable void byte buffers. -template<> -class Span : public Span -{ -using Byte = char; -using Base = Span; - -public: - /// Constructor from nothing. This allows you to use {} to mean an empty Span. - constexpr Span() : Base() {} - - /// Template constructor from any pointer and length - /// - /// @param [in] data Pointer to the start of the buffer or array - /// @param sizeInBytes,numElements Number of bytes or elements in the buffer or array - template> - Span(T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) { } - - /// Template copy constructor - /// - /// @param [in] src Other Span to copy from - template> - Span(const Span& src) : Span(src.Data(), src.NumElements()) {} - - /// Template constructor from any C++ array - /// - /// @param [in] src C++ array - template> - Span(T(& src)[NumElements]) : Span(&src[0], NumElements) {} - - /// Constructor from any single element - /// - /// @param [in] src Single element - template, typename = IfPtr> - Span(T& src) : Span(&src, 1) {} - - /// Templated conversion of this typeless Span to a typed Subspan - /// - /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) - template - operator Span() const { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } - - /// Returns the element at the location specified. - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - template - T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } - - /// Returns pointer to the underlying buffer serving as data storage - /// - /// @returns Pointer to the underlying data storage. - /// For a non-empty span, the returned pointer contains address of the first element. - /// For an empty span, the returned pointer may or may not be a null pointer. - template - constexpr T* Data() const { return reinterpret_cast(Base::Data()); } - - /// Implicitly convert this void Span to its const void equivalent - /// - /// @returns The same span, but of const void type - operator Span() const { return Span(Data(), NumElements()); } - - /// Returns a "subspan", a view over a subset range of the elements. - /// - /// @warning Behavior is undefined if either - /// - offset is greater than NumElements(), or - /// - count is not size_t(-1) and is greater than NumElements()-offset. - /// - /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses - /// in the same way to mean "take the remainder of the elements from offset". - /// - /// @param offset Zero-based offset to start the subspan at - /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset - /// - /// @returns The subspan - template - Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } - - /// Returns a subspan dropping the specified number (default 1) of elements from the front. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the front - /// - /// @returns The subspan - template - Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } - - /// Returns a subspan dropping the specified number (default 1) of elements from the back. - /// Returns an empty Span if there were no more elements than that to start with. - /// - /// @param count Number of elements to drop from the back - /// - /// @returns The subspan - template - Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } -}; - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palSpan.h +* @brief PAL utility collection Span class declaration. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" +#include "palSysMemory.h" +#include "palInlineFuncs.h" +#include + +namespace Util +{ + +/** + *********************************************************************************************************************** + * @brief Span container + * + * Span is an array with a length, where the data is not owned by the Span object. It is similar to C++20 std::span, + * but only the dynamic extent variant. It is similar to LLVM MutableArrayRef and ArrayRef. A Span is intended to + * be passed around by value. + * + *********************************************************************************************************************** + */ +template +class Span +{ +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : m_pData(nullptr), m_numElements(0) {} + + /// Constructor from pointer and length + /// + /// @param [in] data Pointer to the start of the array + /// @param numElements Number of elements in the array + constexpr Span(T* pData, size_t numElements) : m_pData(pData), m_numElements(numElements) {} + + /// Copy constructor + /// + /// @param [in] src Other Span to copy from + constexpr Span(const Span& src) : m_pData(src.m_pData), m_numElements(src.m_numElements) {} + + /// Constructor from C++ array + /// + /// @param [in] src C++ array + template constexpr Span(T(& src)[NumElements]) : m_pData(&src[0]), m_numElements(NumElements) {} + + /// Constructor from single element + /// + /// @param [in] src Single element + constexpr Span(T& src) : m_pData(&src), m_numElements(1) {} + + /// Implicitly convert a Span to its const-element equivalent. + /// + /// @returns The same span, but with const element type + constexpr operator Span() const { return Span(m_pData, m_numElements); } + + /// Assignment operator + /// + /// @param [in] src Other Span to copy from + constexpr Span& operator=(const Span& src) { + m_pData = src.m_pData; + m_numElements = src.m_numElements; + return *this; + } + + ///@{ + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + constexpr T& At(size_t index) const + { + PAL_CONSTEXPR_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + constexpr T& operator[](size_t index) const noexcept { return At(index); } + ///@} + + /// Returns the data at the front of the vector. + /// + /// @returns The data at the front of the vector. + constexpr T& Front() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return *m_pData; + } + + /// Returns the data at the back of the vector. + /// + /// @returns The data at the back of the vector. + constexpr T& Back() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return *(m_pData + (m_numElements - 1)); + } + + /// Returns an iterator to the first element of the vector. + /// + /// @returns An iterator to first element of the vector. + constexpr T* Begin() const { return m_pData; } + + /// Returns an iterator beyond the last element of the vector. (NOT at the last element like Util::Vector::End()!) + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns VectorIterator An iterator to last element of the vector. + constexpr T* End() const { return m_pData + m_numElements; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + constexpr T* Data() const { return m_pData; } + + /// Returns the extent of the span. + /// + /// @returns An unsigned integer equal to the number of elements currently present in the span. + constexpr size_t NumElements() const { return m_numElements; } + + /// Returns the size in bytes the Span represents. + /// + /// @returns An unsigned integer equal to the size in bytes the entire span represents. + constexpr size_t SizeInBytes() const { return ElementSize() * m_numElements; } + + /// Returns true if the number of elements present in the vector is equal to zero. + /// + /// @returns True if the span is empty. + constexpr bool IsEmpty() const { return (m_numElements == 0); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + constexpr Span Subspan( + size_t offset, + size_t count) const + { + PAL_CONSTEXPR_ASSERT((offset <= NumElements()) + && ((count == size_t(-1)) || (count <= NumElements() - offset))); + if (count == size_t(-1)) + { + count = NumElements() - offset; + } + return Span(Data() + offset, count); + } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + constexpr Span DropFront( + size_t count = 1) const + { + Span retVal; + if (count < NumElements()) + { + retVal = Subspan(count, size_t(-1)); + } + return retVal; + } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + constexpr Span DropBack( + size_t count = 1) const + { + Span retVal; + if (count < NumElements()) + { + retVal = Subspan(0, NumElements() - count); + } + return retVal; + } + + ///@{ + /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as `range for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using value_type = T; + using reference = T&; + using iterator = T*; + using difference_type = size_t; + using size_type = size_t; + + constexpr iterator begin() const noexcept { return m_pData; } + constexpr iterator end() const noexcept { return (m_pData + m_numElements); } + constexpr bool empty() const noexcept { return IsEmpty(); } + constexpr size_type size() const noexcept { return m_numElements; } + ///@} + +protected: + template, char, U>> + static constexpr size_t ElementSize() { return sizeof(R); } + + template + using IfConst = std::enable_if_t == Condition>; + + template + using IfPtr = std::enable_if_t == Condition>; + + T* m_pData; // Pointer to the current data. + size_t m_numElements; // Number of elements present. +}; + +/// ==================================================================================================================== +/// Span template specialization for const void byte buffers. +template<> +class Span : public Span +{ +using Byte = const char; +using Base = Span; + +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : Base() {} + + /// Template constructor from any pointer and length + /// + /// @param [in] data Pointer to the start of the buffer or array + /// @param numElements Number of bytes or elements in the buffer or array + template + Span(const T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) {} + + /// Template copy constructor + /// + /// @param [in] src Other Span to copy from + template + Span(const Span& src) : Span(src.Data(), src.NumElements()) {} + + /// Template constructor from any C++ array + /// + /// @param [in] src C++ array + template + Span(const T(& src)[NumElements]) : Span(&src[0], NumElements) {} + + /// Constructor from any single element + /// + /// @param [in] src Single element + template> + Span(const T& src) : Span(&src, 1) {} + + /// Templated conversion of this typeless Span to a typed Subspan + /// + /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) + template + operator Span() const + { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } + + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + template + const T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } + + /// Returns pointer to the underlying buffer serving as data storage. + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + template + constexpr const T* Data() const { return reinterpret_cast(Base::Data()); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + template + Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + template + Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + template + Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } +}; + +/// ==================================================================================================================== +/// Span template specialization for mutable void byte buffers. +template<> +class Span : public Span +{ +using Byte = char; +using Base = Span; + +public: + /// Constructor from nothing. This allows you to use {} to mean an empty Span. + constexpr Span() : Base() {} + + /// Template constructor from any pointer and length + /// + /// @param [in] data Pointer to the start of the buffer or array + /// @param sizeInBytes,numElements Number of bytes or elements in the buffer or array + template> + Span(T* pData, size_t numElements) : Base(reinterpret_cast(pData), ElementSize() * numElements) { } + + /// Template copy constructor + /// + /// @param [in] src Other Span to copy from + template> + Span(const Span& src) : Span(src.Data(), src.NumElements()) {} + + /// Template constructor from any C++ array + /// + /// @param [in] src C++ array + template> + Span(T(& src)[NumElements]) : Span(&src[0], NumElements) {} + + /// Constructor from any single element + /// + /// @param [in] src Single element + template, typename = IfPtr> + Span(T& src) : Span(&src, 1) {} + + /// Templated conversion of this typeless Span to a typed Subspan + /// + /// @returns A subspan with typed elemeents and NumElements truncated down to the nearest sizeof(type) + template + operator Span() const { return Span(static_cast(Data()), SizeInBytes() / ElementSize()); } + + /// Returns the element at the location specified. + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + template + T& At(size_t index) const { return reinterpret_cast(Base::At(index)); } + + /// Returns pointer to the underlying buffer serving as data storage + /// + /// @returns Pointer to the underlying data storage. + /// For a non-empty span, the returned pointer contains address of the first element. + /// For an empty span, the returned pointer may or may not be a null pointer. + template + constexpr T* Data() const { return reinterpret_cast(Base::Data()); } + + /// Implicitly convert this void Span to its const void equivalent + /// + /// @returns The same span, but of const void type + operator Span() const { return Span(Data(), NumElements()); } + + /// Returns a "subspan", a view over a subset range of the elements. + /// + /// @warning Behavior is undefined if either + /// - offset is greater than NumElements(), or + /// - count is not size_t(-1) and is greater than NumElements()-offset. + /// + /// Note that size_t(-1) is equivalent to C++20 std::dynamic_extent, which the C++20 std::span::subspan uses + /// in the same way to mean "take the remainder of the elements from offset". + /// + /// @param offset Zero-based offset to start the subspan at + /// @param count Number of elements in the subspan, or size_t(-1) for the remainder of the elements from offset + /// + /// @returns The subspan + template + Span Subspan(size_t offset, size_t count) const { return Span(Base::Subspan(offset, count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the front. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the front + /// + /// @returns The subspan + template + Span DropFront(size_t count = 1) const { return Span(Base::DropFront(count)); } + + /// Returns a subspan dropping the specified number (default 1) of elements from the back. + /// Returns an empty Span if there were no more elements than that to start with. + /// + /// @param count Number of elements to drop from the back + /// + /// @returns The subspan + template + Span DropBack(size_t count = 1) const { return Span(Base::DropBack(count)); } +}; + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h index 56ec55099f..fb6ef9b694 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palStringUtil.h @@ -1,188 +1,188 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palStringUtil.h - * @brief PAL String utility collection functions. - *********************************************************************************************************************** - */ - -#pragma once - -#include -#include -#include - -#include "palUtil.h" - -namespace Util -{ -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 -/// Returns the length of a wchar_t based string. -/// -/// @param [in] wide string to query -/// -/// @returns The length of the given string in wide characters -extern size_t PalWcslen( - const wchar_t* pWideStr); - -/// Performs a reverse string find of wide character wc. -/// -/// @param [in] wide string to scan -/// @param [in] wide character to find -/// -/// @returns The matching character at the end of the string or nullptr if not found. -extern wchar_t* PalWcsrchr( - wchar_t *pStr, - wchar_t wc); -#endif - -/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but mbstowcs still treats the dest -/// as 32 bit so we provide our own implementation. -/// -/// @param [out] dst string -/// @param [in] src string -/// @param [in] size of the destination buffer in words -/// -/// @returns Returns whether or not the conversion was successful. -extern bool ConvertCharStringToUtf16( - wchar_t* pDst, - const char* pSrc, - size_t dstSizeInWords); - -/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcstombs still treats the src -/// as 32 bit so we provide our own implementation. -/// -/// @param [out] dst string -/// @param [in] src string -/// @param [in] size of the destination buffer in bytes -/// -/// @returns Returns whether or not the conversion was successful. -extern bool ConvertUtf16StringToUtf8( - char* pDst, - const wchar_t* pSrc, - size_t dstSizeInBytes); - -/// Convert wchar_t string to UTF-8 string. Works whether wchar_t is 16 or 32 bits. -/// If wchar_t is 16 bits, this decodes UTF-16. -/// -/// @param [out] dst string -/// @param [in] src string -/// @param [in] size of the destination buffer in bytes -/// -/// @returns Returns whether or not the conversion was successful. -bool ConvertWcharStringToUtf8(char* pDst, const wchar_t* pSrc, size_t dstSizeInBytes); - -/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcsncpy still treats its arguments -/// as 32 bit so we provide our own implementation. -/// -/// @param [out] pDst Destination string. -/// @param [in] pSrc Source string to copy. -/// @param [in] dstSize Length of the destination buffer, in wchar_t's. -extern void CopyUtf16String( - wchar_t* pDst, - const wchar_t* pSrc, - size_t dstSize); - -/// A shared helper function which takes an arbitrary blob of data and formats it into a human readable "memory view" -/// string. This is intended to be used by logging code. -/// -/// Imagine your input buffer is: { 0xef, 0xbe, 0xad, 0xde, 0x78, 0x56, 0x34, 0x12, 0xab }, then the string looks like -/// this with a blockSize of 4: "0xdeadbeef 0x12345678 0xab". So the block size determines how many bytes are combined -/// into one "0x" character block. The whole block is effetively cast into an integer of that size and printed in big -/// endian. Trailing bytes are printed without being size-extended. If a block won't fit at the end of the string it -/// is skipped (update your buffer pointer and call again to continue). -/// -/// The return value is the number of bytes consumed from pBuffer. The idea is that you can loop until the full size -/// is consumed, printing a new line for each call. -/// -/// @param [out] pDst The caller-provided destination string. -/// @param [in] dstSize The length of pDst in bytes. -/// @param [in] pBuffer The arbitrary data blob to turn into a string. -/// @param [in] bufferSize The length of pBuffer in bytes. -/// @param [in] blockSize How many bytes to combine into one hexidecimal big endian string. -/// -/// @returns The number of bytes from pBuffer that were formatted into pDst. -extern size_t BytesToStr( - char* pDst, - size_t dstSize, - const void* pBuffer, - size_t bufferSize, - size_t blockSize); - -/// Returns the length of the string. -/// -/// @returns String length. -constexpr uint32 StringLength( - const char* pString) -{ - // TODO: On C++23 we can replace this with consteval-if. - // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. -#if defined(__cpp_lib_is_constant_evaluated) - if (std::is_constant_evaluated()) -#else - if (__builtin_is_constant_evaluated()) -#endif - { - uint32 length = 0; - while (pString[length] != '\0') - { - length++; - } - return length; - } - else - { - return uint32(std::strlen(pString)); - } -} -/// Returns the length of the string. -/// -/// @returns String length. -constexpr uint32 StringLength( - const wchar_t* pString) -{ - // TODO: On C++23 we can replace this with consteval-if. - // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. -#if defined(__cpp_lib_is_constant_evaluated) - if (std::is_constant_evaluated()) -#else - if (__builtin_is_constant_evaluated()) -#endif - { - uint32 length = 0; - while (pString[length] != L'\0') - { - length++; - } - return length; - } - else - { - return uint32(std::wcslen(pString)); - } -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palStringUtil.h + * @brief PAL String utility collection functions. + *********************************************************************************************************************** + */ + +#pragma once + +#include +#include +#include + +#include "palUtil.h" + +namespace Util +{ +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 919 +/// Returns the length of a wchar_t based string. +/// +/// @param [in] wide string to query +/// +/// @returns The length of the given string in wide characters +extern size_t PalWcslen( + const wchar_t* pWideStr); + +/// Performs a reverse string find of wide character wc. +/// +/// @param [in] wide string to scan +/// @param [in] wide character to find +/// +/// @returns The matching character at the end of the string or nullptr if not found. +extern wchar_t* PalWcsrchr( + wchar_t *pStr, + wchar_t wc); +#endif + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but mbstowcs still treats the dest +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in words +/// +/// @returns Returns whether or not the conversion was successful. +extern bool ConvertCharStringToUtf16( + wchar_t* pDst, + const char* pSrc, + size_t dstSizeInWords); + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcstombs still treats the src +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in bytes +/// +/// @returns Returns whether or not the conversion was successful. +extern bool ConvertUtf16StringToUtf8( + char* pDst, + const wchar_t* pSrc, + size_t dstSizeInBytes); + +/// Convert wchar_t string to UTF-8 string. Works whether wchar_t is 16 or 32 bits. +/// If wchar_t is 16 bits, this decodes UTF-16. +/// +/// @param [out] dst string +/// @param [in] src string +/// @param [in] size of the destination buffer in bytes +/// +/// @returns Returns whether or not the conversion was successful. +bool ConvertWcharStringToUtf8(char* pDst, const wchar_t* pSrc, size_t dstSizeInBytes); + +/// When the -fshort-char compiler option is specified, wchar_t is 16 bits, but wcsncpy still treats its arguments +/// as 32 bit so we provide our own implementation. +/// +/// @param [out] pDst Destination string. +/// @param [in] pSrc Source string to copy. +/// @param [in] dstSize Length of the destination buffer, in wchar_t's. +extern void CopyUtf16String( + wchar_t* pDst, + const wchar_t* pSrc, + size_t dstSize); + +/// A shared helper function which takes an arbitrary blob of data and formats it into a human readable "memory view" +/// string. This is intended to be used by logging code. +/// +/// Imagine your input buffer is: { 0xef, 0xbe, 0xad, 0xde, 0x78, 0x56, 0x34, 0x12, 0xab }, then the string looks like +/// this with a blockSize of 4: "0xdeadbeef 0x12345678 0xab". So the block size determines how many bytes are combined +/// into one "0x" character block. The whole block is effetively cast into an integer of that size and printed in big +/// endian. Trailing bytes are printed without being size-extended. If a block won't fit at the end of the string it +/// is skipped (update your buffer pointer and call again to continue). +/// +/// The return value is the number of bytes consumed from pBuffer. The idea is that you can loop until the full size +/// is consumed, printing a new line for each call. +/// +/// @param [out] pDst The caller-provided destination string. +/// @param [in] dstSize The length of pDst in bytes. +/// @param [in] pBuffer The arbitrary data blob to turn into a string. +/// @param [in] bufferSize The length of pBuffer in bytes. +/// @param [in] blockSize How many bytes to combine into one hexidecimal big endian string. +/// +/// @returns The number of bytes from pBuffer that were formatted into pDst. +extern size_t BytesToStr( + char* pDst, + size_t dstSize, + const void* pBuffer, + size_t bufferSize, + size_t blockSize); + +/// Returns the length of the string. +/// +/// @returns String length. +constexpr uint32 StringLength( + const char* pString) +{ + // TODO: On C++23 we can replace this with consteval-if. + // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. +#if defined(__cpp_lib_is_constant_evaluated) + if (std::is_constant_evaluated()) +#else + if (__builtin_is_constant_evaluated()) +#endif + { + uint32 length = 0; + while (pString[length] != '\0') + { + length++; + } + return length; + } + else + { + return uint32(std::strlen(pString)); + } +} +/// Returns the length of the string. +/// +/// @returns String length. +constexpr uint32 StringLength( + const wchar_t* pString) +{ + // TODO: On C++23 we can replace this with consteval-if. + // TODO: When we upgrade PAL_CPLUSPLUS then we can rely that std::is_constant_evaluated() is always defined. +#if defined(__cpp_lib_is_constant_evaluated) + if (std::is_constant_evaluated()) +#else + if (__builtin_is_constant_evaluated()) +#endif + { + uint32 length = 0; + while (pString[length] != L'\0') + { + length++; + } + return length; + } + else + { + return uint32(std::wcslen(pString)); + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h b/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h index e907de1d15..34057ebd60 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palStringView.h @@ -1,228 +1,228 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file palStringView.h -* @brief PAL utility collection string view declaration. -*********************************************************************************************************************** -*/ - -#pragma once - -#include "palAssert.h" -#include "palInlineFuncs.h" -#include "palStringUtil.h" -#include "palUtil.h" -#include - -namespace Util -{ - -/** -*********************************************************************************************************************** -* @brief String view. -* -* A StringView is a templated view over a constant contiguous sequence of characters. -* -* @warning The string view assumes that its lifetime does not extend past that of the pointed-to character sequence. -*********************************************************************************************************************** -*/ -template -class StringView -{ - static_assert((std::is_same::value || std::is_same::value), - "StringView type T must be either char or wchar_t."); -public: - /// Constructs an empty StringView. - constexpr StringView() - : - m_pData{}, - m_length{} - {} - - constexpr StringView( - const CharT* s, - uint32 count) - : - m_pData{s}, - m_length{count} - { - PAL_CONSTEXPR_ASSERT((s != nullptr) || (count == 0)); - } - - StringView( - const CharT* s) - : - StringView() - { - if (s != nullptr) - { - m_length = StringLength(s); - m_pData = s; - } - } - - constexpr StringView(std::nullptr_t) = delete; - - ///@{ - /// Returns the element at the location specified. - /// - /// @warning Calling this function with an out-of-bounds index will cause an access violation! - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - constexpr const CharT& At(uint32 index) const - { - PAL_CONSTEXPR_ASSERT(index < Length()); - return m_pData[index]; - } - - constexpr const CharT& operator[](uint32 index) const { return At(index); } - ///@} - - /// Returns pointer to the underlying string serving as data storage. - /// The returned pointer defines always valid range [Data(), Data() + Length()), - /// even if the view does not point to any data storage (Data() is not dereferenceable in that case). - /// - /// @warning Dereferencing pointer returned by Data() from a view that does not point to a data storage will cause - /// an access violation! - /// - /// @returns Pointer to the underlying data storage for read access. - /// For a view to a valid data storage, the returned pointer contains address of the first element. - /// For a view without a valid data storage, the returned pointer will be a @c nullptr. - constexpr const CharT* Data() const noexcept { return m_pData; } - - /// Returns the data at the front of the view. - /// - /// @warning Calling this function on an empty view will cause an access violation! - /// - /// @returns The data at the front of the view. - constexpr const CharT& Front() const - { - PAL_CONSTEXPR_ASSERT(IsEmpty() == false); - return m_pData[0]; - } - - /// Returns the data at the back of the view. - /// - /// @warning Calling this function on an empty view will cause an access violation! - /// - /// @returns The data at the back of the view. - constexpr const CharT& Back() const - { - PAL_CONSTEXPR_ASSERT(IsEmpty() == false); - return m_pData[Length() - 1]; - } - - /// Returns the length of the string. - /// - /// @returns An unsigned integer equal to the length of the string. - constexpr uint32 Length() const { return m_length; } - - /// Returns true if the number of characters the view points to is equal to zero. - /// - /// @returns True if the view points to an empty or non-existing data storage. - constexpr bool IsEmpty() const { return (m_length == 0); } - - ///@{ - /// @internal Satisfies concept `range_expression`, using CharT* as `iterator`. - /// - /// @note - These are a convenience intended to be used by C++ language features such as `range-based for`. - /// These should not be called directly as they do not adhere to PAL coding standards. - using const_iterator = const CharT*; - - constexpr const_iterator begin() const noexcept { return m_pData; } - constexpr const_iterator end() const noexcept { return m_pData + Length(); } - ///@} - -private: - const CharT* m_pData; - uint32 m_length; -}; - -// ===================================================================================================================== -template -constexpr bool operator==( - StringView x, - StringView y) -{ - bool equal = (x.Length() == y.Length()); - if (equal) - { - if (x.Data() != y.Data()) - { - // they are not pointing to the same storage, so we need to compare the contents - for (uint32 index = 0; equal && (index < x.Length()); ++index) - { - equal = (x[index] == y[index]); - } - } - } - return equal; -} - -// ===================================================================================================================== -template -constexpr bool operator!=(StringView x, StringView y) { return (x == y) == false; } - -// ===================================================================================================================== -template -bool operator<( - StringView x, - StringView y) -{ - const uint32 minLength = Min(x.Length(), y.Length()); - int compare = strncmp(x.Data(), y.Data(), minLength); - if (compare == 0) - { - // strings are equal up to minLength, so check which is shorter - compare = int(x.Length()) - int(y.Length()); - } - - return compare < 0; -} - -// ===================================================================================================================== -template -bool operator<=(StringView x, StringView y) { return (y < x) == false; } - -// ===================================================================================================================== -template -bool operator>(StringView x, StringView y) { return y < x; } - -// ===================================================================================================================== -template -bool operator>=(StringView x, StringView y) { return (x < y) == false; } - -/// Specialization of @ref HashString(const char*,size_t) for @ref StringView. -template -constexpr uint32 HashString( - StringView sv) -{ - return HashString(sv.Data(), sv.Length()); -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palStringView.h +* @brief PAL utility collection string view declaration. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palAssert.h" +#include "palInlineFuncs.h" +#include "palStringUtil.h" +#include "palUtil.h" +#include + +namespace Util +{ + +/** +*********************************************************************************************************************** +* @brief String view. +* +* A StringView is a templated view over a constant contiguous sequence of characters. +* +* @warning The string view assumes that its lifetime does not extend past that of the pointed-to character sequence. +*********************************************************************************************************************** +*/ +template +class StringView +{ + static_assert((std::is_same::value || std::is_same::value), + "StringView type T must be either char or wchar_t."); +public: + /// Constructs an empty StringView. + constexpr StringView() + : + m_pData{}, + m_length{} + {} + + constexpr StringView( + const CharT* s, + uint32 count) + : + m_pData{s}, + m_length{count} + { + PAL_CONSTEXPR_ASSERT((s != nullptr) || (count == 0)); + } + + StringView( + const CharT* s) + : + StringView() + { + if (s != nullptr) + { + m_length = StringLength(s); + m_pData = s; + } + } + + constexpr StringView(std::nullptr_t) = delete; + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + constexpr const CharT& At(uint32 index) const + { + PAL_CONSTEXPR_ASSERT(index < Length()); + return m_pData[index]; + } + + constexpr const CharT& operator[](uint32 index) const { return At(index); } + ///@} + + /// Returns pointer to the underlying string serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + Length()), + /// even if the view does not point to any data storage (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from a view that does not point to a data storage will cause + /// an access violation! + /// + /// @returns Pointer to the underlying data storage for read access. + /// For a view to a valid data storage, the returned pointer contains address of the first element. + /// For a view without a valid data storage, the returned pointer will be a @c nullptr. + constexpr const CharT* Data() const noexcept { return m_pData; } + + /// Returns the data at the front of the view. + /// + /// @warning Calling this function on an empty view will cause an access violation! + /// + /// @returns The data at the front of the view. + constexpr const CharT& Front() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return m_pData[0]; + } + + /// Returns the data at the back of the view. + /// + /// @warning Calling this function on an empty view will cause an access violation! + /// + /// @returns The data at the back of the view. + constexpr const CharT& Back() const + { + PAL_CONSTEXPR_ASSERT(IsEmpty() == false); + return m_pData[Length() - 1]; + } + + /// Returns the length of the string. + /// + /// @returns An unsigned integer equal to the length of the string. + constexpr uint32 Length() const { return m_length; } + + /// Returns true if the number of characters the view points to is equal to zero. + /// + /// @returns True if the view points to an empty or non-existing data storage. + constexpr bool IsEmpty() const { return (m_length == 0); } + + ///@{ + /// @internal Satisfies concept `range_expression`, using CharT* as `iterator`. + /// + /// @note - These are a convenience intended to be used by C++ language features such as `range-based for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using const_iterator = const CharT*; + + constexpr const_iterator begin() const noexcept { return m_pData; } + constexpr const_iterator end() const noexcept { return m_pData + Length(); } + ///@} + +private: + const CharT* m_pData; + uint32 m_length; +}; + +// ===================================================================================================================== +template +constexpr bool operator==( + StringView x, + StringView y) +{ + bool equal = (x.Length() == y.Length()); + if (equal) + { + if (x.Data() != y.Data()) + { + // they are not pointing to the same storage, so we need to compare the contents + for (uint32 index = 0; equal && (index < x.Length()); ++index) + { + equal = (x[index] == y[index]); + } + } + } + return equal; +} + +// ===================================================================================================================== +template +constexpr bool operator!=(StringView x, StringView y) { return (x == y) == false; } + +// ===================================================================================================================== +template +bool operator<( + StringView x, + StringView y) +{ + const uint32 minLength = Min(x.Length(), y.Length()); + int compare = strncmp(x.Data(), y.Data(), minLength); + if (compare == 0) + { + // strings are equal up to minLength, so check which is shorter + compare = int(x.Length()) - int(y.Length()); + } + + return compare < 0; +} + +// ===================================================================================================================== +template +bool operator<=(StringView x, StringView y) { return (y < x) == false; } + +// ===================================================================================================================== +template +bool operator>(StringView x, StringView y) { return y < x; } + +// ===================================================================================================================== +template +bool operator>=(StringView x, StringView y) { return (x < y) == false; } + +/// Specialization of @ref HashString(const char*,size_t) for @ref StringView. +template +constexpr uint32 HashString( + StringView sv) +{ + return HashString(sv.Data(), sv.Length()); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h b/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h index 4b22d88616..c43ef0069e 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSysMemory.h @@ -1,820 +1,820 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palSysMemory.h - * @brief PAL utility collection system memory management macros. - *********************************************************************************************************************** - */ - -#pragma once - -#include "palAssert.h" -#include "palInlineFuncs.h" -#include "palMemTracker.h" -#include -#include - -// Forward declarations -namespace Util { struct AllocInfo; } -namespace Util { struct FreeInfo; } -namespace Util { enum SystemAllocType : uint32; } - -#if !defined(__GNUC__) || (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 8)) -/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. -#define PAL_DEFAULT_MEM_ALIGN alignof(std::max_align_t) -#else -// GCC versions prior to 4.9 break C++11 compatibility by putting max_align_t in the global namespace. -/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. -#define PAL_DEFAULT_MEM_ALIGN alignof(::max_align_t) -#endif - -namespace Util -{ - -/// Informs that @p p is aligned to at least @p Alignment. -template -constexpr T* AssumeAligned(T* p) -{ - return static_cast(__builtin_assume_aligned(p, Alignment)); -} - -} // Util - -#if PAL_MEMTRACK - -/// @internal Malloc allocation method with extra memory leak tracking arguments. -#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ - _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType, _memBlkType, __FILE__, __LINE__)) - -/// @internal Calloc allocation method with extra memory leak tracking arguments. -#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ - _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType, _memBlkType, __FILE__, __LINE__)) - -/// @internal Free method with extra memory leak tracking arguments. -#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ - _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)), _memBlkType)) - -#else - -/// @internal Malloc method not wrapped with memory leak tracking. -#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ - _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType)) - -/// @internal Calloc method not wrapped with memory leak tracking. -#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ - _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType)) - -/// @internal Free method not wrapped with memory leak tracking. -#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ - _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)))) - -#endif - -/// Allocates heap memory in place of malloc(). -/// -/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. -/// The client is also free to use this macro in order to take advantage of PAL's memory leak tracking. -#define PAL_MALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ - PAL_MALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) - -/// Same as @ref PAL_MALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. -#define PAL_MALLOC(_size, _allocator, _allocType) \ - PAL_MALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) - -/// Allocates zero-initialized heap memory in place of calloc(). See @ref PAL_MALLOC_ALIGNED. -#define PAL_CALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ - PAL_CALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) - -/// Same as @ref PAL_CALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. -#define PAL_CALLOC(_size, _allocator, _allocType) \ - PAL_CALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) - -/// Frees heap memory allocated with the @ref PAL_MALLOC* or @ref PAL_CALLOC* macros. -#define PAL_FREE(_ptr, _allocator) PAL_FREE_BASE((_ptr), (_allocator), ::Util::MemBlkType::Malloc) - -/// Safe free macro. Pointer is set to null after the free. -#define PAL_SAFE_FREE(_ptr, _allocator) { PAL_FREE((_ptr), (_allocator)); (_ptr) = nullptr; } - -/// @internal -/// -/// This type only exists to force a unique override for placement new. We need to override placement new in order to -/// call the constructor in the PAL_NEW and PAL_NEW_ARRAY implementations, but we do not want to overload global -/// placement new or include \ since either could interfere with the client. Adding a dummy parameter allows us -/// to define a PAL-only placement new implementation. -namespace Util -{ -struct Dummy -{ - explicit Dummy() { } ///< Explicit default constructor prevents this from being instantiated via unqualified "{}". -}; -} - -/// @internal -/// -/// PAL-internal placement new override. The Dummy is used to ensure there won't be a conflict if a client tries to -/// override global placement new. -/// -/// @param [in] size Size of the memory allocation. -/// @param [in] pObjMem Memory where object will be constructed. -/// @param [in] dummy Unused. -extern void* PAL_CDECL operator new( - size_t size, - void* pObjMem, - Util::Dummy dummy) noexcept; - -/// @internal -/// -/// Silences compiler warnings about not have a matching delete for the placement new override above. Will never be -/// called. -/// -/// @param [in] pObj Unused. -/// @param [in] pObjMem Unused. -/// @param [in] dummy Unused. -extern void PAL_CDECL operator delete( - void* pObj, - void* pObjMem, - Util::Dummy dummy) noexcept; - -/// Placement new macro. -#define PAL_PLACEMENT_NEW(_ptr) new((_ptr), ::Util::Dummy{}) - -/// Allocates heap memory and calls constructor for an object of the specified type. -/// -/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. -/// The client is also free to use this macro. -/// -/// Instead of calling "MyClass* pMyClass = new MyClass(arg1, arg2)", call -/// "MyClass* pMyClass = PAL_NEW(MyClass, AllocInternal)(arg1, arg2)". -#define PAL_NEW(_className, _allocator, _allocType) \ - PAL_PLACEMENT_NEW( \ - PAL_MALLOC_BASE(sizeof(_className), alignof(_className), (_allocator), (_allocType), ::Util::MemBlkType::New)) \ - _className - -/// Calls destructor and frees heap memory for the object allocated with PAL_NEW*. -#define PAL_DELETE(_ptr, _allocator) \ -{ \ - /* we want to evaluate the expression (_allocator) before calling - the destructor because the destructor might have side effects */ \ - auto _allocator_ = (_allocator); \ - ::Util::Destructor(_ptr); \ - PAL_FREE_BASE((_ptr), _allocator_, ::Util::MemBlkType::New); \ -} - -/// Calls destructor and frees heap memory for "this". Use this macro to delete an object without a public destructor. -#define PAL_DELETE_THIS(_className, _allocator) \ -{ \ - /* we want to evaluate the expression (_allocator) before calling - the destructor because the destructor might have side effects */ \ - auto _allocator_ = (_allocator); \ - this->~_className(); \ - PAL_FREE_BASE(this, _allocator_, ::Util::MemBlkType::New); \ -} - -/// Safe delete macro. Pointer is set to null after the delete. -#define PAL_SAFE_DELETE(_ptr, _allocator) { PAL_DELETE(_ptr, _allocator); (_ptr) = nullptr; } - -/// Allocates an array of the specified object type. -/// -/// For non-POD types, the default constructor will be called. Default constructor is not available for POD types -/// (i.e., PAL_NEW_ARRAY(int, 3, AllocInternal)() won't work. -#if PAL_MEMTRACK -#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ - ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType), __FILE__, __LINE__) -#else -#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ - ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType)) -#endif - -/// Destroys an array of the specified object type. -/// -/// For non-POD types, the destructor will be called. -#define PAL_DELETE_ARRAY(_ptr, _allocator) ::Util::DeleteArray(_ptr, _allocator) - -/// Safe delete array macro. Pointer is set to null after the delete. -#define PAL_SAFE_DELETE_ARRAY(_ptr, _allocator) { PAL_DELETE_ARRAY(_ptr, _allocator); (_ptr) = nullptr; } - -namespace Util -{ -/// Specifies the usage of a system memory allocation made via a client allocation callback. -/// -/// The selected type gives the client an idea of the expected lifetime of the allocation, perhaps allowing intelligent -/// selection of sub-allocation pool, etc. -/// -/// @note This is a weak uint32 enum where all PAL values set the top bit. The client is free to use PAL's memory -/// utilities for their own allocations with their own uint32 enum using the range 0 to 0x7FFFFFFF. The client's -/// allocation callback can then separately handle any memory allocation category, whether allocated by PAL or -/// themselves. -/// -/// @see AllocCallbacks -/// @see AllocFunc -enum SystemAllocType : uint32 -{ - /// Indicates an allocation will be attached to a client-created PAL object and will not be freed until the client - /// frees the associated object. This type will be specified when allocation callbacks are made during a PAL create - /// call (e.g., IDevice::CreateGraphicsPipeline()). - AllocObject = 0x80000000, - - /// Indicates an allocation is for internal PAL use. The client should assume such allocations have a long - /// lifetime, and may not be freed until IPlatform::Destroy() is called. - AllocInternal = 0x80000001, - - /// Indicates an allocation is for internal PAL use and that the lifetime of the allocation will be short. - /// Typically this will be specified for heap allocations that will be freed before control is returned to the - /// client. - AllocInternalTemp = 0x80000002, - - /// Indicates an allocation was requested by the shader compiler. - AllocInternalShader = 0x80000003 -}; - -/// Function pointer type defining a callback for client-controlled system memory allocation. -/// -/// @see AllocCallbacks -/// -/// @ingroup LibInit -/// -/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter -/// to CreatePlatform() will be passed back to the client on every allocation callback. -/// @param [in] size Size of the requested allocation in bytes. Must be non-zero. -/// @param [in] alignment Required alignment of the requested allocation in bytes. Must be a power of two. -/// @param [in] allocType Hint to client about expected allocation usage and lifetime. See @ref SystemAllocType. -/// -/// @returns Pointer to system memory with the specified size and alignment. nullptr means that the allocation failed. -typedef void* (PAL_STDCALL *AllocFunc)( - void* pClientData, - size_t size, - size_t alignment, - SystemAllocType allocType); - -/// Function pointer type defining a callback for client-controlled system memory deallocation. -/// -/// @see AllocCallbacks -/// -/// @ingroup LibInit -/// -/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter -/// to CreatePlatform() will be passed back to the client on every free callback. -/// @param [in] pMem System memory pointer to be freed. The specified pointer must have been allocated by an -/// @ref AllocFunc callback. -typedef void (PAL_STDCALL *FreeFunc)( - void* pClientData, - void* pMem); - -/// Specifies client-provided system allocation callbacks. Used as a parameter to Pal::CreatePlatform(). -/// -/// @ingroup LibInit -struct AllocCallbacks -{ - void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to - /// every @ref AllocFunc and @ref FreeFunc call made by PAL. - AllocFunc pfnAlloc; ///< System memory allocation callback. @see AllocFunc. - FreeFunc pfnFree; ///< System memory deallocation callback. @see FreeFunc. -}; - -/// Information about requested allocation. -/// -/// Contains necessary information (size, alignment, etc.) to allocate new system memory. -/// -/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator -/// does not need to use any of this information, but can if desired. -/// -/// @see Allocators -struct AllocInfo -{ - /// Constructor. - AllocInfo( - size_t bytes, ///< [in] Number of bytes to allocate. - size_t alignment, ///< [in] Required alignment of the requested allocation in bytes. - bool zeroMem, ///< [in] True for calloc, false for malloc. - SystemAllocType allocType ///< [in] Hint on type of allocation and lifetime for client callbacks. -#if PAL_MEMTRACK - , MemBlkType blockType, ///< [in] Type of allocation (malloc, new, or new array). - const char* pFilename, ///< [in] Source filename that requested the memory allocation. - uint32 lineNumber ///< [in] Line number in the source file that requested the memory allocation. -#endif - ) - : - bytes(bytes), - alignment(alignment), - zeroMem(zeroMem), - allocType(allocType) -#if PAL_MEMTRACK - , blockType(blockType), - pFilename(pFilename), - lineNumber(lineNumber) -#endif - {} - - size_t bytes; ///< Number of bytes to allocate. - const size_t alignment; ///< Required alignment of the requested allocation in bytes. - const bool zeroMem; ///< True for calloc, false for malloc. - const SystemAllocType allocType; ///< Hint on type of allocation and lifetime for client callbacks. -#if PAL_MEMTRACK - const MemBlkType blockType; ///< Type of allocation (malloc, new, or new array). - const char* pFilename; ///< Source filename that requested the memory allocation. - const uint32 lineNumber; ///< Line number in the source file that requested the memory allocation. -#endif -}; - -/// Information about freeing a specified allocation. -/// -/// Contains necessary information about memory that needs to be freed. -/// -/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator -/// does not need to use any of this information, but can if desired. -/// -/// @see Allocators -struct FreeInfo -{ - /// Constructor. - FreeInfo( - void* pClientMem ///< [in] Pointer to memory allocation. -#if PAL_MEMTRACK - , MemBlkType blockType ///< [in] Type of free (free, delete, or delete array). -#endif - ) - : - pClientMem(pClientMem) -#if PAL_MEMTRACK - , blockType(blockType) -#endif - {} - - void* pClientMem; ///< Pointer to memory allocation. -#if PAL_MEMTRACK - const MemBlkType blockType; ///< Type of free (free, delete, or delete array). -#endif -}; - -/** - *********************************************************************************************************************** - * @brief Wraps a AllocCallbacks struct into a class compatible with PAL's Allocator concept. - *********************************************************************************************************************** - */ -class ForwardAllocator -{ -public: - /// Constructor. - ForwardAllocator(const AllocCallbacks& callbacks) : m_callbacks(callbacks) { } - - /// Allocates memory using the provided pfnAlloc callback. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) - { - // Allocating zero bytes of memory results in undefined behavior. - PAL_ASSERT(allocInfo.bytes > 0); - - void* pMem = m_callbacks.pfnAlloc(m_callbacks.pClientData, - allocInfo.bytes, - allocInfo.alignment, - allocInfo.allocType); - - if ((pMem != nullptr) && allocInfo.zeroMem) - { - memset(pMem, 0, allocInfo.bytes); - } - - return pMem; - } - - /// Frees memory using the provided pfnFree callback. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) - { - if (freeInfo.pClientMem != nullptr) - { - m_callbacks.pfnFree(m_callbacks.pClientData, freeInfo.pClientMem); - } - } - -private: - const AllocCallbacks m_callbacks; -}; - -/** -************************************************************************************************************************ -* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps ForwardAllocator. -************************************************************************************************************************ -*/ -#if PAL_MEMTRACK -class ForwardAllocatorTracked -{ -public: - /// Constructor - ForwardAllocatorTracked(const AllocCallbacks& callbacks) - : - m_allocator(callbacks), - m_memTracker(&m_allocator) - { - } - - /// Allocates a block of memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) - { - return m_memTracker.Alloc(allocInfo); - } - - /// Frees a block of memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) - { - m_memTracker.Free(freeInfo); - } - -private: - Util::ForwardAllocator m_allocator; ///< The ForwardAllocator which this object wraps. - MemTracker m_memTracker; ///< Memory tracker for this ForwardAllocator. -}; -#else -using ForwardAllocatorTracked = ForwardAllocator; -#endif - -/** -************************************************************************************************************************ -* @brief A wrapper representing an allocator const-pointer. Can be implicitly constructed from any Allocator pointer. -* -* IndirectAllocator is a type-erasure replacement for `Allocator*const pAllocator`, to abstract around Allocator types. -* This allows classes to not need fully template on `typename Allocator`, at the cost of more pointer-indirection. -* -* Const-correctness should be treated as `Allocator*const pAllocator` - the pointed-to Allocator may be mutable. -************************************************************************************************************************ -*/ -class IndirectAllocator -{ -public: - /// Implicit conversion from any Allocator pointer. - template - IndirectAllocator(Allocator*const pAllocator) - : - m_pAllocator(pAllocator), - m_pfnAlloc(&DispatchAlloc), - m_pfnFree(&DispatchFree) - { } - - /// Constructor specialization for a pointer to another IndirectAllocator, which acts like a copy constructor. - IndirectAllocator(const IndirectAllocator*const pAllocator) : IndirectAllocator(*pAllocator) { } - - /// Allocates memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) const { return m_pfnAlloc(m_pAllocator, allocInfo); } - - /// Frees memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) const { return m_pfnFree(m_pAllocator, freeInfo); } - - /// Returns true if the allocator == nullptr. Used in place of `pAllocator == nullptr`. - constexpr bool operator==(std::nullptr_t) const { return m_pAllocator == nullptr; } - -private: - /// @internal Allocation dispatch function. This is what the non-template @ref m_pfnAlloc callback references. - template - static void* DispatchAlloc(void*const pAllocator, const AllocInfo& allocInfo) - { - auto*const pTypedAllocator = static_cast(pAllocator); - return pTypedAllocator->Alloc(allocInfo); - } - - /// @internal Free dispatch function. This is what the non-template @ref m_pfnFree callback references. - template - static void DispatchFree(void*const pAllocator, const FreeInfo& freeInfo) - { - auto*const pTypedAllocator = static_cast(pAllocator); - pTypedAllocator->Free(freeInfo); - } - - using DispatchAllocCb = void* (*)(void*const, const AllocInfo&); - using DispatchFreeCb = void (*)(void*const, const FreeInfo&); - - void*const m_pAllocator; - - const DispatchAllocCb m_pfnAlloc; - const DispatchFreeCb m_pfnFree; -}; - -/** - *********************************************************************************************************************** - * @brief A generic allocator class that allocate and free memory for general purpose use. - *********************************************************************************************************************** - */ -class GenericAllocator -{ -public: - /// Allocates memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - static void* Alloc(const AllocInfo& allocInfo); - - /// Frees memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - static void Free(const FreeInfo& freeInfo); -}; - -/** -************************************************************************************************************************ -* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps GenericAllocator. -************************************************************************************************************************ -*/ -#if PAL_MEMTRACK -class GenericAllocatorTracked -{ -public: - /// Constructor - GenericAllocatorTracked() - : - m_memTracker(&m_allocator) - { - Result result = m_memTracker.Init(); - PAL_ASSERT(result == Result::_Success); - } - - /// Allocates a block of memory. - /// - /// @param [in] allocInfo Contains information about the requested allocation. - /// - /// @returns Pointer to the allocated memory, nullptr if the allocation failed. - void* Alloc(const AllocInfo& allocInfo) - { - void* pMemory = m_memTracker.Alloc(allocInfo); - return pMemory; - } - - /// Frees a block of memory. - /// - /// @param [in] freeInfo Contains information about the requested free. - void Free(const FreeInfo& freeInfo) - { - m_memTracker.Free(freeInfo); - } - -private: - GenericAllocator m_allocator; ///< The GenericAllocator which this object wraps. - - MemTracker m_memTracker; ///< Memory tracker for this GenericAllocator. -}; -#else -using GenericAllocatorTracked = GenericAllocator; -#endif - -/// Returns the OS-specific page size. -/// -/// @note All virtual reservations/commits/decommits/releases must be aligned to the value returned by this function. -/// -/// @return The OS-specific size, in bytes, of a page. -extern size_t VirtualPageSize(); - -/// Reserves the specified amount of virtual address space. -/// -/// @param [in] sizeInBytes Size in bytes of the requested reservation. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// @param [out] ppOut Pointer to reserved memory. Must not be null. -/// @param [in] pMem Pointer to the starting virtual address to reserve. -/// Subject to failure "ErrorOutOfMemory" if any pages in that range have already been reserved -/// elsewhere by the process. -/// @param [in] alignment Optional parameter specifying the alignment of the reserved cpu VA. -/// -/// @returns Success if reservation is successful. -/// Otherwise: -/// - ErrorOutOfMemory if memory reservation failed. -/// - ErrorInvalidValue if sizeInBytes is zero. -/// - ErrorInvalidPointer if ppOut is null. -extern Result VirtualReserve(size_t sizeInBytes, void** ppOut, void* pMem = nullptr, size_t alignment = 1); - -/// Commits the specified amount of virtual address space, requesting backing memory from the OS. -/// -/// @param [in] pMem Pointer to the start of reserved memory to commit. Must be aligned to the page size -/// returned from @ref Util::VirtualPageSize(); -/// @param [in] sizeInBytes Size in bytes of the requested commit. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// @param [in] isExecutable Indicate if the committed memory is used for execution on gpu(For instance: PM4, kernel). -/// Used only on Windows. -/// -/// @returns Success if committing is successful. -/// Otherwise: -/// - ErrorOutOfMemory if memory committing failed. -/// - ErrorInvalidValue if sizeInBytes is zero. -/// - ErrorInvalidPointer if pMem is null. -extern Result VirtualCommit(void* pMem, size_t sizeInBytes, bool isExecutable = false); - -/// Decommits the specified amount of virtual address space, freeing the backing memory back to the OS. -/// -/// @param [in] pMem Pointer to the start of committed memory. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// @param [in] sizeInBytes Size in bytes of how much to decommit. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// -/// @returns Success if decommitting is successful. -/// Otherwise: -/// - ErrorInvalidValue if sizeInBytes is zero. -/// - ErrorInvalidPointer if pMem is null. -extern Result VirtualDecommit(void* pMem, size_t sizeInBytes); - -/// Releases the specified amount of virtual address space, both freeing the backing memory and virtual address space -/// back to the OS. -/// -/// @param [in] pMem Pointer to the start of reserved memory. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// @param [in] sizeInBytes Size in bytes of how much to release. Must be aligned to the page size returned from -/// @ref Util::VirtualPageSize(); -/// -/// @returns Success if decommitting is successful. -/// Otherwise: -/// - ErrorInvalidValue if sizeInBytes is zero -/// - ErrorInvalidPointer if pMem is null. -extern Result VirtualRelease(void* pMem, size_t sizeInBytes); - -/// @internal -/// -/// OS-specific implementation to install default allocation callbacks in the specified structure. Expected to be -/// called during CreatePlatform if the client doesn't specify their own allocation callbacks. -/// -/// @param [in,out] pAllocCb Allocation callback structure to be updated with the OS-specific default callbacks. -/// -/// @returns Success if successful, otherwise an appropriate error code. -extern Result OsInitDefaultAllocCallbacks(AllocCallbacks* pAllocCb); - -/// @internal Internal template implementation for calling a destructor from PAL_DELETE or PAL_DELETE_ARRAY. -/// -/// @param [in] p Object to be destructed. -template -void Destructor(T* p) -{ - if ((p != nullptr) && !std::is_trivial::value) - { - p->~T(); - } -} - -/// @internal -/// -/// Internal template implementation of PAL_NEW_ARRAY. -/// -/// For non-POD types, allocate extra memory and store the array count for use by the destructor. An entire extra cache -/// line is allocated in order to prevent misaligning the actual array data. In practice, we rarely allocate an array -/// of non-POD objects, so this is likely not an issue. -/// -/// @param [in] arrayCnt Number of entries in the array. -/// @param [in] pAllocator The allocator that will allocate the memory for the array. -/// @param [in] allocType Hint to client on the lifetime/type of allocation. -/// @param [in] pFilename Source filename that requested the new array. -/// @param [in] lineNumber Line number in the source file that requested the new array. -/// -/// @returns Pointer to the allocated array, nullptr if the allocation failed. -template -T* NewArray( - size_t arrayCnt, - Allocator* pAllocator, - SystemAllocType allocType -#if PAL_MEMTRACK - , - const char* pFilename, - uint32 lineNumber -#endif - ) -{ - size_t align = alignof(T); - size_t allocSize = sizeof(T) * arrayCnt; - size_t headerSize = 0; - - if (!std::is_trivial::value) - { - align = Max(align, alignof(size_t)); - headerSize = Max(align, sizeof(size_t)); - allocSize += headerSize; - } - -#if PAL_MEMTRACK - const Util::AllocInfo info(allocSize, align, false, allocType, MemBlkType::NewArray, pFilename, lineNumber); -#else - const Util::AllocInfo info(allocSize, align, false, allocType); -#endif - - T* pRet = static_cast(pAllocator->Alloc(info)); - - if ((!std::is_trivial::value) && (pRet != nullptr)) - { - pRet = static_cast(Util::VoidPtrInc(static_cast(pRet), headerSize)); - - size_t* pArrayCnt = static_cast(Util::VoidPtrDec(static_cast(pRet), sizeof(size_t))); - *pArrayCnt = arrayCnt; - - T* pCurObj = static_cast(pRet); - for (uint32 i = 0; i < arrayCnt; i++) - { - PAL_PLACEMENT_NEW(pCurObj) T; - pCurObj++; - } - } - - return pRet; -} - -/// @internal -/// -/// Internal template implementation for PAL_DELETE_ARRAY. -/// -/// For non-POD types, find the array count stored before the client pointer, and call the destructor on each object in -/// the array. -/// -/// @param [in] p Pointer to the memory to be deleted. -/// @param [in] pAllocator The allocator that will free the memory for the array. -template -void DeleteArray(T* p, Allocator* pAllocator) -{ - if ((p != nullptr) && !std::is_trivial::value) - { - const size_t headerSize = Max(Max(alignof(T), alignof(size_t)), sizeof(size_t)); - const size_t* pArrayCnt = const_cast(reinterpret_cast(p)) - 1; - - for (uint32 i = 0; i < *pArrayCnt; i++) - { - Destructor(p + i); - } - - p = const_cast(reinterpret_cast((Util::VoidPtrDec(pArrayCnt + 1, headerSize)))); - } - -#if PAL_MEMTRACK - const Util::FreeInfo info(const_cast(static_cast(p)), MemBlkType::NewArray); -#else - const Util::FreeInfo info(const_cast(static_cast(p))); -#endif - - pAllocator->Free(info); -} - -constexpr size_t FastMemCpyMaxSmallSize = 64; - -typedef void* (PAL_CDECL *FastMemCpySmallFunc)(void* pDst, const void* pSrc, size_t count); - -extern const FastMemCpySmallFunc FastMemCpySmallFuncTable[]; - -/// A version of memcpy that has fewer branches for small copies. It computes an index into -/// a table based on the size requested then jumps to a branchless memcpy for that size. -/// Note that The compiler will NOT inline this if the count is known at compile time. The -/// regular memcpy() will be inlined and should be used. Note further that it is NOT always -/// obvious that the count is not known. Consider the case: -/// void SetData(UINT count, UINT* pData) -/// { -/// ... -/// memcpy(pBuf, pData, count*sizeof(UINT)); -/// ... -/// } -/// and SetData is called like this -/// SetData(4, buf); -/// In such a case, although the memcpy itself has an unknown size, if the SetData function -/// inlines and is used with a fixed count, the memcpy count is actually known. Such cases need -/// to be carefully managed. - -PAL_FORCE_INLINE void* FastMemCpy(void* pDst, const void* pSrc, size_t count) -{ - // The last entry in the table handles all entries larger than 64 bytes, so clamping the size - // to 64 calls the correct routine. - const size_t index = Min(count, FastMemCpyMaxSmallSize + 1); - return (FastMemCpySmallFuncTable[index])(pDst, pSrc, count); -} - -/// Get the default allocation callback. -/// -/// @param [out] pAllocCb Pointer to the allocation callback structure. Must not be null. -void PAL_STDCALL GetDefaultAllocCb(Util::AllocCallbacks* pAllocCb); - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palSysMemory.h + * @brief PAL utility collection system memory management macros. + *********************************************************************************************************************** + */ + +#pragma once + +#include "palAssert.h" +#include "palInlineFuncs.h" +#include "palMemTracker.h" +#include +#include + +// Forward declarations +namespace Util { struct AllocInfo; } +namespace Util { struct FreeInfo; } +namespace Util { enum SystemAllocType : uint32; } + +#if !defined(__GNUC__) || (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 8)) +/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. +#define PAL_DEFAULT_MEM_ALIGN alignof(std::max_align_t) +#else +// GCC versions prior to 4.9 break C++11 compatibility by putting max_align_t in the global namespace. +/// Default malloc alignment. Usually equal to 16 bytes for x64 targets. +#define PAL_DEFAULT_MEM_ALIGN alignof(::max_align_t) +#endif + +namespace Util +{ + +/// Informs that @p p is aligned to at least @p Alignment. +template +constexpr T* AssumeAligned(T* p) +{ + return static_cast(__builtin_assume_aligned(p, Alignment)); +} + +} // Util + +#if PAL_MEMTRACK + +/// @internal Malloc allocation method with extra memory leak tracking arguments. +#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType, _memBlkType, __FILE__, __LINE__)) + +/// @internal Calloc allocation method with extra memory leak tracking arguments. +#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType, _memBlkType, __FILE__, __LINE__)) + +/// @internal Free method with extra memory leak tracking arguments. +#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ + _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)), _memBlkType)) + +#else + +/// @internal Malloc method not wrapped with memory leak tracking. +#define PAL_MALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, false, _allocType)) + +/// @internal Calloc method not wrapped with memory leak tracking. +#define PAL_CALLOC_BASE(_size, _align, _allocator, _allocType, _memBlkType) \ + _allocator->Alloc(::Util::AllocInfo(_size, _align, true, _allocType)) + +/// @internal Free method not wrapped with memory leak tracking. +#define PAL_FREE_BASE(_ptr, _allocator, _memBlkType) \ + _allocator->Free(::Util::FreeInfo(const_cast(static_cast(_ptr)))) + +#endif + +/// Allocates heap memory in place of malloc(). +/// +/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. +/// The client is also free to use this macro in order to take advantage of PAL's memory leak tracking. +#define PAL_MALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ + PAL_MALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) + +/// Same as @ref PAL_MALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. +#define PAL_MALLOC(_size, _allocator, _allocType) \ + PAL_MALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) + +/// Allocates zero-initialized heap memory in place of calloc(). See @ref PAL_MALLOC_ALIGNED. +#define PAL_CALLOC_ALIGNED(_size, _align, _allocator, _allocType) \ + PAL_CALLOC_BASE((_size), (_align), (_allocator), (_allocType), ::Util::MemBlkType::Malloc) + +/// Same as @ref PAL_CALLOC_ALIGNED with alignment set to the alignment of the largest native scalar type. +#define PAL_CALLOC(_size, _allocator, _allocType) \ + PAL_CALLOC_ALIGNED(_size, PAL_DEFAULT_MEM_ALIGN, _allocator, _allocType) + +/// Frees heap memory allocated with the @ref PAL_MALLOC* or @ref PAL_CALLOC* macros. +#define PAL_FREE(_ptr, _allocator) PAL_FREE_BASE((_ptr), (_allocator), ::Util::MemBlkType::Malloc) + +/// Safe free macro. Pointer is set to null after the free. +#define PAL_SAFE_FREE(_ptr, _allocator) { PAL_FREE((_ptr), (_allocator)); (_ptr) = nullptr; } + +/// @internal +/// +/// This type only exists to force a unique override for placement new. We need to override placement new in order to +/// call the constructor in the PAL_NEW and PAL_NEW_ARRAY implementations, but we do not want to overload global +/// placement new or include \ since either could interfere with the client. Adding a dummy parameter allows us +/// to define a PAL-only placement new implementation. +namespace Util +{ +struct Dummy +{ + explicit Dummy() { } ///< Explicit default constructor prevents this from being instantiated via unqualified "{}". +}; +} + +/// @internal +/// +/// PAL-internal placement new override. The Dummy is used to ensure there won't be a conflict if a client tries to +/// override global placement new. +/// +/// @param [in] size Size of the memory allocation. +/// @param [in] pObjMem Memory where object will be constructed. +/// @param [in] dummy Unused. +extern void* PAL_CDECL operator new( + size_t size, + void* pObjMem, + Util::Dummy dummy) noexcept; + +/// @internal +/// +/// Silences compiler warnings about not have a matching delete for the placement new override above. Will never be +/// called. +/// +/// @param [in] pObj Unused. +/// @param [in] pObjMem Unused. +/// @param [in] dummy Unused. +extern void PAL_CDECL operator delete( + void* pObj, + void* pObjMem, + Util::Dummy dummy) noexcept; + +/// Placement new macro. +#define PAL_PLACEMENT_NEW(_ptr) new((_ptr), ::Util::Dummy{}) + +/// Allocates heap memory and calls constructor for an object of the specified type. +/// +/// This macro is used internally by PAL, and will potentially result in a callback to the client for actual allocation. +/// The client is also free to use this macro. +/// +/// Instead of calling "MyClass* pMyClass = new MyClass(arg1, arg2)", call +/// "MyClass* pMyClass = PAL_NEW(MyClass, AllocInternal)(arg1, arg2)". +#define PAL_NEW(_className, _allocator, _allocType) \ + PAL_PLACEMENT_NEW( \ + PAL_MALLOC_BASE(sizeof(_className), alignof(_className), (_allocator), (_allocType), ::Util::MemBlkType::New)) \ + _className + +/// Calls destructor and frees heap memory for the object allocated with PAL_NEW*. +#define PAL_DELETE(_ptr, _allocator) \ +{ \ + /* we want to evaluate the expression (_allocator) before calling + the destructor because the destructor might have side effects */ \ + auto _allocator_ = (_allocator); \ + ::Util::Destructor(_ptr); \ + PAL_FREE_BASE((_ptr), _allocator_, ::Util::MemBlkType::New); \ +} + +/// Calls destructor and frees heap memory for "this". Use this macro to delete an object without a public destructor. +#define PAL_DELETE_THIS(_className, _allocator) \ +{ \ + /* we want to evaluate the expression (_allocator) before calling + the destructor because the destructor might have side effects */ \ + auto _allocator_ = (_allocator); \ + this->~_className(); \ + PAL_FREE_BASE(this, _allocator_, ::Util::MemBlkType::New); \ +} + +/// Safe delete macro. Pointer is set to null after the delete. +#define PAL_SAFE_DELETE(_ptr, _allocator) { PAL_DELETE(_ptr, _allocator); (_ptr) = nullptr; } + +/// Allocates an array of the specified object type. +/// +/// For non-POD types, the default constructor will be called. Default constructor is not available for POD types +/// (i.e., PAL_NEW_ARRAY(int, 3, AllocInternal)() won't work. +#if PAL_MEMTRACK +#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ + ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType), __FILE__, __LINE__) +#else +#define PAL_NEW_ARRAY(_className, _arrayCnt, _allocator, _allocType) \ + ::Util::NewArray<_className>((_arrayCnt), (_allocator), (_allocType)) +#endif + +/// Destroys an array of the specified object type. +/// +/// For non-POD types, the destructor will be called. +#define PAL_DELETE_ARRAY(_ptr, _allocator) ::Util::DeleteArray(_ptr, _allocator) + +/// Safe delete array macro. Pointer is set to null after the delete. +#define PAL_SAFE_DELETE_ARRAY(_ptr, _allocator) { PAL_DELETE_ARRAY(_ptr, _allocator); (_ptr) = nullptr; } + +namespace Util +{ +/// Specifies the usage of a system memory allocation made via a client allocation callback. +/// +/// The selected type gives the client an idea of the expected lifetime of the allocation, perhaps allowing intelligent +/// selection of sub-allocation pool, etc. +/// +/// @note This is a weak uint32 enum where all PAL values set the top bit. The client is free to use PAL's memory +/// utilities for their own allocations with their own uint32 enum using the range 0 to 0x7FFFFFFF. The client's +/// allocation callback can then separately handle any memory allocation category, whether allocated by PAL or +/// themselves. +/// +/// @see AllocCallbacks +/// @see AllocFunc +enum SystemAllocType : uint32 +{ + /// Indicates an allocation will be attached to a client-created PAL object and will not be freed until the client + /// frees the associated object. This type will be specified when allocation callbacks are made during a PAL create + /// call (e.g., IDevice::CreateGraphicsPipeline()). + AllocObject = 0x80000000, + + /// Indicates an allocation is for internal PAL use. The client should assume such allocations have a long + /// lifetime, and may not be freed until IPlatform::Destroy() is called. + AllocInternal = 0x80000001, + + /// Indicates an allocation is for internal PAL use and that the lifetime of the allocation will be short. + /// Typically this will be specified for heap allocations that will be freed before control is returned to the + /// client. + AllocInternalTemp = 0x80000002, + + /// Indicates an allocation was requested by the shader compiler. + AllocInternalShader = 0x80000003 +}; + +/// Function pointer type defining a callback for client-controlled system memory allocation. +/// +/// @see AllocCallbacks +/// +/// @ingroup LibInit +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter +/// to CreatePlatform() will be passed back to the client on every allocation callback. +/// @param [in] size Size of the requested allocation in bytes. Must be non-zero. +/// @param [in] alignment Required alignment of the requested allocation in bytes. Must be a power of two. +/// @param [in] allocType Hint to client about expected allocation usage and lifetime. See @ref SystemAllocType. +/// +/// @returns Pointer to system memory with the specified size and alignment. nullptr means that the allocation failed. +typedef void* (PAL_STDCALL *AllocFunc)( + void* pClientData, + size_t size, + size_t alignment, + SystemAllocType allocType); + +/// Function pointer type defining a callback for client-controlled system memory deallocation. +/// +/// @see AllocCallbacks +/// +/// @ingroup LibInit +/// +/// @param [in] pClientData Pointer to client-defined data. The pClientData value specified in the pAllocCb parameter +/// to CreatePlatform() will be passed back to the client on every free callback. +/// @param [in] pMem System memory pointer to be freed. The specified pointer must have been allocated by an +/// @ref AllocFunc callback. +typedef void (PAL_STDCALL *FreeFunc)( + void* pClientData, + void* pMem); + +/// Specifies client-provided system allocation callbacks. Used as a parameter to Pal::CreatePlatform(). +/// +/// @ingroup LibInit +struct AllocCallbacks +{ + void* pClientData; ///< Opaque pointer to data of client's choosing. This pointer will be passed back to + /// every @ref AllocFunc and @ref FreeFunc call made by PAL. + AllocFunc pfnAlloc; ///< System memory allocation callback. @see AllocFunc. + FreeFunc pfnFree; ///< System memory deallocation callback. @see FreeFunc. +}; + +/// Information about requested allocation. +/// +/// Contains necessary information (size, alignment, etc.) to allocate new system memory. +/// +/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator +/// does not need to use any of this information, but can if desired. +/// +/// @see Allocators +struct AllocInfo +{ + /// Constructor. + AllocInfo( + size_t bytes, ///< [in] Number of bytes to allocate. + size_t alignment, ///< [in] Required alignment of the requested allocation in bytes. + bool zeroMem, ///< [in] True for calloc, false for malloc. + SystemAllocType allocType ///< [in] Hint on type of allocation and lifetime for client callbacks. +#if PAL_MEMTRACK + , MemBlkType blockType, ///< [in] Type of allocation (malloc, new, or new array). + const char* pFilename, ///< [in] Source filename that requested the memory allocation. + uint32 lineNumber ///< [in] Line number in the source file that requested the memory allocation. +#endif + ) + : + bytes(bytes), + alignment(alignment), + zeroMem(zeroMem), + allocType(allocType) +#if PAL_MEMTRACK + , blockType(blockType), + pFilename(pFilename), + lineNumber(lineNumber) +#endif + {} + + size_t bytes; ///< Number of bytes to allocate. + const size_t alignment; ///< Required alignment of the requested allocation in bytes. + const bool zeroMem; ///< True for calloc, false for malloc. + const SystemAllocType allocType; ///< Hint on type of allocation and lifetime for client callbacks. +#if PAL_MEMTRACK + const MemBlkType blockType; ///< Type of allocation (malloc, new, or new array). + const char* pFilename; ///< Source filename that requested the memory allocation. + const uint32 lineNumber; ///< Line number in the source file that requested the memory allocation. +#endif +}; + +/// Information about freeing a specified allocation. +/// +/// Contains necessary information about memory that needs to be freed. +/// +/// @note If memory leak tracking is enabled, additional parameters are available from this structure. The allocator +/// does not need to use any of this information, but can if desired. +/// +/// @see Allocators +struct FreeInfo +{ + /// Constructor. + FreeInfo( + void* pClientMem ///< [in] Pointer to memory allocation. +#if PAL_MEMTRACK + , MemBlkType blockType ///< [in] Type of free (free, delete, or delete array). +#endif + ) + : + pClientMem(pClientMem) +#if PAL_MEMTRACK + , blockType(blockType) +#endif + {} + + void* pClientMem; ///< Pointer to memory allocation. +#if PAL_MEMTRACK + const MemBlkType blockType; ///< Type of free (free, delete, or delete array). +#endif +}; + +/** + *********************************************************************************************************************** + * @brief Wraps a AllocCallbacks struct into a class compatible with PAL's Allocator concept. + *********************************************************************************************************************** + */ +class ForwardAllocator +{ +public: + /// Constructor. + ForwardAllocator(const AllocCallbacks& callbacks) : m_callbacks(callbacks) { } + + /// Allocates memory using the provided pfnAlloc callback. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + // Allocating zero bytes of memory results in undefined behavior. + PAL_ASSERT(allocInfo.bytes > 0); + + void* pMem = m_callbacks.pfnAlloc(m_callbacks.pClientData, + allocInfo.bytes, + allocInfo.alignment, + allocInfo.allocType); + + if ((pMem != nullptr) && allocInfo.zeroMem) + { + memset(pMem, 0, allocInfo.bytes); + } + + return pMem; + } + + /// Frees memory using the provided pfnFree callback. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + if (freeInfo.pClientMem != nullptr) + { + m_callbacks.pfnFree(m_callbacks.pClientData, freeInfo.pClientMem); + } + } + +private: + const AllocCallbacks m_callbacks; +}; + +/** +************************************************************************************************************************ +* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps ForwardAllocator. +************************************************************************************************************************ +*/ +#if PAL_MEMTRACK +class ForwardAllocatorTracked +{ +public: + /// Constructor + ForwardAllocatorTracked(const AllocCallbacks& callbacks) + : + m_allocator(callbacks), + m_memTracker(&m_allocator) + { + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + return m_memTracker.Alloc(allocInfo); + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + m_memTracker.Free(freeInfo); + } + +private: + Util::ForwardAllocator m_allocator; ///< The ForwardAllocator which this object wraps. + MemTracker m_memTracker; ///< Memory tracker for this ForwardAllocator. +}; +#else +using ForwardAllocatorTracked = ForwardAllocator; +#endif + +/** +************************************************************************************************************************ +* @brief A wrapper representing an allocator const-pointer. Can be implicitly constructed from any Allocator pointer. +* +* IndirectAllocator is a type-erasure replacement for `Allocator*const pAllocator`, to abstract around Allocator types. +* This allows classes to not need fully template on `typename Allocator`, at the cost of more pointer-indirection. +* +* Const-correctness should be treated as `Allocator*const pAllocator` - the pointed-to Allocator may be mutable. +************************************************************************************************************************ +*/ +class IndirectAllocator +{ +public: + /// Implicit conversion from any Allocator pointer. + template + IndirectAllocator(Allocator*const pAllocator) + : + m_pAllocator(pAllocator), + m_pfnAlloc(&DispatchAlloc), + m_pfnFree(&DispatchFree) + { } + + /// Constructor specialization for a pointer to another IndirectAllocator, which acts like a copy constructor. + IndirectAllocator(const IndirectAllocator*const pAllocator) : IndirectAllocator(*pAllocator) { } + + /// Allocates memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) const { return m_pfnAlloc(m_pAllocator, allocInfo); } + + /// Frees memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) const { return m_pfnFree(m_pAllocator, freeInfo); } + + /// Returns true if the allocator == nullptr. Used in place of `pAllocator == nullptr`. + constexpr bool operator==(std::nullptr_t) const { return m_pAllocator == nullptr; } + +private: + /// @internal Allocation dispatch function. This is what the non-template @ref m_pfnAlloc callback references. + template + static void* DispatchAlloc(void*const pAllocator, const AllocInfo& allocInfo) + { + auto*const pTypedAllocator = static_cast(pAllocator); + return pTypedAllocator->Alloc(allocInfo); + } + + /// @internal Free dispatch function. This is what the non-template @ref m_pfnFree callback references. + template + static void DispatchFree(void*const pAllocator, const FreeInfo& freeInfo) + { + auto*const pTypedAllocator = static_cast(pAllocator); + pTypedAllocator->Free(freeInfo); + } + + using DispatchAllocCb = void* (*)(void*const, const AllocInfo&); + using DispatchFreeCb = void (*)(void*const, const FreeInfo&); + + void*const m_pAllocator; + + const DispatchAllocCb m_pfnAlloc; + const DispatchFreeCb m_pfnFree; +}; + +/** + *********************************************************************************************************************** + * @brief A generic allocator class that allocate and free memory for general purpose use. + *********************************************************************************************************************** + */ +class GenericAllocator +{ +public: + /// Allocates memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + static void* Alloc(const AllocInfo& allocInfo); + + /// Frees memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + static void Free(const FreeInfo& freeInfo); +}; + +/** +************************************************************************************************************************ +* @brief A wrapper for Trackable (using MemTracker) memory allocator that wraps GenericAllocator. +************************************************************************************************************************ +*/ +#if PAL_MEMTRACK +class GenericAllocatorTracked +{ +public: + /// Constructor + GenericAllocatorTracked() + : + m_memTracker(&m_allocator) + { + Result result = m_memTracker.Init(); + PAL_ASSERT(result == Result::_Success); + } + + /// Allocates a block of memory. + /// + /// @param [in] allocInfo Contains information about the requested allocation. + /// + /// @returns Pointer to the allocated memory, nullptr if the allocation failed. + void* Alloc(const AllocInfo& allocInfo) + { + void* pMemory = m_memTracker.Alloc(allocInfo); + return pMemory; + } + + /// Frees a block of memory. + /// + /// @param [in] freeInfo Contains information about the requested free. + void Free(const FreeInfo& freeInfo) + { + m_memTracker.Free(freeInfo); + } + +private: + GenericAllocator m_allocator; ///< The GenericAllocator which this object wraps. + + MemTracker m_memTracker; ///< Memory tracker for this GenericAllocator. +}; +#else +using GenericAllocatorTracked = GenericAllocator; +#endif + +/// Returns the OS-specific page size. +/// +/// @note All virtual reservations/commits/decommits/releases must be aligned to the value returned by this function. +/// +/// @return The OS-specific size, in bytes, of a page. +extern size_t VirtualPageSize(); + +/// Reserves the specified amount of virtual address space. +/// +/// @param [in] sizeInBytes Size in bytes of the requested reservation. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [out] ppOut Pointer to reserved memory. Must not be null. +/// @param [in] pMem Pointer to the starting virtual address to reserve. +/// Subject to failure "ErrorOutOfMemory" if any pages in that range have already been reserved +/// elsewhere by the process. +/// @param [in] alignment Optional parameter specifying the alignment of the reserved cpu VA. +/// +/// @returns Success if reservation is successful. +/// Otherwise: +/// - ErrorOutOfMemory if memory reservation failed. +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if ppOut is null. +extern Result VirtualReserve(size_t sizeInBytes, void** ppOut, void* pMem = nullptr, size_t alignment = 1); + +/// Commits the specified amount of virtual address space, requesting backing memory from the OS. +/// +/// @param [in] pMem Pointer to the start of reserved memory to commit. Must be aligned to the page size +/// returned from @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of the requested commit. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] isExecutable Indicate if the committed memory is used for execution on gpu(For instance: PM4, kernel). +/// Used only on Windows. +/// +/// @returns Success if committing is successful. +/// Otherwise: +/// - ErrorOutOfMemory if memory committing failed. +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualCommit(void* pMem, size_t sizeInBytes, bool isExecutable = false); + +/// Decommits the specified amount of virtual address space, freeing the backing memory back to the OS. +/// +/// @param [in] pMem Pointer to the start of committed memory. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of how much to decommit. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// +/// @returns Success if decommitting is successful. +/// Otherwise: +/// - ErrorInvalidValue if sizeInBytes is zero. +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualDecommit(void* pMem, size_t sizeInBytes); + +/// Releases the specified amount of virtual address space, both freeing the backing memory and virtual address space +/// back to the OS. +/// +/// @param [in] pMem Pointer to the start of reserved memory. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// @param [in] sizeInBytes Size in bytes of how much to release. Must be aligned to the page size returned from +/// @ref Util::VirtualPageSize(); +/// +/// @returns Success if decommitting is successful. +/// Otherwise: +/// - ErrorInvalidValue if sizeInBytes is zero +/// - ErrorInvalidPointer if pMem is null. +extern Result VirtualRelease(void* pMem, size_t sizeInBytes); + +/// @internal +/// +/// OS-specific implementation to install default allocation callbacks in the specified structure. Expected to be +/// called during CreatePlatform if the client doesn't specify their own allocation callbacks. +/// +/// @param [in,out] pAllocCb Allocation callback structure to be updated with the OS-specific default callbacks. +/// +/// @returns Success if successful, otherwise an appropriate error code. +extern Result OsInitDefaultAllocCallbacks(AllocCallbacks* pAllocCb); + +/// @internal Internal template implementation for calling a destructor from PAL_DELETE or PAL_DELETE_ARRAY. +/// +/// @param [in] p Object to be destructed. +template +void Destructor(T* p) +{ + if ((p != nullptr) && !std::is_trivial::value) + { + p->~T(); + } +} + +/// @internal +/// +/// Internal template implementation of PAL_NEW_ARRAY. +/// +/// For non-POD types, allocate extra memory and store the array count for use by the destructor. An entire extra cache +/// line is allocated in order to prevent misaligning the actual array data. In practice, we rarely allocate an array +/// of non-POD objects, so this is likely not an issue. +/// +/// @param [in] arrayCnt Number of entries in the array. +/// @param [in] pAllocator The allocator that will allocate the memory for the array. +/// @param [in] allocType Hint to client on the lifetime/type of allocation. +/// @param [in] pFilename Source filename that requested the new array. +/// @param [in] lineNumber Line number in the source file that requested the new array. +/// +/// @returns Pointer to the allocated array, nullptr if the allocation failed. +template +T* NewArray( + size_t arrayCnt, + Allocator* pAllocator, + SystemAllocType allocType +#if PAL_MEMTRACK + , + const char* pFilename, + uint32 lineNumber +#endif + ) +{ + size_t align = alignof(T); + size_t allocSize = sizeof(T) * arrayCnt; + size_t headerSize = 0; + + if (!std::is_trivial::value) + { + align = Max(align, alignof(size_t)); + headerSize = Max(align, sizeof(size_t)); + allocSize += headerSize; + } + +#if PAL_MEMTRACK + const Util::AllocInfo info(allocSize, align, false, allocType, MemBlkType::NewArray, pFilename, lineNumber); +#else + const Util::AllocInfo info(allocSize, align, false, allocType); +#endif + + T* pRet = static_cast(pAllocator->Alloc(info)); + + if ((!std::is_trivial::value) && (pRet != nullptr)) + { + pRet = static_cast(Util::VoidPtrInc(static_cast(pRet), headerSize)); + + size_t* pArrayCnt = static_cast(Util::VoidPtrDec(static_cast(pRet), sizeof(size_t))); + *pArrayCnt = arrayCnt; + + T* pCurObj = static_cast(pRet); + for (uint32 i = 0; i < arrayCnt; i++) + { + PAL_PLACEMENT_NEW(pCurObj) T; + pCurObj++; + } + } + + return pRet; +} + +/// @internal +/// +/// Internal template implementation for PAL_DELETE_ARRAY. +/// +/// For non-POD types, find the array count stored before the client pointer, and call the destructor on each object in +/// the array. +/// +/// @param [in] p Pointer to the memory to be deleted. +/// @param [in] pAllocator The allocator that will free the memory for the array. +template +void DeleteArray(T* p, Allocator* pAllocator) +{ + if ((p != nullptr) && !std::is_trivial::value) + { + const size_t headerSize = Max(Max(alignof(T), alignof(size_t)), sizeof(size_t)); + const size_t* pArrayCnt = const_cast(reinterpret_cast(p)) - 1; + + for (uint32 i = 0; i < *pArrayCnt; i++) + { + Destructor(p + i); + } + + p = const_cast(reinterpret_cast((Util::VoidPtrDec(pArrayCnt + 1, headerSize)))); + } + +#if PAL_MEMTRACK + const Util::FreeInfo info(const_cast(static_cast(p)), MemBlkType::NewArray); +#else + const Util::FreeInfo info(const_cast(static_cast(p))); +#endif + + pAllocator->Free(info); +} + +constexpr size_t FastMemCpyMaxSmallSize = 64; + +typedef void* (PAL_CDECL *FastMemCpySmallFunc)(void* pDst, const void* pSrc, size_t count); + +extern const FastMemCpySmallFunc FastMemCpySmallFuncTable[]; + +/// A version of memcpy that has fewer branches for small copies. It computes an index into +/// a table based on the size requested then jumps to a branchless memcpy for that size. +/// Note that The compiler will NOT inline this if the count is known at compile time. The +/// regular memcpy() will be inlined and should be used. Note further that it is NOT always +/// obvious that the count is not known. Consider the case: +/// void SetData(UINT count, UINT* pData) +/// { +/// ... +/// memcpy(pBuf, pData, count*sizeof(UINT)); +/// ... +/// } +/// and SetData is called like this +/// SetData(4, buf); +/// In such a case, although the memcpy itself has an unknown size, if the SetData function +/// inlines and is used with a fixed count, the memcpy count is actually known. Such cases need +/// to be carefully managed. + +PAL_FORCE_INLINE void* FastMemCpy(void* pDst, const void* pSrc, size_t count) +{ + // The last entry in the table handles all entries larger than 64 bytes, so clamping the size + // to 64 calls the correct routine. + const size_t index = Min(count, FastMemCpyMaxSmallSize + 1); + return (FastMemCpySmallFuncTable[index])(pDst, pSrc, count); +} + +/// Get the default allocation callback. +/// +/// @param [out] pAllocCb Pointer to the allocation callback structure. Must not be null. +void PAL_STDCALL GetDefaultAllocCb(Util::AllocCallbacks* pAllocCb); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h index daad33e893..e17690a133 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palSysUtil.h @@ -1,816 +1,816 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palSysUtil.h - * @brief PAL utility collection system functions. - *********************************************************************************************************************** - */ - -#pragma once - -// pal -#include "palAssert.h" -#include "palAutoBuffer.h" -#include "palFile.h" -#include "palSpan.h" -#include "palStringView.h" -#include "palTime.h" -#include "palUtil.h" -#include "palVector.h" - -// stl -#include -#include - -// platform -#if defined(_WIN32) -#define PAL_HAS_CPUID (_M_IX86 || _M_X64) -#include -#include -typedef void* HANDLE; -#elif defined(__unix__) -#define PAL_HAS_CPUID (__i386__ || __x86_64__) -#if PAL_HAS_CPUID -#include -#endif -#endif - -namespace Util -{ - -static constexpr uint32 RyzenMaxCcxCount = 4; -static constexpr uint32 CpuVendorAmd = 0x01000000; -static constexpr uint32 CpuVendorIntel = 0x02000000; - -/// Specifies a keyboard key for detecting key presses. -enum class KeyCode : uint32 -{ - Esc, - F1, - F2, - F3, - F4, - F5, - F6, - F7, - F8, - F9, - F10, - F11, - F12, - F13, - F14, - F15, - F16, - Backtick, // ` ~ - Minus, // - _ - Equal, // = + - LBrace, // [ { - RBrace, // ] } - Backslash, // \ | - Semicolon, // ; : - Apostrophe, // " ' - Comma, // , < - Dot, // . > - Slash, // / ? - Enter, - Space, - Backspace, - Tab, - Capslock, - Shift, - LShift, - RShift, - Control, - LControl, - RControl, - Alt, - LAlt, - RAlt, - Scroll, - Insert, - Delete, - Home, - End, - PageUp, - PageDown, - ArrowUp, - ArrowDown, - ArrowLeft, - ArrowRight, - Numlock, - NumSlash, - NumAsterisk, - NumMinus, - NumPlus, - NumDot, - NumEnter, - Num0, - Num1, - Num2, - Num3, - Num4, - Num5, - Num6, - Num7, - Num8, - Num9, - Zero, - One, - Two, - Three, - Four, - Five, - Six, - Seven, - Eight, - Nine, - A, - B, - C, - D, - E, - F, - G, - H, - I, - J, - K, - L, - M, - N, - O, - P, - Q, - R, - S, - T, - U, - V, - W, - X, - Y, - Z, - Shift_F10, - Shift_F11, - Undefined // Used as value where key code is not defined in the enum -}; - -/// Enum to identify possible configurations -enum class CpuType : uint32 -{ - Unknown = 0, ///< No capabilites set - AmdK5 = (CpuVendorAmd + 0), ///< No MMX, no cmov, no 3DNow - AmdK6 = (CpuVendorAmd + 1), ///< No MMX, no cmov, 3DNow (models 6 and 7) - AmdK6_2 = (CpuVendorAmd + 2), ///< MMX, no cmov, 3DNow (model 8, no HW WC but not part of cpuid) - AmdK6_3 = (CpuVendorAmd + 3), ///< MMX, no cmov, 3DNow (model 9) - AmdK7Basic = (CpuVendorAmd + 4), ///< K7 missing one of the features of K7 - AmdK7 = (CpuVendorAmd + 5), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext - AmdK7Sse = (CpuVendorAmd + 6), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext, SSE - AmdK8 = (CpuVendorAmd + 7), ///< Athlon 64, Athlon 64 FX, and Opteron - AmdK10 = (CpuVendorAmd + 8), ///< Barcelona, Phenom, Greyhound - AmdFamily12h = (CpuVendorAmd + 9), ///< Family 12h - Llano - AmdBobcat = (CpuVendorAmd + 10), ///< Bobcat - AmdFamily15h = (CpuVendorAmd + 11), ///< Family 15h - Orochi, Trinity, Komodo, Kaveri, Basilisk - AmdFamily16h = (CpuVendorAmd + 12), ///< Family 16h - Kabini - AmdRyzen = (CpuVendorAmd + 13), ///< Ryzen - IntelOld = (CpuVendorIntel + 0), ///< Inidicate cpu type befor Intel Pentium III - IntelP3 = (CpuVendorIntel + 1), ///< Generic Pentium III - IntelP3Model7 = (CpuVendorIntel + 2), ///< PIII-7, PIII Xeon-7 - IntelP3Model8 = (CpuVendorIntel + 3), ///< PIII-8, PIII Xeon-8, Celeron-8 - IntelPMModel9 = (CpuVendorIntel + 4), ///< Pentium M Model 9 (Banias) - IntelXeonModelA = (CpuVendorIntel + 5), ///< Xeon-A - IntelP3ModelB = (CpuVendorIntel + 6), ///< PIII-B - IntelPMModelD = (CpuVendorIntel + 7), ///< Pentium M Model D (Dothan) - IntelP4 = (CpuVendorIntel + 8), ///< Pentium 4, Pentium 4-M, Xenon, Celeron - IntelPMModelE = (CpuVendorIntel + 9), ///< Pentium M Model E (Yonah) - IntelCoreModelF = (CpuVendorIntel + 10), ///< Core F (Conroe) -}; - -/// Specifies a struct that contains information about the system. -struct SystemInfo -{ - CpuType cpuType; ///< Cpu type - char cpuVendorString[16]; ///< Null-terminated cpu vendor string - char cpuBrandString[48]; ///< Null-terminated cpu brand string - uint32 cpuLogicalCoreCount; ///< Number of logical cores on the cpu - uint32 cpuPhysicalCoreCount; ///< Number of physical cores on the cpu - uint32 totalSysMemSize; ///< Total system memory (RAM) size in megabytes - uint32 cpuFrequency; ///< Reports CPU clock speed in MHz. - /// (From Registry for Windows, current average processor speed for Linux.) - uint32 displayFamily; ///< Display Family of cpu - uint32 displayModel; ///< Display Model of cpu - - union - { - struct - { - uint32 affinityMask[RyzenMaxCcxCount]; ///< Affinity mask for each core complex (CCX). - } amdRyzen; ///< Properties specific to AMD Ryzen CPU's. - } cpuArchInfo; ///< This member should be used only for Ryzen for now. -}; - -/// Returns an appropriate result from the given errno -/// -/// @param errno_in Value from 'errno' (or functions that return errno_t) -/// -/// @returns Relevent Result value for the given errno-- never Success. -inline Result ConvertErrno( - int32 errnoIn) -{ - Result result = Result::ErrorUnknown; - switch (errnoIn) - { - case EAGAIN: - case EBUSY: - case EINTR: - result = Result::NotReady; - break; - case ETIMEDOUT: - case ETIME: - result = Result::Timeout; - break; - case EEXIST: - result = Result::AlreadyExists; - break; - case ENOENT: - case ENOTDIR: - result = Result::NotFound; - break; - case EACCES: - case EPERM: - case EROFS: - result = Result::ErrorPermissionDenied; - break; - case ENOSPC: - result = Result::ErrorDiskFull; - break; - case EISDIR: - case EINVAL: - case EBADF: - case ENAMETOOLONG: - case ELOOP: - result = Result::ErrorInvalidValue; - break; - case ENOMEM: - case EOVERFLOW: - result = Result::ErrorOutOfMemory; - break; - default: - PAL_ALERT_ALWAYS_MSG("Unknown result generated from errno %d (%s)", errnoIn, strerror(errnoIn)); - break; - } - return result; -} - -#if defined(_WIN32) -/// Helper function to convert Pal::Result to HRESULT -/// -/// @param result Pal::Result to convert -/// -/// @returns the equivalent HRESULT -extern HRESULT PalToHResult(Result result); - -/// Helper function to convert HRESULT to Pal::Result -/// -/// @param hr HRESULT to convert -/// -/// @returns the equivalent Pal::Result -extern Result HResultToPal(HRESULT hr); -#endif - -#if defined(_WIN32) -/// Returns an appropriate result from the given error from 'GetLastError'. -/// Certain OS functions may also return a 'ERROR_SUCCESS' which this helper also translates. -/// -/// @param errno_in System error code from 'GetLastError' -/// -/// @returns Relevent Result value for the given system error code. -inline Result ConvertWinError( - uint32 winError) -{ - Result result = Result::ErrorUnknown; - switch (winError) - { - // File already exists - case ERROR_ALREADY_EXISTS: - case ERROR_FILE_EXISTS: - result = Result::AlreadyExists; - break; - - // File or path does not exist - case ERROR_FILE_NOT_FOUND: - case ERROR_PATH_NOT_FOUND: - case ERROR_INVALID_NAME: - result = Result::NotFound; - break; - - // File locked for specified access - case ERROR_SHARING_VIOLATION: - case ERROR_LOCK_VIOLATION: - result = Result::ErrorNotShareable; - break; - case ERROR_ACCESS_DENIED: - result = Result::ErrorPermissionDenied; - break; - - // Temporary Errors - case ERROR_NOT_READY: - case ERROR_BUSY: - result = Result::NotReady; - break; - case WAIT_TIMEOUT: - result = Result::Timeout; - break; - - // Usage errors - case ERROR_BUFFER_OVERFLOW: - case ERROR_INVALID_HANDLE: - result = Result::ErrorInvalidValue; - break; - - // Various expected but rare error cases - case ERROR_TOO_MANY_OPEN_FILES: - case ERROR_FILE_TOO_LARGE: - case ERROR_VIRUS_INFECTED: - case ERROR_VIRUS_DELETED: - result = Result::ErrorUnavailable; - break; - case ERROR_HANDLE_DISK_FULL: - case ERROR_DISK_FULL: - result = Result::ErrorDiskFull; - break; - case ERROR_NOT_ENOUGH_MEMORY: - case ERROR_OUTOFMEMORY: - result = Result::ErrorOutOfMemory; - break; - - case ERROR_SUCCESS: - result = Result::Success; - break; - default: - PAL_ALERT_ALWAYS_MSG("Unknown result generated from Windows error code %d", winError); - break; - } - return result; -} - -/// Windows Process Integrity Levels -enum class ProcessIntegrityLevel -{ - Untrusted, //< Untrusted Process (No System Resource Access) - Low, //< Restricted Process - Medium, //< Normal Process - High, //< Elevated Process - System //< System-Level or higher Process -}; - -/// Get the local storage path based on process integrity level -/// -/// @returns the Integrity Level of the current process -extern ProcessIntegrityLevel GetProcessIntegrityLevel(); - -/// Detects if we're inside an App Container sandbox (ie, a Windows Store or UWP app) -/// -/// @returns whether the current process is in an App Container -extern bool IsProcessInAppContainer(); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 921 -/// Tests whether the passed handle is valid or not. -/// A handle could be either nullptr or INVALID_HANDLE_VALUE. -/// INVALID_HANDLE_VALUE is the proper value to set a handle to when you intend to error out. -/// However, handles default-initialize to nullptr. So that's also a common value you see for an unset handle. -/// A common error is to check for one value but not the other. So PAL prescribes that clients use this helper. -constexpr bool IsValidHandle(HANDLE handle) -{ -#ifdef INVALID_HANDLE_VALUE - return (handle != nullptr) && (handle != INVALID_HANDLE_VALUE); -#else - return (handle != nullptr) && (handle != HANDLE(size_t(-1))); -#endif -} -#endif -#endif - -/// Queries system information. -/// -/// @param [out] pSystemInfo SystemInfo struct containing information about the system. -/// -/// @returns Success if querying the system info was successful. Otherwise, the following results will be returned: -/// + ErrorInvalidPointer returned if pSystemInfo is nullptr. -/// + ErrorOutOfMemory returned if the system ran out of memory during the function call. -/// + ErrorUnavailable returned if querying the system info is not supported. -/// + ErrorUnknown returned if an error occurs while calling OS functions. -extern Result QuerySystemInfo(SystemInfo* pSystemInfo); - -/// Query cpu type for AMD processor. -/// -/// @param [out] pSystemInfo SystemInfo struct containing information about the system. -/// -/// @returns none. -extern void QueryAMDCpuType(SystemInfo* pSystemInfo); - -/// Query cpu type for Intel processor. -/// -/// @param [out] pSystemInfo SystemInfo struct containing information about the system. -/// -/// @returns none. -extern void QueryIntelCpuType(SystemInfo* pSystemInfo); - -/// Gets the frequency of performance-related queries. -/// -/// @returns Current CPU performance counter frequency in Hz. -extern int64 GetPerfFrequency(); - -/// Gets the current time of a performance-related query. -/// -/// This is a high resolution time stamp that can be used in conjunction with GetPerfFrequency to measure time -/// intervals. -/// -/// @param [in] raw Whether to use a 'monotonic raw' clock which ignores smoothing. Ignored on Windows. -/// -/// @returns Current value of the CPU performance counter. -extern int64 GetPerfCpuTime(bool raw=false); - -/// Determines if a specific key is pressed down. -/// -/// @param [in] key Specified which key to check. -/// @param [in, out] pPrevState The previous state of the key. -/// -/// @returns True if the specified key is currently pressed down. -extern bool IsKeyPressed(KeyCode key, bool* pPrevState = nullptr); - -/// Determines if profiling is restricted -/// -/// @returns true if the process is not restricted for profiling, otherwise, false will be returned. -extern bool IsProfileRestricted(); - -/// Retrieves the fully resolved file name of the application binary. -/// -/// @param [out] pBuffer Character buffer to contain the application's executable and (fully-resolved) path -/// string. -/// @param [out] ppFilename Pointer to the location within the output buffer where the executable name begins. -/// @param [in] bufferLength Length of the output buffer, in bytes. -/// @returns Result::Success if GetModuleFileNameA succeeds. Otherwise, the following result codes would be returned: -/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. -extern Result GetExecutableName( - char* pBuffer, - char** ppFilename, - size_t bufferLength); - -/// Retrieves the fully resolved wchar_t file name of the application binary. -/// -/// @param [out] pWcBuffer wchar_t buffer to contain the application's executable and (fully-resolved) path -/// string. -/// @param [out] ppWcFilename Pointer to the location within the wchar_t output buffer where the executable name begins. -/// @param [in] bufferLength Length of the output buffer, in bytes. -/// @returns Result::Success if GetModuleFileNameW succeeds. Otherwise, the following result codes would be returned: -/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. -extern Result GetExecutableName( - wchar_t* pWcBuffer, - wchar_t** ppWcFilename, - size_t bufferLength); - -/// Gets the current library name. ie: the name of the library containing the function -/// 'GetCurrentLibraryName'. Optionally, it will also return the extension if the input -/// buffer for extension is valid. -/// -/// @param [out] pLibBuffer Character buffer where the library name will be stored. -/// @param [in] libBufferLength Length of the output buffer that will hold the library name, in bytes. -/// @param [out] pExtBuffer Character buffer where the extension will be stored. -/// @param [in] extBufferLength Length of the output buffer that will hold the extension, in bytes. -/// @returns Result::Success if no error. Otherwise, returns one of the following codes: -/// Result::ErrorInvalidMemorySize - if incoming buffer is too small. -/// Result::ErrorUnknown - for all other types of errors. -extern Result GetCurrentLibraryName( - char* pLibBuffer, - size_t libBufferLength, - char* pExtBuffer, - size_t extBufferLength); - -/// Opaque build ID obtained with GetCurrentLibraryBuildId -struct BuildId { - uint8 data[16]; -}; - -/// Gets build-unique identifier for the executable or shared library PAL was built into. This will be some opaque -/// hash or timestamp embedded in the code if present and falls back to random bytes (constant until exit) for a -/// 'usuable' build id to always exist. -/// -/// @param [out] pBuildId Will contain unique id for build -/// -/// @returns true if build id will be persistent, false if temporary -/// -/// @note We fallback to using a random build id here instead of failing so that even if this fails and nobody checks, -/// a new driver will always invalidate caches. It is better, of course, not to persist these on disk. -/// @note This may be heavy on first invocation but can be assumed to always return quickly after that. -extern bool GetCurrentLibraryBuildId( - BuildId* pBuildId); - -/// Splits a filename into its path and file components. -/// -/// @param [in] pFullPath Buffer containing the full path & file name. -/// @param [out] pPathBuf Optional. If non-null, will contain the path to the file name. On Windows, this will also -/// include the drive letter. -/// @param [in] pathLen Length of the pPathBuf buffer. Must be zero when pPathBuf is null. -/// @param [out] pFileBuf Optional. If non-null, will contain the base file name, and extension. -/// @param [in] fileLen Length of the pFileBuf buffer. Must be zero when pFileBuf is null. -extern void SplitFilePath( - const char* pFullPath, - char* pPathBuf, - size_t pathLen, - char* pFileBuf, - size_t fileLen); - -/// Creates a new directory at the specified path. -/// -/// @param [in] pPathName String specifying the new path to create. Note that this method can only create one -/// directory, if you specify "foo/bar" the "bar" directory can only be created if "foo" already -/// exists. -/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the -/// following result codes may be returned: -/// + Result::AlreadyExists if the specified directory already exists. -/// + Result::ErrorInvalidValue if the parent directory does not exist. -extern Result MkDir( - const char* pPathName); - -/// Creates a new directory at the specified path and all intermediate directories. -/// -/// @param [in] pPathName String specifying the new path to create.n -/// -/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the -/// following result codes may be returned: -/// + Result::AlreadyExists if the specified directory already exists. -/// + Result::ErrorInvalidValue if the parent directory does not exist. -extern Result MkDirRecursively( - const char* pPathName); - -/// A tuple containing a file's name and statistics -struct StatName -{ - File::Stat stat; - char name[MaxPathStrLen]; -}; - -/// Gets file information for the files in a directory -/// -/// @param [in] dirPath string specifying the directory -/// @param [out] pFileInfos list of information on every file in the directory -/// -/// @returns Result::ErrorInvalidPointer if any of the input pointers are null -/// @returns Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows -/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux -/// @returns Result::Success if the dir is empty -/// @returns Result::Success otherwise -extern Result GetFileInfoInDir( - StringView dirPath, - Vector* pFileInfos); - -/// Counts the number of files found within the directory. -/// -/// @param [in] pDirPath string specifying the directory -/// @param [out] pFileCount the number of files in the directory -/// @param [out] pCharCount the number of characters in the names of all the files -/// -/// @returns Result::ErrorInvalidPointer if any of the input pointers are null -/// @returns Value of Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows -/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux -/// @returns Result::Success if the dir is empty (pFileCount and pCharCount will be 0) -/// @returns Result::Success otherwise -extern Result CountFilesInDir( - Util::StringView dirPath, - size_t* pFileCount, - size_t* pCharCount); - -/// Lists the contents of the specified directory in an array of strings -/// -/// @param [in] dirPath String specifying the directory -/// @param [out] fileNames An array where pointers the file names will be written. -/// @param [out] buffer Memory where the file names can be stored. -/// -/// @returns Result::ErrorInvalidPointer if any of the inputs are null or empty -/// @returns Result::ErrorInvalidValue if there are any file I/O errors -/// @returns Result::Success otherwise -extern Result GetFileNamesInDir( - Util::StringView dirPath, - Util::Span> fileNames, - Util::Span buffer); - -/// Non-recursively delete the least-recently-accesssed files from a directory until the directory reaches size in bytes. -/// -/// @param [in] pPathName string specifying the absolute path to the directory you want to remove files from -/// @param desiredSize the size you want to shrink the directory to -/// -/// @returns Result::ErrorUnknown on File I/O error. -/// Result::Success otherwise. -Result RemoveOldestFilesOfDirUntilSize( - const char* pPathName, - uint64 desiredSize); - -/// Non-recursively delete the least-recently-accesssed files until the total reaches size in bytes. -/// -/// @param [in] pDirPath String specifying the directory. -/// @param [in/out] pFileInfos List of files. This span will be modified to remove the files which were deleted. -/// @param desiredSize The size you want to shrink the list of files to. -/// -/// @returns Result::ErrorUnknown on File I/O error. -/// Result::Success otherwise. -Result RemoveOldestFilesOfDirUntilSize( - StringView dirPath, - Span* pFileInfos, - uint64 desiredSize); - -/// Remove all files below threshold of a directory at the specified path. -/// -/// @param [in] pPathName String specifying the absolute path to remove. -/// @param [in] threshold The file time(from 1970/01/01 00:00:00) older(smaller) than threshold will be removed. -/// -/// @returns Result::Success if all files are successfully removed. Otherwise, the -/// following result codes may be returned: -/// + Result::ErrorUnknown if the specified directory is failed to open/remove. -/// + Result::ErrorInvalidValue if the parent directory does not exist. -Result RemoveFilesOfDirOlderThan( - const char* pPathName, - SecondsSinceEpoch threshold); - -/// Get status of a directory at the specified path. -/// -/// @param [in] pPathName String specifying the absolute path. -/// @param [out] pTotalSize Size(byte) of all files -/// @param [out] pOldestTime The oldest time(seconds from 1970/01/01 00:00:00) of all files -/// -/// @returns Result::Success if all files are successfully removed. Otherwise, the -/// following result codes may be returned: -/// + Result::ErrorUnknown if the specified directory is failed to open. -Result GetStatusOfDir( - const char* pPathName, - uint64* pTotalSize, - SecondsSinceEpoch* pOldestTime); - -/// Almost-Posix-style rename file or directory: replaces already-existing file. -/// Posix says this operation is atomic; Windows does not specify. -/// -/// @param [in] pOldName Old file or directory name -/// @param [in] pNewName Name to rename to -/// -/// @returns Result::Success if file/directory successfully moved. -Result Rename( - const char* pOldName, - const char* pNewName); - -/// Get the Process ID of the current process -/// -/// @returns The Process ID of the current process -extern uint32 GetIdOfCurrentProcess(); - -/// OS-specific wrapper for printing stack trace information. -/// -/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be -/// printed had a buffer with enough space been provided. -/// @param [in] bufSize Available space in pOutput. -/// @param [in] skipFrames Number of stack frames to skip. Implied skip of 1 (0 is 1). -/// -/// @returns The resultant length of the stack trace string. -extern size_t DumpStackTrace( - char* pOutput, - size_t bufSize, - uint32 skipFrames); - -/// Puts the calling thread to sleep for a specified number of milliseconds. -/// -/// @param [in] duration Amount of time to sleep for, in milliseconds. -extern void Sleep(std::chrono::milliseconds duration); - -/// Create Directory and SubDirectory of Executable's Name at pBaseDir -/// Like if pBaseDir="amdPal/" then final pLogDir will be created like this: amdpal/app.exe/ -/// -/// @param [in] pBaseDir Base Dir Path -/// @param [out] pLogDir Pointer to Memory where Final Path will be saved -/// @param [in] logDirSize Size of pLogDir in chars -/// -/// @returns Result::Success if the directories were created and pLogDir was populated. -extern Result CreateLogDir( - const char* pBaseDir, - char* pLogDir, - size_t logDirSize); - -/// Check if the requested key is combo key. -/// -/// @param [in] key The requested key value -/// @param [out] pKeys The array of keys the combo key composed of -/// -/// @returns If the requested key is a combo key. -inline bool IsComboKey( - KeyCode key, - KeyCode* pKeys) -{ - bool ret = false; - - if (key == KeyCode::Shift_F10) - { - ret = true; - pKeys[0] = KeyCode::Shift; - pKeys[1] = KeyCode::F10; - } - else if (key == KeyCode::Shift_F11) - { - ret = true; - pKeys[0] = KeyCode::Shift; - pKeys[1] = KeyCode::F11; - } - else - { - pKeys[0] = key; - } - - return ret; -} - -#if PAL_HAS_CPUID -/// Issue the cpuid instruction. -/// -/// @param [out] pRegValues EAX/EBX/ECX/EDX values -/// @param [in] level CpuId instruction feature level. -inline void CpuId( - uint32* pRegValues, - uint32 level) -{ -#if defined(_WIN32) - __cpuid(reinterpret_cast(pRegValues), static_cast(level)); -#elif defined(__unix__) - __get_cpuid(level, pRegValues, pRegValues + 1, pRegValues + 2, pRegValues + 3); -#else -#error "Not implemented for the current platform" -#endif -} - -/// Issue the cpuid instruction, with an additional sublevel code. -/// -/// @param [out] pRegValues EAX/EBX/ECX/EDX values -/// @param [in] level CpuId instruction feature level. -/// @param [in] sublevel CpuId instruction feature sublevel. -inline void CpuId( - uint32* pRegValues, - uint32 level, - uint32 sublevel) -{ -#if defined(_WIN32) - __cpuidex(reinterpret_cast(pRegValues), static_cast(level), static_cast(sublevel)); -#elif defined(__unix__) - __cpuid_count(level, sublevel, *pRegValues, *(pRegValues + 1), *(pRegValues + 2), *(pRegValues + 3)); -#else -#error "Not implemented for the current platform" -#endif -} -#endif - -/// Play beep sound. Currently function implemented only for WIN platform. -/// -/// @param [in] frequency Frequency in hertz of the beep sound. -/// @param [in] duration Duration in milliseconds of the beep sound. -extern void BeepSound( - uint32 frequency, - uint32 duration); - -/// Detect if a debugger is attached to the current process. -/// -/// @note This function is conservative, declaring the debugger not attached on error. -/// -/// @returns true if there is a debugger attached. -extern bool IsDebuggerAttached(); - -/// Set path to be accessible by everyone. -/// -/// In Linux world that is 777. For Windows it does nothing. -/// To be used for debug folders mainly. Use carefully for release paths as this will open them wide -/// and possibly create a security hole. -/// -/// @param [in] fileName Path to a file or folder, it must exist. -/// -/// @returns Returns success if permissions were set successfully, Result::ErrorUnknown otherwise -extern Result SetRwxFilePermissions(const char* pFileName); - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palSysUtil.h + * @brief PAL utility collection system functions. + *********************************************************************************************************************** + */ + +#pragma once + +// pal +#include "palAssert.h" +#include "palAutoBuffer.h" +#include "palFile.h" +#include "palSpan.h" +#include "palStringView.h" +#include "palTime.h" +#include "palUtil.h" +#include "palVector.h" + +// stl +#include +#include + +// platform +#if defined(_WIN32) +#define PAL_HAS_CPUID (_M_IX86 || _M_X64) +#include +#include +typedef void* HANDLE; +#elif defined(__unix__) +#define PAL_HAS_CPUID (__i386__ || __x86_64__) +#if PAL_HAS_CPUID +#include +#endif +#endif + +namespace Util +{ + +static constexpr uint32 RyzenMaxCcxCount = 4; +static constexpr uint32 CpuVendorAmd = 0x01000000; +static constexpr uint32 CpuVendorIntel = 0x02000000; + +/// Specifies a keyboard key for detecting key presses. +enum class KeyCode : uint32 +{ + Esc, + F1, + F2, + F3, + F4, + F5, + F6, + F7, + F8, + F9, + F10, + F11, + F12, + F13, + F14, + F15, + F16, + Backtick, // ` ~ + Minus, // - _ + Equal, // = + + LBrace, // [ { + RBrace, // ] } + Backslash, // \ | + Semicolon, // ; : + Apostrophe, // " ' + Comma, // , < + Dot, // . > + Slash, // / ? + Enter, + Space, + Backspace, + Tab, + Capslock, + Shift, + LShift, + RShift, + Control, + LControl, + RControl, + Alt, + LAlt, + RAlt, + Scroll, + Insert, + Delete, + Home, + End, + PageUp, + PageDown, + ArrowUp, + ArrowDown, + ArrowLeft, + ArrowRight, + Numlock, + NumSlash, + NumAsterisk, + NumMinus, + NumPlus, + NumDot, + NumEnter, + Num0, + Num1, + Num2, + Num3, + Num4, + Num5, + Num6, + Num7, + Num8, + Num9, + Zero, + One, + Two, + Three, + Four, + Five, + Six, + Seven, + Eight, + Nine, + A, + B, + C, + D, + E, + F, + G, + H, + I, + J, + K, + L, + M, + N, + O, + P, + Q, + R, + S, + T, + U, + V, + W, + X, + Y, + Z, + Shift_F10, + Shift_F11, + Undefined // Used as value where key code is not defined in the enum +}; + +/// Enum to identify possible configurations +enum class CpuType : uint32 +{ + Unknown = 0, ///< No capabilites set + AmdK5 = (CpuVendorAmd + 0), ///< No MMX, no cmov, no 3DNow + AmdK6 = (CpuVendorAmd + 1), ///< No MMX, no cmov, 3DNow (models 6 and 7) + AmdK6_2 = (CpuVendorAmd + 2), ///< MMX, no cmov, 3DNow (model 8, no HW WC but not part of cpuid) + AmdK6_3 = (CpuVendorAmd + 3), ///< MMX, no cmov, 3DNow (model 9) + AmdK7Basic = (CpuVendorAmd + 4), ///< K7 missing one of the features of K7 + AmdK7 = (CpuVendorAmd + 5), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext + AmdK7Sse = (CpuVendorAmd + 6), ///< MMX, MMX Ext, cmov, 3DNow, 3DNow Ext, SSE + AmdK8 = (CpuVendorAmd + 7), ///< Athlon 64, Athlon 64 FX, and Opteron + AmdK10 = (CpuVendorAmd + 8), ///< Barcelona, Phenom, Greyhound + AmdFamily12h = (CpuVendorAmd + 9), ///< Family 12h - Llano + AmdBobcat = (CpuVendorAmd + 10), ///< Bobcat + AmdFamily15h = (CpuVendorAmd + 11), ///< Family 15h - Orochi, Trinity, Komodo, Kaveri, Basilisk + AmdFamily16h = (CpuVendorAmd + 12), ///< Family 16h - Kabini + AmdRyzen = (CpuVendorAmd + 13), ///< Ryzen + IntelOld = (CpuVendorIntel + 0), ///< Inidicate cpu type befor Intel Pentium III + IntelP3 = (CpuVendorIntel + 1), ///< Generic Pentium III + IntelP3Model7 = (CpuVendorIntel + 2), ///< PIII-7, PIII Xeon-7 + IntelP3Model8 = (CpuVendorIntel + 3), ///< PIII-8, PIII Xeon-8, Celeron-8 + IntelPMModel9 = (CpuVendorIntel + 4), ///< Pentium M Model 9 (Banias) + IntelXeonModelA = (CpuVendorIntel + 5), ///< Xeon-A + IntelP3ModelB = (CpuVendorIntel + 6), ///< PIII-B + IntelPMModelD = (CpuVendorIntel + 7), ///< Pentium M Model D (Dothan) + IntelP4 = (CpuVendorIntel + 8), ///< Pentium 4, Pentium 4-M, Xenon, Celeron + IntelPMModelE = (CpuVendorIntel + 9), ///< Pentium M Model E (Yonah) + IntelCoreModelF = (CpuVendorIntel + 10), ///< Core F (Conroe) +}; + +/// Specifies a struct that contains information about the system. +struct SystemInfo +{ + CpuType cpuType; ///< Cpu type + char cpuVendorString[16]; ///< Null-terminated cpu vendor string + char cpuBrandString[48]; ///< Null-terminated cpu brand string + uint32 cpuLogicalCoreCount; ///< Number of logical cores on the cpu + uint32 cpuPhysicalCoreCount; ///< Number of physical cores on the cpu + uint32 totalSysMemSize; ///< Total system memory (RAM) size in megabytes + uint32 cpuFrequency; ///< Reports CPU clock speed in MHz. + /// (From Registry for Windows, current average processor speed for Linux.) + uint32 displayFamily; ///< Display Family of cpu + uint32 displayModel; ///< Display Model of cpu + + union + { + struct + { + uint32 affinityMask[RyzenMaxCcxCount]; ///< Affinity mask for each core complex (CCX). + } amdRyzen; ///< Properties specific to AMD Ryzen CPU's. + } cpuArchInfo; ///< This member should be used only for Ryzen for now. +}; + +/// Returns an appropriate result from the given errno +/// +/// @param errno_in Value from 'errno' (or functions that return errno_t) +/// +/// @returns Relevent Result value for the given errno-- never Success. +inline Result ConvertErrno( + int32 errnoIn) +{ + Result result = Result::ErrorUnknown; + switch (errnoIn) + { + case EAGAIN: + case EBUSY: + case EINTR: + result = Result::NotReady; + break; + case ETIMEDOUT: + case ETIME: + result = Result::Timeout; + break; + case EEXIST: + result = Result::AlreadyExists; + break; + case ENOENT: + case ENOTDIR: + result = Result::NotFound; + break; + case EACCES: + case EPERM: + case EROFS: + result = Result::ErrorPermissionDenied; + break; + case ENOSPC: + result = Result::ErrorDiskFull; + break; + case EISDIR: + case EINVAL: + case EBADF: + case ENAMETOOLONG: + case ELOOP: + result = Result::ErrorInvalidValue; + break; + case ENOMEM: + case EOVERFLOW: + result = Result::ErrorOutOfMemory; + break; + default: + PAL_ALERT_ALWAYS_MSG("Unknown result generated from errno %d (%s)", errnoIn, strerror(errnoIn)); + break; + } + return result; +} + +#if defined(_WIN32) +/// Helper function to convert Pal::Result to HRESULT +/// +/// @param result Pal::Result to convert +/// +/// @returns the equivalent HRESULT +extern HRESULT PalToHResult(Result result); + +/// Helper function to convert HRESULT to Pal::Result +/// +/// @param hr HRESULT to convert +/// +/// @returns the equivalent Pal::Result +extern Result HResultToPal(HRESULT hr); +#endif + +#if defined(_WIN32) +/// Returns an appropriate result from the given error from 'GetLastError'. +/// Certain OS functions may also return a 'ERROR_SUCCESS' which this helper also translates. +/// +/// @param errno_in System error code from 'GetLastError' +/// +/// @returns Relevent Result value for the given system error code. +inline Result ConvertWinError( + uint32 winError) +{ + Result result = Result::ErrorUnknown; + switch (winError) + { + // File already exists + case ERROR_ALREADY_EXISTS: + case ERROR_FILE_EXISTS: + result = Result::AlreadyExists; + break; + + // File or path does not exist + case ERROR_FILE_NOT_FOUND: + case ERROR_PATH_NOT_FOUND: + case ERROR_INVALID_NAME: + result = Result::NotFound; + break; + + // File locked for specified access + case ERROR_SHARING_VIOLATION: + case ERROR_LOCK_VIOLATION: + result = Result::ErrorNotShareable; + break; + case ERROR_ACCESS_DENIED: + result = Result::ErrorPermissionDenied; + break; + + // Temporary Errors + case ERROR_NOT_READY: + case ERROR_BUSY: + result = Result::NotReady; + break; + case WAIT_TIMEOUT: + result = Result::Timeout; + break; + + // Usage errors + case ERROR_BUFFER_OVERFLOW: + case ERROR_INVALID_HANDLE: + result = Result::ErrorInvalidValue; + break; + + // Various expected but rare error cases + case ERROR_TOO_MANY_OPEN_FILES: + case ERROR_FILE_TOO_LARGE: + case ERROR_VIRUS_INFECTED: + case ERROR_VIRUS_DELETED: + result = Result::ErrorUnavailable; + break; + case ERROR_HANDLE_DISK_FULL: + case ERROR_DISK_FULL: + result = Result::ErrorDiskFull; + break; + case ERROR_NOT_ENOUGH_MEMORY: + case ERROR_OUTOFMEMORY: + result = Result::ErrorOutOfMemory; + break; + + case ERROR_SUCCESS: + result = Result::Success; + break; + default: + PAL_ALERT_ALWAYS_MSG("Unknown result generated from Windows error code %d", winError); + break; + } + return result; +} + +/// Windows Process Integrity Levels +enum class ProcessIntegrityLevel +{ + Untrusted, //< Untrusted Process (No System Resource Access) + Low, //< Restricted Process + Medium, //< Normal Process + High, //< Elevated Process + System //< System-Level or higher Process +}; + +/// Get the local storage path based on process integrity level +/// +/// @returns the Integrity Level of the current process +extern ProcessIntegrityLevel GetProcessIntegrityLevel(); + +/// Detects if we're inside an App Container sandbox (ie, a Windows Store or UWP app) +/// +/// @returns whether the current process is in an App Container +extern bool IsProcessInAppContainer(); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 921 +/// Tests whether the passed handle is valid or not. +/// A handle could be either nullptr or INVALID_HANDLE_VALUE. +/// INVALID_HANDLE_VALUE is the proper value to set a handle to when you intend to error out. +/// However, handles default-initialize to nullptr. So that's also a common value you see for an unset handle. +/// A common error is to check for one value but not the other. So PAL prescribes that clients use this helper. +constexpr bool IsValidHandle(HANDLE handle) +{ +#ifdef INVALID_HANDLE_VALUE + return (handle != nullptr) && (handle != INVALID_HANDLE_VALUE); +#else + return (handle != nullptr) && (handle != HANDLE(size_t(-1))); +#endif +} +#endif +#endif + +/// Queries system information. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns Success if querying the system info was successful. Otherwise, the following results will be returned: +/// + ErrorInvalidPointer returned if pSystemInfo is nullptr. +/// + ErrorOutOfMemory returned if the system ran out of memory during the function call. +/// + ErrorUnavailable returned if querying the system info is not supported. +/// + ErrorUnknown returned if an error occurs while calling OS functions. +extern Result QuerySystemInfo(SystemInfo* pSystemInfo); + +/// Query cpu type for AMD processor. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns none. +extern void QueryAMDCpuType(SystemInfo* pSystemInfo); + +/// Query cpu type for Intel processor. +/// +/// @param [out] pSystemInfo SystemInfo struct containing information about the system. +/// +/// @returns none. +extern void QueryIntelCpuType(SystemInfo* pSystemInfo); + +/// Gets the frequency of performance-related queries. +/// +/// @returns Current CPU performance counter frequency in Hz. +extern int64 GetPerfFrequency(); + +/// Gets the current time of a performance-related query. +/// +/// This is a high resolution time stamp that can be used in conjunction with GetPerfFrequency to measure time +/// intervals. +/// +/// @param [in] raw Whether to use a 'monotonic raw' clock which ignores smoothing. Ignored on Windows. +/// +/// @returns Current value of the CPU performance counter. +extern int64 GetPerfCpuTime(bool raw=false); + +/// Determines if a specific key is pressed down. +/// +/// @param [in] key Specified which key to check. +/// @param [in, out] pPrevState The previous state of the key. +/// +/// @returns True if the specified key is currently pressed down. +extern bool IsKeyPressed(KeyCode key, bool* pPrevState = nullptr); + +/// Determines if profiling is restricted +/// +/// @returns true if the process is not restricted for profiling, otherwise, false will be returned. +extern bool IsProfileRestricted(); + +/// Retrieves the fully resolved file name of the application binary. +/// +/// @param [out] pBuffer Character buffer to contain the application's executable and (fully-resolved) path +/// string. +/// @param [out] ppFilename Pointer to the location within the output buffer where the executable name begins. +/// @param [in] bufferLength Length of the output buffer, in bytes. +/// @returns Result::Success if GetModuleFileNameA succeeds. Otherwise, the following result codes would be returned: +/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. +extern Result GetExecutableName( + char* pBuffer, + char** ppFilename, + size_t bufferLength); + +/// Retrieves the fully resolved wchar_t file name of the application binary. +/// +/// @param [out] pWcBuffer wchar_t buffer to contain the application's executable and (fully-resolved) path +/// string. +/// @param [out] ppWcFilename Pointer to the location within the wchar_t output buffer where the executable name begins. +/// @param [in] bufferLength Length of the output buffer, in bytes. +/// @returns Result::Success if GetModuleFileNameW succeeds. Otherwise, the following result codes would be returned: +/// + Result::ErrorInvalidMemorySize returned if pBuffer is not sufficiently large. +extern Result GetExecutableName( + wchar_t* pWcBuffer, + wchar_t** ppWcFilename, + size_t bufferLength); + +/// Gets the current library name. ie: the name of the library containing the function +/// 'GetCurrentLibraryName'. Optionally, it will also return the extension if the input +/// buffer for extension is valid. +/// +/// @param [out] pLibBuffer Character buffer where the library name will be stored. +/// @param [in] libBufferLength Length of the output buffer that will hold the library name, in bytes. +/// @param [out] pExtBuffer Character buffer where the extension will be stored. +/// @param [in] extBufferLength Length of the output buffer that will hold the extension, in bytes. +/// @returns Result::Success if no error. Otherwise, returns one of the following codes: +/// Result::ErrorInvalidMemorySize - if incoming buffer is too small. +/// Result::ErrorUnknown - for all other types of errors. +extern Result GetCurrentLibraryName( + char* pLibBuffer, + size_t libBufferLength, + char* pExtBuffer, + size_t extBufferLength); + +/// Opaque build ID obtained with GetCurrentLibraryBuildId +struct BuildId { + uint8 data[16]; +}; + +/// Gets build-unique identifier for the executable or shared library PAL was built into. This will be some opaque +/// hash or timestamp embedded in the code if present and falls back to random bytes (constant until exit) for a +/// 'usuable' build id to always exist. +/// +/// @param [out] pBuildId Will contain unique id for build +/// +/// @returns true if build id will be persistent, false if temporary +/// +/// @note We fallback to using a random build id here instead of failing so that even if this fails and nobody checks, +/// a new driver will always invalidate caches. It is better, of course, not to persist these on disk. +/// @note This may be heavy on first invocation but can be assumed to always return quickly after that. +extern bool GetCurrentLibraryBuildId( + BuildId* pBuildId); + +/// Splits a filename into its path and file components. +/// +/// @param [in] pFullPath Buffer containing the full path & file name. +/// @param [out] pPathBuf Optional. If non-null, will contain the path to the file name. On Windows, this will also +/// include the drive letter. +/// @param [in] pathLen Length of the pPathBuf buffer. Must be zero when pPathBuf is null. +/// @param [out] pFileBuf Optional. If non-null, will contain the base file name, and extension. +/// @param [in] fileLen Length of the pFileBuf buffer. Must be zero when pFileBuf is null. +extern void SplitFilePath( + const char* pFullPath, + char* pPathBuf, + size_t pathLen, + char* pFileBuf, + size_t fileLen); + +/// Creates a new directory at the specified path. +/// +/// @param [in] pPathName String specifying the new path to create. Note that this method can only create one +/// directory, if you specify "foo/bar" the "bar" directory can only be created if "foo" already +/// exists. +/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the +/// following result codes may be returned: +/// + Result::AlreadyExists if the specified directory already exists. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +extern Result MkDir( + const char* pPathName); + +/// Creates a new directory at the specified path and all intermediate directories. +/// +/// @param [in] pPathName String specifying the new path to create.n +/// +/// @returns Result::Success if the directory was successfully created, otherwise an appropriate error. Otherwise, the +/// following result codes may be returned: +/// + Result::AlreadyExists if the specified directory already exists. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +extern Result MkDirRecursively( + const char* pPathName); + +/// A tuple containing a file's name and statistics +struct StatName +{ + File::Stat stat; + char name[MaxPathStrLen]; +}; + +/// Gets file information for the files in a directory +/// +/// @param [in] dirPath string specifying the directory +/// @param [out] pFileInfos list of information on every file in the directory +/// +/// @returns Result::ErrorInvalidPointer if any of the input pointers are null +/// @returns Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows +/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux +/// @returns Result::Success if the dir is empty +/// @returns Result::Success otherwise +extern Result GetFileInfoInDir( + StringView dirPath, + Vector* pFileInfos); + +/// Counts the number of files found within the directory. +/// +/// @param [in] pDirPath string specifying the directory +/// @param [out] pFileCount the number of files in the directory +/// @param [out] pCharCount the number of characters in the names of all the files +/// +/// @returns Result::ErrorInvalidPointer if any of the input pointers are null +/// @returns Value of Util::ConvertWinError(GetLastError()) if there are any file I/O errors on Windows +/// @returns Result::ErrorInvalidValue if there for all file I/O errors on Linux +/// @returns Result::Success if the dir is empty (pFileCount and pCharCount will be 0) +/// @returns Result::Success otherwise +extern Result CountFilesInDir( + Util::StringView dirPath, + size_t* pFileCount, + size_t* pCharCount); + +/// Lists the contents of the specified directory in an array of strings +/// +/// @param [in] dirPath String specifying the directory +/// @param [out] fileNames An array where pointers the file names will be written. +/// @param [out] buffer Memory where the file names can be stored. +/// +/// @returns Result::ErrorInvalidPointer if any of the inputs are null or empty +/// @returns Result::ErrorInvalidValue if there are any file I/O errors +/// @returns Result::Success otherwise +extern Result GetFileNamesInDir( + Util::StringView dirPath, + Util::Span> fileNames, + Util::Span buffer); + +/// Non-recursively delete the least-recently-accesssed files from a directory until the directory reaches size in bytes. +/// +/// @param [in] pPathName string specifying the absolute path to the directory you want to remove files from +/// @param desiredSize the size you want to shrink the directory to +/// +/// @returns Result::ErrorUnknown on File I/O error. +/// Result::Success otherwise. +Result RemoveOldestFilesOfDirUntilSize( + const char* pPathName, + uint64 desiredSize); + +/// Non-recursively delete the least-recently-accesssed files until the total reaches size in bytes. +/// +/// @param [in] pDirPath String specifying the directory. +/// @param [in/out] pFileInfos List of files. This span will be modified to remove the files which were deleted. +/// @param desiredSize The size you want to shrink the list of files to. +/// +/// @returns Result::ErrorUnknown on File I/O error. +/// Result::Success otherwise. +Result RemoveOldestFilesOfDirUntilSize( + StringView dirPath, + Span* pFileInfos, + uint64 desiredSize); + +/// Remove all files below threshold of a directory at the specified path. +/// +/// @param [in] pPathName String specifying the absolute path to remove. +/// @param [in] threshold The file time(from 1970/01/01 00:00:00) older(smaller) than threshold will be removed. +/// +/// @returns Result::Success if all files are successfully removed. Otherwise, the +/// following result codes may be returned: +/// + Result::ErrorUnknown if the specified directory is failed to open/remove. +/// + Result::ErrorInvalidValue if the parent directory does not exist. +Result RemoveFilesOfDirOlderThan( + const char* pPathName, + SecondsSinceEpoch threshold); + +/// Get status of a directory at the specified path. +/// +/// @param [in] pPathName String specifying the absolute path. +/// @param [out] pTotalSize Size(byte) of all files +/// @param [out] pOldestTime The oldest time(seconds from 1970/01/01 00:00:00) of all files +/// +/// @returns Result::Success if all files are successfully removed. Otherwise, the +/// following result codes may be returned: +/// + Result::ErrorUnknown if the specified directory is failed to open. +Result GetStatusOfDir( + const char* pPathName, + uint64* pTotalSize, + SecondsSinceEpoch* pOldestTime); + +/// Almost-Posix-style rename file or directory: replaces already-existing file. +/// Posix says this operation is atomic; Windows does not specify. +/// +/// @param [in] pOldName Old file or directory name +/// @param [in] pNewName Name to rename to +/// +/// @returns Result::Success if file/directory successfully moved. +Result Rename( + const char* pOldName, + const char* pNewName); + +/// Get the Process ID of the current process +/// +/// @returns The Process ID of the current process +extern uint32 GetIdOfCurrentProcess(); + +/// OS-specific wrapper for printing stack trace information. +/// +/// @param [out] pOutput Output string. If buffer is a nullptr it returns the length of the string that would be +/// printed had a buffer with enough space been provided. +/// @param [in] bufSize Available space in pOutput. +/// @param [in] skipFrames Number of stack frames to skip. Implied skip of 1 (0 is 1). +/// +/// @returns The resultant length of the stack trace string. +extern size_t DumpStackTrace( + char* pOutput, + size_t bufSize, + uint32 skipFrames); + +/// Puts the calling thread to sleep for a specified number of milliseconds. +/// +/// @param [in] duration Amount of time to sleep for, in milliseconds. +extern void Sleep(std::chrono::milliseconds duration); + +/// Create Directory and SubDirectory of Executable's Name at pBaseDir +/// Like if pBaseDir="amdPal/" then final pLogDir will be created like this: amdpal/app.exe/ +/// +/// @param [in] pBaseDir Base Dir Path +/// @param [out] pLogDir Pointer to Memory where Final Path will be saved +/// @param [in] logDirSize Size of pLogDir in chars +/// +/// @returns Result::Success if the directories were created and pLogDir was populated. +extern Result CreateLogDir( + const char* pBaseDir, + char* pLogDir, + size_t logDirSize); + +/// Check if the requested key is combo key. +/// +/// @param [in] key The requested key value +/// @param [out] pKeys The array of keys the combo key composed of +/// +/// @returns If the requested key is a combo key. +inline bool IsComboKey( + KeyCode key, + KeyCode* pKeys) +{ + bool ret = false; + + if (key == KeyCode::Shift_F10) + { + ret = true; + pKeys[0] = KeyCode::Shift; + pKeys[1] = KeyCode::F10; + } + else if (key == KeyCode::Shift_F11) + { + ret = true; + pKeys[0] = KeyCode::Shift; + pKeys[1] = KeyCode::F11; + } + else + { + pKeys[0] = key; + } + + return ret; +} + +#if PAL_HAS_CPUID +/// Issue the cpuid instruction. +/// +/// @param [out] pRegValues EAX/EBX/ECX/EDX values +/// @param [in] level CpuId instruction feature level. +inline void CpuId( + uint32* pRegValues, + uint32 level) +{ +#if defined(_WIN32) + __cpuid(reinterpret_cast(pRegValues), static_cast(level)); +#elif defined(__unix__) + __get_cpuid(level, pRegValues, pRegValues + 1, pRegValues + 2, pRegValues + 3); +#else +#error "Not implemented for the current platform" +#endif +} + +/// Issue the cpuid instruction, with an additional sublevel code. +/// +/// @param [out] pRegValues EAX/EBX/ECX/EDX values +/// @param [in] level CpuId instruction feature level. +/// @param [in] sublevel CpuId instruction feature sublevel. +inline void CpuId( + uint32* pRegValues, + uint32 level, + uint32 sublevel) +{ +#if defined(_WIN32) + __cpuidex(reinterpret_cast(pRegValues), static_cast(level), static_cast(sublevel)); +#elif defined(__unix__) + __cpuid_count(level, sublevel, *pRegValues, *(pRegValues + 1), *(pRegValues + 2), *(pRegValues + 3)); +#else +#error "Not implemented for the current platform" +#endif +} +#endif + +/// Play beep sound. Currently function implemented only for WIN platform. +/// +/// @param [in] frequency Frequency in hertz of the beep sound. +/// @param [in] duration Duration in milliseconds of the beep sound. +extern void BeepSound( + uint32 frequency, + uint32 duration); + +/// Detect if a debugger is attached to the current process. +/// +/// @note This function is conservative, declaring the debugger not attached on error. +/// +/// @returns true if there is a debugger attached. +extern bool IsDebuggerAttached(); + +/// Set path to be accessible by everyone. +/// +/// In Linux world that is 777. For Windows it does nothing. +/// To be used for debug folders mainly. Use carefully for release paths as this will open them wide +/// and possibly create a security hole. +/// +/// @param [in] fileName Path to a file or folder, it must exist. +/// +/// @returns Returns success if permissions were set successfully, Result::ErrorUnknown otherwise +extern Result SetRwxFilePermissions(const char* pFileName); + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palTime.h b/shared/amdgpu-windows-interop/pal/inc/util/palTime.h index 6ec202601a..69e3394966 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palTime.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palTime.h @@ -1,87 +1,87 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palTime.h - * @brief PAL time-related utility collection. - *********************************************************************************************************************** - */ - -#pragma once - -#include - -namespace Util -{ - -/// Specifies a class that implements a timestamp. -class Timestamp -{ -public: - /// Creates a new timestamp object that records the time it was created. - Timestamp(); - - /// Returns the timestamp as a C-string. - const char* CStr() const { return m_data; } - -private: - char m_data[64]; -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 873 -/// Seconds stored as a float instead of an integer. -using fseconds = std::chrono::duration; -/// Milliseconds stored as a float instead of an integer. -using fmilliseconds = std::chrono::duration; -/// Microseconds stored as a float instead of an integer. -using fmicroseconds = std::chrono::duration; -/// Nanoseconds stored as a float instead of an integer. -using fnanoseconds = std::chrono::duration; - -/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. -/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. -/// system_clock's internal duration is still implementation defined. -/// On Windows it's hundreds-of-nanoseconds and on Linux it's seconds. -/// However time_point has it's own duration type. -/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. -using SecondsSinceEpoch = std::chrono::time_point; - -/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. -template -constexpr DestDuration TimeoutCast( - const std::chrono::duration& d) -{ - if (d == (std::chrono::duration::max)()) - { - return (DestDuration::max)(); - } - else - { - return std::chrono::duration_cast(d); - } -} -#endif - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palTime.h + * @brief PAL time-related utility collection. + *********************************************************************************************************************** + */ + +#pragma once + +#include + +namespace Util +{ + +/// Specifies a class that implements a timestamp. +class Timestamp +{ +public: + /// Creates a new timestamp object that records the time it was created. + Timestamp(); + + /// Returns the timestamp as a C-string. + const char* CStr() const { return m_data; } + +private: + char m_data[64]; +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 873 +/// Seconds stored as a float instead of an integer. +using fseconds = std::chrono::duration; +/// Milliseconds stored as a float instead of an integer. +using fmilliseconds = std::chrono::duration; +/// Microseconds stored as a float instead of an integer. +using fmicroseconds = std::chrono::duration; +/// Nanoseconds stored as a float instead of an integer. +using fnanoseconds = std::chrono::duration; + +/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. +/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. +/// system_clock's internal duration is still implementation defined. +/// On Windows it's hundreds-of-nanoseconds and on Linux it's seconds. +/// However time_point has it's own duration type. +/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. +using SecondsSinceEpoch = std::chrono::time_point; + +/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. +template +constexpr DestDuration TimeoutCast( + const std::chrono::duration& d) +{ + if (d == (std::chrono::duration::max)()) + { + return (DestDuration::max)(); + } + else + { + return std::chrono::duration_cast(d); + } +} +#endif + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h b/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h index cfa6790f2d..878731e1db 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palUtil.h @@ -1,726 +1,714 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file palUtil.h - * @brief Common include for the PAL utility collection. Defines common types, macros, enums, etc. - *********************************************************************************************************************** - */ - -#pragma once - -/// Utility macro for turning another macro into a string literal. -#define _PAL_STRINGIFY(_x) #_x -#define PAL_STRINGIFY(_x) _PAL_STRINGIFY(_x) - -/// C++11 standard version. -#define PAL_CPLUSPLUS_11 201103L -/// C++14 standard version. -#define PAL_CPLUSPLUS_14 201402L -/// C++17 standard version. -#define PAL_CPLUSPLUS_17 201703L -/// C++ feature version from September 2017 contains a few C++20 features. -#define PAL_CPLUSPLUS_1709 201709L -/// C++20 standard version. -#define PAL_CPLUSPLUS_20 202002L - -/// C++ standard version used to compile PAL. -# ifdef _MSVC_LANG -// MSVC requires the /Zc:__cplusplus flag, otherwise __cplusplus is that of 199711L (C++98). _MSVC_LANG has the correct -// C++ standard version. -# define PAL_CPLUSPLUS _MSVC_LANG -# else -# define PAL_CPLUSPLUS __cplusplus -# endif - -/// Checks if PAL is compiled with C++ of at least version @p v. -#define PAL_CPLUSPLUS_AT_LEAST(v) (PAL_CPLUSPLUS >= (v)) - -static_assert( - PAL_CPLUSPLUS_AT_LEAST(PAL_CPLUSPLUS_1709), - "C++ standard version " PAL_STRINGIFY(PAL_CPLUSPLUS_1709) " is required to build PAL. " - "Found " PAL_STRINGIFY(PAL_CPLUSPLUS) "."); - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 878 -/// We already declare NOMINMAX publicly, but that won't stop clients from defining their own min/max macros. -/// These macros confuse the compiler when using functions named min/max, leading to build errors. -#if defined(min) || defined(max) -static_assert(false, "Clients may not define macros named \"min\" or \"max\"."); -#endif -#endif - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 -#include -#endif -#include - -/// stdint is included instead of cstdint to allow Visual Studio Intellisense to work for Linux builds. This can be -/// removed if the error caused by including cstdint is figured out. -#include "stdint.h" - -/// Include in the class declaration in order to disallow use of the copy constructor and assignment operator for that -/// class. -#define PAL_DISALLOW_COPY_AND_ASSIGN(_typename) \ - _typename(const _typename&) = delete; \ - _typename& operator=(const _typename&) = delete; - -/// Include in the declaration in order to disallow use of the default constructor for a class. -#define PAL_DISALLOW_DEFAULT_CTOR(_typename) \ - _typename() = delete; - -#if !defined(__GNUC__) - -// Equates to the [__stdcall](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/stdcall.md) convention on Windows. -#define PAL_STDCALL __stdcall -// Equates to the [__cdecl](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/cdecl.md) convention on Windows. -#define PAL_CDECL __cdecl -// Equates to [__declspec(align(__x))](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/align-cpp.md) on Windows. -#define PAL_ALIGN(__x) __declspec(align(__x)) -#define PAL_FORCE_INLINE __forceinline -#define PAL_NO_INLINE __declspec(noinline) -#else -/// Undefined on GCC platforms. -#define PAL_STDCALL -/// Undefined on GCC platforms. -#define PAL_CDECL -/// Undefined on GCC platforms. -#define PAL_ALIGN(__x) -#define PAL_FORCE_INLINE __attribute__((always_inline)) inline -#define PAL_NO_INLINE __attribute__((noinline)) -#endif - -/// Platform cache line size in bytes. -#define PAL_CACHE_LINE_BYTES 64 -/// Platform system memory page size in bytes. -#define PAL_PAGE_BYTES 4096 - -/// Force cache line alignment. -#define PAL_ALIGN_CACHE_LINE PAL_ALIGN(PAL_CACHE_LINE_BYTES) - -#if !defined(_WIN32) -/// Value representing an invalid file descriptor on Linux systems. -constexpr int32_t InvalidFd = -1; -#endif - -#ifdef __has_builtin -/// A macro that checks for the presence of builtin functions. Will default to false if the compiler does not have -/// support for doing this check. -#define PAL_HAS_BUILTIN(builtin) __has_builtin(builtin) -#else -#define PAL_HAS_BUILTIN(builtin) 0 -#endif - -#if defined(__has_cpp_attribute) -#define PAL_HAS_CPP_ATTR(attr) __has_cpp_attribute(attr) -#else -#define PAL_HAS_CPP_ATTR(attr) 0 -#endif - -/// Library-wide namespace encapsulating all PAL utility collection entities. -namespace Util -{ - -typedef int8_t int8; ///< 8-bit integer. -typedef int16_t int16; ///< 16-bit integer. -typedef int32_t int32; ///< 32-bit integer. -typedef int64_t int64; ///< 64-bit integer. -typedef uint8_t uint8; ///< Unsigned 8-bit integer. -typedef uint16_t uint16; ///< Unsigned 16-bit integer. -typedef uint32_t uint32; ///< Unsigned 32-bit integer. -typedef uint64_t uint64; ///< Unsigned 64-bit integer. -typedef uint64_t gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from - /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. - -/// Error and return codes indicating outcome of a requested operation. Success result codes are greater than or equal -/// to 0, and error results codes are less than 0. -enum class Result : int32 -{ - /// @internal The operation completed successfully. - _Success = 0x00000000, - - // Unfortunately for Linux clients, X.h includes a "#define Success 0" macro. Clients have their choice of either - // undefing Success before including this header or using _Success when dealing with PAL. -#ifndef Success - /// The operation completed successfully. - Success = _Success, -#endif - - /// The operation is not supported. - Unsupported = 0x00000001, - - /// The operation completed successfully but the result is not ready. This result code normally applies to - /// situations where results of queued GPU operations such as queries and fences have not been written to memory - /// yet. - NotReady = 0x00000002, - - /// The wait operation completed due to a client-specified timeout condition. - Timeout = 0x00000003, - - /// The event is in the "set" state. @see IGpuEvent::GetStatus. - EventSet = 0x00000004, - - /// The event is in the "reset" state. @see IGpuEvent::GetStatus. - EventReset = 0x00000005, - - /// The operation was successful, but the client has reached the maximum allowable number of flippable GPU memory - /// objects. Future requests to create presentable Images or flippable GPU memory objects may fail due to - /// limitations within the underlying OS. - /// @see IDevice::CreateGpuMemory. - /// @see IDevice::CreatePresentableImage. - TooManyFlippableAllocations = 0x00000006, - - /// The present was successful, but some portion of the window is currently occluded by another window. - PresentOccluded = 0x00000007, - - /// The directory/file/etc. being created already exists. - AlreadyExists = 0x00000008, - - /// A warning indicates an operation is successful (supported by H/W) but out of a certain spec (e.g. VESA). - OutOfSpec = 0x00000009, - - /// The value being searched for was not found. - NotFound = 0x0000000A, - - /// End of file reached successfully. - Eof = 0x0000000B, - - /// If ReserveEntryOnMiss was specified, the entry was not found, and the entry was successfully reserved. - Reserved = 0x0000000C, - - /// If an operation is purposefully terminated early, rather than from an error. - Aborted = 0x0000000D, - - /// The operation encountered an unknown error. - ErrorUnknown = -(0x00000001), - - /// The requested operation is unavailable at this time. - ErrorUnavailable = -(0x00000002), - - /// The initialization operation failed for unknown reasons. - ErrorInitializationFailed = -(0x00000003), - - /// The operation could not complete due to insufficient system memory. - ErrorOutOfMemory = -(0x00000004), - - /// The operation could not complete due to insufficient GPU memory. - ErrorOutOfGpuMemory = -(0x00000005), - - /// The device was lost due to its removal or a possible hang and recovery condition. The client should destroy all - /// devices (and objects attached to them) and re-enumerate the available devices be calling EnumerateDevices(). - ErrorDeviceLost = -(0x00000007), - - /// A required input pointer passed to the call was invalid (probably null). - ErrorInvalidPointer = -(0x00000008), - - /// An invalid value was passed to the call. - ErrorInvalidValue = -(0x00000009), - - /// An invalid ordinal was passed to the call. - ErrorInvalidOrdinal = -(0x0000000A), - - /// An invalid memory size was passed to the call. - ErrorInvalidMemorySize = -(0x0000000B), - - /// Invalid flags were passed to the call. - ErrorInvalidFlags = -(0x0000000C), - - /// An invalid alignment parameter was specified - ErrorInvalidAlignment = -(0x0000000D), - - /// An invalid resource format was specified. - ErrorInvalidFormat = -(0x0000000E), - - /// The requested operation cannot be performed on the provided @ref Pal::IImage object. - ErrorInvalidImage = -(0x0000000F), - - /// The descriptor set data is invalid or does not match the related pipeline. - ErrorInvalidDescriptorSetData = -(0x00000010), - - /// An invalid queue type was specified. - ErrorInvalidQueueType = -(0x00000011), - - /// An invalid object type was specified. - ErrorInvalidObjectType = -(0x00000012), - - /// The specified shader uses an unsupported version of AMD IL. - ErrorUnsupportedShaderIlVersion = -(0x00000013), - - /// The specified shader code is invalid or corrupt. - ErrorBadShaderCode = -(0x00000014), - - /// The specified serialized pipeline data is invalid or corrupt. - ErrorBadPipelineData = -(0x00000015), - - /// The queue operation specified more GPU memory references than are supported. - /// @see Pal::IQueue::Submit - /// @see Pal::IDevice::AddGpuMemoryReferences - /// @see Pal::DeviceProperties::maxGpuMemoryRefsResident - ErrorTooManyMemoryReferences = -(0x00000016), - - /// The memory object cannot be mapped because it does not reside in a CPU visible heap. - ErrorNotMappable = -(0x00000017), - - /// The map operation failed due to an unknown or system reason. - ErrorGpuMemoryMapFailed = -(0x00000018), - - /// The unmap operation failed due to an unknown or system reason. - ErrorGpuMemoryUnmapFailed = -(0x00000019), - - /// The serialized pipeline load operation failed due to an incompatible device. - ErrorIncompatibleDevice = -(0x0000001A), - - /// The serialized pipeline load operation failed due to an incompatible PAL library. - ErrorIncompatibleLibrary = -(0x0000001B), - - /// The requested operation (such as command buffer submission) can't be completed because command buffer - /// construction is not complete. - ErrorIncompleteCommandBuffer = -(0x0000001C), - - /// The specified command buffer failed to build correctly. This error can be delayed from the original source of - /// the error since the command buffer building methods do not return error codes. - ErrorBuildingCommandBuffer = -(0x0000001D), - - /// The operation cannot complete since not all objects have valid GPU memory bound to them. - ErrorGpuMemoryNotBound = -(0x0000001E), - - /// The requested operation is not supported on the specified queue type. - ErrorIncompatibleQueue = -(0x0000001F), - - /// The object cannot be created or opened for sharing between multiple GPU devices. - ErrorNotShareable = -(0x00000020), - - /// The operation failed because the specified fullscreen mode was unavailable. This could be a failure while - /// attempting to take fullscreen ownership, or when attempting to perform a fullscreen present and the user has - /// left fullscreen mode. - ErrorFullscreenUnavailable = -(0x00000021), - - /// The targeted screen of the operation has been removed from the system. - ErrorScreenRemoved = -(0x00000022), - - /// Present failed because the screen mode is no longer compatible with the source image. - ErrorIncompatibleScreenMode = -(0x00000023), - - /// The cross-GPU present failed, possibly due to a lack of system bus bandwidth to accommodate the transfer. - ErrorMultiDevicePresentFailed = -(0x00000024), - - /// The slave GPU(s) in an MGPU system cannot create BLTable present images. - ErrorWindowedPresentUnavailable = -(0x00000025), - - /// The attempt to enter fullscreen exclusive mode failed because the specified image doesn't properly match the - /// screen's current dimensions. - ErrorInvalidResolution = -(0x00000026), - - /// The shader specifies a thread group size that is bigger than what is supported by this device. - ErrorThreadGroupTooBig = -(0x00000027), - - /// Invalid image create info: Specified both color target and depth usage - ErrorInvalidImageTargetUsage = -(0x00000028), - - /// Invalid image create info: Specified a 1D type for a color target - ErrorInvalidColorTargetType = -(0x00000029), - - /// Invalid image create info: Specified a non-2D type for a depth/stencil target - ErrorInvalidDepthTargetType = -(0x0000002A), - - /// Invalid image create info: The image format supports depth/stencil but depth/stencil usage was not specified - ErrorMissingDepthStencilUsage = -(0x0000002B), - - /// Invalid image create info: Specified MSAA and multiple mip levels - ErrorInvalidMsaaMipLevels = -(0x0000002C), - - /// Invalid image create info: The image format is incompatible with MSAA - ErrorInvalidMsaaFormat = -(0x0000002D), - - /// Invalid image create info: The image type is incompatible with MSAA - ErrorInvalidMsaaType = -(0x0000002E), - - /// The sample count is invalid - ErrorInvalidSampleCount = -(0x0000002F), - - /// Invalid image create info: Invalid block compressed image type - ErrorInvalidCompressedImageType = -(0x00000030), - - /// Invalid image create info: Format is incompatible with the specified image usage - ErrorInvalidUsageForFormat = -(0x00000032), - - /// Invalid image create info: Array size is invalid - ErrorInvalidImageArraySize = -(0x00000033), - - /// Invalid image create info: Array size is invalid for a 3D image - ErrorInvalid3dImageArraySize = -(0x00000034), - - /// Invalid image create info: Image width is invalid - ErrorInvalidImageWidth = -(0x00000035), - - /// Invalid image create info: Image height is invalid - ErrorInvalidImageHeight = -(0x00000036), - - /// Invalid image create info: Image depth is invalid - ErrorInvalidImageDepth = -(0x00000037), - - /// Invalid image create info: Mip count is invalid - ErrorInvalidMipCount = -(0x00000038), - - /// Invalid image create info: Image format is incompatible with the image usage specified. - ErrorFormatIncompatibleWithImageUsage = -(0x00000039), - - /// Operation requested an image plane that is not available on the image. - ErrorImagePlaneUnavailable = -(0x0000003A), - - /// Another format is incompatible with an image's format. - ErrorFormatIncompatibleWithImageFormat = -(0x0000003B), - - /// Another format is incompatible with an image plane's format. - ErrorFormatIncompatibleWithImagePlane = -(0x0000003C), - - /// Operation requires a shader readable or writable image usage but the image does not support it. - ErrorImageNotShaderAccessible = -(0x0000003D), - - /// Format is paired with a channel mapping that contains invalid components. - ErrorInvalidFormatSwizzle = -(0x0000003E), - - /// A base mip level that is out of bounds or otherwise invalid was specified. - ErrorInvalidBaseMipLevel = -(0x0000003F), - - /// A view array size that was zero or otherwise invalid was specified. - ErrorInvalidViewArraySize = -(0x00000040), - - /// A view base array slice that was out of bounds or otherwise invalid was specified. - ErrorInvalidViewBaseSlice = -(0x00000041), - - /// A view image type was specified that is incompatible with the image's type. - ErrorViewTypeIncompatibleWithImageType = -(0x00000042), - - /// A view specifies an array slice range that is larger than what is supported by the image. - ErrorInsufficientImageArraySize = -(0x00000043), - - /// It is illegal to create a cubemap view into an MSAA image. - ErrorCubemapIncompatibleWithMsaa = -(0x00000044), - - /// A cubemap view was created to an image that does not have square width and height. - ErrorCubemapNonSquareFaceSize = -(0x00000045), - - /// An fmask view was created to an image that does not support an fmask. - ErrorImageFmaskUnavailable = -(0x00000046), - - /// A private screen was removed. - ErrorPrivateScreenRemoved = -(0x00000047), - - /// A private screen was already in exclusive use. - ErrorPrivateScreenUsed = -(0x00000048), - - /// The image count created or opened on this private display exceed maximum. - ErrorTooManyPrivateDisplayImages = -(0x00000049), - - /// The private screen is not enabled. - ErrorPrivateScreenNotEnabled = -(0x0000004A), - - /// The private screen count exceeds the maximum (including emulated and physical ones). - ErrorTooManyPrivateScreens = -(0x0000004B), - - /// Invalid image create info: Image rowPitch does not equal the image's actual row pitch. - ErrorMismatchedImageRowPitch = -(0x0000004C), - - /// Invalid image create info: Image depthPitch does not equal the image's actual depth pitch. - ErrorMismatchedImageDepthPitch = -(0x0000004D), - - /// The given swap chain cannot be associated with any more presentable images. - ErrorTooManyPresentableImages = -(0x0000004E), - - /// A fence was used in GetStatus() or WaitForFences() without being used in any submission. - ErrorFenceNeverSubmitted = -(0x0000004F), - - /// The image used on the specified private screen has an invalid format. - ErrorPrivateScreenInvalidFormat = -(0x00000050), - - /// The timing data set on the specified private screen was invalid. - ErrorPrivateScreenInvalidTiming = -(0x00000051), - - /// The resolution set on the specified private screen was invalid. - ErrorPrivateScreenInvalidResolution = -(0x00000052), - - /// The scaling parameter set on the specified private screen was invalid. - ErrorPrivateScreenInvalidScaling = -(0x00000053), - - /// Invalid image create info: Invalid YUV image type - ErrorInvalidYuvImageType = -(0x00000054), - - /// The external shader cache found a matching hash but the with different key data. - ErrorShaderCacheHashCollision = -(0x00000055), - - /// The external shader cache is full - ErrorShaderCacheFull = -(0x00000056), - - /// The operation caused a pagefault. - ErrorGpuPageFaultDetected = -(0x00000057), - - /// The provided pipeline ELF uses an unsupported ABI version. - ErrorUnsupportedPipelineElfAbiVersion = -(0x00000058), - - /// The provided pipeline ELF is invalid. - ErrorInvalidPipelineElf = -(0x00000059), - - /// The returned results were incomplete. - ErrorIncompleteResults = -(0x00000060), - - /// The display mode is imcompatible with framebuffer or CRTC. - ErrorIncompatibleDisplayMode = -(0x00000061), - - /// Implicit fullscreen exclusive mode is not safe because the specified window size doesn't match the - /// screen's current dimensions. - ErrorIncompatibleWindowSize = -(0x00000062), - - /// A semaphore was used in WaitForSemaphores() without being signaled. - ErrorSemaphoreNeverSignaled = -(0x00000063), - - /// Invalid image create info: specified metadataMode is invalid for the Image. - ErrorInvalidImageMetadataMode = -(0x00000064), - - /// Invalid external handle detected for the Image. - ErrorInvalidExternalHandle = -(0x00000065), - - /// The permission of operation is denied. - ErrorPermissionDenied = -(0x00000066), - - /// The operation failed because the disk is full. - ErrorDiskFull = -(0x00000067), - - /// The static VMID acquire/release operation failed. - ErrorStaticVmidOpFailed = -(0x00000068), - -}; - -///Specifies a ratio of two unsigned integers. -struct Rational -{ - uint32 numerator; ///< Numerator - uint32 denominator; ///< Denominator -}; - -/// Implements operator== for PAL's Rational struct where similar ratios like 2/3 and 4/6 are treated as equal. -/// -/// @param left Rational to be compared -/// @param right Rational to be compared -/// -/// @return true if the ratios are logically equal. -constexpr bool operator==( - const Rational& lhs, - const Rational& rhs) -{ - // Any ratio with a zero denominator is illegal/undefined, for example: "3/0 == 5/0" or "5/3 == 0/0". We must pick - // either "true" or "false" for these illegal cases. "true" seems like the most wrong option so we use "false". - if ((lhs.denominator == 0) || (rhs.denominator == 0)) - { - return false; - } - // Otherwise, our equality check is: lhs_n / lhs_d == rhs_n / rhs_d - // Multiply both sides by lhs_d: lhs_n == rhs_n * lhs_d / rhs_d - // Multiply both sides by rhs_d: lhs_n * rhs_d == rhs_n * lhs_d - // This trick avoids dealing with common factors or remainders and uses no slow division instructions. - return (static_cast(lhs.numerator) * static_cast(rhs.denominator) == - static_cast(rhs.numerator) * static_cast(lhs.denominator)); -} - -// Flags to be passed to store operations. -struct StoreFlags -{ - union - { - struct - { - uint32 enableFileCache : 1; ///< If we should skip the file cache layer when we get to it. - uint32 enableCompression : 1; ///< If we should skip the compression layer when we get to it. - uint32 reserved : 30; - }; - uint32 all; - }; -}; - -#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 -/// Seconds stored as a float instead of an integer. -using fseconds = std::chrono::duration; -/// Milliseconds stored as a float instead of an integer. -using fmilliseconds = std::chrono::duration; -/// Microseconds stored as a float instead of an integer. -using fmicroseconds = std::chrono::duration; -/// Nanoseconds stored as a float instead of an integer. -using fnanoseconds = std::chrono::duration; - -/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. -/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. -/// system_clock's internal duration is still implementation defined. -/// On Windows it's hundreds of nanoseconds and on Linux it's seconds. -/// However time_point has it's own duration type. -/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. -using SecondsSinceEpoch = std::chrono::time_point; - -/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. -template -constexpr ToDuration TimeoutCast( - const std::chrono::duration& d) -{ - if (d == (std::chrono::duration::max)()) - { - return (ToDuration::max)(); - } - else - { - return std::chrono::duration_cast(d); - } -} -#endif - -/// Inline function to determine if a Result enum is considered an error. -constexpr bool IsErrorResult(Result result) { return (static_cast(result) < 0); } - -/// Inline function to collapse two Result enums into the most useful Result code. It considers errors to be more -/// interesting than success codes and considers "Success" to be the least interesting success code. If both Results -/// are errors, the first Result is returned. -constexpr Result CollapseResults(Result lhs, Result rhs) - { return (IsErrorResult(lhs) || (static_cast(lhs) > static_cast(rhs))) ? lhs : rhs; } - -/// A simple enum-to-string helper function. Given a result like Result::ErrorOutOfMemory, it returns a pointer to a -/// global string containing "ErrorOutOfMemory". The caller must not try to free the returned string. -/// -/// @param [in] result The Result code to turn into a string. -/// -/// @returns A valid pointer to the appropriate global string or to "FixTheTables!!!" if someone forgot to update the -/// internal string tables when they added a new Result value. It's impossible for this to return nullptr. -extern const char* ResultToString( - Result result); - -/** - *********************************************************************************************************************** - * @page UtilOverview Utility Collection - * - * In addition to its GPU-specific core functionality, PAL provides a lot of generic, OS-abstracted software utilities - * in the @ref Util namespace. The PAL core relies on these utilities, but they are also available for use by its - * clients. In fact, it is possible to build and use PAL only for its utility collection by building PAL with the - * PAL_BUILD_CORE build option set to 0. - * - * All available PAL utilities are defined in the @ref Util namespace, and are briefly summarized below. See the - * Reference topics for more detailed information on specific classes, enums, etc. - * - * ### System Memory Management - * palSysMemory.h defines a handful of macros that can be used for allocating and freeing system heap memory. These - * macros will use the client-specified allocation callbacks specified by the client at CreatePlatform() if specified. - * These macros are: - * - * - PAL_MALLOC: Equivalent to malloc(). - * - PAL_CALLOC: Equivalent to calloc(). - * - PAL_FREE: Equivalent to free(). - * - PAL_SAFE_FREE: Equivalent to free(), then nulls out the specified pointer. - * - PAL_NEW: Equivalent to C++ new. - * - PAL_NEW_ARRAY: Equivalent to C++ new[]. - * - PAL_PLACEMENT_NEW: Equivalent to C++ placement new. - * - PAL_DELETE: Equivalent to C++ delete. - * - PAL_DELETE_THIS: Special version of PAL_DELETE that effectively does "delete this;" This is necessary for - * classes that have non-public destructors. - * - PAL_DELETE_ARRAY: Equivalent to C++ delete[]. - * - PAL_SAFE_DELETE_ARRAY: Equivalent to C++ delete, then nulls out the specified pointer. - * - PAL_SAFE_DELETE: Equivalent to C++ delete[], then nulls out the specified pointer. - * - * ### Allocators - * All of the memory management macros take in a templated allocator, which is required to have the following two - * functions defined: - * - * void* Alloc(const Util::AllocInfo) - * void Free(const Util::FreeInfo) - * - * It is expected that clients that specify their own allocators will handle cases that require specific alignments - * and/or zeroing the returned memory. - * - * Some allocators can be created for use by clients: - * - VirtualLinearAllocator: A linear allocator that allocates virtual memory and backs it with physical memory - * when needed. - * - * ### Debug Prints and Asserts - * palDbgPrint.h and palAssert.h provide a number of macros used widely by the PAL core and also available for use - * by clients. - * - * The PAL_DPF, PAL_DPINFO, PAL_DPERROR, and PAL_DPWARN can be used to issue debug prints. These macros will be nulled - * out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. SetDbgPrintMode() can be called to configure how the - * different categories of debug prints will be handled (e.g., print to the debugger, print to file, etc.). - * - * The PAL_ASSERT and PAL_ALERT macros can be used to verify expected states of the program at runtime. PAL_ASSERT - * should be used for verifying expected invariants and assumptions, while PAL_ALERT should be used to alert the - * developer of a condition that is allowed, but not typically expected (i.e., failure of a system memory allocation). - * Note that the polarity of the condition check is different between assert and alert. Asserts "assert" that the - * specified condition is true (and complain if it's not), while alerts "alert" a developer if an unexpected condition - * is true. These macros will be nulled out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. EnableAssertMode() - * can be called to enable/disable asserts or alerts at runtime. - * - * ### Generic Containers - * Util includes a number of generic container data structure implementations. Note that most of these are broken up - * into two header files - for example, list.h and listImpl.h. The intention is that list.h will be included from - * other header files that need a full list definition, while listImpl.h will be included by .cpp files that actually - * interact with the list. This should keep build times down versus putting all implementations directly in list.h. - * - AutoBuffer: Allows dynamic arrays to be placed on the stack without a heap allocation in situations where a - * maximum reasonable expected size is known. - * - Deque: Double ended queue. - * - HashMap: Fast map implementation. Note that this implementation has some non-standard restrictions on the key - * (can't be 0) and value size (must fit in a cache line). - * - HashSet: Fast set implementation. Note the similar restrictions to HashMap. - * - IntervalTree: [Interval tree] implementation. - * - RingBuffer: A ringed buffer of variable length and size. - * - * ### Multithreading and Synchronization - * Util includes a number of OS-abstracted multithreading and CPU synchronization constructs: - * - * - Thread - * - Mutex - * - Semaphore - * - ConditionVariable - * - Event - * - * ### Files - * The File class provides an OS-abstracted interface for opening files and reading/writing data in those files. - * Further, the ElfReadContext and ElfWriteContext classes provide functionality for reading and writing buffers in the - * [Executable and Linkable Format (ELF)] - * The ELF utilities can be used in conjunction with File in order to read/write ELF files on disk. - * - * ### Inline Functions - * palInlineFuncs.h defines a bunch of simple inline functions that are used throughout PAL and might be useful to - * clients. Some examples include VoidPtrInc(), Pow2Pad(), Min(), Max(), Strncpy(), etc. - * - * palMath.h defines a Math namespace with various constants and functions related to floating point conversions and - * basic math rouintes like Sqrt(). - * - * Additionally, palInlineFuncs.h defines a template metaprogramming string hash implementation that can produce - * a FNV1A hash for a string specified in the source code without the string showing up in a compiled release build. - * - * ### System Utilities - * palSysUtil.h defines a few functions providing abstracted system-specific functionality: - * - Access to the high resolution CPU performance counters with GetPerfFrequency() and GetPerfCpuTime(). - * - Support for asynchronously querying if a particular keyboard key is currently pressed with IsKeyPressed(). - * - * ### Cryptographic Algorithm Implementations - * Util provides the crypto algorithm Md5 - * - * Next: @ref GpuUtilOverview - *********************************************************************************************************************** - */ - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file palUtil.h + * @brief Common include for the PAL utility collection. Defines common types, macros, enums, etc. + *********************************************************************************************************************** + */ + +#pragma once + +/// Utility macro for turning another macro into a string literal. +#define _PAL_STRINGIFY(_x) #_x +#define PAL_STRINGIFY(_x) _PAL_STRINGIFY(_x) + +/// C++11 standard version. +#define PAL_CPLUSPLUS_11 201103L +/// C++14 standard version. +#define PAL_CPLUSPLUS_14 201402L +/// C++17 standard version. +#define PAL_CPLUSPLUS_17 201703L +/// C++ feature version from September 2017 contains a few C++20 features. +#define PAL_CPLUSPLUS_1709 201709L +/// C++20 standard version. +#define PAL_CPLUSPLUS_20 202002L + +/// C++ standard version used to compile PAL. +# ifdef _MSVC_LANG +// MSVC requires the /Zc:__cplusplus flag, otherwise __cplusplus is that of 199711L (C++98). _MSVC_LANG has the correct +// C++ standard version. +# define PAL_CPLUSPLUS _MSVC_LANG +# else +# define PAL_CPLUSPLUS __cplusplus +# endif + +/// Checks if PAL is compiled with C++ of at least version @p v. +#define PAL_CPLUSPLUS_AT_LEAST(v) (PAL_CPLUSPLUS >= (v)) + +static_assert( + PAL_CPLUSPLUS_AT_LEAST(PAL_CPLUSPLUS_1709), + "C++ standard version " PAL_STRINGIFY(PAL_CPLUSPLUS_1709) " is required to build PAL. " + "Found " PAL_STRINGIFY(PAL_CPLUSPLUS) "."); + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 878 +/// We already declare NOMINMAX publicly, but that won't stop clients from defining their own min/max macros. +/// These macros confuse the compiler when using functions named min/max, leading to build errors. +#if defined(min) || defined(max) +static_assert(false, "Clients may not define macros named \"min\" or \"max\"."); +#endif +#endif + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 +#include +#endif +#include + +/// stdint is included instead of cstdint to allow Visual Studio Intellisense to work for Linux builds. This can be +/// removed if the error caused by including cstdint is figured out. +#include "stdint.h" + +/// Include in the class declaration in order to disallow use of the copy constructor and assignment operator for that +/// class. +#define PAL_DISALLOW_COPY_AND_ASSIGN(_typename) \ + _typename(const _typename&) = delete; \ + _typename& operator=(const _typename&) = delete; + +/// Include in the declaration in order to disallow use of the default constructor for a class. +#define PAL_DISALLOW_DEFAULT_CTOR(_typename) \ + _typename() = delete; + +#if !defined(__GNUC__) + +// Equates to the [__stdcall](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/stdcall.md) convention on Windows. +#define PAL_STDCALL __stdcall +// Equates to the [__cdecl](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/cdecl.md) convention on Windows. +#define PAL_CDECL __cdecl +// Equates to [__declspec(align(__x))](https://github.com/MicrosoftDocs/cpp-docs/blob/master/docs/cpp/align-cpp.md) on Windows. +#define PAL_ALIGN(__x) __declspec(align(__x)) +#define PAL_FORCE_INLINE __forceinline +#else +/// Undefined on GCC platforms. +#define PAL_STDCALL +/// Undefined on GCC platforms. +#define PAL_CDECL +/// Undefined on GCC platforms. +#define PAL_ALIGN(__x) +#define PAL_FORCE_INLINE __attribute__((always_inline)) inline +#endif + +/// Platform cache line size in bytes. +#define PAL_CACHE_LINE_BYTES 64 +/// Platform system memory page size in bytes. +#define PAL_PAGE_BYTES 4096 + +/// Force cache line alignment. +#define PAL_ALIGN_CACHE_LINE PAL_ALIGN(PAL_CACHE_LINE_BYTES) + +#if !defined(_WIN32) +/// Value representing an invalid file descriptor on Linux systems. +constexpr int32_t InvalidFd = -1; +#endif + +#ifdef __has_builtin +/// A macro that checks for the presence of builtin functions. Will default to false if the compiler does not have +/// support for doing this check. +#define PAL_HAS_BUILTIN(builtin) __has_builtin(builtin) +#else +#define PAL_HAS_BUILTIN(builtin) 0 +#endif + +#if defined(__has_cpp_attribute) +#define PAL_HAS_CPP_ATTR(attr) __has_cpp_attribute(attr) +#else +#define PAL_HAS_CPP_ATTR(attr) 0 +#endif + +/// Library-wide namespace encapsulating all PAL utility collection entities. +namespace Util +{ + +typedef int8_t int8; ///< 8-bit integer. +typedef int16_t int16; ///< 16-bit integer. +typedef int32_t int32; ///< 32-bit integer. +typedef int64_t int64; ///< 64-bit integer. +typedef uint8_t uint8; ///< Unsigned 8-bit integer. +typedef uint16_t uint16; ///< Unsigned 16-bit integer. +typedef uint32_t uint32; ///< Unsigned 32-bit integer. +typedef uint64_t uint64; ///< Unsigned 64-bit integer. +typedef uint64_t gpusize; ///< Used to specify GPU addresses and sizes of GPU allocations. This differs from + /// size_t since the GPU still uses 64-bit addresses on a 32-bit OS. + +/// Error and return codes indicating outcome of a requested operation. Success result codes are greater than or equal +/// to 0, and error results codes are less than 0. +enum class Result : int32 +{ + /// @internal The operation completed successfully. + _Success = 0x00000000, + + // Unfortunately for Linux clients, X.h includes a "#define Success 0" macro. Clients have their choice of either + // undefing Success before including this header or using _Success when dealing with PAL. +#ifndef Success + /// The operation completed successfully. + Success = _Success, +#endif + + /// The operation is not supported. + Unsupported = 0x00000001, + + /// The operation completed successfully but the result is not ready. This result code normally applies to + /// situations where results of queued GPU operations such as queries and fences have not been written to memory + /// yet. + NotReady = 0x00000002, + + /// The wait operation completed due to a client-specified timeout condition. + Timeout = 0x00000003, + + /// The event is in the "set" state. @see IGpuEvent::GetStatus. + EventSet = 0x00000004, + + /// The event is in the "reset" state. @see IGpuEvent::GetStatus. + EventReset = 0x00000005, + + /// The operation was successful, but the client has reached the maximum allowable number of flippable GPU memory + /// objects. Future requests to create presentable Images or flippable GPU memory objects may fail due to + /// limitations within the underlying OS. + /// @see IDevice::CreateGpuMemory. + /// @see IDevice::CreatePresentableImage. + TooManyFlippableAllocations = 0x00000006, + + /// The present was successful, but some portion of the window is currently occluded by another window. + PresentOccluded = 0x00000007, + + /// The directory/file/etc. being created already exists. + AlreadyExists = 0x00000008, + + /// A warning indicates an operation is successful (supported by H/W) but out of a certain spec (e.g. VESA). + OutOfSpec = 0x00000009, + + /// The value being searched for was not found. + NotFound = 0x0000000A, + + /// End of file reached successfully. + Eof = 0x0000000B, + + /// If ReserveEntryOnMiss was specified, the entry was not found, and the entry was successfully reserved. + Reserved = 0x0000000C, + + /// If an operation is purposefully terminated early, rather than from an error. + Aborted = 0x0000000D, + + /// The operation encountered an unknown error. + ErrorUnknown = -(0x00000001), + + /// The requested operation is unavailable at this time. + ErrorUnavailable = -(0x00000002), + + /// The initialization operation failed for unknown reasons. + ErrorInitializationFailed = -(0x00000003), + + /// The operation could not complete due to insufficient system memory. + ErrorOutOfMemory = -(0x00000004), + + /// The operation could not complete due to insufficient GPU memory. + ErrorOutOfGpuMemory = -(0x00000005), + + /// The device was lost due to its removal or a possible hang and recovery condition. The client should destroy all + /// devices (and objects attached to them) and re-enumerate the available devices be calling EnumerateDevices(). + ErrorDeviceLost = -(0x00000007), + + /// A required input pointer passed to the call was invalid (probably null). + ErrorInvalidPointer = -(0x00000008), + + /// An invalid value was passed to the call. + ErrorInvalidValue = -(0x00000009), + + /// An invalid ordinal was passed to the call. + ErrorInvalidOrdinal = -(0x0000000A), + + /// An invalid memory size was passed to the call. + ErrorInvalidMemorySize = -(0x0000000B), + + /// Invalid flags were passed to the call. + ErrorInvalidFlags = -(0x0000000C), + + /// An invalid alignment parameter was specified + ErrorInvalidAlignment = -(0x0000000D), + + /// An invalid resource format was specified. + ErrorInvalidFormat = -(0x0000000E), + + /// The requested operation cannot be performed on the provided @ref Pal::IImage object. + ErrorInvalidImage = -(0x0000000F), + + /// The descriptor set data is invalid or does not match the related pipeline. + ErrorInvalidDescriptorSetData = -(0x00000010), + + /// An invalid queue type was specified. + ErrorInvalidQueueType = -(0x00000011), + + /// An invalid object type was specified. + ErrorInvalidObjectType = -(0x00000012), + + /// The specified shader uses an unsupported version of AMD IL. + ErrorUnsupportedShaderIlVersion = -(0x00000013), + + /// The specified shader code is invalid or corrupt. + ErrorBadShaderCode = -(0x00000014), + + /// The specified serialized pipeline data is invalid or corrupt. + ErrorBadPipelineData = -(0x00000015), + + /// The queue operation specified more GPU memory references than are supported. + /// @see Pal::IQueue::Submit + /// @see Pal::IDevice::AddGpuMemoryReferences + /// @see Pal::DeviceProperties::maxGpuMemoryRefsResident + ErrorTooManyMemoryReferences = -(0x00000016), + + /// The memory object cannot be mapped because it does not reside in a CPU visible heap. + ErrorNotMappable = -(0x00000017), + + /// The map operation failed due to an unknown or system reason. + ErrorGpuMemoryMapFailed = -(0x00000018), + + /// The unmap operation failed due to an unknown or system reason. + ErrorGpuMemoryUnmapFailed = -(0x00000019), + + /// The serialized pipeline load operation failed due to an incompatible device. + ErrorIncompatibleDevice = -(0x0000001A), + + /// The serialized pipeline load operation failed due to an incompatible PAL library. + ErrorIncompatibleLibrary = -(0x0000001B), + + /// The requested operation (such as command buffer submission) can't be completed because command buffer + /// construction is not complete. + ErrorIncompleteCommandBuffer = -(0x0000001C), + + /// The specified command buffer failed to build correctly. This error can be delayed from the original source of + /// the error since the command buffer building methods do not return error codes. + ErrorBuildingCommandBuffer = -(0x0000001D), + + /// The operation cannot complete since not all objects have valid GPU memory bound to them. + ErrorGpuMemoryNotBound = -(0x0000001E), + + /// The requested operation is not supported on the specified queue type. + ErrorIncompatibleQueue = -(0x0000001F), + + /// The object cannot be created or opened for sharing between multiple GPU devices. + ErrorNotShareable = -(0x00000020), + + /// The operation failed because the specified fullscreen mode was unavailable. This could be a failure while + /// attempting to take fullscreen ownership, or when attempting to perform a fullscreen present and the user has + /// left fullscreen mode. + ErrorFullscreenUnavailable = -(0x00000021), + + /// The targeted screen of the operation has been removed from the system. + ErrorScreenRemoved = -(0x00000022), + + /// Present failed because the screen mode is no longer compatible with the source image. + ErrorIncompatibleScreenMode = -(0x00000023), + + /// The cross-GPU present failed, possibly due to a lack of system bus bandwidth to accommodate the transfer. + ErrorMultiDevicePresentFailed = -(0x00000024), + + /// The slave GPU(s) in an MGPU system cannot create BLTable present images. + ErrorWindowedPresentUnavailable = -(0x00000025), + + /// The attempt to enter fullscreen exclusive mode failed because the specified image doesn't properly match the + /// screen's current dimensions. + ErrorInvalidResolution = -(0x00000026), + + /// The shader specifies a thread group size that is bigger than what is supported by this device. + ErrorThreadGroupTooBig = -(0x00000027), + + /// Invalid image create info: Specified both color target and depth usage + ErrorInvalidImageTargetUsage = -(0x00000028), + + /// Invalid image create info: Specified a 1D type for a color target + ErrorInvalidColorTargetType = -(0x00000029), + + /// Invalid image create info: Specified a non-2D type for a depth/stencil target + ErrorInvalidDepthTargetType = -(0x0000002A), + + /// Invalid image create info: The image format supports depth/stencil but depth/stencil usage was not specified + ErrorMissingDepthStencilUsage = -(0x0000002B), + + /// Invalid image create info: Specified MSAA and multiple mip levels + ErrorInvalidMsaaMipLevels = -(0x0000002C), + + /// Invalid image create info: The image format is incompatible with MSAA + ErrorInvalidMsaaFormat = -(0x0000002D), + + /// Invalid image create info: The image type is incompatible with MSAA + ErrorInvalidMsaaType = -(0x0000002E), + + /// The sample count is invalid + ErrorInvalidSampleCount = -(0x0000002F), + + /// Invalid image create info: Invalid block compressed image type + ErrorInvalidCompressedImageType = -(0x00000030), + + /// Invalid image create info: Format is incompatible with the specified image usage + ErrorInvalidUsageForFormat = -(0x00000032), + + /// Invalid image create info: Array size is invalid + ErrorInvalidImageArraySize = -(0x00000033), + + /// Invalid image create info: Array size is invalid for a 3D image + ErrorInvalid3dImageArraySize = -(0x00000034), + + /// Invalid image create info: Image width is invalid + ErrorInvalidImageWidth = -(0x00000035), + + /// Invalid image create info: Image height is invalid + ErrorInvalidImageHeight = -(0x00000036), + + /// Invalid image create info: Image depth is invalid + ErrorInvalidImageDepth = -(0x00000037), + + /// Invalid image create info: Mip count is invalid + ErrorInvalidMipCount = -(0x00000038), + + /// Invalid image create info: Image format is incompatible with the image usage specified. + ErrorFormatIncompatibleWithImageUsage = -(0x00000039), + + /// Operation requested an image plane that is not available on the image. + ErrorImagePlaneUnavailable = -(0x0000003A), + + /// Another format is incompatible with an image's format. + ErrorFormatIncompatibleWithImageFormat = -(0x0000003B), + + /// Another format is incompatible with an image plane's format. + ErrorFormatIncompatibleWithImagePlane = -(0x0000003C), + + /// Operation requires a shader readable or writable image usage but the image does not support it. + ErrorImageNotShaderAccessible = -(0x0000003D), + + /// Format is paired with a channel mapping that contains invalid components. + ErrorInvalidFormatSwizzle = -(0x0000003E), + + /// A base mip level that is out of bounds or otherwise invalid was specified. + ErrorInvalidBaseMipLevel = -(0x0000003F), + + /// A view array size that was zero or otherwise invalid was specified. + ErrorInvalidViewArraySize = -(0x00000040), + + /// A view base array slice that was out of bounds or otherwise invalid was specified. + ErrorInvalidViewBaseSlice = -(0x00000041), + + /// A view image type was specified that is incompatible with the image's type. + ErrorViewTypeIncompatibleWithImageType = -(0x00000042), + + /// A view specifies an array slice range that is larger than what is supported by the image. + ErrorInsufficientImageArraySize = -(0x00000043), + + /// It is illegal to create a cubemap view into an MSAA image. + ErrorCubemapIncompatibleWithMsaa = -(0x00000044), + + /// A cubemap view was created to an image that does not have square width and height. + ErrorCubemapNonSquareFaceSize = -(0x00000045), + + /// An fmask view was created to an image that does not support an fmask. + ErrorImageFmaskUnavailable = -(0x00000046), + + /// A private screen was removed. + ErrorPrivateScreenRemoved = -(0x00000047), + + /// A private screen was already in exclusive use. + ErrorPrivateScreenUsed = -(0x00000048), + + /// The image count created or opened on this private display exceed maximum. + ErrorTooManyPrivateDisplayImages = -(0x00000049), + + /// The private screen is not enabled. + ErrorPrivateScreenNotEnabled = -(0x0000004A), + + /// The private screen count exceeds the maximum (including emulated and physical ones). + ErrorTooManyPrivateScreens = -(0x0000004B), + + /// Invalid image create info: Image rowPitch does not equal the image's actual row pitch. + ErrorMismatchedImageRowPitch = -(0x0000004C), + + /// Invalid image create info: Image depthPitch does not equal the image's actual depth pitch. + ErrorMismatchedImageDepthPitch = -(0x0000004D), + + /// The given swap chain cannot be associated with any more presentable images. + ErrorTooManyPresentableImages = -(0x0000004E), + + /// A fence was used in GetStatus() or WaitForFences() without being used in any submission. + ErrorFenceNeverSubmitted = -(0x0000004F), + + /// The image used on the specified private screen has an invalid format. + ErrorPrivateScreenInvalidFormat = -(0x00000050), + + /// The timing data set on the specified private screen was invalid. + ErrorPrivateScreenInvalidTiming = -(0x00000051), + + /// The resolution set on the specified private screen was invalid. + ErrorPrivateScreenInvalidResolution = -(0x00000052), + + /// The scaling parameter set on the specified private screen was invalid. + ErrorPrivateScreenInvalidScaling = -(0x00000053), + + /// Invalid image create info: Invalid YUV image type + ErrorInvalidYuvImageType = -(0x00000054), + + /// The external shader cache found a matching hash but the with different key data. + ErrorShaderCacheHashCollision = -(0x00000055), + + /// The external shader cache is full + ErrorShaderCacheFull = -(0x00000056), + + /// The operation caused a pagefault. + ErrorGpuPageFaultDetected = -(0x00000057), + + /// The provided pipeline ELF uses an unsupported ABI version. + ErrorUnsupportedPipelineElfAbiVersion = -(0x00000058), + + /// The provided pipeline ELF is invalid. + ErrorInvalidPipelineElf = -(0x00000059), + + /// The returned results were incomplete. + ErrorIncompleteResults = -(0x00000060), + + /// The display mode is imcompatible with framebuffer or CRTC. + ErrorIncompatibleDisplayMode = -(0x00000061), + + /// Implicit fullscreen exclusive mode is not safe because the specified window size doesn't match the + /// screen's current dimensions. + ErrorIncompatibleWindowSize = -(0x00000062), + + /// A semaphore was used in WaitForSemaphores() without being signaled. + ErrorSemaphoreNeverSignaled = -(0x00000063), + + /// Invalid image create info: specified metadataMode is invalid for the Image. + ErrorInvalidImageMetadataMode = -(0x00000064), + + /// Invalid external handle detected for the Image. + ErrorInvalidExternalHandle = -(0x00000065), + + /// The permission of operation is denied. + ErrorPermissionDenied = -(0x00000066), + + /// The operation failed because the disk is full. + ErrorDiskFull = -(0x00000067), + + /// The static VMID acquire/release operation failed. + ErrorStaticVmidOpFailed = -(0x00000068), + +}; + +///Specifies a ratio of two unsigned integers. +struct Rational +{ + uint32 numerator; ///< Numerator + uint32 denominator; ///< Denominator +}; + +/// Implements operator== for PAL's Rational struct where similar ratios like 2/3 and 4/6 are treated as equal. +/// +/// @param left Rational to be compared +/// @param right Rational to be compared +/// +/// @return true if the ratios are logically equal. +constexpr bool operator==( + const Rational& lhs, + const Rational& rhs) +{ + // Any ratio with a zero denominator is illegal/undefined, for example: "3/0 == 5/0" or "5/3 == 0/0". We must pick + // either "true" or "false" for these illegal cases. "true" seems like the most wrong option so we use "false". + if ((lhs.denominator == 0) || (rhs.denominator == 0)) + { + return false; + } + // Otherwise, our equality check is: lhs_n / lhs_d == rhs_n / rhs_d + // Multiply both sides by lhs_d: lhs_n == rhs_n * lhs_d / rhs_d + // Multiply both sides by rhs_d: lhs_n * rhs_d == rhs_n * lhs_d + // This trick avoids dealing with common factors or remainders and uses no slow division instructions. + return (static_cast(lhs.numerator) * static_cast(rhs.denominator) == + static_cast(rhs.numerator) * static_cast(lhs.denominator)); +} + +// Flags to be passed to store operations. +struct StoreFlags +{ + union + { + struct + { + uint32 enableFileCache : 1; ///< If we should skip the file cache layer when we get to it. + uint32 enableCompression : 1; ///< If we should skip the compression layer when we get to it. + uint32 reserved : 30; + }; + uint32 all; + }; +}; + +#if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 873 +/// Seconds stored as a float instead of an integer. +using fseconds = std::chrono::duration; +/// Milliseconds stored as a float instead of an integer. +using fmilliseconds = std::chrono::duration; +/// Microseconds stored as a float instead of an integer. +using fmicroseconds = std::chrono::duration; +/// Nanoseconds stored as a float instead of an integer. +using fnanoseconds = std::chrono::duration; + +/// A time_point who's epoch is January 1st 1970 and uses seconds for the duration. +/// C++20 guarantees us that system_clock's epoch is always January 1st 1970 on all platforms. +/// system_clock's internal duration is still implementation defined. +/// On Windows it's hundreds of nanoseconds and on Linux it's seconds. +/// However time_point has it's own duration type. +/// As long as we go through the time_point to interpret the duration then everything should be in terms of seconds. +using SecondsSinceEpoch = std::chrono::time_point; + +/// Like std::chrono::duration_cast, but it preserves the special 'infinite' value used in timeouts. +template +constexpr ToDuration TimeoutCast( + const std::chrono::duration& d) +{ + if (d == (std::chrono::duration::max)()) + { + return (ToDuration::max)(); + } + else + { + return std::chrono::duration_cast(d); + } +} +#endif + +/// Inline function to determine if a Result enum is considered an error. +constexpr bool IsErrorResult(Result result) { return (static_cast(result) < 0); } + +/// Inline function to collapse two Result enums into the most useful Result code. It considers errors to be more +/// interesting than success codes and considers "Success" to be the least interesting success code. If both Results +/// are errors, the first Result is returned. +constexpr Result CollapseResults(Result lhs, Result rhs) + { return (IsErrorResult(lhs) || (static_cast(lhs) > static_cast(rhs))) ? lhs : rhs; } + +/** + *********************************************************************************************************************** + * @page UtilOverview Utility Collection + * + * In addition to its GPU-specific core functionality, PAL provides a lot of generic, OS-abstracted software utilities + * in the @ref Util namespace. The PAL core relies on these utilities, but they are also available for use by its + * clients. In fact, it is possible to build and use PAL only for its utility collection by building PAL with the + * PAL_BUILD_CORE build option set to 0. + * + * All available PAL utilities are defined in the @ref Util namespace, and are briefly summarized below. See the + * Reference topics for more detailed information on specific classes, enums, etc. + * + * ### System Memory Management + * palSysMemory.h defines a handful of macros that can be used for allocating and freeing system heap memory. These + * macros will use the client-specified allocation callbacks specified by the client at CreatePlatform() if specified. + * These macros are: + * + * - PAL_MALLOC: Equivalent to malloc(). + * - PAL_CALLOC: Equivalent to calloc(). + * - PAL_FREE: Equivalent to free(). + * - PAL_SAFE_FREE: Equivalent to free(), then nulls out the specified pointer. + * - PAL_NEW: Equivalent to C++ new. + * - PAL_NEW_ARRAY: Equivalent to C++ new[]. + * - PAL_PLACEMENT_NEW: Equivalent to C++ placement new. + * - PAL_DELETE: Equivalent to C++ delete. + * - PAL_DELETE_THIS: Special version of PAL_DELETE that effectively does "delete this;" This is necessary for + * classes that have non-public destructors. + * - PAL_DELETE_ARRAY: Equivalent to C++ delete[]. + * - PAL_SAFE_DELETE_ARRAY: Equivalent to C++ delete, then nulls out the specified pointer. + * - PAL_SAFE_DELETE: Equivalent to C++ delete[], then nulls out the specified pointer. + * + * ### Allocators + * All of the memory management macros take in a templated allocator, which is required to have the following two + * functions defined: + * + * void* Alloc(const Util::AllocInfo) + * void Free(const Util::FreeInfo) + * + * It is expected that clients that specify their own allocators will handle cases that require specific alignments + * and/or zeroing the returned memory. + * + * Some allocators can be created for use by clients: + * - VirtualLinearAllocator: A linear allocator that allocates virtual memory and backs it with physical memory + * when needed. + * + * ### Debug Prints and Asserts + * palDbgPrint.h and palAssert.h provide a number of macros used widely by the PAL core and also available for use + * by clients. + * + * The PAL_DPF, PAL_DPINFO, PAL_DPERROR, and PAL_DPWARN can be used to issue debug prints. These macros will be nulled + * out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. SetDbgPrintMode() can be called to configure how the + * different categories of debug prints will be handled (e.g., print to the debugger, print to file, etc.). + * + * The PAL_ASSERT and PAL_ALERT macros can be used to verify expected states of the program at runtime. PAL_ASSERT + * should be used for verifying expected invariants and assumptions, while PAL_ALERT should be used to alert the + * developer of a condition that is allowed, but not typically expected (i.e., failure of a system memory allocation). + * Note that the polarity of the condition check is different between assert and alert. Asserts "assert" that the + * specified condition is true (and complain if it's not), while alerts "alert" a developer if an unexpected condition + * is true. These macros will be nulled out if PAL_ENABLE_PRINTS_ASSERTS is not defined to be 1. EnableAssertMode() + * can be called to enable/disable asserts or alerts at runtime. + * + * ### Generic Containers + * Util includes a number of generic container data structure implementations. Note that most of these are broken up + * into two header files - for example, list.h and listImpl.h. The intention is that list.h will be included from + * other header files that need a full list definition, while listImpl.h will be included by .cpp files that actually + * interact with the list. This should keep build times down versus putting all implementations directly in list.h. + * - AutoBuffer: Allows dynamic arrays to be placed on the stack without a heap allocation in situations where a + * maximum reasonable expected size is known. + * - Deque: Double ended queue. + * - HashMap: Fast map implementation. Note that this implementation has some non-standard restrictions on the key + * (can't be 0) and value size (must fit in a cache line). + * - HashSet: Fast set implementation. Note the similar restrictions to HashMap. + * - IntervalTree: [Interval tree] implementation. + * - RingBuffer: A ringed buffer of variable length and size. + * + * ### Multithreading and Synchronization + * Util includes a number of OS-abstracted multithreading and CPU synchronization constructs: + * + * - Thread + * - Mutex + * - Semaphore + * - ConditionVariable + * - Event + * + * ### Files + * The File class provides an OS-abstracted interface for opening files and reading/writing data in those files. + * Further, the ElfReadContext and ElfWriteContext classes provide functionality for reading and writing buffers in the + * [Executable and Linkable Format (ELF)] + * The ELF utilities can be used in conjunction with File in order to read/write ELF files on disk. + * + * ### Inline Functions + * palInlineFuncs.h defines a bunch of simple inline functions that are used throughout PAL and might be useful to + * clients. Some examples include VoidPtrInc(), Pow2Pad(), Min(), Max(), Strncpy(), etc. + * + * palMath.h defines a Math namespace with various constants and functions related to floating point conversions and + * basic math rouintes like Sqrt(). + * + * Additionally, palInlineFuncs.h defines a template metaprogramming string hash implementation that can produce + * a FNV1A hash for a string specified in the source code without the string showing up in a compiled release build. + * + * ### System Utilities + * palSysUtil.h defines a few functions providing abstracted system-specific functionality: + * - Access to the high resolution CPU performance counters with GetPerfFrequency() and GetPerfCpuTime(). + * - Support for asynchronously querying if a particular keyboard key is currently pressed with IsKeyPressed(). + * + * ### Cryptographic Algorithm Implementations + * Util provides the crypto algorithm Md5 + * + * Next: @ref GpuUtilOverview + *********************************************************************************************************************** + */ + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/inc/util/palVector.h b/shared/amdgpu-windows-interop/pal/inc/util/palVector.h index 7d13981669..633b00d71d 100644 --- a/shared/amdgpu-windows-interop/pal/inc/util/palVector.h +++ b/shared/amdgpu-windows-interop/pal/inc/util/palVector.h @@ -1,455 +1,455 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -* @file palVector.h -* @brief PAL utility collection Vector and VectorIterator class declarations. -*********************************************************************************************************************** -*/ - -#pragma once - -#include "palUtil.h" -#include "palAssert.h" -#include "palSpan.h" -#include "palSysMemory.h" -#include - -namespace Util -{ - -// Forward declarations. -template class Vector; - -/** - *********************************************************************************************************************** - * @brief Iterator for traversal of elements in Vector. - * - * Supports forward traversal. - *********************************************************************************************************************** - */ -template -class VectorIterator -{ -public: - /// Checks if the current index is within bounds of the number of elements in the vector. - /// - /// @returns True if the current element this iterator is pointing to is within the permitted range. - bool IsValid() const { return (m_curIndex < m_srcVector.m_numElements); } - - /// Returns the element the iterator is currently pointing to as a reference. - /// - /// @warning This may cause an access violation if the iterator is not valid. - /// - /// @returns The element the iterator is currently pointing to. - T& Get() const - { - PAL_ASSERT(IsValid()); - return (*(m_srcVector.m_pData + m_curIndex)); - } - - /// Advances the iterator to point to the next element. - /// - /// @warning Does not do bounds checking. - void Next() { ++m_curIndex; } - - /// Retrieves the current vector position of this iterator. - /// - /// @returns The location in the vector of the element the iterator is currently pointing to. - uint32 Position() const { return m_curIndex; } - -private: - VectorIterator(uint32 index, const Vector& srcVec); - - uint32 m_curIndex; // The current index of the vector iterator. - const Vector& m_srcVector; // The vector container this iterator is used for. - - PAL_DISALLOW_DEFAULT_CTOR(VectorIterator); - - // Although this is a transgression of coding standards, it means that Vector does not need to have a public - // interface specifically to implement this class. The added encapsulation this provides is worthwhile. - friend class Vector; -}; - -/** - *********************************************************************************************************************** - * @brief Vector container. - * - * Vector is a templated array based storage that starts with a default-size allocation in the stack. If more space is - * needed it then resorts to dynamic allocation by doubling the size every time the capacity is exceeded. - * Operations which this class supports are: - * - * - Insertion at the end of the array. - * - Forward iteration. - * - Random access. - * - * @warning This class is not thread-safe. - *********************************************************************************************************************** - */ -template -class Vector -{ -public: - /// A convenient shorthand for VectorIterator. - typedef VectorIterator Iter; - - /// When this allocates, it doubles the old size of memory - static constexpr uint32 GrowthFactor = 2; - - /// Constructor. - /// - /// @param [in] pAllocator The allocator that will allocate memory if required. - Vector(Allocator*const pAllocator); - - /// Destructor. - ~Vector(); - - /// Move constructor. - /// - /// @param [in] vector Reference to a dying vector, from which resources will be stolen. - Vector(Vector&& vector); - - /// Increases maximal capacity to value greater or equal to the newCapacity. - /// If newCapacity is greater than the maximal capacity, new storage is allocated, - /// otherwise the method does nothing. - /// - /// @note All existing iterators will not get invalidated, even in case new storage is allocated, - /// because iterators are referencing vector, rather than elements of that vector. - /// - /// @warning All pointers and references to elements of a vector will be invalidated, - /// in case new storage is allocated. - /// - /// @param [in] newCapacity The new capacity of a vector, which is lower limit of the maximal capacity. - /// - /// @returns Result ErrorOutOfMemory if the operation failed. - Result Reserve(uint32 newCapacity); - - /// Increases maximum capacity to the number of elements in the vector, plus the specified increment amount. - /// Equivalent to this->Reserve(this->NumElements() + amount); - /// - /// @param [in] amount Number of items beyond the current element count to increas the capacity to. - /// - /// @returns Result ErrorOutOfMemory if the operation failed. - Result Grow(uint32 amount) { return Reserve(NumElements() + amount); } - - /// Set size to newSize. - /// If size is decreased, elements at the end of the vector will be removed. - /// If size is increased, new elements will be set to newVal. - /// If newSize requires a higher capacity, a new allocation is made. See notes on Reserve. - /// - /// @note If size is decreased, any iterators referencing removed elements will become invalid. All other - /// iterators will remain valid. Otherwise, all iterators will still be valid. - /// - /// @warning All pointers and references to elements of a vector will be invalidated, - /// in case new storage is allocated. - /// - /// @param [in] newSize The new size of a vector. - /// - /// @returns Result ErrorOutOfMemory if the operation failed. - Result Resize(uint32 newSize, const T& newVal = T()); - - /// Copy/Move an element to end of the vector. If not enough space is available, new space will be allocated and - /// the old data will be copied to the new space. - /// - /// @param [in] data The element to be pushed to the vector. The element will become the last element. - /// - /// @returns Result ErrorOutOfMemory if the operation failed. - Result PushBack(const T& data); - Result PushBack(T&& data); - - /// Constructs an object in-place at the end of the vector. If not enough space is available, new space will be - /// allocated and the old data will be copied to the new space. - /// - /// @param [in] args... The arguments passed to the constructor - /// - /// @returns Result ErrorOutOfMemory if the operation failed. - template - Result EmplaceBack(Args&&... args); - - /// Returns the element at the end of the vector and destroys it. - /// - /// @param [out] pData The element at the end of the vector. - /// It is expected that pData is uninitialized as it will be overwritten and not destructed. - void PopBack(T* pData); - - /// Destroys all elements stored in the vector. All dynamically allocated memory will be saved for reuse. - void Clear(); - - ///@{ - /// Returns the element at the location specified. - /// - /// @warning Calling this function with an out-of-bounds index will cause an access violation! - /// - /// @param [in] index Integer location of the element needed. - /// - /// @returns The element at location specified by index by reference - T& At(uint32 index) - { - PAL_ASSERT(index < m_numElements); - return *(m_pData + index); - } - - const T& At(uint32 index) const - { - PAL_ASSERT(index < m_numElements); - return *(m_pData + index); - } - - T& operator[](uint32 index) noexcept { return At(index); } - const T& operator[](uint32 index) const noexcept { return At(index); } - ///@} - - /// Returns the data at the front of the vector. - /// - /// @warning Calling this function on an empty vector will cause an access violation! - /// - /// @returns The data at the front of the vector. - T& Front() const - { - PAL_ASSERT(IsEmpty() == false); - return *m_pData; - } - - /// Returns the data at the back of the vector. - /// - /// @warning Calling this function on an empty vector will cause an access violation! - /// - /// @returns The data at the back of the vector. - T& Back() const - { - PAL_ASSERT(IsEmpty() == false); - return *(m_pData + (m_numElements - 1)); - } - - /// Returns an iterator to the first element of the vector. - /// - /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! - /// - /// @returns An iterator to first element of the vector. - Iter Begin() const { return Iter(0, *this); } - - /// Returns an iterator to the last element of the vector. - /// - /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! - /// - /// @returns VectorIterator An iterator to last element of the vector. - Iter End() const { return Iter((m_numElements - 1), *this); } - - ///@{ - /// Implicitly gets the current contents of the vector as a Span. - /// - /// @returns The contents of the vector as a Span; same as Span(Data(), NumElements()). - operator Span() { return Span(Data(), NumElements()); } - operator Span() const { return Span(Data(), NumElements()); } - ///@} - - /// Returns pointer to the underlying buffer serving as data storage. - /// The returned pointer defines always valid range [Data(), Data() + NumElements()), - /// even if the container is empty (Data() is not dereferenceable in that case). - /// - /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! - /// - /// @returns Pointer to the underlying data storage for read & write access. - /// For a non-empty vector, the returned pointer contains address of the first element. - /// For an empty vector, the returned pointer may or may not be a null pointer. - T* Data() { return m_pData; } - - /// Returns pointer to the underlying buffer serving as data storage. - /// The returned pointer defines always valid range [Data(), Data() + NumElements()), - /// even if the container is empty (Data() is not dereferenceable in that case). - /// - /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! - /// - /// @returns Pointer to the underlying data storage for read only access. - /// For a non-empty vector, the returned pointer contains address of the first element. - /// For an empty vector, the returned pointer may or may not be a null pointer. - const T* Data() const { return m_pData; } - - /// Returns the size of the vector. - /// - /// @returns An unsigned integer equal to the number of elements currently present in the vector. - uint32 NumElements() const { return m_numElements; } - - /// Returns true if the number of elements present in the vector is equal to zero. - /// - /// @returns True if the vector is empty. - bool IsEmpty() const { return (m_numElements == 0); } - - /// Returns a pointer to the allocator used for this container's memory management. - /// - /// @returns Allocator pointer. - Allocator* GetAllocator() const { return m_pAllocator; } - - ///@{ - /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types - /// - /// @note - These are a convenience intended to be used by c++ language features such as `range for`. - /// These should not be called directly as they do not adhere to PAL coding standards. - using value_type = T; - using reference = T&; - using const_reference = const T&; - using iterator = T*; - using const_iterator = const T*; - using difference_type = int32; - using size_type = uint32; - - iterator begin() noexcept { return m_pData; } - iterator end() noexcept { return (m_pData + m_numElements); } - const_iterator begin() const noexcept { return m_pData; } - const_iterator end() const noexcept { return (m_pData + m_numElements); } - const_iterator cbegin() const noexcept { return m_pData; } - const_iterator cend() const noexcept { return (m_pData + m_numElements); } - [[nodiscard]] bool empty() const noexcept { return IsEmpty(); } - size_type size() const noexcept { return m_numElements; } - ///@} - - /// Erases the element at the specified iterator. - void Erase(Iter it); - - /// Erases the element at the specified iterator. - void Erase(iterator it); - - /// Erases the element at the specified index. - void Erase(uint32 index); - - /// Erase the element at the specified iterator, and swap last element to that position. - /// If the element to erase is the last element, erase directly and no swap operation. - void EraseAndSwapLast(Iter it); - - /// Erase the element at the specified iterator, and swap last element to that position. - /// If the element to erase is the last element, erase directly and no swap operation. - void EraseAndSwapLast(iterator it); - - /// Erases the element at the specified index, and swap last element to that position. - /// If the element to erase is the last element, erase directly and no swap operation. - void EraseAndSwapLast(uint32 index); - -private: - // This is a POD-type that exactly fits one T value. - typedef typename std::aligned_storage::type ValueStorage; - - ValueStorage m_data[defaultCapacity]; // The initial data buffer stored within the vector object. - T* m_pData; // Pointer to the current data buffer. - uint32 m_numElements; // Number of elements present. - uint32 m_maxCapacity; // Maximum size it can hold. - Allocator*const m_pAllocator; // Allocator for this Vector. - - PAL_DISALLOW_COPY_AND_ASSIGN(Vector); - - // Although this is a transgression of coding standards, it prevents VectorIterator requiring a public constructor; - // constructing a 'bare' VectorIterator (i.e. without calling Vector::GetIterator) can never be a legal operation, - // so this means that these two classes are much safer to use. - friend class VectorIterator; -}; - -// ===================================================================================================================== -template -VectorIterator::VectorIterator( - uint32 index, - const Vector& srcVec) - : - m_curIndex(index), - m_srcVector(srcVec) - { - } - -// ===================================================================================================================== -template -Vector::Vector( - Allocator*const pAllocator) - : - m_pData(reinterpret_cast(m_data)), - m_numElements(0), - m_maxCapacity(defaultCapacity), - m_pAllocator(pAllocator) - { - } - -// ===================================================================================================================== -template -Vector::~Vector() -{ - // Explicitly destroy all non-trivial types. - if (!std::is_trivial::value) - { - for (uint32 idx = 0; idx < m_numElements; ++idx) - { - m_pData[idx].~T(); - } - } - - // Check if we have dynamically allocated memory. - if (m_pData != reinterpret_cast(m_data)) - { - // Free the memory that was allocated dynamically. - PAL_FREE(m_pData, m_pAllocator); - } -} - -// ===================================================================================================================== -// Steals allocation from a dying vector, if data buffer uses storage from heap allocation. -// Moves objects between local buffers of new and dying vectors (for non-trivial types) or -// copies local buffer from a dying vector to a new vector (for trivial types), -// if data buffer uses storage from local buffer. -template -Vector::Vector( - Vector&& vector) - : - m_numElements(vector.m_numElements), - m_maxCapacity(vector.m_maxCapacity), - m_pAllocator(vector.m_pAllocator) -{ - if (vector.m_pData == reinterpret_cast(vector.m_data)) // Local buffer - { - // Data buffer will be using storage from local buffer. - m_pData = reinterpret_cast(m_data); - - if (std::is_trivial::value) - { - // Optimize trivial types by copying local buffer. - std::memcpy(m_pData, vector.m_pData, sizeof(T) * m_numElements); - } - else - { - // Move objects from local buffer of a dying vector to local buffer of a new vector. - for (uint32 idx = 0; idx < m_numElements; ++idx) - { - PAL_PLACEMENT_NEW(m_pData + idx) T(Move(vector.m_pData[idx])); - } - } - } - else // Heap allocation - { - // Steal heap allocation from dying vector. - m_pData = vector.m_pData; - - // After the allocation has been stolen, dying vector is just an empty shell. - vector.m_pData = nullptr; - vector.m_numElements = 0; - vector.m_maxCapacity = 0; - } -} - -} // Util +/* + *********************************************************************************************************************** + * + * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file palVector.h +* @brief PAL utility collection Vector and VectorIterator class declarations. +*********************************************************************************************************************** +*/ + +#pragma once + +#include "palUtil.h" +#include "palAssert.h" +#include "palSpan.h" +#include "palSysMemory.h" +#include + +namespace Util +{ + +// Forward declarations. +template class Vector; + +/** + *********************************************************************************************************************** + * @brief Iterator for traversal of elements in Vector. + * + * Supports forward traversal. + *********************************************************************************************************************** + */ +template +class VectorIterator +{ +public: + /// Checks if the current index is within bounds of the number of elements in the vector. + /// + /// @returns True if the current element this iterator is pointing to is within the permitted range. + bool IsValid() const { return (m_curIndex < m_srcVector.m_numElements); } + + /// Returns the element the iterator is currently pointing to as a reference. + /// + /// @warning This may cause an access violation if the iterator is not valid. + /// + /// @returns The element the iterator is currently pointing to. + T& Get() const + { + PAL_ASSERT(IsValid()); + return (*(m_srcVector.m_pData + m_curIndex)); + } + + /// Advances the iterator to point to the next element. + /// + /// @warning Does not do bounds checking. + void Next() { ++m_curIndex; } + + /// Retrieves the current vector position of this iterator. + /// + /// @returns The location in the vector of the element the iterator is currently pointing to. + uint32 Position() const { return m_curIndex; } + +private: + VectorIterator(uint32 index, const Vector& srcVec); + + uint32 m_curIndex; // The current index of the vector iterator. + const Vector& m_srcVector; // The vector container this iterator is used for. + + PAL_DISALLOW_DEFAULT_CTOR(VectorIterator); + + // Although this is a transgression of coding standards, it means that Vector does not need to have a public + // interface specifically to implement this class. The added encapsulation this provides is worthwhile. + friend class Vector; +}; + +/** + *********************************************************************************************************************** + * @brief Vector container. + * + * Vector is a templated array based storage that starts with a default-size allocation in the stack. If more space is + * needed it then resorts to dynamic allocation by doubling the size every time the capacity is exceeded. + * Operations which this class supports are: + * + * - Insertion at the end of the array. + * - Forward iteration. + * - Random access. + * + * @warning This class is not thread-safe. + *********************************************************************************************************************** + */ +template +class Vector +{ +public: + /// A convenient shorthand for VectorIterator. + typedef VectorIterator Iter; + + /// When this allocates, it doubles the old size of memory + static constexpr uint32 GrowthFactor = 2; + + /// Constructor. + /// + /// @param [in] pAllocator The allocator that will allocate memory if required. + Vector(Allocator*const pAllocator); + + /// Destructor. + ~Vector(); + + /// Move constructor. + /// + /// @param [in] vector Reference to a dying vector, from which resources will be stolen. + Vector(Vector&& vector); + + /// Increases maximal capacity to value greater or equal to the newCapacity. + /// If newCapacity is greater than the maximal capacity, new storage is allocated, + /// otherwise the method does nothing. + /// + /// @note All existing iterators will not get invalidated, even in case new storage is allocated, + /// because iterators are referencing vector, rather than elements of that vector. + /// + /// @warning All pointers and references to elements of a vector will be invalidated, + /// in case new storage is allocated. + /// + /// @param [in] newCapacity The new capacity of a vector, which is lower limit of the maximal capacity. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Reserve(uint32 newCapacity); + + /// Increases maximum capacity to the number of elements in the vector, plus the specified increment amount. + /// Equivalent to this->Reserve(this->NumElements() + amount); + /// + /// @param [in] amount Number of items beyond the current element count to increas the capacity to. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Grow(uint32 amount) { return Reserve(NumElements() + amount); } + + /// Set size to newSize. + /// If size is decreased, elements at the end of the vector will be removed. + /// If size is increased, new elements will be set to newVal. + /// If newSize requires a higher capacity, a new allocation is made. See notes on Reserve. + /// + /// @note If size is decreased, any iterators referencing removed elements will become invalid. All other + /// iterators will remain valid. Otherwise, all iterators will still be valid. + /// + /// @warning All pointers and references to elements of a vector will be invalidated, + /// in case new storage is allocated. + /// + /// @param [in] newSize The new size of a vector. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result Resize(uint32 newSize, const T& newVal = T()); + + /// Copy/Move an element to end of the vector. If not enough space is available, new space will be allocated and + /// the old data will be copied to the new space. + /// + /// @param [in] data The element to be pushed to the vector. The element will become the last element. + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + Result PushBack(const T& data); + Result PushBack(T&& data); + + /// Constructs an object in-place at the end of the vector. If not enough space is available, new space will be + /// allocated and the old data will be copied to the new space. + /// + /// @param [in] args... The arguments passed to the constructor + /// + /// @returns Result ErrorOutOfMemory if the operation failed. + template + Result EmplaceBack(Args&&... args); + + /// Returns the element at the end of the vector and destroys it. + /// + /// @param [out] pData The element at the end of the vector. + /// It is expected that pData is uninitialized as it will be overwritten and not destructed. + void PopBack(T* pData); + + /// Destroys all elements stored in the vector. All dynamically allocated memory will be saved for reuse. + void Clear(); + + ///@{ + /// Returns the element at the location specified. + /// + /// @warning Calling this function with an out-of-bounds index will cause an access violation! + /// + /// @param [in] index Integer location of the element needed. + /// + /// @returns The element at location specified by index by reference + T& At(uint32 index) + { + PAL_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + const T& At(uint32 index) const + { + PAL_ASSERT(index < m_numElements); + return *(m_pData + index); + } + + T& operator[](uint32 index) noexcept { return At(index); } + const T& operator[](uint32 index) const noexcept { return At(index); } + ///@} + + /// Returns the data at the front of the vector. + /// + /// @warning Calling this function on an empty vector will cause an access violation! + /// + /// @returns The data at the front of the vector. + T& Front() const + { + PAL_ASSERT(IsEmpty() == false); + return *m_pData; + } + + /// Returns the data at the back of the vector. + /// + /// @warning Calling this function on an empty vector will cause an access violation! + /// + /// @returns The data at the back of the vector. + T& Back() const + { + PAL_ASSERT(IsEmpty() == false); + return *(m_pData + (m_numElements - 1)); + } + + /// Returns an iterator to the first element of the vector. + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns An iterator to first element of the vector. + Iter Begin() const { return Iter(0, *this); } + + /// Returns an iterator to the last element of the vector. + /// + /// @warning Accessing an element using an iterator of an empty vector will cause an access violation! + /// + /// @returns VectorIterator An iterator to last element of the vector. + Iter End() const { return Iter((m_numElements - 1), *this); } + + ///@{ + /// Implicitly gets the current contents of the vector as a Span. + /// + /// @returns The contents of the vector as a Span; same as Span(Data(), NumElements()). + operator Span() { return Span(Data(), NumElements()); } + operator Span() const { return Span(Data(), NumElements()); } + ///@} + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + NumElements()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! + /// + /// @returns Pointer to the underlying data storage for read & write access. + /// For a non-empty vector, the returned pointer contains address of the first element. + /// For an empty vector, the returned pointer may or may not be a null pointer. + T* Data() { return m_pData; } + + /// Returns pointer to the underlying buffer serving as data storage. + /// The returned pointer defines always valid range [Data(), Data() + NumElements()), + /// even if the container is empty (Data() is not dereferenceable in that case). + /// + /// @warning Dereferencing pointer returned by Data() from an empty vector will cause an access violation! + /// + /// @returns Pointer to the underlying data storage for read only access. + /// For a non-empty vector, the returned pointer contains address of the first element. + /// For an empty vector, the returned pointer may or may not be a null pointer. + const T* Data() const { return m_pData; } + + /// Returns the size of the vector. + /// + /// @returns An unsigned integer equal to the number of elements currently present in the vector. + uint32 NumElements() const { return m_numElements; } + + /// Returns true if the number of elements present in the vector is equal to zero. + /// + /// @returns True if the vector is empty. + bool IsEmpty() const { return (m_numElements == 0); } + + /// Returns a pointer to the allocator used for this container's memory management. + /// + /// @returns Allocator pointer. + Allocator* GetAllocator() const { return m_pAllocator; } + + ///@{ + /// @internal Satisfies concept `range_expression`, using T* as `iterator` and 32-bit size and difference types + /// + /// @note - These are a convenience intended to be used by c++ language features such as `range for`. + /// These should not be called directly as they do not adhere to PAL coding standards. + using value_type = T; + using reference = T&; + using const_reference = const T&; + using iterator = T*; + using const_iterator = const T*; + using difference_type = int32; + using size_type = uint32; + + iterator begin() noexcept { return m_pData; } + iterator end() noexcept { return (m_pData + m_numElements); } + const_iterator begin() const noexcept { return m_pData; } + const_iterator end() const noexcept { return (m_pData + m_numElements); } + const_iterator cbegin() const noexcept { return m_pData; } + const_iterator cend() const noexcept { return (m_pData + m_numElements); } + [[nodiscard]] bool empty() const noexcept { return IsEmpty(); } + size_type size() const noexcept { return m_numElements; } + ///@} + + /// Erases the element at the specified iterator. + void Erase(Iter it); + + /// Erases the element at the specified iterator. + void Erase(iterator it); + + /// Erases the element at the specified index. + void Erase(uint32 index); + + /// Erase the element at the specified iterator, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(Iter it); + + /// Erase the element at the specified iterator, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(iterator it); + + /// Erases the element at the specified index, and swap last element to that position. + /// If the element to erase is the last element, erase directly and no swap operation. + void EraseAndSwapLast(uint32 index); + +private: + // This is a POD-type that exactly fits one T value. + typedef typename std::aligned_storage::type ValueStorage; + + ValueStorage m_data[defaultCapacity]; // The initial data buffer stored within the vector object. + T* m_pData; // Pointer to the current data buffer. + uint32 m_numElements; // Number of elements present. + uint32 m_maxCapacity; // Maximum size it can hold. + Allocator*const m_pAllocator; // Allocator for this Vector. + + PAL_DISALLOW_COPY_AND_ASSIGN(Vector); + + // Although this is a transgression of coding standards, it prevents VectorIterator requiring a public constructor; + // constructing a 'bare' VectorIterator (i.e. without calling Vector::GetIterator) can never be a legal operation, + // so this means that these two classes are much safer to use. + friend class VectorIterator; +}; + +// ===================================================================================================================== +template +VectorIterator::VectorIterator( + uint32 index, + const Vector& srcVec) + : + m_curIndex(index), + m_srcVector(srcVec) + { + } + +// ===================================================================================================================== +template +Vector::Vector( + Allocator*const pAllocator) + : + m_pData(reinterpret_cast(m_data)), + m_numElements(0), + m_maxCapacity(defaultCapacity), + m_pAllocator(pAllocator) + { + } + +// ===================================================================================================================== +template +Vector::~Vector() +{ + // Explicitly destroy all non-trivial types. + if (!std::is_trivial::value) + { + for (uint32 idx = 0; idx < m_numElements; ++idx) + { + m_pData[idx].~T(); + } + } + + // Check if we have dynamically allocated memory. + if (m_pData != reinterpret_cast(m_data)) + { + // Free the memory that was allocated dynamically. + PAL_FREE(m_pData, m_pAllocator); + } +} + +// ===================================================================================================================== +// Steals allocation from a dying vector, if data buffer uses storage from heap allocation. +// Moves objects between local buffers of new and dying vectors (for non-trivial types) or +// copies local buffer from a dying vector to a new vector (for trivial types), +// if data buffer uses storage from local buffer. +template +Vector::Vector( + Vector&& vector) + : + m_numElements(vector.m_numElements), + m_maxCapacity(vector.m_maxCapacity), + m_pAllocator(vector.m_pAllocator) +{ + if (vector.m_pData == reinterpret_cast(vector.m_data)) // Local buffer + { + // Data buffer will be using storage from local buffer. + m_pData = reinterpret_cast(m_data); + + if (std::is_trivial::value) + { + // Optimize trivial types by copying local buffer. + std::memcpy(m_pData, vector.m_pData, sizeof(T) * m_numElements); + } + else + { + // Move objects from local buffer of a dying vector to local buffer of a new vector. + for (uint32 idx = 0; idx < m_numElements; ++idx) + { + PAL_PLACEMENT_NEW(m_pData + idx) T(Move(vector.m_pData[idx])); + } + } + } + else // Heap allocation + { + // Steal heap allocation from dying vector. + m_pData = vector.m_pData; + + // After the allocation has been stolen, dying vector is just an empty shell. + vector.m_pData = nullptr; + vector.m_numElements = 0; + vector.m_maxCapacity = 0; + } +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc index 20c4c58f7b..9ce4af7fc2 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/DriverUtilsService.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 95d96350d29e5d7ee8249b13f8344bfa +- md5: fd5f7481a122f40f73d1f638e3b9b027 size: 16738 hash: md5 path: DriverUtilsService.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc index ba52ca570a..9d853eaa31 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/SettingsRpcService2.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 456e1346e62388d873836eee241c2ecc +- md5: e09dbb1896128ac2b2bcac2b35878a40 size: 9460 hash: md5 path: SettingsRpcService2.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc index 251f9c0982..8ed81148d4 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/UberTraceService.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 26144175ebf644e9c406a84cac291898 +- md5: 364bc94b5b81ef5bb337e6afb0060c55 size: 13912 hash: md5 path: UberTraceService.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc index a5cd855411..7fe0b48324 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/addrlib.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 82c883995b5833b7c1e3456da645f1c7 - size: 976846 +- md5: 56362998d9feb9b0ce6ccad8441bf1c8 + size: 820446 hash: md5 path: addrlib.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc index 8860814ef3..603a536a3d 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/amdrdf.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: b752c646510e1b854e86d2180ce91cbb +- md5: 35af646710d883bfe6184113cb88e96a size: 702568 hash: md5 path: amdrdf.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc index 590a97151c..21eab68dfb 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/cwpack.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 25f322e041c71e95504e49333ba711c4 +- md5: 06c7697ce380a8127e7478041aed7fc8 size: 27894 hash: md5 path: cwpack.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc index f964474a12..c9b8c6bfd3 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCommon.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 8638cb376e4098e11bbd3a96d9de126d +- md5: 7c6ba83c44ee8bd70397a1458dbea7e0 size: 82210 hash: md5 path: ddCommon.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc index 47f6172b21..8312dd1b09 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddCore.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 429ad9f4c0eb7a231c97fff83ed3aac9 +- md5: 32f4aa9943ab5fde0da6f09bcbacf9be size: 72778 hash: md5 path: ddCore.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc index 5f8ec6b1dd..2b15e06af2 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventClient.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 1c256136b2c4b0ac352156910e159df9 - size: 138010 +- md5: 09fc5ce2eb8653cbd8ddda5d983ce836 + size: 137794 hash: md5 path: ddEventClient.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc index 5075446d90..dbdd525e3b 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventParser.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 1370a8b4f3697241d7dac7d43aa9dd79 +- md5: 2f5e7eb06485bebea0ec2779d8df9f97 size: 46862 hash: md5 path: ddEventParser.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc index 6aee11c420..8bf234218c 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventServer.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: b8622eb38d6c1468301489f98fef8e4d +- md5: bdb3738c5bbd4ac6abd3a805930b89fe size: 30582 hash: md5 path: ddEventServer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc index 597b9c415e..aebfdd6d57 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddEventStreamer.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 9bfec5d057726ec5d03213ede38ae4a2 +- md5: 43ea2bd7328593fab054754f9fd1a7c9 size: 35768 hash: md5 path: ddEventStreamer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc index e2c62a6f71..449f6f8e2d 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddNet.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: ce4b653d66b86c8f6f8e11bbfffea54f +- md5: d91f19ed479fd51b481f6b25566dfd31 size: 13230 hash: md5 path: ddNet.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc index df8227c0d0..0f95f12baf 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcClient.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 20bba474c25d34bad7c675c0d774017c +- md5: 5b37ae9cc29dfaba3cb0d08a30bd684a size: 23224 hash: md5 path: ddRpcClient.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc index e5f3b2ad48..13273407e5 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcServer.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: a0f3d781dc7358b33f693ad6f19284ab +- md5: b3f63ef6d9a9d6bfb3e5934a9a34465e size: 179024 hash: md5 path: ddRpcServer.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc index be32560901..3565cd252d 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddRpcShared.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 78c64bc9c07804300f848425908f3443 +- md5: db87375bafb0d667ac054dbf7dc0dc36 size: 16268 hash: md5 path: ddRpcShared.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc index fd0db6d23e..37471b02ed 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddSocket.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 4f16385611ba26f28374e11d9421269d +- md5: f88897c7d989d95f0352cf6e1a21df99 size: 106228 hash: md5 path: ddSocket.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc index a38ef554a7..4481f54001 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/ddYaml.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: c4bed6f36417e25fdb069616b2ef7edd +- md5: 4d64cf4c3b034f09a8a3abd1b7e657b4 size: 35902 hash: md5 path: ddYaml.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc index 5a4e4fb34b..27c10a4cb0 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_common.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: cb9ece645ccf22601a53d532614422c8 - size: 663402 +- md5: 2394b7141b71f0b738dd3ad024dcbfc0 + size: 661222 hash: md5 path: dd_common.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc index c207492c87..58c95fb8c1 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_libyaml.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 5d4c03c414cd2661e27a9d661b2aaaa0 +- md5: 2b9f0af04b216527b49338cc1b8fa1a5 size: 264022 hash: md5 path: dd_libyaml.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc index 5bbdd9f85a..b72ed57fbc 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/dd_settings.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 5de26d6d7739cdad888688eb4937fcb1 - size: 213370 +- md5: d9a1105679db9411bf2365aae2b6d2a5 + size: 212936 hash: md5 path: dd_settings.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc index a2b631fd86..e65c97aa25 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/devdriver.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 25a99e1ae065b735a3644ece1fc9c0c4 - size: 2701484 +- md5: 18e7d04c4ecc9fb872de2e0ac9dffd61 + size: 2700190 hash: md5 path: devdriver.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc index 7758e17e61..40ca7236c1 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/metrohash.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 53cad99c6cd2848fb4e283db250e1148 +- md5: 64dbdb1c2d7c68e7ae3083ea35878a83 size: 28682 hash: md5 path: metrohash.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc index 873d9621ca..6f403fe56b 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/mpack.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 8c5d464f8f60a4285770e7994a74ba70 +- md5: 43f91cf1e53eef1411a6e4a40776cd79 size: 218874 hash: md5 path: mpack.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc index bd5e3d2e55..a732119591 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 5800da924b60abf1b7ca111ac2fb1aae - size: 20625154 +- md5: 852e161ac4115309a2591db0b80f13dd + size: 24025742 hash: md5 path: pal.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc index 4e7990af3b..35efc5ee4b 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palCompilerDeps.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 59a339d6330fa360eb7452a15851d2d4 - size: 440980 +- md5: a929ad3103021925d382e419b0e5343d + size: 433780 hash: md5 path: palCompilerDeps.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc index 3cdbcc9bc7..02e9c72251 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/palUtil.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: b35b947076fbb8821eab7252511049e2 - size: 831218 +- md5: 3cca5923fa12cf564360058254c2c6db + size: 799750 hash: md5 path: palUtil.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc index 8343567c45..3941913ab4 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_lz4.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: d5ebea86c9821bd43006bea5e8fd1ce3 +- md5: 36731971681f4a89f4e89b5ad44473ac size: 291664 hash: md5 path: pal_lz4.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc index 6d62a7d409..a9d688ea70 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/pal_uuid.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 8551962e3709c5da05736df2959b074f +- md5: 1ad5de7ebbb13b41f7d7dc0367d7d1d8 size: 3460 hash: md5 path: pal_uuid.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc index 9b1d5c464a..596f2ad962 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/stb_sprintf.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: cb6d2ce450c3869437fc090fa06eb1c5 +- md5: b2a1cd0f59d07aaa0cf21afa9235dbda size: 25990 hash: md5 path: stb_sprintf.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc index 7894b92e57..aae38aec12 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/vam.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 709400309f890ea3e16cf2c816dead42 +- md5: 02784ea9d25a9a9c94c20acca001456c size: 215198 hash: md5 path: vam.lib diff --git a/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc b/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc index 42f5ab4db9..2f44e47a41 100644 --- a/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc +++ b/shared/amdgpu-windows-interop/pal/lib/Release/x64/zstd.lib.dvc @@ -1,5 +1,5 @@ outs: -- md5: 6af43c37bb2018208ba884ca155a3cf6 +- md5: 6a1ac31db298434da1573cda69d9e4d3 size: 1356642 hash: md5 path: zstd.lib diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h index 1ba33fa91f..0799878bf9 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcDefs.h @@ -1,269 +1,269 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include - -#include -#include - -// Macros for conditional language support. -#ifdef _MSVC_LANG - #define DD_CPLUSPLUS _MSVC_LANG -#else - #define DD_CPLUSPLUS __cplusplus -#endif -// Denotes versions of the C++ standard from __cplusplus. -#define CPP98 (199711L) -#define CPP11 (201103L) -#define CPP14 (201402L) -#define CPP17 (201703L) -#define CPP20 (202002L) -#define DD_CPLUSPLUS_SUPPORTS(x) (DD_CPLUSPLUS >= (x)) - -static_assert(DD_CPLUSPLUS_SUPPORTS(CPP11), "C++11 is required to build devdriver."); - -#if defined(_MSC_VER) - #define DD_ALIGNAS(x)__declspec(align(x)) - #if _MSC_VER < 1900 - #define DD_STATIC_CONST static const - #else - #define DD_STATIC_CONST static constexpr - #endif - #ifndef va_copy - #define va_copy(d,s) ((d) = (s)) - #endif -#endif - -#if !defined(DD_STATIC_CONST) - #if defined(__cplusplus) && __cplusplus >= 201103L - #define DD_STATIC_CONST static constexpr - #else - #define DD_STATIC_CONST static const - #endif -#endif - -#if DD_CPLUSPLUS_SUPPORTS(CPP14) - #define DD_CPP14_CONSTEXPR_FN constexpr - #define DD_CPP14_STATIC_ASSERT(a, b) static_assert(a, b) -#else - #define DD_CPP14_CONSTEXPR_FN inline - #define DD_CPP14_STATIC_ASSERT(a, b) -#endif - -#if !defined(DD_ALIGNAS) - #if defined(__cplusplus) && __cplusplus >= 201103L - #define DD_ALIGNAS(x) alignas(x) - #else - static_assert(false, "Error: unsupported compiler detected. Support is required to build."); - #endif -#endif - -/// Remove the __FILE__ macro for release builds -#ifndef DD_FILE - #ifdef NDEBUG - #define DD_FILE "" - #else - #define DD_FILE __FILE__ - #endif -#endif - -// Creates a structure with the specified name and alignment. -#define DD_ALIGNED_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name - -// Creates a structure with the specified alignment, and mark it as final to ensure it cannot be used as a parent class -#define DD_NETWORK_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name final - -#define DD_CHECK_SIZE(x, size) static_assert(sizeof(x) == size_t(size), "sizeof(" # x ") should be " # size " bytes but has changed recently") - -#define DD_UNUSED(x) (static_cast(x)) - -#define _DD_STRINGIFY(str) #str -#define DD_STRINGIFY(x) _DD_STRINGIFY(x) - -#if DD_CPLUSPLUS_SUPPORTS(CPP17) - // Require that a function's return value, or an entire type, be used. - #define DD_NODISCARD [[nodiscard]] - - // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. - // switch (x) - // { - // case 0: DD_FALLTHROUGH(); - // case 1: DD_FALLTHROUGH(); - // case 2: - // printf("0, 1, or 2"); - // break; - // } - // - #define DD_FALLTHROUGH() [[fallthrough]] -#else - // Require that a function's return value, or an entire type, be used. - // This option is aggressive enough that we do not enable it when C++17 is not enabled - #define DD_NODISCARD - - // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. - // switch (x) - // { - // case 0: DD_FALLTHROUGH(); - // case 1: DD_FALLTHROUGH(); - // case 2: - // printf("0, 1, or 2"); - // break; - // } - // - #if defined(__clang__) - #define DD_FALLTHROUGH() [[clang::fallthrough]] - #elif defined(__GNUC__) - #if __GNUC__ >= 7 - // gnu::fallthrough isn't supported until GCC 7+ - #define DD_FALLTHROUGH() [[gnu::fallthrough]] - #else - // Not supported on older versions of GCC - #define DD_FALLTHROUGH() - #endif - #elif defined(_MSC_VER) - // Not supported on MSVC - who doesn't warn about this issue in the first place. - #define DD_FALLTHROUGH() - #else - // We don't know what compiler this is, so just no-op the macro. - #define DD_FALLTHROUGH() - #endif -#endif - -// Include in the private section of a class declaration in order to disallow use of the copy and assignment operator -#define DD_DISALLOW_COPY_AND_ASSIGN(_typename) \ - _typename(const _typename&); \ - _typename& operator =(const _typename&); - -// Include in the private section of a class declaration in order to disallow use of the default constructor -#define DD_DISALLOW_DEFAULT_CTOR(_typename) \ - _typename(); - -// Detect the CPU architecture for the target. -// These are often evaluated during the preprocessor stage, so it's important that we don't rely on things like sizeof. -#if UINTPTR_MAX == 0xFFFFFFFF - #define DEVDRIVER_ARCHITECTURE_BITS 32 -#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFF - #define DEVDRIVER_ARCHITECTURE_BITS 64 -#else - static_assert(false, "Unknown or unsupported target architecture."); -#endif -static_assert(DEVDRIVER_ARCHITECTURE_BITS == (8 * sizeof(void*)), // Assume 8-bits-per-byte. - "DEVDRIVER_ARCHITECTURE_BITS does not match sizeof(void*)."); - -// Add a detailed function name macro -// These vary across platforms, so we'll just pick the first one that's defined -#if defined(__FUNCSIG__) - #define DD_FUNCTION_NAME __FUNCSIG__ -#elif defined(__PRETTY_FUNCTION__) - #define DD_FUNCTION_NAME __PRETTY_FUNCTION__ -#else - #define DD_FUNCTION_NAME __FUNCTION__ -#endif - -// Common Typedefs -// These types are shared between all platforms, -// and need to be defined before including a specific platform header. - -namespace DevDriver -{ - -typedef int8_t int8; ///< 8-bit integer. -typedef int16_t int16; ///< 16-bit integer. -typedef int32_t int32; ///< 32-bit integer. -typedef int64_t int64; ///< 64-bit integer. -typedef uint8_t uint8; ///< Unsigned 8-bit integer. -typedef uint16_t uint16; ///< Unsigned 16-bit integer. -typedef uint32_t uint32; ///< Unsigned 32-bit integer. -typedef uint64_t uint64; ///< Unsigned 64-bit integer. - -typedef uint32_t ProcessId; -typedef uint32_t Size; -typedef uint64_t Handle; - -DD_STATIC_CONST Handle kNullPtr = 0; -DD_STATIC_CONST Handle kInvalidHandle = 0; - -//////////////////////////// -// Common result codes -enum struct Result : uint32 -{ - //// Generic Result Code //// - Success = 0, - Error = 1, - NotReady = 2, - VersionMismatch = 3, - Unavailable = 4, - Rejected = 5, - EndOfStream = 6, - Aborted = 7, - InsufficientMemory = 8, - InvalidParameter = 9, - InvalidClientId = 10, - ConnectionExists = 11, - FileNotFound = 12, - FunctionNotFound = 13, - InterfaceNotFound = 14, - EntryExists = 15, - FileAccessError = 16, - FileIoError = 17, - LimitReached = 18, - MemoryOverLimit = 19, - - //// URI PROTOCOL //// - UriServiceRegistrationError = 1000, - UriStringParseError = 1001, - UriInvalidParameters = 1002, - UriInvalidPostDataBlock = 1003, - UriInvalidPostDataSize = 1004, - UriFailedToAcquirePostBlock = 1005, - UriFailedToOpenResponseBlock = 1006, - UriRequestFailed = 1007, - UriPendingRequestError = 1008, - UriInvalidChar = 1009, - UriInvalidJson = 1010, - - //// Settings URI Service //// - SettingsUriInvalidComponent = 2000, - SettingsUriInvalidSettingName = 2001, - SettingsUriInvalidSettingValue = 2002, - SettingsUriInvalidSettingValueSize = 2003, - - //// Info URI Service //// - InfoUriSourceNameInvalid = 3000, - InfoUriSourceCallbackInvalid = 3001, - InfoUriSourceAlreadyRegistered = 3002, - InfoUriSourceWriteFailed = 3003, - - //// Settings Service //// - SettingsInvalidComponent = 4000, - SettingsInvalidSettingName = 4001, - SettingsInvalidSettingValue = 4002, - SettingsInsufficientValueSize = 4003, - SettingsInvalidSettingValueSize = 4004, -}; - -} // namespace DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#include +#include + +// Macros for conditional language support. +#ifdef _MSVC_LANG + #define DD_CPLUSPLUS _MSVC_LANG +#else + #define DD_CPLUSPLUS __cplusplus +#endif +// Denotes versions of the C++ standard from __cplusplus. +#define CPP98 (199711L) +#define CPP11 (201103L) +#define CPP14 (201402L) +#define CPP17 (201703L) +#define CPP20 (202002L) +#define DD_CPLUSPLUS_SUPPORTS(x) (DD_CPLUSPLUS >= (x)) + +static_assert(DD_CPLUSPLUS_SUPPORTS(CPP11), "C++11 is required to build devdriver."); + +#if defined(_MSC_VER) + #define DD_ALIGNAS(x)__declspec(align(x)) + #if _MSC_VER < 1900 + #define DD_STATIC_CONST static const + #else + #define DD_STATIC_CONST static constexpr + #endif + #ifndef va_copy + #define va_copy(d,s) ((d) = (s)) + #endif +#endif + +#if !defined(DD_STATIC_CONST) + #if defined(__cplusplus) && __cplusplus >= 201103L + #define DD_STATIC_CONST static constexpr + #else + #define DD_STATIC_CONST static const + #endif +#endif + +#if DD_CPLUSPLUS_SUPPORTS(CPP14) + #define DD_CPP14_CONSTEXPR_FN constexpr + #define DD_CPP14_STATIC_ASSERT(a, b) static_assert(a, b) +#else + #define DD_CPP14_CONSTEXPR_FN inline + #define DD_CPP14_STATIC_ASSERT(a, b) +#endif + +#if !defined(DD_ALIGNAS) + #if defined(__cplusplus) && __cplusplus >= 201103L + #define DD_ALIGNAS(x) alignas(x) + #else + static_assert(false, "Error: unsupported compiler detected. Support is required to build."); + #endif +#endif + +/// Remove the __FILE__ macro for release builds +#ifndef DD_FILE + #ifdef NDEBUG + #define DD_FILE "" + #else + #define DD_FILE __FILE__ + #endif +#endif + +// Creates a structure with the specified name and alignment. +#define DD_ALIGNED_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name + +// Creates a structure with the specified alignment, and mark it as final to ensure it cannot be used as a parent class +#define DD_NETWORK_STRUCT(name, alignment) struct DD_ALIGNAS(alignment) name final + +#define DD_CHECK_SIZE(x, size) static_assert(sizeof(x) == size_t(size), "sizeof(" # x ") should be " # size " bytes but has changed recently") + +#define DD_UNUSED(x) (static_cast(x)) + +#define _DD_STRINGIFY(str) #str +#define DD_STRINGIFY(x) _DD_STRINGIFY(x) + +#if DD_CPLUSPLUS_SUPPORTS(CPP17) + // Require that a function's return value, or an entire type, be used. + #define DD_NODISCARD [[nodiscard]] + + // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. + // switch (x) + // { + // case 0: DD_FALLTHROUGH(); + // case 1: DD_FALLTHROUGH(); + // case 2: + // printf("0, 1, or 2"); + // break; + // } + // + #define DD_FALLTHROUGH() [[fallthrough]] +#else + // Require that a function's return value, or an entire type, be used. + // This option is aggressive enough that we do not enable it when C++17 is not enabled + #define DD_NODISCARD + + // Do not warn about switch statement cases falling through. Place this macro as the case body, e.g. + // switch (x) + // { + // case 0: DD_FALLTHROUGH(); + // case 1: DD_FALLTHROUGH(); + // case 2: + // printf("0, 1, or 2"); + // break; + // } + // + #if defined(__clang__) + #define DD_FALLTHROUGH() [[clang::fallthrough]] + #elif defined(__GNUC__) + #if __GNUC__ >= 7 + // gnu::fallthrough isn't supported until GCC 7+ + #define DD_FALLTHROUGH() [[gnu::fallthrough]] + #else + // Not supported on older versions of GCC + #define DD_FALLTHROUGH() + #endif + #elif defined(_MSC_VER) + // Not supported on MSVC - who doesn't warn about this issue in the first place. + #define DD_FALLTHROUGH() + #else + // We don't know what compiler this is, so just no-op the macro. + #define DD_FALLTHROUGH() + #endif +#endif + +// Include in the private section of a class declaration in order to disallow use of the copy and assignment operator +#define DD_DISALLOW_COPY_AND_ASSIGN(_typename) \ + _typename(const _typename&); \ + _typename& operator =(const _typename&); + +// Include in the private section of a class declaration in order to disallow use of the default constructor +#define DD_DISALLOW_DEFAULT_CTOR(_typename) \ + _typename(); + +// Detect the CPU architecture for the target. +// These are often evaluated during the preprocessor stage, so it's important that we don't rely on things like sizeof. +#if UINTPTR_MAX == 0xFFFFFFFF + #define DEVDRIVER_ARCHITECTURE_BITS 32 +#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFF + #define DEVDRIVER_ARCHITECTURE_BITS 64 +#else + static_assert(false, "Unknown or unsupported target architecture."); +#endif +static_assert(DEVDRIVER_ARCHITECTURE_BITS == (8 * sizeof(void*)), // Assume 8-bits-per-byte. + "DEVDRIVER_ARCHITECTURE_BITS does not match sizeof(void*)."); + +// Add a detailed function name macro +// These vary across platforms, so we'll just pick the first one that's defined +#if defined(__FUNCSIG__) + #define DD_FUNCTION_NAME __FUNCSIG__ +#elif defined(__PRETTY_FUNCTION__) + #define DD_FUNCTION_NAME __PRETTY_FUNCTION__ +#else + #define DD_FUNCTION_NAME __FUNCTION__ +#endif + +// Common Typedefs +// These types are shared between all platforms, +// and need to be defined before including a specific platform header. + +namespace DevDriver +{ + +typedef int8_t int8; ///< 8-bit integer. +typedef int16_t int16; ///< 16-bit integer. +typedef int32_t int32; ///< 32-bit integer. +typedef int64_t int64; ///< 64-bit integer. +typedef uint8_t uint8; ///< Unsigned 8-bit integer. +typedef uint16_t uint16; ///< Unsigned 16-bit integer. +typedef uint32_t uint32; ///< Unsigned 32-bit integer. +typedef uint64_t uint64; ///< Unsigned 64-bit integer. + +typedef uint32_t ProcessId; +typedef uint32_t Size; +typedef uint64_t Handle; + +DD_STATIC_CONST Handle kNullPtr = 0; +DD_STATIC_CONST Handle kInvalidHandle = 0; + +//////////////////////////// +// Common result codes +enum struct Result : uint32 +{ + //// Generic Result Code //// + Success = 0, + Error = 1, + NotReady = 2, + VersionMismatch = 3, + Unavailable = 4, + Rejected = 5, + EndOfStream = 6, + Aborted = 7, + InsufficientMemory = 8, + InvalidParameter = 9, + InvalidClientId = 10, + ConnectionExists = 11, + FileNotFound = 12, + FunctionNotFound = 13, + InterfaceNotFound = 14, + EntryExists = 15, + FileAccessError = 16, + FileIoError = 17, + LimitReached = 18, + MemoryOverLimit = 19, + + //// URI PROTOCOL //// + UriServiceRegistrationError = 1000, + UriStringParseError = 1001, + UriInvalidParameters = 1002, + UriInvalidPostDataBlock = 1003, + UriInvalidPostDataSize = 1004, + UriFailedToAcquirePostBlock = 1005, + UriFailedToOpenResponseBlock = 1006, + UriRequestFailed = 1007, + UriPendingRequestError = 1008, + UriInvalidChar = 1009, + UriInvalidJson = 1010, + + //// Settings URI Service //// + SettingsUriInvalidComponent = 2000, + SettingsUriInvalidSettingName = 2001, + SettingsUriInvalidSettingValue = 2002, + SettingsUriInvalidSettingValueSize = 2003, + + //// Info URI Service //// + InfoUriSourceNameInvalid = 3000, + InfoUriSourceCallbackInvalid = 3001, + InfoUriSourceAlreadyRegistered = 3002, + InfoUriSourceWriteFailed = 3003, + + //// Settings Service //// + SettingsInvalidComponent = 4000, + SettingsInvalidSettingName = 4001, + SettingsInvalidSettingValue = 4002, + SettingsInsufficientValueSize = 4003, + SettingsInvalidSettingValueSize = 4004, +}; + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h index fe2ed5bb89..d09fa62c99 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcPlatform.h @@ -1,824 +1,824 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include -// can not be used in the kernel -#if !DD_PLATFORM_IS_KM -#include -#endif - -#include -#include - -#define DD_CACHE_LINE_BYTES 64 - -#define DD_MALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, false) -#define DD_CALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, true) -#define DD_FREE(memory, allocCb) allocCb.Free(memory) - -#define DD_NEW(className, allocCb) new(allocCb, alignof(className), true, DD_FILE, __LINE__, __FUNCTION__) className -#define DD_DELETE(memory, allocCb) DevDriver::Platform::Destructor(memory); DD_FREE(memory, allocCb) - -#define DD_NEW_ARRAY(className, numElements, allocCb) DevDriver::Platform::NewArray(numElements, allocCb) -#define DD_DELETE_ARRAY(memory, allocCb) DevDriver::Platform::DeleteArray(memory, allocCb) - -// Always enable asserts in Debug builds -#if !defined(NDEBUG) - #if !defined(DD_OPT_ASSERTS_ENABLE) - #define DD_OPT_ASSERTS_ENABLE - #endif - #if !defined(DD_OPT_ASSERTS_DEBUGBREAK) - #define DD_OPT_ASSERTS_DEBUGBREAK - #endif -#endif - -#define DD_PTR_TO_HANDLE(x) ((DevDriver::Handle)(uintptr_t)(x)) - -#define DD_SANITIZE_RESULT(x) ((x != Result::Success) ? Result::Error : x) - -namespace DevDriver -{ - -//////////////////////////// -// Common logging levels -enum struct LogLevel : uint8 -{ - Debug = 0, - Verbose, - Info, - Warn, - Error, - Always, - Count, - - // Backwards compatibility for old alert log level - Alert = Warn, - - Never = 0xFF -}; - -typedef void*(*AllocFunc)(void* pUserdata, size_t size, size_t alignment, bool zero); -typedef void(*FreeFunc)(void* pUserdata, void* pMemory); - -struct AllocCb -{ - void* pUserdata; - AllocFunc pfnAlloc; - FreeFunc pfnFree; - - void* Alloc(size_t size, size_t alignment, bool zero) const; - void* Alloc(size_t size, bool zero) const; - void Free(void* pMemory) const; -}; - -namespace Platform -{ - -// Used by the Platform::Thread implementation. -typedef void (*ThreadFunction)(void* pThreadParameter); - -} // namespace Platform - -} // namespace DevDriver - -#if defined(DD_PLATFORM_WINDOWS_UM) - #include -#elif defined(DD_PLATFORM_WINDOWS_KM) - #include -#elif defined(DD_PLATFORM_DARWIN_UM) - #include -#elif defined(DD_PLATFORM_LINUX_UM) - #include -#else - // Legacy system for Ati Make - #if defined(_WIN32) && !defined(_KERNEL_MODE) - #define DD_PLATFORM_WINDOWS_UM - #include - #elif defined(__linux__) - #define DD_PLATFORM_LINUX_UM - #include - #else - #error "Unknown Platform - please configure your build system" - #endif - - #if __x86_64__ - #define DD_ARCH_BITS 64 - #else - #define DD_ARCH_BITS 32 - #endif -#endif - -#if !defined(DD_RESTRICT) - #error "DD_RESTRICT not defined by platform!" -#endif - -#if !defined(DD_DEBUG_BREAK) - #error "DD_DEBUG_BREAK not defined by platform!" -#endif - -// This only exists for 32bit Windows to specificy callbacks as __stdcall. -#if !defined(DD_APIENTRY) - #define DD_APIENTRY -#endif - -// TODO: remove this and make kDebugLogLevel DD_STATIC_CONST when we use a version of visual studio that supports it -#ifdef DD_OPT_LOG_LEVEL - #define DD_OPT_LOG_LEVEL_VALUE static_cast(DD_OPT_LOG_LEVEL) -#else - #if defined(NDEBUG) - // In non-debug builds, default to printing asserts, Error, and Always log messages - #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Error - #else - // In debug builds, default to more messages - #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Verbose - #endif -#endif - -#define DD_WILL_PRINT(lvl) ((lvl >= DD_OPT_LOG_LEVEL_VALUE) && (lvl < DevDriver::LogLevel::Count)) -#define DD_PRINT(lvl, ...) DevDriver::LogString(__VA_ARGS__) - -#if defined(DD_OPT_ASSERTS_DEBUGBREAK) - #define DD_ASSERT_DEBUG_BREAK() DD_DEBUG_BREAK() -#else - #define DD_ASSERT_DEBUG_BREAK() -#endif - -#include - -// Calling `check_expr_is_bool(x)` when `x` is not exactly a bool will create a compile error. -// When it is a bool, it's a no-op. -// This allows us to enforce bool arguments to DD_ASSERT() macros -namespace DevDriver -{ - inline void check_expr_is_bool(bool) {} - - template - void check_expr_is_bool(const T&) = delete; -} - -#if !defined(DD_OPT_ASSERTS_ENABLE) - #define DD_WARN(statement) DD_UNUSED(0) - #define DD_WARN_REASON(reason) DD_UNUSED(0) - - #ifndef DD_ASSERT - #define DD_ASSERT(statement) DD_UNUSED(0) - #endif - - #define DD_ASSERT_REASON(reason) DD_UNUSED(0) -#else - #define DD_WARN(statement) do \ - { \ - DevDriver::check_expr_is_bool(statement); \ - if (!(statement)) \ - { \ - DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ - DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ - } \ - } while (0) - - #define DD_WARN_REASON(reason) do \ - { \ - DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ - DD_FILE, __LINE__, __func__, reason); \ - } while (0) - - #ifndef DD_ASSERT - #define DD_ASSERT(statement) do \ - { \ - DevDriver::check_expr_is_bool(statement); \ - if (!(statement)) \ - { \ - DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ - DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ - DD_ASSERT_DEBUG_BREAK(); \ - } \ - } while (0) - #endif - - #define DD_ASSERT_REASON(reason) do \ - { \ - DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ - DD_FILE, __LINE__, __func__, reason); \ - DD_ASSERT_DEBUG_BREAK(); \ - } while (0) -#endif - -/// Convenience macro that always warns. -#define DD_WARN_ALWAYS() DD_WARN_REASON("Unconditional Warning") - -/// Convenience macro that always asserts. -#define DD_ASSERT_ALWAYS() DD_ASSERT_REASON("Unconditional Assertion") - -/// Convenience macro that asserts if something has not been implemented. -#define DD_NOT_IMPLEMENTED() DD_ASSERT_REASON("Code not implemented!") - -/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. -#define DD_UNREACHABLE() DD_ASSERT_REASON("Unreachable code has been reached!") - -// Backwards compatibility for old alert macro -#define DD_ALERT(statement) DD_WARN(statement) -#define DD_ALERT_REASON(reason) DD_WARN_REASON(reason) -#define DD_ALERT_ALWAYS() DD_WARN_ALWAYS() - -// Debug utility to log an expression -// -// This works by taking the format specifier for a local variable, and an expression. -// The expression is evaluated once. -// It then prints that expression and its value: -// ```cpp -// int x = 5; -// int y = 10; -// int z = 0xf0; -// DD_DBG("0x%x", x + y + z); // Prints: foo/file.cpp:5 "x + y + z" == 0xff -// ``` -#define DD_DBG(level, fmt, expr) DD_PRINT( \ - level, \ - "%s:%d:\t\"" DD_STRINGIFY(expr) "\" == " fmt, \ - DD_FILE, \ - __LINE__, \ - (expr) \ - ) - -// Allocates memory using an AllocCb. -// This overload is declared noexcept, and will correctly handle AllocCb::pfnAlloc() returning NULL. -void* operator new( - size_t size, - const DevDriver::AllocCb& allocCb, - size_t align, - bool zero, - const char* pFilename, - int lineNumber, - const char* pFunction -) noexcept; - -#if DD_PLATFORM_IS_KM -// Provide a placement new function if is not available -inline void* operator new(size_t size, void *pMemory) -{ - return pMemory; -}; -#endif - -// Overload of operator delete that matches the previously declared operator new. -// The compiler can call this version automatically in the case of exceptions thrown in the Constructor -// ... even though we turn them off? -// Compilers are fussy. -void operator delete( - void* pObject, - const DevDriver::AllocCb& allocCb, - size_t align, - bool zero, - const char* pFilename, - int lineNumber, - const char* pFunction -) noexcept; - -namespace DevDriver -{ - -namespace Platform -{ - -template -inline void static Destructor(T* p) -{ - if (p != nullptr) - { - p->~T(); - } -} - -template -static T* NewArray(size_t numElements, const AllocCb& allocCb) -{ - size_t allocSize = (sizeof(T) * numElements) + DD_CACHE_LINE_BYTES; - size_t allocAlign = DD_CACHE_LINE_BYTES; - - T* pMem = reinterpret_cast(DD_MALLOC(allocSize, allocAlign, allocCb)); - if (pMem != nullptr) - { - pMem = reinterpret_cast(reinterpret_cast(pMem) + DD_CACHE_LINE_BYTES); - size_t* pNumElements = reinterpret_cast(reinterpret_cast(pMem) - sizeof(size_t)); - *pNumElements = numElements; - T* pCurrentElement = pMem; - for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) - { - new(pCurrentElement) T; - ++pCurrentElement; - } - } - - return pMem; -} - -template -static void DeleteArray(T* pElements, const AllocCb& allocCb) -{ - if (pElements != nullptr) - { - size_t numElements = *reinterpret_cast(reinterpret_cast(pElements) - sizeof(size_t)); - T* pCurrentElement = pElements; - for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) - { - pCurrentElement->~T(); - ++pCurrentElement; - } - - pElements = reinterpret_cast(reinterpret_cast(pElements) - DD_CACHE_LINE_BYTES); - } - - DD_FREE(pElements, allocCb); -} - -// Get the number of elements in a statically sized array -// Usage: -// char buffer[1024]; -// size_t size = ArraySize(buffer); // size == 1024 -// -// With a cast: -// char buffer[1024]; -// uint32 size = ArraySize(buffer); -// -template < - typename SizeT = size_t, // Type to return - typename T, // Inferred type of array elements - you should not need to supply this argument - size_t Size // Inferred length of array (in elements) - you should not need to supply this argument -> -constexpr SizeT ArraySize(const T(&)[Size]) -{ - return static_cast(Size); -} - -// Log to consoles and attached debuggers -void DebugPrint(LogLevel lvl, const char* pFormat, ...); - -// Platform-specific loggers, this is called from DebugPrint. -void PlatformDebugPrint(LogLevel lvl, const char* pString); - -/// Get the absolute path to a file or directory that already exists -/// If ppAbsPathFilePart is non-NULL, *ppAbsPathFilePart will point into absPath at the beginning of the Filename -/// This is recommended to do whenever you need to display a path to a user. -Result GetAbsPathName( - const char* pPath, - char (&absPath)[256] -); - -/* platform functions for performing atomic operations */ - -int32 AtomicIncrement(Atomic* pVariable); -int32 AtomicDecrement(Atomic* pVariable); -int32 AtomicAdd(Atomic* pVariable, int32 num); -int32 AtomicSubtract(Atomic* pVariable, int32 num); - -int64 AtomicIncrement(Atomic64* pVariable); -int64 AtomicDecrement(Atomic64* pVariable); -int64 AtomicAdd(Atomic64* pVariable, int64 num); -int64 AtomicSubtract(Atomic64* pVariable, int64 num); - -// A generic AllocCb that defers allocation to Platform::AllocateMemory() -// Suitable for memory allocation if you don't care about it. -extern AllocCb GenericAllocCb; - -void* AllocateMemory(size_t size, size_t alignment, bool zero); -void FreeMemory(void* pMemory); - -/* fast locks */ -class AtomicLock -{ -public: - AtomicLock() : m_lock(0) {}; - ~AtomicLock() {}; - void Lock(); - bool TryLock(); - void Unlock(); - bool IsLocked() { return (m_lock != 0); }; -private: - Atomic m_lock; -}; - -class Mutex -{ -public: - Mutex(); - ~Mutex(); - void Lock(); - void Unlock(); -private: - MutexStorage m_mutex; -}; - -class Semaphore -{ -public: - explicit Semaphore(uint32 initialCount, uint32 maxCount); - ~Semaphore(); - Result Signal(); - Result Wait(uint32 millisecTimeout); -private: - SemaphoreStorage m_semaphore; -}; - -class Event -{ -public: - explicit Event(bool signaled); - ~Event(); - void Clear(); - void Signal(); - Result Wait(uint32 timeoutInMs); -private: - EventStorage m_event; -}; - -class Thread -{ -public: - Thread() = default; - - Thread(Thread&& other) noexcept = default; - Thread& operator=(Thread&& other) noexcept = default; - - // Copying a thread doesn't make sense - Thread(const Thread&) = delete; - Thread& operator= (const Thread& other) = delete; - - ~Thread(); - - Result Start(ThreadFunction pFnThreadFunc, void* pThreadParameter); - - // Set the user-visible name for the thread using printf-style formatters - // This should only be called on valid thread objects. (Threads that have been started) - // This function will return Result::Error if it's called on an invalid thread. - // Note: This change is global to the thread and can be changed by other means - // Treat this as an aid for people - Result SetName(const char* pFmt, ...); - - Result Join(uint32 timeoutInMs); - - bool IsJoinable() const; - -private: - static ThreadReturnType DD_APIENTRY ThreadShim(void* pShimParam); - - // Reset our object to a default state - void Reset() - { - pFnFunction = nullptr; - pParameter = nullptr; - hThread = kInvalidThreadHandle; - - onExit.Clear(); - } - - // Set the thread name to a hard-coded string. - // The thread name passed to this function must be no larger than kThreadNameMaxLength including the NULL byte. - // If a larger string is passed, errors may occur on some platforms. - Result SetNameRaw(const char* pThreadName); - - ThreadFunction pFnFunction = nullptr; - void* pParameter = nullptr; - ThreadHandle hThread = kInvalidThreadHandle; - Event onExit = Event(false); // Start unsignaled -}; - -class Random -{ -public: - // Algorithm Constants - static constexpr uint64 kModulus = (uint64(1) << 48); - static constexpr uint64 kMultiplier = 0X5DEECE66Dull; - static constexpr uint16 kIncrement = 0xB; - - Random(); - Random(uint64 seed) - { - Reseed(seed); - } - ~Random() {} - - uint32 Generate(); - void Reseed(uint64 seed); -private: - uint64 m_prevState = 0; - - // Sanity checks. - static_assert(0 < kModulus, "Invalid modulus"); - static_assert(0 < kMultiplier, "Invalid multiplier"); - static_assert(kMultiplier < kModulus, "Invalid multiplier"); - static_assert(kIncrement < kModulus, "Invalid increment"); -}; - -class Library -{ -public: - Library() : m_hLib(nullptr) { } - ~Library() { Close(); } - - Result Load(const char* pLibraryName); - - void Close(); - - bool IsLoaded() const { return (m_hLib != nullptr); } - - void Swap(Library* pLibrary) - { - m_hLib = pLibrary->m_hLib; - pLibrary->m_hLib = nullptr; - } - - // Retrieve a function address from the dynamic library object. Returns true if successful, false otherwise. - template - bool GetFunction(const char* pName, Func_t* ppfnFunc) const - { - (*ppfnFunc) = reinterpret_cast(GetFunctionHelper(pName)); - return ((*ppfnFunc) != nullptr); - } - -private: - void* GetFunctionHelper(const char* pName) const; - - LibraryHandle m_hLib; - - DD_DISALLOW_COPY_AND_ASSIGN(Library); -}; - -enum struct MkdirStatus -{ - Unknown, - Created, - Existed, -}; - -// Create a directory with default permissions -// On Windows, this uses NULL for LPSECURITY_ATTRIBUTES -// On Unix, this uses 0777 for the mode. -// When pStatus is non-NULL, *pStatus is set to -// MkdirStatus::Created if the directory did not exist and was created -// MkdirStatus::Existed if the directory already existed -// Returns: -// - Result::Success, if the directory already exists or was created -// - Result::FileIoError, if the directory failed to be created -Result Mkdir(const char* pDir, MkdirStatus* pStatus = nullptr); - -ProcessId GetProcessId(); - -uint64 GetCurrentTimeInMs(); - -uint64 QueryTimestampFrequency(); -uint64 QueryTimestamp(); - -// Todo: Remove Sleep() entirely from our platform API. It cannot be used in the KMD and should not be used -// anywhere else either. -void Sleep(uint32 millisecTimeout); - -void GetProcessName(char* buffer, size_t bufferSize); - -void Strncpy(char* pDst, const char* pSrc, size_t dstSize); - -template -void Strncpy(char(&dst)[DstSize], const char* pSrc) -{ - Strncpy(dst, pSrc, DstSize); -} - -char* Strtok(char* pDst, const char* pDelimiter, char** ppContext); - -void Strncat(char* pDst, const char* pSrc, size_t dstSize); - -template -void Strncat(char(&dst)[DstSize], const char* pSrc) -{ - Strncat(dst, pSrc, DstSize); -} - -int32 Strcmpi(const char* pSrc1, const char* pSrc2); - -int32 Snprintf(char* pDst, size_t dstSize, const char* pFormat, ...); -int32 Vsnprintf(char* pDst, size_t dstSize, const char* pFormat, va_list args); - -template -int32 Snprintf(char(&dst)[DstSize], const char* pFormat, Args&&... args) -{ - return Snprintf(dst, DstSize, pFormat, args...); -} - -struct OsInfo -{ - DD_STATIC_CONST const char* kOsTypeWindows = "Windows"; - DD_STATIC_CONST const char* kOsTypeLinux = "Linux"; - DD_STATIC_CONST const char* kOsTypeDarwin = "Darwin"; - - char type[16]; /// The type of the OS, either "Windows", "Linux", or "Darwin". - - char name[32]; /// A human-readable string to identify the version of the OS running - char description[256]; /// A human-readable string to identify the detailed version of the OS running - char hostname[128]; /// The hostname for the machine - - struct UserInfo { - char name[32]; /// Username for the current user - char homeDir[128]; /// Path to the current user's home directory - //< This is typically stored in $HOME or %HOMEPATH% and looks like one of: - //< C:\Users\BobMarley - //< /home/bob_ross - //< /Users/BobTheBuilder - } user; - - uint64 physMemory; /// Total amount of memory available on host in bytes - uint64 swapMemory; /// Total amount of swap memory available on host in bytes -}; - - Result QueryOsInfo(OsInfo* pInfo); - -struct EtwSupportInfo - { - bool isSupported; ///< If true, indicates that the OS platform supports system monitoring, false otherwise. - bool hasPermission; ///< If true, indicates the account has the required permissions, false otherwise. - uint32 statusCode; ///< The status result returned when attempting to open a monitoring session. - char statusDescription[256]; ///< The textual status result returned when attempting to open a monitoring. - }; - - Result QueryEtwInfo(EtwSupportInfo* pInfo); - -} // Platform - -#ifndef DD_PRINT_FUNC -#define DD_PRINT_FUNC Platform::DebugPrint -#else -void DD_PRINT_FUNC(LogLevel logLevel, const char* format, ...); -#endif - -template -inline void LogString(const char *format, Ts&&... args) -{ - if (DD_WILL_PRINT(logLevel)) - { - DD_PRINT_FUNC(logLevel, format, Platform::Forward(args)...); - } -} - -// Increments a const pointer by numBytes by first casting it to a const uint8*. -DD_NODISCARD -constexpr const void* VoidPtrInc( - const void* pPtr, - size_t numBytes) -{ - return (static_cast(pPtr) + numBytes); -} - -// Increments a pointer by numBytes by first casting it to a uint8*. -DD_NODISCARD -constexpr void* VoidPtrInc( - void* pPtr, - size_t numBytes) -{ - return (static_cast(pPtr) + numBytes); -} - -// Decrements a const pointer by numBytes by first casting it to a const uint8*. -DD_NODISCARD -constexpr const void* VoidPtrDec( - const void* pPtr, - size_t numBytes) -{ - return (static_cast(pPtr) - numBytes); -} - -// Decrements a pointer by numBytes by first casting it to a uint8*. -DD_NODISCARD -constexpr void* VoidPtrDec( - void* pPtr, - size_t numBytes) -{ - return (static_cast(pPtr) - numBytes); -} - -/// Convert a `DevDriver::Result` into a human recognizable string. -static inline const char* ResultToString(Result result) -{ - switch (result) - { - //// Generic Result Code //// - case Result::Success: return "Success"; - case Result::Error: return "Error"; - case Result::NotReady: return "NotReady"; - case Result::VersionMismatch: return "VersionMismatch"; - case Result::Unavailable: return "Unavailable"; - case Result::Rejected: return "Rejected"; - case Result::EndOfStream: return "EndOfStream"; - case Result::Aborted: return "Aborted"; - case Result::InsufficientMemory: return "InsufficientMemory"; - case Result::InvalidParameter: return "InvalidParameter"; - case Result::InvalidClientId: return "InvalidClientId"; - case Result::ConnectionExists: return "ConnectionExists"; - case Result::FileNotFound: return "FileNotFound"; - case Result::FunctionNotFound: return "FunctionNotFound"; - case Result::InterfaceNotFound: return "InterfaceNotFound"; - case Result::EntryExists: return "EntryExists"; - case Result::FileAccessError: return "FileAccessError"; - case Result::FileIoError: return "FileIoError"; - case Result::LimitReached: return "LimitReached"; - case Result::MemoryOverLimit: return "MemoryOverLimit"; - - //// URI PROTOCOL //// - case Result::UriServiceRegistrationError: return "UriServiceRegistrationError"; - case Result::UriStringParseError: return "UriStringParseError"; - case Result::UriInvalidParameters: return "UriInvalidParameters"; - case Result::UriInvalidPostDataBlock: return "UriInvalidPostDataBlock"; - case Result::UriInvalidPostDataSize: return "UriInvalidPostDataSize"; - case Result::UriFailedToAcquirePostBlock: return "UriFailedToAcquirePostBlock"; - case Result::UriFailedToOpenResponseBlock: return "UriFailedToOpenResponseBlock"; - case Result::UriRequestFailed: return "UriRequestFailed"; - case Result::UriPendingRequestError: return "UriPendingRequestError"; - case Result::UriInvalidChar: return "UriInvalidChar"; - case Result::UriInvalidJson: return "UriInvalidJson"; - - //// Settings URI Service //// - case Result::SettingsUriInvalidComponent: return "SettingsUriInvalidComponent"; - case Result::SettingsUriInvalidSettingName: return "SettingsUriInvalidSettingName"; - case Result::SettingsUriInvalidSettingValue: return "SettingsUriInvalidSettingValue"; - case Result::SettingsUriInvalidSettingValueSize: return "SettingsUriInvalidSettingValueSize"; - - //// Info URI Service //// - case Result::InfoUriSourceNameInvalid: return "InfoUriSourceNameInvalid"; - case Result::InfoUriSourceCallbackInvalid: return "InfoUriSourceCallbackInvalid"; - case Result::InfoUriSourceAlreadyRegistered: return "InfoUriSourceAlreadyRegistered"; - case Result::InfoUriSourceWriteFailed: return "InfoUriSourceWriteFailed"; - - //// Settings Service //// - case Result::SettingsInvalidComponent: return "SettingsInvalidComponent"; - case Result::SettingsInvalidSettingName: return "SettingsInvalidSettingName"; - case Result::SettingsInvalidSettingValue: return "SettingsInvalidSettingValue"; - case Result::SettingsInsufficientValueSize: return "SettingsInsufficientValueSize"; - case Result::SettingsInvalidSettingValueSize: return "SettingsInvalidSettingValueSize"; - } - - DD_PRINT(LogLevel::Warn, "Result code %u is not handled", static_cast(result)); - return "Unrecognized DevDriver::Result"; -} - -// Helper function for converting bool values into Result enums -// Useful for cases where Results and bools are interleaved in logic -static inline Result BoolToResult(bool value) -{ - return (value ? Result::Success : Result::Error); -} - -// Use this macro to mark Result values that have not been or cannot be handled correctly. -#define DD_UNHANDLED_RESULT(x) DevDriver::MarkUnhandledResultImpl((x), DD_STRINGIFY(x), DD_FILE, __LINE__, __func__) - -// Implementation for DD_UNHANDLED_RESULT. -// This is a specialized assert that should be used through the macro, and not called directly. -// This is implemented in ddPlatform.h, so that it has access to DD_ASSERT. -static inline void MarkUnhandledResultImpl( - Result result, - const char* pExpr, - const char* pFile, - int lineNumber, - const char* pFunc) -{ -#if defined(DD_OPT_ASSERTS_ENABLE) - if (result != Result::Success) - { - DD_PRINT(DevDriver::LogLevel::Error, - "%s (%d): Unchecked Result in %s: \"%s\" == \"%s\" (0x%X)\n", - pFile, - lineNumber, - pFunc, - pExpr, - ResultToString(result), - result); - } -#else - DD_UNUSED(result); - DD_UNUSED(pExpr); - DD_UNUSED(pFile); - DD_UNUSED(lineNumber); - DD_UNUSED(pFunc); -#endif -} - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +// can not be used in the kernel +#if !DD_PLATFORM_IS_KM +#include +#endif + +#include +#include + +#define DD_CACHE_LINE_BYTES 64 + +#define DD_MALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, false) +#define DD_CALLOC(size, alignment, allocCb) allocCb.Alloc(size, alignment, true) +#define DD_FREE(memory, allocCb) allocCb.Free(memory) + +#define DD_NEW(className, allocCb) new(allocCb, alignof(className), true, DD_FILE, __LINE__, __FUNCTION__) className +#define DD_DELETE(memory, allocCb) DevDriver::Platform::Destructor(memory); DD_FREE(memory, allocCb) + +#define DD_NEW_ARRAY(className, numElements, allocCb) DevDriver::Platform::NewArray(numElements, allocCb) +#define DD_DELETE_ARRAY(memory, allocCb) DevDriver::Platform::DeleteArray(memory, allocCb) + +// Always enable asserts in Debug builds +#if !defined(NDEBUG) + #if !defined(DD_OPT_ASSERTS_ENABLE) + #define DD_OPT_ASSERTS_ENABLE + #endif + #if !defined(DD_OPT_ASSERTS_DEBUGBREAK) + #define DD_OPT_ASSERTS_DEBUGBREAK + #endif +#endif + +#define DD_PTR_TO_HANDLE(x) ((DevDriver::Handle)(uintptr_t)(x)) + +#define DD_SANITIZE_RESULT(x) ((x != Result::Success) ? Result::Error : x) + +namespace DevDriver +{ + +//////////////////////////// +// Common logging levels +enum struct LogLevel : uint8 +{ + Debug = 0, + Verbose, + Info, + Warn, + Error, + Always, + Count, + + // Backwards compatibility for old alert log level + Alert = Warn, + + Never = 0xFF +}; + +typedef void*(*AllocFunc)(void* pUserdata, size_t size, size_t alignment, bool zero); +typedef void(*FreeFunc)(void* pUserdata, void* pMemory); + +struct AllocCb +{ + void* pUserdata; + AllocFunc pfnAlloc; + FreeFunc pfnFree; + + void* Alloc(size_t size, size_t alignment, bool zero) const; + void* Alloc(size_t size, bool zero) const; + void Free(void* pMemory) const; +}; + +namespace Platform +{ + +// Used by the Platform::Thread implementation. +typedef void (*ThreadFunction)(void* pThreadParameter); + +} // namespace Platform + +} // namespace DevDriver + +#if defined(DD_PLATFORM_WINDOWS_UM) + #include +#elif defined(DD_PLATFORM_WINDOWS_KM) + #include +#elif defined(DD_PLATFORM_DARWIN_UM) + #include +#elif defined(DD_PLATFORM_LINUX_UM) + #include +#else + // Legacy system for Ati Make + #if defined(_WIN32) && !defined(_KERNEL_MODE) + #define DD_PLATFORM_WINDOWS_UM + #include + #elif defined(__linux__) + #define DD_PLATFORM_LINUX_UM + #include + #else + #error "Unknown Platform - please configure your build system" + #endif + + #if __x86_64__ + #define DD_ARCH_BITS 64 + #else + #define DD_ARCH_BITS 32 + #endif +#endif + +#if !defined(DD_RESTRICT) + #error "DD_RESTRICT not defined by platform!" +#endif + +#if !defined(DD_DEBUG_BREAK) + #error "DD_DEBUG_BREAK not defined by platform!" +#endif + +// This only exists for 32bit Windows to specificy callbacks as __stdcall. +#if !defined(DD_APIENTRY) + #define DD_APIENTRY +#endif + +// TODO: remove this and make kDebugLogLevel DD_STATIC_CONST when we use a version of visual studio that supports it +#ifdef DD_OPT_LOG_LEVEL + #define DD_OPT_LOG_LEVEL_VALUE static_cast(DD_OPT_LOG_LEVEL) +#else + #if defined(NDEBUG) + // In non-debug builds, default to printing asserts, Error, and Always log messages + #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Error + #else + // In debug builds, default to more messages + #define DD_OPT_LOG_LEVEL_VALUE LogLevel::Verbose + #endif +#endif + +#define DD_WILL_PRINT(lvl) ((lvl >= DD_OPT_LOG_LEVEL_VALUE) && (lvl < DevDriver::LogLevel::Count)) +#define DD_PRINT(lvl, ...) DevDriver::LogString(__VA_ARGS__) + +#if defined(DD_OPT_ASSERTS_DEBUGBREAK) + #define DD_ASSERT_DEBUG_BREAK() DD_DEBUG_BREAK() +#else + #define DD_ASSERT_DEBUG_BREAK() +#endif + +#include + +// Calling `check_expr_is_bool(x)` when `x` is not exactly a bool will create a compile error. +// When it is a bool, it's a no-op. +// This allows us to enforce bool arguments to DD_ASSERT() macros +namespace DevDriver +{ + inline void check_expr_is_bool(bool) {} + + template + void check_expr_is_bool(const T&) = delete; +} + +#if !defined(DD_OPT_ASSERTS_ENABLE) + #define DD_WARN(statement) DD_UNUSED(0) + #define DD_WARN_REASON(reason) DD_UNUSED(0) + + #ifndef DD_ASSERT + #define DD_ASSERT(statement) DD_UNUSED(0) + #endif + + #define DD_ASSERT_REASON(reason) DD_UNUSED(0) +#else + #define DD_WARN(statement) do \ + { \ + DevDriver::check_expr_is_bool(statement); \ + if (!(statement)) \ + { \ + DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ + DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ + } \ + } while (0) + + #define DD_WARN_REASON(reason) do \ + { \ + DD_PRINT(DevDriver::LogLevel::Warn, "%s (%d): Warning triggered in %s: %s", \ + DD_FILE, __LINE__, __func__, reason); \ + } while (0) + + #ifndef DD_ASSERT + #define DD_ASSERT(statement) do \ + { \ + DevDriver::check_expr_is_bool(statement); \ + if (!(statement)) \ + { \ + DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ + DD_FILE, __LINE__, __func__, DD_STRINGIFY(statement)); \ + DD_ASSERT_DEBUG_BREAK(); \ + } \ + } while (0) + #endif + + #define DD_ASSERT_REASON(reason) do \ + { \ + DD_PRINT(DevDriver::LogLevel::Error, "%s (%d): Assertion failed in %s: %s", \ + DD_FILE, __LINE__, __func__, reason); \ + DD_ASSERT_DEBUG_BREAK(); \ + } while (0) +#endif + +/// Convenience macro that always warns. +#define DD_WARN_ALWAYS() DD_WARN_REASON("Unconditional Warning") + +/// Convenience macro that always asserts. +#define DD_ASSERT_ALWAYS() DD_ASSERT_REASON("Unconditional Assertion") + +/// Convenience macro that asserts if something has not been implemented. +#define DD_NOT_IMPLEMENTED() DD_ASSERT_REASON("Code not implemented!") + +/// Convenience macro that asserts if an area of code that shouldn't be executed is reached. +#define DD_UNREACHABLE() DD_ASSERT_REASON("Unreachable code has been reached!") + +// Backwards compatibility for old alert macro +#define DD_ALERT(statement) DD_WARN(statement) +#define DD_ALERT_REASON(reason) DD_WARN_REASON(reason) +#define DD_ALERT_ALWAYS() DD_WARN_ALWAYS() + +// Debug utility to log an expression +// +// This works by taking the format specifier for a local variable, and an expression. +// The expression is evaluated once. +// It then prints that expression and its value: +// ```cpp +// int x = 5; +// int y = 10; +// int z = 0xf0; +// DD_DBG("0x%x", x + y + z); // Prints: foo/file.cpp:5 "x + y + z" == 0xff +// ``` +#define DD_DBG(level, fmt, expr) DD_PRINT( \ + level, \ + "%s:%d:\t\"" DD_STRINGIFY(expr) "\" == " fmt, \ + DD_FILE, \ + __LINE__, \ + (expr) \ + ) + +// Allocates memory using an AllocCb. +// This overload is declared noexcept, and will correctly handle AllocCb::pfnAlloc() returning NULL. +void* operator new( + size_t size, + const DevDriver::AllocCb& allocCb, + size_t align, + bool zero, + const char* pFilename, + int lineNumber, + const char* pFunction +) noexcept; + +#if DD_PLATFORM_IS_KM +// Provide a placement new function if is not available +inline void* operator new(size_t size, void *pMemory) +{ + return pMemory; +}; +#endif + +// Overload of operator delete that matches the previously declared operator new. +// The compiler can call this version automatically in the case of exceptions thrown in the Constructor +// ... even though we turn them off? +// Compilers are fussy. +void operator delete( + void* pObject, + const DevDriver::AllocCb& allocCb, + size_t align, + bool zero, + const char* pFilename, + int lineNumber, + const char* pFunction +) noexcept; + +namespace DevDriver +{ + +namespace Platform +{ + +template +inline void static Destructor(T* p) +{ + if (p != nullptr) + { + p->~T(); + } +} + +template +static T* NewArray(size_t numElements, const AllocCb& allocCb) +{ + size_t allocSize = (sizeof(T) * numElements) + DD_CACHE_LINE_BYTES; + size_t allocAlign = DD_CACHE_LINE_BYTES; + + T* pMem = reinterpret_cast(DD_MALLOC(allocSize, allocAlign, allocCb)); + if (pMem != nullptr) + { + pMem = reinterpret_cast(reinterpret_cast(pMem) + DD_CACHE_LINE_BYTES); + size_t* pNumElements = reinterpret_cast(reinterpret_cast(pMem) - sizeof(size_t)); + *pNumElements = numElements; + T* pCurrentElement = pMem; + for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) + { + new(pCurrentElement) T; + ++pCurrentElement; + } + } + + return pMem; +} + +template +static void DeleteArray(T* pElements, const AllocCb& allocCb) +{ + if (pElements != nullptr) + { + size_t numElements = *reinterpret_cast(reinterpret_cast(pElements) - sizeof(size_t)); + T* pCurrentElement = pElements; + for (size_t elementIndex = 0; elementIndex < numElements; ++elementIndex) + { + pCurrentElement->~T(); + ++pCurrentElement; + } + + pElements = reinterpret_cast(reinterpret_cast(pElements) - DD_CACHE_LINE_BYTES); + } + + DD_FREE(pElements, allocCb); +} + +// Get the number of elements in a statically sized array +// Usage: +// char buffer[1024]; +// size_t size = ArraySize(buffer); // size == 1024 +// +// With a cast: +// char buffer[1024]; +// uint32 size = ArraySize(buffer); +// +template < + typename SizeT = size_t, // Type to return + typename T, // Inferred type of array elements - you should not need to supply this argument + size_t Size // Inferred length of array (in elements) - you should not need to supply this argument +> +constexpr SizeT ArraySize(const T(&)[Size]) +{ + return static_cast(Size); +} + +// Log to consoles and attached debuggers +void DebugPrint(LogLevel lvl, const char* pFormat, ...); + +// Platform-specific loggers, this is called from DebugPrint. +void PlatformDebugPrint(LogLevel lvl, const char* pString); + +/// Get the absolute path to a file or directory that already exists +/// If ppAbsPathFilePart is non-NULL, *ppAbsPathFilePart will point into absPath at the beginning of the Filename +/// This is recommended to do whenever you need to display a path to a user. +Result GetAbsPathName( + const char* pPath, + char (&absPath)[256] +); + +/* platform functions for performing atomic operations */ + +int32 AtomicIncrement(Atomic* pVariable); +int32 AtomicDecrement(Atomic* pVariable); +int32 AtomicAdd(Atomic* pVariable, int32 num); +int32 AtomicSubtract(Atomic* pVariable, int32 num); + +int64 AtomicIncrement(Atomic64* pVariable); +int64 AtomicDecrement(Atomic64* pVariable); +int64 AtomicAdd(Atomic64* pVariable, int64 num); +int64 AtomicSubtract(Atomic64* pVariable, int64 num); + +// A generic AllocCb that defers allocation to Platform::AllocateMemory() +// Suitable for memory allocation if you don't care about it. +extern AllocCb GenericAllocCb; + +void* AllocateMemory(size_t size, size_t alignment, bool zero); +void FreeMemory(void* pMemory); + +/* fast locks */ +class AtomicLock +{ +public: + AtomicLock() : m_lock(0) {}; + ~AtomicLock() {}; + void Lock(); + bool TryLock(); + void Unlock(); + bool IsLocked() { return (m_lock != 0); }; +private: + Atomic m_lock; +}; + +class Mutex +{ +public: + Mutex(); + ~Mutex(); + void Lock(); + void Unlock(); +private: + MutexStorage m_mutex; +}; + +class Semaphore +{ +public: + explicit Semaphore(uint32 initialCount, uint32 maxCount); + ~Semaphore(); + Result Signal(); + Result Wait(uint32 millisecTimeout); +private: + SemaphoreStorage m_semaphore; +}; + +class Event +{ +public: + explicit Event(bool signaled); + ~Event(); + void Clear(); + void Signal(); + Result Wait(uint32 timeoutInMs); +private: + EventStorage m_event; +}; + +class Thread +{ +public: + Thread() = default; + + Thread(Thread&& other) noexcept = default; + Thread& operator=(Thread&& other) noexcept = default; + + // Copying a thread doesn't make sense + Thread(const Thread&) = delete; + Thread& operator= (const Thread& other) = delete; + + ~Thread(); + + Result Start(ThreadFunction pFnThreadFunc, void* pThreadParameter); + + // Set the user-visible name for the thread using printf-style formatters + // This should only be called on valid thread objects. (Threads that have been started) + // This function will return Result::Error if it's called on an invalid thread. + // Note: This change is global to the thread and can be changed by other means + // Treat this as an aid for people + Result SetName(const char* pFmt, ...); + + Result Join(uint32 timeoutInMs); + + bool IsJoinable() const; + +private: + static ThreadReturnType DD_APIENTRY ThreadShim(void* pShimParam); + + // Reset our object to a default state + void Reset() + { + pFnFunction = nullptr; + pParameter = nullptr; + hThread = kInvalidThreadHandle; + + onExit.Clear(); + } + + // Set the thread name to a hard-coded string. + // The thread name passed to this function must be no larger than kThreadNameMaxLength including the NULL byte. + // If a larger string is passed, errors may occur on some platforms. + Result SetNameRaw(const char* pThreadName); + + ThreadFunction pFnFunction = nullptr; + void* pParameter = nullptr; + ThreadHandle hThread = kInvalidThreadHandle; + Event onExit = Event(false); // Start unsignaled +}; + +class Random +{ +public: + // Algorithm Constants + static constexpr uint64 kModulus = (uint64(1) << 48); + static constexpr uint64 kMultiplier = 0X5DEECE66Dull; + static constexpr uint16 kIncrement = 0xB; + + Random(); + Random(uint64 seed) + { + Reseed(seed); + } + ~Random() {} + + uint32 Generate(); + void Reseed(uint64 seed); +private: + uint64 m_prevState = 0; + + // Sanity checks. + static_assert(0 < kModulus, "Invalid modulus"); + static_assert(0 < kMultiplier, "Invalid multiplier"); + static_assert(kMultiplier < kModulus, "Invalid multiplier"); + static_assert(kIncrement < kModulus, "Invalid increment"); +}; + +class Library +{ +public: + Library() : m_hLib(nullptr) { } + ~Library() { Close(); } + + Result Load(const char* pLibraryName); + + void Close(); + + bool IsLoaded() const { return (m_hLib != nullptr); } + + void Swap(Library* pLibrary) + { + m_hLib = pLibrary->m_hLib; + pLibrary->m_hLib = nullptr; + } + + // Retrieve a function address from the dynamic library object. Returns true if successful, false otherwise. + template + bool GetFunction(const char* pName, Func_t* ppfnFunc) const + { + (*ppfnFunc) = reinterpret_cast(GetFunctionHelper(pName)); + return ((*ppfnFunc) != nullptr); + } + +private: + void* GetFunctionHelper(const char* pName) const; + + LibraryHandle m_hLib; + + DD_DISALLOW_COPY_AND_ASSIGN(Library); +}; + +enum struct MkdirStatus +{ + Unknown, + Created, + Existed, +}; + +// Create a directory with default permissions +// On Windows, this uses NULL for LPSECURITY_ATTRIBUTES +// On Unix, this uses 0777 for the mode. +// When pStatus is non-NULL, *pStatus is set to +// MkdirStatus::Created if the directory did not exist and was created +// MkdirStatus::Existed if the directory already existed +// Returns: +// - Result::Success, if the directory already exists or was created +// - Result::FileIoError, if the directory failed to be created +Result Mkdir(const char* pDir, MkdirStatus* pStatus = nullptr); + +ProcessId GetProcessId(); + +uint64 GetCurrentTimeInMs(); + +uint64 QueryTimestampFrequency(); +uint64 QueryTimestamp(); + +// Todo: Remove Sleep() entirely from our platform API. It cannot be used in the KMD and should not be used +// anywhere else either. +void Sleep(uint32 millisecTimeout); + +void GetProcessName(char* buffer, size_t bufferSize); + +void Strncpy(char* pDst, const char* pSrc, size_t dstSize); + +template +void Strncpy(char(&dst)[DstSize], const char* pSrc) +{ + Strncpy(dst, pSrc, DstSize); +} + +char* Strtok(char* pDst, const char* pDelimiter, char** ppContext); + +void Strncat(char* pDst, const char* pSrc, size_t dstSize); + +template +void Strncat(char(&dst)[DstSize], const char* pSrc) +{ + Strncat(dst, pSrc, DstSize); +} + +int32 Strcmpi(const char* pSrc1, const char* pSrc2); + +int32 Snprintf(char* pDst, size_t dstSize, const char* pFormat, ...); +int32 Vsnprintf(char* pDst, size_t dstSize, const char* pFormat, va_list args); + +template +int32 Snprintf(char(&dst)[DstSize], const char* pFormat, Args&&... args) +{ + return Snprintf(dst, DstSize, pFormat, args...); +} + +struct OsInfo +{ + DD_STATIC_CONST const char* kOsTypeWindows = "Windows"; + DD_STATIC_CONST const char* kOsTypeLinux = "Linux"; + DD_STATIC_CONST const char* kOsTypeDarwin = "Darwin"; + + char type[16]; /// The type of the OS, either "Windows", "Linux", or "Darwin". + + char name[32]; /// A human-readable string to identify the version of the OS running + char description[256]; /// A human-readable string to identify the detailed version of the OS running + char hostname[128]; /// The hostname for the machine + + struct UserInfo { + char name[32]; /// Username for the current user + char homeDir[128]; /// Path to the current user's home directory + //< This is typically stored in $HOME or %HOMEPATH% and looks like one of: + //< C:\Users\BobMarley + //< /home/bob_ross + //< /Users/BobTheBuilder + } user; + + uint64 physMemory; /// Total amount of memory available on host in bytes + uint64 swapMemory; /// Total amount of swap memory available on host in bytes +}; + + Result QueryOsInfo(OsInfo* pInfo); + +struct EtwSupportInfo + { + bool isSupported; ///< If true, indicates that the OS platform supports system monitoring, false otherwise. + bool hasPermission; ///< If true, indicates the account has the required permissions, false otherwise. + uint32 statusCode; ///< The status result returned when attempting to open a monitoring session. + char statusDescription[256]; ///< The textual status result returned when attempting to open a monitoring. + }; + + Result QueryEtwInfo(EtwSupportInfo* pInfo); + +} // Platform + +#ifndef DD_PRINT_FUNC +#define DD_PRINT_FUNC Platform::DebugPrint +#else +void DD_PRINT_FUNC(LogLevel logLevel, const char* format, ...); +#endif + +template +inline void LogString(const char *format, Ts&&... args) +{ + if (DD_WILL_PRINT(logLevel)) + { + DD_PRINT_FUNC(logLevel, format, Platform::Forward(args)...); + } +} + +// Increments a const pointer by numBytes by first casting it to a const uint8*. +DD_NODISCARD +constexpr const void* VoidPtrInc( + const void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) + numBytes); +} + +// Increments a pointer by numBytes by first casting it to a uint8*. +DD_NODISCARD +constexpr void* VoidPtrInc( + void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) + numBytes); +} + +// Decrements a const pointer by numBytes by first casting it to a const uint8*. +DD_NODISCARD +constexpr const void* VoidPtrDec( + const void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) - numBytes); +} + +// Decrements a pointer by numBytes by first casting it to a uint8*. +DD_NODISCARD +constexpr void* VoidPtrDec( + void* pPtr, + size_t numBytes) +{ + return (static_cast(pPtr) - numBytes); +} + +/// Convert a `DevDriver::Result` into a human recognizable string. +static inline const char* ResultToString(Result result) +{ + switch (result) + { + //// Generic Result Code //// + case Result::Success: return "Success"; + case Result::Error: return "Error"; + case Result::NotReady: return "NotReady"; + case Result::VersionMismatch: return "VersionMismatch"; + case Result::Unavailable: return "Unavailable"; + case Result::Rejected: return "Rejected"; + case Result::EndOfStream: return "EndOfStream"; + case Result::Aborted: return "Aborted"; + case Result::InsufficientMemory: return "InsufficientMemory"; + case Result::InvalidParameter: return "InvalidParameter"; + case Result::InvalidClientId: return "InvalidClientId"; + case Result::ConnectionExists: return "ConnectionExists"; + case Result::FileNotFound: return "FileNotFound"; + case Result::FunctionNotFound: return "FunctionNotFound"; + case Result::InterfaceNotFound: return "InterfaceNotFound"; + case Result::EntryExists: return "EntryExists"; + case Result::FileAccessError: return "FileAccessError"; + case Result::FileIoError: return "FileIoError"; + case Result::LimitReached: return "LimitReached"; + case Result::MemoryOverLimit: return "MemoryOverLimit"; + + //// URI PROTOCOL //// + case Result::UriServiceRegistrationError: return "UriServiceRegistrationError"; + case Result::UriStringParseError: return "UriStringParseError"; + case Result::UriInvalidParameters: return "UriInvalidParameters"; + case Result::UriInvalidPostDataBlock: return "UriInvalidPostDataBlock"; + case Result::UriInvalidPostDataSize: return "UriInvalidPostDataSize"; + case Result::UriFailedToAcquirePostBlock: return "UriFailedToAcquirePostBlock"; + case Result::UriFailedToOpenResponseBlock: return "UriFailedToOpenResponseBlock"; + case Result::UriRequestFailed: return "UriRequestFailed"; + case Result::UriPendingRequestError: return "UriPendingRequestError"; + case Result::UriInvalidChar: return "UriInvalidChar"; + case Result::UriInvalidJson: return "UriInvalidJson"; + + //// Settings URI Service //// + case Result::SettingsUriInvalidComponent: return "SettingsUriInvalidComponent"; + case Result::SettingsUriInvalidSettingName: return "SettingsUriInvalidSettingName"; + case Result::SettingsUriInvalidSettingValue: return "SettingsUriInvalidSettingValue"; + case Result::SettingsUriInvalidSettingValueSize: return "SettingsUriInvalidSettingValueSize"; + + //// Info URI Service //// + case Result::InfoUriSourceNameInvalid: return "InfoUriSourceNameInvalid"; + case Result::InfoUriSourceCallbackInvalid: return "InfoUriSourceCallbackInvalid"; + case Result::InfoUriSourceAlreadyRegistered: return "InfoUriSourceAlreadyRegistered"; + case Result::InfoUriSourceWriteFailed: return "InfoUriSourceWriteFailed"; + + //// Settings Service //// + case Result::SettingsInvalidComponent: return "SettingsInvalidComponent"; + case Result::SettingsInvalidSettingName: return "SettingsInvalidSettingName"; + case Result::SettingsInvalidSettingValue: return "SettingsInvalidSettingValue"; + case Result::SettingsInsufficientValueSize: return "SettingsInsufficientValueSize"; + case Result::SettingsInvalidSettingValueSize: return "SettingsInvalidSettingValueSize"; + } + + DD_PRINT(LogLevel::Warn, "Result code %u is not handled", static_cast(result)); + return "Unrecognized DevDriver::Result"; +} + +// Helper function for converting bool values into Result enums +// Useful for cases where Results and bools are interleaved in logic +static inline Result BoolToResult(bool value) +{ + return (value ? Result::Success : Result::Error); +} + +// Use this macro to mark Result values that have not been or cannot be handled correctly. +#define DD_UNHANDLED_RESULT(x) DevDriver::MarkUnhandledResultImpl((x), DD_STRINGIFY(x), DD_FILE, __LINE__, __func__) + +// Implementation for DD_UNHANDLED_RESULT. +// This is a specialized assert that should be used through the macro, and not called directly. +// This is implemented in ddPlatform.h, so that it has access to DD_ASSERT. +static inline void MarkUnhandledResultImpl( + Result result, + const char* pExpr, + const char* pFile, + int lineNumber, + const char* pFunc) +{ +#if defined(DD_OPT_ASSERTS_ENABLE) + if (result != Result::Success) + { + DD_PRINT(DevDriver::LogLevel::Error, + "%s (%d): Unchecked Result in %s: \"%s\" == \"%s\" (0x%X)\n", + pFile, + lineNumber, + pFunc, + pExpr, + ResultToString(result), + result); + } +#else + DD_UNUSED(result); + DD_UNUSED(pExpr); + DD_UNUSED(pFile); + DD_UNUSED(lineNumber); + DD_UNUSED(pFunc); +#endif +} + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h index 1f5daec70e..48dede89c2 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/ddcTemplate.h @@ -1,378 +1,378 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#if !defined(_MSC_VER) -#include -#endif - -namespace DevDriver -{ - namespace Platform - { - /// Templated LockGuard class. Works with any type that implements Lock() and Unlock() - template - class LockGuard - { - public: - explicit LockGuard(T &lock) : m_lock(lock) { lock.Lock(); } - ~LockGuard() { m_lock.Unlock(); } - private: - T &m_lock; - }; - - /// Computes the base-2 logarithm of an unsigned 64-bit integer. - /// - /// If the given integer is not a power of 2, this function will not provide an exact answer. - /// - /// @returns log_2(u) - template - inline uint32 Log2(T u) ///< Value to compute the logarithm of. - { - uint32 logValue = 0; - - while (u > 1) - { - ++logValue; - u >>= 1; - } - return logValue; - } - - /// Computes the base-2 logarithm of an unsigned 64-bit integer. - /// - /// If the given integer is not a power of 2, this function will not provide an exact answer. - /// - /// @returns log_2(u) - template - inline constexpr uint32 _ConstLog2(T u, uint32 logValue) ///< Value to compute the logarithm of. - { - return (u > 1) ? _ConstLog2(u >> 1, logValue + 1) : logValue; - } - - /// Computes the base-2 logarithm of an unsigned 64-bit integer. - /// - /// If the given integer is not a power of 2, this function will not provide an exact answer. - /// - /// @returns log_2(u) - template - inline constexpr uint32 ConstLog2(T u) ///< Value to compute the logarithm of. - { - return _ConstLog2(u, 0); - } - - static_assert(ConstLog2(1) == 0, "ConstLog2 failure"); - static_assert(ConstLog2(2) == 1, "ConstLog2 failure"); - static_assert(ConstLog2(128) == 7, "ConstLog2 failure"); - static_assert(ConstLog2(255) == 7, "ConstLog2 failure"); - - /// Computes 2 ^ value provided - /// - /// @returns 2 ^ (u) - template - inline constexpr T Pow2(T u) - { - return ((T)1 << u); - } - - static_assert(Pow2(0) == 1, "Pow2 failure"); - static_assert(Pow2(1) == 2, "Pow2 failure"); - static_assert(Pow2(7) == 128, "Pow2 failure"); - - /// Determines if a value is a power of two. - /// - /// @returns True if it is a power of two, false otherwise. - inline constexpr bool IsPowerOfTwo(uint64 value) - { - return (value == 0) ? false : ((value & (value - 1)) == 0); - } - - /// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 - /// alignments are supported by this function. - /// - /// returns Aligned value. - template - inline constexpr T Pow2Align( - T value, ///< Value to align. - uint64 alignment) ///< Desired alignment (must be a power of 2). - { - return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); - } - - /// Rounds the specified uint 'value' up to the nearest power of 2 - /// - /// @returns Power of 2 padded value. - template - inline T Pow2Pad(T value) ///< Value to pad. - { - T ret = 1; - if (IsPowerOfTwo(value)) - { - ret = value; - } - else - { - while (ret < value) - { - ret <<= 1; - } - } - - return ret; - } - - /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. - /// - /// @returns Power of 2 padded value. - template - inline constexpr T _ConstPow2Pad(T value, T padded) ///< Value to pad. - { - return (padded < value) ? _ConstPow2Pad(value, padded << 1) : padded; - } - - /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. - /// - /// @returns Power of 2 padded value. - template - inline constexpr T ConstPow2Pad(T value) ///< Value to pad. - { - return (IsPowerOfTwo(value)) ? value : _ConstPow2Pad(value, (T)1); - } - - static_assert(ConstPow2Pad(512) == 512, "ConstPow2Pad failure"); - static_assert(ConstPow2Pad(511) == 512, "ConstPow2Pad failure"); - static_assert(ConstPow2Pad(257) == 512, "ConstPow2Pad failure"); - - /// Finds the smallest of two values - /// - /// @returns a if a < b, otherwise b. - template - inline constexpr T Min(const T &a, const T &b) - { - return ((a < b) ? a : b); - } - - /// Finds the larger of two values - /// - /// @returns a if a > b, otherwise b. - template - inline constexpr T Max(const T &a, const T &b) - { - return ((a > b) ? a : b); - } - - // Given a type T, set Type equal to T - template - struct RemoveRef - { - typedef T Type; - }; - - // Given a type T&, set Type equal to T - template - struct RemoveRef - { - typedef T Type; - }; - - // Given a type T&&, set Type equal to T - template - struct RemoveRef - { - typedef T Type; - }; - - // std::move equivalent - template - inline typename RemoveRef::Type&& Move(T&& obj) - { - return static_cast::Type&&>(obj); - } - - // std::forward equivalent - template - inline T&& Forward(typename RemoveRef::Type&& args) - { - return static_cast(args); - } - - // std::forward equivalent - template - inline T&& Forward(typename RemoveRef::Type& args) - { - return static_cast(args); - } - - // Returns the contents of Value in a new variable, and assign newValue into the memory occupied by value. - template - inline T Exchange(T& value, U&& newValue) - { - T oldValue = Move(value); - value = Forward(newValue); - return (oldValue); - } - - // Convenience structure that defined Value as either true or false, and Type as either TrueType or FalseType - template - struct BoolType - { - static const bool Value = value; - using Type = BoolType; - }; - - using FalseType = BoolType; - using TrueType = BoolType; - - // Struct whose ::Type member is undefined if the first condition is not true - template - struct EnableIf - { - }; - - // Struct whose ::Type member is equal to T if the first condition is true. - template - struct EnableIf - { - typedef T Type; - }; - - template - struct IsPointer : FalseType - { - }; - - template - struct IsPointer : TrueType - { - }; - -#if defined(_MSC_VER) - // If we are building with MSVC we want to use the compiler intrinsics here. This is primarily because building with - // the /kernel precludes the use of the C++ type traits library. For all other compilers we simply implement this - // using the standard C++ library. - - // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. - template - struct IsConvertible : BoolType<__is_convertible_to(T, U)> - { - }; - - // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments - // provided. - template - struct IsConstructible : BoolType<__is_constructible(T, Args...)> - { - - }; - - // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. - template - struct IsAbstract : BoolType<__is_abstract(T)> - { - - }; - - // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. - template - struct IsPod : BoolType<__is_pod(T)> - { - - }; - - // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. - template - struct IsStandardLayout : BoolType<__is_standard_layout(T)> - { - - }; - - // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. - template - struct IsTriviallyDestructible : BoolType<__is_trivially_destructible(T)> - { - - }; - - // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. - template - struct IsEnum : BoolType<__is_enum(T)> - { - - }; -#else - // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. - template - struct IsConvertible : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments - // provided. - template - struct IsConstructible : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. - template - struct IsAbstract : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. - template - struct IsPod : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. - template - struct IsStandardLayout : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. - template - struct IsTriviallyDestructible : BoolType::value> - { - - }; - - // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. - template - struct IsEnum : BoolType::value> - { - - }; -#endif - } -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#if !defined(_MSC_VER) +#include +#endif + +namespace DevDriver +{ + namespace Platform + { + /// Templated LockGuard class. Works with any type that implements Lock() and Unlock() + template + class LockGuard + { + public: + explicit LockGuard(T &lock) : m_lock(lock) { lock.Lock(); } + ~LockGuard() { m_lock.Unlock(); } + private: + T &m_lock; + }; + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline uint32 Log2(T u) ///< Value to compute the logarithm of. + { + uint32 logValue = 0; + + while (u > 1) + { + ++logValue; + u >>= 1; + } + return logValue; + } + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline constexpr uint32 _ConstLog2(T u, uint32 logValue) ///< Value to compute the logarithm of. + { + return (u > 1) ? _ConstLog2(u >> 1, logValue + 1) : logValue; + } + + /// Computes the base-2 logarithm of an unsigned 64-bit integer. + /// + /// If the given integer is not a power of 2, this function will not provide an exact answer. + /// + /// @returns log_2(u) + template + inline constexpr uint32 ConstLog2(T u) ///< Value to compute the logarithm of. + { + return _ConstLog2(u, 0); + } + + static_assert(ConstLog2(1) == 0, "ConstLog2 failure"); + static_assert(ConstLog2(2) == 1, "ConstLog2 failure"); + static_assert(ConstLog2(128) == 7, "ConstLog2 failure"); + static_assert(ConstLog2(255) == 7, "ConstLog2 failure"); + + /// Computes 2 ^ value provided + /// + /// @returns 2 ^ (u) + template + inline constexpr T Pow2(T u) + { + return ((T)1 << u); + } + + static_assert(Pow2(0) == 1, "Pow2 failure"); + static_assert(Pow2(1) == 2, "Pow2 failure"); + static_assert(Pow2(7) == 128, "Pow2 failure"); + + /// Determines if a value is a power of two. + /// + /// @returns True if it is a power of two, false otherwise. + inline constexpr bool IsPowerOfTwo(uint64 value) + { + return (value == 0) ? false : ((value & (value - 1)) == 0); + } + + /// Rounds the specified uint 'value' up to the nearest value meeting the specified 'alignment'. Only power of 2 + /// alignments are supported by this function. + /// + /// returns Aligned value. + template + inline constexpr T Pow2Align( + T value, ///< Value to align. + uint64 alignment) ///< Desired alignment (must be a power of 2). + { + return ((value + static_cast(alignment) - 1) & ~(static_cast(alignment) - 1)); + } + + /// Rounds the specified uint 'value' up to the nearest power of 2 + /// + /// @returns Power of 2 padded value. + template + inline T Pow2Pad(T value) ///< Value to pad. + { + T ret = 1; + if (IsPowerOfTwo(value)) + { + ret = value; + } + else + { + while (ret < value) + { + ret <<= 1; + } + } + + return ret; + } + + /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. + /// + /// @returns Power of 2 padded value. + template + inline constexpr T _ConstPow2Pad(T value, T padded) ///< Value to pad. + { + return (padded < value) ? _ConstPow2Pad(value, padded << 1) : padded; + } + + /// Rounds the specified uint 'value' up to the nearest power of 2. Constexpr varient. + /// + /// @returns Power of 2 padded value. + template + inline constexpr T ConstPow2Pad(T value) ///< Value to pad. + { + return (IsPowerOfTwo(value)) ? value : _ConstPow2Pad(value, (T)1); + } + + static_assert(ConstPow2Pad(512) == 512, "ConstPow2Pad failure"); + static_assert(ConstPow2Pad(511) == 512, "ConstPow2Pad failure"); + static_assert(ConstPow2Pad(257) == 512, "ConstPow2Pad failure"); + + /// Finds the smallest of two values + /// + /// @returns a if a < b, otherwise b. + template + inline constexpr T Min(const T &a, const T &b) + { + return ((a < b) ? a : b); + } + + /// Finds the larger of two values + /// + /// @returns a if a > b, otherwise b. + template + inline constexpr T Max(const T &a, const T &b) + { + return ((a > b) ? a : b); + } + + // Given a type T, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // Given a type T&, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // Given a type T&&, set Type equal to T + template + struct RemoveRef + { + typedef T Type; + }; + + // std::move equivalent + template + inline typename RemoveRef::Type&& Move(T&& obj) + { + return static_cast::Type&&>(obj); + } + + // std::forward equivalent + template + inline T&& Forward(typename RemoveRef::Type&& args) + { + return static_cast(args); + } + + // std::forward equivalent + template + inline T&& Forward(typename RemoveRef::Type& args) + { + return static_cast(args); + } + + // Returns the contents of Value in a new variable, and assign newValue into the memory occupied by value. + template + inline T Exchange(T& value, U&& newValue) + { + T oldValue = Move(value); + value = Forward(newValue); + return (oldValue); + } + + // Convenience structure that defined Value as either true or false, and Type as either TrueType or FalseType + template + struct BoolType + { + static const bool Value = value; + using Type = BoolType; + }; + + using FalseType = BoolType; + using TrueType = BoolType; + + // Struct whose ::Type member is undefined if the first condition is not true + template + struct EnableIf + { + }; + + // Struct whose ::Type member is equal to T if the first condition is true. + template + struct EnableIf + { + typedef T Type; + }; + + template + struct IsPointer : FalseType + { + }; + + template + struct IsPointer : TrueType + { + }; + +#if defined(_MSC_VER) + // If we are building with MSVC we want to use the compiler intrinsics here. This is primarily because building with + // the /kernel precludes the use of the C++ type traits library. For all other compilers we simply implement this + // using the standard C++ library. + + // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. + template + struct IsConvertible : BoolType<__is_convertible_to(T, U)> + { + }; + + // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments + // provided. + template + struct IsConstructible : BoolType<__is_constructible(T, Args...)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsAbstract : BoolType<__is_abstract(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsPod : BoolType<__is_pod(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. + template + struct IsStandardLayout : BoolType<__is_standard_layout(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. + template + struct IsTriviallyDestructible : BoolType<__is_trivially_destructible(T)> + { + + }; + + // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. + template + struct IsEnum : BoolType<__is_enum(T)> + { + + }; +#else + // Struct whose ::Value member is equal to true if you can cast from T to U, and false otherwise. + template + struct IsConvertible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if you can construct an object of type T using the arguments + // provided. + template + struct IsConstructible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsAbstract : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an abstract class, and false otherwise. + template + struct IsPod : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is has a standard layout, and false otherwise. + template + struct IsStandardLayout : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is trivially destructable, and false otherwise. + template + struct IsTriviallyDestructible : BoolType::value> + { + + }; + + // Struct whose ::Value member is equal to true if T is an enumeration type, and false otherwise. + template + struct IsEnum : BoolType::value> + { + + }; +#endif + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h index 818f0a20e4..300bae754e 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/core/inc/platforms/ddcWinPlatform.h @@ -1,115 +1,115 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -#pragma once - -#if defined(_KERNEL_MODE) -static_assert(false, "This header is for user mode windows, and it does not work in kernel mode."); -#endif - -// Our code expects these defined before including Windows.h. -// However, we need to guard against clients defining them too. -#ifndef _CRT_RAND_S - #define _CRT_RAND_S -#endif - -#ifndef WIN32_LEAN_AND_MEAN - #define WIN32_LEAN_AND_MEAN -#endif - -#ifndef NOMINMAX - #define NOMINMAX -#endif - -// WIN32_NO_STATUS makes Windows.h not include macro definitions from winnt.h -// which collide with those from ntstatus.h. This avoids compilation errors -// when other files that include ntstatus.h also include this file. -#define WIN32_NO_STATUS -#include -#undef WIN32_NO_STATUS - -#include - -#define DD_RESTRICT __restrict - -#define DD_DEBUG_BREAK() __debugbreak() - -namespace DevDriver -{ - namespace Platform - { - /* platform functions for performing atomic operations */ - typedef volatile LONG Atomic; - DD_CHECK_SIZE(Atomic, sizeof(int32)); - - typedef volatile LONG64 Atomic64; - DD_CHECK_SIZE(Atomic64, sizeof(int64)); - - struct EmptyStruct {}; - - struct MutexStorage - { - CRITICAL_SECTION criticalSection; -#if !defined(NDEBUG) - Atomic lockCount; -#endif - }; - typedef Handle SemaphoreStorage; - typedef HANDLE EventStorage; - typedef HANDLE ThreadHandle; - typedef DWORD ThreadReturnType; - typedef HMODULE LibraryHandle; - - constexpr ThreadHandle kInvalidThreadHandle = NULL; - - // Maximum supported size for thread names, including NULL byte - // This exists because some platforms have hard limits on thread name size. - // Windows doesn't seem to have a thread name size limit, but we use this variable to control - // a formatting buffer as well and we want to keep it reasonably small since it's stack allocated. - static constexpr size_t kThreadNameMaxLength = 64; - - #define DD_APIENTRY APIENTRY - - namespace Windows - { - // Windows specific functions required for in-memory communication - Handle CreateSharedSemaphore(uint32 initialCount, uint32 maxCount); - Handle CopySemaphoreFromProcess(ProcessId processId, Handle hObject); - Result SignalSharedSemaphore(Handle pSemaphore); - Result WaitSharedSemaphore(Handle pSemaphore, uint32 millisecTimeout); - void CloseSharedSemaphore(Handle pSemaphore); - - Handle CreateSharedBuffer(Size bufferSizeInBytes); - void CloseSharedBuffer(Handle hSharedBuffer); - - Handle MapSystemBufferView(Handle hBuffer, Size bufferSizeInBytes); - Handle MapProcessBufferView(Handle hBuffer, ProcessId processId); - void UnmapBufferView(Handle hSharedBuffer, Handle hSharedBufferView); - - // Whether or not the user has enabled Windows Developer Mode on their system - // See: https://github.com/MicrosoftDocs/windows-uwp/blob/docs/hub/apps/get-started/enable-your-device-for-development.md - bool IsWin10DeveloperModeEnabled(); - } - } -} +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#pragma once + +#if defined(_KERNEL_MODE) +static_assert(false, "This header is for user mode windows, and it does not work in kernel mode."); +#endif + +// Our code expects these defined before including Windows.h. +// However, we need to guard against clients defining them too. +#ifndef _CRT_RAND_S + #define _CRT_RAND_S +#endif + +#ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN +#endif + +#ifndef NOMINMAX + #define NOMINMAX +#endif + +// WIN32_NO_STATUS makes Windows.h not include macro definitions from winnt.h +// which collide with those from ntstatus.h. This avoids compilation errors +// when other files that include ntstatus.h also include this file. +#define WIN32_NO_STATUS +#include +#undef WIN32_NO_STATUS + +#include + +#define DD_RESTRICT __restrict + +#define DD_DEBUG_BREAK() __debugbreak() + +namespace DevDriver +{ + namespace Platform + { + /* platform functions for performing atomic operations */ + typedef volatile LONG Atomic; + DD_CHECK_SIZE(Atomic, sizeof(int32)); + + typedef volatile LONG64 Atomic64; + DD_CHECK_SIZE(Atomic64, sizeof(int64)); + + struct EmptyStruct {}; + + struct MutexStorage + { + CRITICAL_SECTION criticalSection; +#if !defined(NDEBUG) + Atomic lockCount; +#endif + }; + typedef Handle SemaphoreStorage; + typedef HANDLE EventStorage; + typedef HANDLE ThreadHandle; + typedef DWORD ThreadReturnType; + typedef HMODULE LibraryHandle; + + constexpr ThreadHandle kInvalidThreadHandle = NULL; + + // Maximum supported size for thread names, including NULL byte + // This exists because some platforms have hard limits on thread name size. + // Windows doesn't seem to have a thread name size limit, but we use this variable to control + // a formatting buffer as well and we want to keep it reasonably small since it's stack allocated. + static constexpr size_t kThreadNameMaxLength = 64; + + #define DD_APIENTRY APIENTRY + + namespace Windows + { + // Windows specific functions required for in-memory communication + Handle CreateSharedSemaphore(uint32 initialCount, uint32 maxCount); + Handle CopySemaphoreFromProcess(ProcessId processId, Handle hObject); + Result SignalSharedSemaphore(Handle pSemaphore); + Result WaitSharedSemaphore(Handle pSemaphore, uint32 millisecTimeout); + void CloseSharedSemaphore(Handle pSemaphore); + + Handle CreateSharedBuffer(Size bufferSizeInBytes); + void CloseSharedBuffer(Handle hSharedBuffer); + + Handle MapSystemBufferView(Handle hBuffer, Size bufferSizeInBytes); + Handle MapProcessBufferView(Handle hBuffer, ProcessId processId); + void UnmapBufferView(Handle hSharedBuffer, Handle hSharedBufferView); + + // Whether or not the user has enabled Windows Developer Mode on their system + // See: https://github.com/MicrosoftDocs/windows-uwp/blob/docs/hub/apps/get-started/enable-your-device-for-development.md + bool IsWin10DeveloperModeEnabled(); + } + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h index e01ccedf10..21748a8e18 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/baseProtocolServer.h @@ -1,62 +1,62 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "protocolServer.h" - -namespace DevDriver -{ - class IMsgChannel; - - class BaseProtocolServer : public IProtocolServer - { - public: - virtual ~BaseProtocolServer(); - - Protocol GetProtocol() const override final { return m_protocol; }; - SessionType GetType() const override final { return SessionType::Server; }; - Version GetMinVersion() const override final { return m_minVersion; }; - Version GetMaxVersion() const override final { return m_maxVersion; }; - - bool GetSupportedVersion(Version minVersion, Version maxVersion, Version * version) const override final; - - virtual void Finalize() override; - protected: - BaseProtocolServer(IMsgChannel* pMsgChannel, Protocol protocol, Version minVersion, Version maxVersion); - - // Helper functions for working with SizedPayloadContainers - Result SendPayload(ISession* pSession, const SizedPayloadContainer* pPayload, uint32 timeoutInMs); - Result ReceivePayload(ISession* pSession, SizedPayloadContainer* pPayload, uint32 timeoutInMs); - - IMsgChannel* const m_pMsgChannel; - const Protocol m_protocol; - const Version m_minVersion; - const Version m_maxVersion; - - bool m_isFinalized; - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "protocolServer.h" + +namespace DevDriver +{ + class IMsgChannel; + + class BaseProtocolServer : public IProtocolServer + { + public: + virtual ~BaseProtocolServer(); + + Protocol GetProtocol() const override final { return m_protocol; }; + SessionType GetType() const override final { return SessionType::Server; }; + Version GetMinVersion() const override final { return m_minVersion; }; + Version GetMaxVersion() const override final { return m_maxVersion; }; + + bool GetSupportedVersion(Version minVersion, Version maxVersion, Version * version) const override final; + + virtual void Finalize() override; + protected: + BaseProtocolServer(IMsgChannel* pMsgChannel, Protocol protocol, Version minVersion, Version maxVersion); + + // Helper functions for working with SizedPayloadContainers + Result SendPayload(ISession* pSession, const SizedPayloadContainer* pPayload, uint32 timeoutInMs); + Result ReceivePayload(ISession* pSession, SizedPayloadContainer* pPayload, uint32 timeoutInMs); + + IMsgChannel* const m_pMsgChannel; + const Protocol m_protocol; + const Version m_minVersion; + const Version m_maxVersion; + + bool m_isFinalized; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h index 3c388ffc73..67cc6aecb9 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddDefs.h @@ -1,28 +1,28 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "../core/inc/ddcDefs.h" +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcDefs.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h index 7f5cee54f3..c34d52b362 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddLegacyDefs.h @@ -1,48 +1,48 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#ifndef DD_PLATFORM_WINDOWS_UM - #if _WIN32 && !_KERNEL_MODE - #define DD_PLATFORM_WINDOWS_UM 1 - #define DD_PLATFORM_IS_UM 1 - #endif -#endif - -#ifndef DD_PLATFORM_WINDOWS_KM - #if _WIN32 && _KERNEL_MODE - #define DD_PLATFORM_WINDOWS_KM 1 - #define DD_PLATFORM_IS_KM 1 - #endif -#endif - -#ifndef DD_PLATFORM_LINUX_UM - #ifdef __linux__ - #define DD_PLATFORM_LINUX_UM 1 - #define DD_PLATFORM_IS_UM 1 - #define DD_PLATFORM_IS_GNU 1 - #endif -#endif +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#ifndef DD_PLATFORM_WINDOWS_UM + #if _WIN32 && !_KERNEL_MODE + #define DD_PLATFORM_WINDOWS_UM 1 + #define DD_PLATFORM_IS_UM 1 + #endif +#endif + +#ifndef DD_PLATFORM_WINDOWS_KM + #if _WIN32 && _KERNEL_MODE + #define DD_PLATFORM_WINDOWS_KM 1 + #define DD_PLATFORM_IS_KM 1 + #endif +#endif + +#ifndef DD_PLATFORM_LINUX_UM + #ifdef __linux__ + #define DD_PLATFORM_LINUX_UM 1 + #define DD_PLATFORM_IS_UM 1 + #define DD_PLATFORM_IS_GNU 1 + #endif +#endif diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h index 6f98400190..481338b52b 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddPlatform.h @@ -1,28 +1,28 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "../core/inc/ddcPlatform.h" +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcPlatform.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h index 00a8e18c8e..5dd7d29db6 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddTemplate.h @@ -1,28 +1,28 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "../core/inc/ddcTemplate.h" +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../core/inc/ddcTemplate.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h index 3daa0cfd47..2a61155c9e 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/ddUriInterface.h @@ -1,291 +1,291 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" -#include "ddPlatform.h" - -namespace DevDriver -{ - namespace TransferProtocol - { - class ServerBlock; - } - - // The maximum allowed name for a service name - DD_STATIC_CONST size_t kMaxUriServiceNameLength = 128; - - enum struct URIDataFormat : uint32 - { - Unknown = 0, - Text, - Binary, - Count - }; - - // An interface to write bytes. - class IByteWriter - { - protected: - virtual ~IByteWriter() {} - - public: - // Finish all writing and return the last error. - virtual Result End() = 0; - - // Write exactly `length` bytes. - virtual void WriteBytes(const void* pBytes, size_t length) = 0; - - // Write a value as a byte array. - // N.B.: Be mindful of your struct's implicit padding! - template - void Write(const T& value) - { - static_assert(!Platform::IsPointer::Value, "Writing a pointer is likely an error. Cast to an integer type if you mean it."); - WriteBytes(&value, sizeof(value)); - } - }; - - // An interface to write and validate text. - class ITextWriter - { - protected: - virtual ~ITextWriter() {} - - public: - // Finish all writing and return the last error. - virtual Result End() = 0; - - // Write formatted text. - // Try and only pass string literals as `pFmt`. Prefer: Write("%s", myGeneratedBuffer); - virtual void Write(const char* pFmt, ...) = 0; - - // Write specific types - virtual void Write(uint64 value) = 0; - virtual void Write(uint32 value) = 0; - virtual void Write(uint16 value) = 0; - virtual void Write(uint8 value) = 0; - virtual void Write(int64 value) = 0; - virtual void Write(int32 value) = 0; - virtual void Write(int16 value) = 0; - virtual void Write(double value) = 0; - virtual void Write(float value) = 0; - virtual void Write(bool value) = 0; - virtual void Write(char value) = 0; - }; - - // An interface to write and validate structured data - e.g. json or message pack - class IStructuredWriter - { - protected: - virtual ~IStructuredWriter() {} - - public: - // Finish all writing and return the last error. - virtual Result End() = 0; - - // Structured data is often nullable. - // Write a "null" value. - virtual void ValueNull() = 0; - - // ===== Collection Writers ==================================================================================== - - // Begin writing a new list collection. - virtual void BeginList() = 0; - - // End the current list collection. - virtual void EndList() = 0; - - // Begin writing a new map collection. - virtual void BeginMap() = 0; - - // End the current map collection. - virtual void EndMap() = 0; - - // Write a key into a map. - virtual void Key(const char* pKey) = 0; - - // ===== Value Writers ========================================================================================= - - virtual void Value(const char* pValue) = 0; - virtual void Value(const char* pValue, size_t length) = 0; - - virtual void Value(uint64 value) = 0; - virtual void Value(uint32 value) = 0; - virtual void Value(uint16 value) = 0; - virtual void Value(uint8 value) = 0; - virtual void Value(int64 value) = 0; - virtual void Value(int32 value) = 0; - virtual void Value(int16 value) = 0; - virtual void Value(int8 value) = 0; - virtual void Value(double value) = 0; - virtual void Value(float value) = 0; - virtual void Value(bool value) = 0; - virtual void Value(char value) = 0; - - /// Writes an enum value as a String or hex value - /// If DevDriver::ToString(Enum) returns NULL or an empty string, it will hex-encode the integer value. - /// Otherwise, it will write that string - template - void ValueEnumOrHex(Enum value) - { - const char* pString = ToString(value); - if ((pString == nullptr) || (strcmp(pString, "") != 0)) - { - Value(pString); - } - else - { - Valuef("0x%x", value); - } - } - - // Write a formatted string - template - void Valuef(const char* pFmt, Args&&... args) - { - char buffer[1024]; - Platform::Snprintf(buffer, pFmt, args...); - Value(buffer); - } - - // ===== Key + Value Writers =================================================================================== - - // Write a key-value pair where the value will be a list. - void KeyAndBeginList(const char* pKey) { Key(pKey); BeginList(); } - - // Write a key-value pair where the value will be a map. - void KeyAndBeginMap(const char* pKey) { Key(pKey); BeginMap(); } - - // Write a key-value pair. - void KeyAndValue(const char* pKey, const char* pValue) { Key(pKey); Value(pValue); } - void KeyAndValue(const char* pKey, const char* pValue, size_t length) { Key(pKey); Value(pValue, length); } - void KeyAndValue(const char* pKey, uint64 value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, uint32 value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, int64 value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, int32 value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, double value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, float value) { Key(pKey); Value(value); } - void KeyAndValue(const char* pKey, bool value) { Key(pKey); Value(value); } - - template - void KeyAndValueEnumOrHex(const char* pKey, Enum value) { Key(pKey); ValueEnumOrHex(value); } - - // Write a key-value pair where the value will be a "null" value. - void KeyAndValueNull(const char* pKey) { Key(pKey); ValueNull(); } - - // Write a key-value pair with a formatted value - template - void KeyAndValuef(const char* pKey, const char* pFmt, Args&&... args) { Key(pKey); Valuef(pFmt, args...); } - }; - - // An aggregate of the POST metadata for a request. - struct PostDataInfo - { - const void* pData; // Immutable view of the post data - uint32 size; // Size of the post data in bytes - URIDataFormat format; // Format of the post data - i.e. how to read it - - // Zero initialize the struct. - PostDataInfo() - { - memset(this, 0, sizeof(*this)); - } - }; - - // An interface that represents a unique URI request - class IURIRequestContext - { - protected: - virtual ~IURIRequestContext() {} - - public: - // Retrieve the request argument string - // N.B: This is non-const and designed to be mutated - virtual char* GetRequestArguments() = 0; - - // Retrieve information about the post data of this request - virtual const PostDataInfo& GetPostData() const = 0; - - // Creates and returns a Writer to copy bytes into the response block. - // Only a single writer is allowed per request context. - // Returns: - // - Result::Rejected if any writer of any type has already been returned - // - Result::Error if `ppWriter` is `nullptr` - virtual Result BeginByteResponse(IByteWriter** ppWriter) = 0; - - // Creates and returns a Writer to copy text into the response block. - // Only a single writer is allowed per request context. - // Returns: - // - Result::Rejected if any writer of any type has already been returned - // - Result::Error if `ppWriter` is `nullptr` - virtual Result BeginTextResponse(ITextWriter** ppWriter) = 0; - - // Creates and returns a Writer to copy json into the response block. - // Only a single writer is allowed per request context. - // Returns: - // - Result::Rejected if any writer of any type has already been returned - // - Result::Error if `ppWriter` is `nullptr` - virtual Result BeginJsonResponse(IStructuredWriter** ppWriter) = 0; - }; - - struct URIResponseHeader - { - // The size of the response data in bytes - size_t responseDataSizeInBytes; - - // The format of the response data - URIDataFormat responseDataFormat; - }; - - // Base class for URI services - class IService - { - public: - virtual ~IService() {} - - // Returns the name of the service - virtual const char* GetName() const = 0; - - // Returns the service version - virtual Version GetVersion() const = 0; - - // Attempts to handle a request from a client - virtual Result HandleRequest(IURIRequestContext* pContext) = 0; - - // Determines the size limit for post data requests for the client request. By default services - // will not accept any post data. The pArguments paramter must remain non-const because the - // service may need to manipulate it for further processing. - virtual size_t QueryPostSizeLimit(char* pArguments) const - { - DD_UNUSED(pArguments); - return 0; - } - - protected: - IService() {}; - }; -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "ddPlatform.h" + +namespace DevDriver +{ + namespace TransferProtocol + { + class ServerBlock; + } + + // The maximum allowed name for a service name + DD_STATIC_CONST size_t kMaxUriServiceNameLength = 128; + + enum struct URIDataFormat : uint32 + { + Unknown = 0, + Text, + Binary, + Count + }; + + // An interface to write bytes. + class IByteWriter + { + protected: + virtual ~IByteWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Write exactly `length` bytes. + virtual void WriteBytes(const void* pBytes, size_t length) = 0; + + // Write a value as a byte array. + // N.B.: Be mindful of your struct's implicit padding! + template + void Write(const T& value) + { + static_assert(!Platform::IsPointer::Value, "Writing a pointer is likely an error. Cast to an integer type if you mean it."); + WriteBytes(&value, sizeof(value)); + } + }; + + // An interface to write and validate text. + class ITextWriter + { + protected: + virtual ~ITextWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Write formatted text. + // Try and only pass string literals as `pFmt`. Prefer: Write("%s", myGeneratedBuffer); + virtual void Write(const char* pFmt, ...) = 0; + + // Write specific types + virtual void Write(uint64 value) = 0; + virtual void Write(uint32 value) = 0; + virtual void Write(uint16 value) = 0; + virtual void Write(uint8 value) = 0; + virtual void Write(int64 value) = 0; + virtual void Write(int32 value) = 0; + virtual void Write(int16 value) = 0; + virtual void Write(double value) = 0; + virtual void Write(float value) = 0; + virtual void Write(bool value) = 0; + virtual void Write(char value) = 0; + }; + + // An interface to write and validate structured data - e.g. json or message pack + class IStructuredWriter + { + protected: + virtual ~IStructuredWriter() {} + + public: + // Finish all writing and return the last error. + virtual Result End() = 0; + + // Structured data is often nullable. + // Write a "null" value. + virtual void ValueNull() = 0; + + // ===== Collection Writers ==================================================================================== + + // Begin writing a new list collection. + virtual void BeginList() = 0; + + // End the current list collection. + virtual void EndList() = 0; + + // Begin writing a new map collection. + virtual void BeginMap() = 0; + + // End the current map collection. + virtual void EndMap() = 0; + + // Write a key into a map. + virtual void Key(const char* pKey) = 0; + + // ===== Value Writers ========================================================================================= + + virtual void Value(const char* pValue) = 0; + virtual void Value(const char* pValue, size_t length) = 0; + + virtual void Value(uint64 value) = 0; + virtual void Value(uint32 value) = 0; + virtual void Value(uint16 value) = 0; + virtual void Value(uint8 value) = 0; + virtual void Value(int64 value) = 0; + virtual void Value(int32 value) = 0; + virtual void Value(int16 value) = 0; + virtual void Value(int8 value) = 0; + virtual void Value(double value) = 0; + virtual void Value(float value) = 0; + virtual void Value(bool value) = 0; + virtual void Value(char value) = 0; + + /// Writes an enum value as a String or hex value + /// If DevDriver::ToString(Enum) returns NULL or an empty string, it will hex-encode the integer value. + /// Otherwise, it will write that string + template + void ValueEnumOrHex(Enum value) + { + const char* pString = ToString(value); + if ((pString == nullptr) || (strcmp(pString, "") != 0)) + { + Value(pString); + } + else + { + Valuef("0x%x", value); + } + } + + // Write a formatted string + template + void Valuef(const char* pFmt, Args&&... args) + { + char buffer[1024]; + Platform::Snprintf(buffer, pFmt, args...); + Value(buffer); + } + + // ===== Key + Value Writers =================================================================================== + + // Write a key-value pair where the value will be a list. + void KeyAndBeginList(const char* pKey) { Key(pKey); BeginList(); } + + // Write a key-value pair where the value will be a map. + void KeyAndBeginMap(const char* pKey) { Key(pKey); BeginMap(); } + + // Write a key-value pair. + void KeyAndValue(const char* pKey, const char* pValue) { Key(pKey); Value(pValue); } + void KeyAndValue(const char* pKey, const char* pValue, size_t length) { Key(pKey); Value(pValue, length); } + void KeyAndValue(const char* pKey, uint64 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, uint32 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, int64 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, int32 value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, double value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, float value) { Key(pKey); Value(value); } + void KeyAndValue(const char* pKey, bool value) { Key(pKey); Value(value); } + + template + void KeyAndValueEnumOrHex(const char* pKey, Enum value) { Key(pKey); ValueEnumOrHex(value); } + + // Write a key-value pair where the value will be a "null" value. + void KeyAndValueNull(const char* pKey) { Key(pKey); ValueNull(); } + + // Write a key-value pair with a formatted value + template + void KeyAndValuef(const char* pKey, const char* pFmt, Args&&... args) { Key(pKey); Valuef(pFmt, args...); } + }; + + // An aggregate of the POST metadata for a request. + struct PostDataInfo + { + const void* pData; // Immutable view of the post data + uint32 size; // Size of the post data in bytes + URIDataFormat format; // Format of the post data - i.e. how to read it + + // Zero initialize the struct. + PostDataInfo() + { + memset(this, 0, sizeof(*this)); + } + }; + + // An interface that represents a unique URI request + class IURIRequestContext + { + protected: + virtual ~IURIRequestContext() {} + + public: + // Retrieve the request argument string + // N.B: This is non-const and designed to be mutated + virtual char* GetRequestArguments() = 0; + + // Retrieve information about the post data of this request + virtual const PostDataInfo& GetPostData() const = 0; + + // Creates and returns a Writer to copy bytes into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginByteResponse(IByteWriter** ppWriter) = 0; + + // Creates and returns a Writer to copy text into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginTextResponse(ITextWriter** ppWriter) = 0; + + // Creates and returns a Writer to copy json into the response block. + // Only a single writer is allowed per request context. + // Returns: + // - Result::Rejected if any writer of any type has already been returned + // - Result::Error if `ppWriter` is `nullptr` + virtual Result BeginJsonResponse(IStructuredWriter** ppWriter) = 0; + }; + + struct URIResponseHeader + { + // The size of the response data in bytes + size_t responseDataSizeInBytes; + + // The format of the response data + URIDataFormat responseDataFormat; + }; + + // Base class for URI services + class IService + { + public: + virtual ~IService() {} + + // Returns the name of the service + virtual const char* GetName() const = 0; + + // Returns the service version + virtual Version GetVersion() const = 0; + + // Attempts to handle a request from a client + virtual Result HandleRequest(IURIRequestContext* pContext) = 0; + + // Determines the size limit for post data requests for the client request. By default services + // will not accept any post data. The pArguments paramter must remain non-const because the + // service may need to manipulate it for further processing. + virtual size_t QueryPostSizeLimit(char* pArguments) const + { + DD_UNUSED(pArguments); + return 0; + } + + protected: + IService() {}; + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h index 005f13515d..123e9eb7db 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/devDriverServer.h @@ -1,103 +1,103 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" -#include "msgChannel.h" -#include "msgTransport.h" -#include "protocols/systemProtocols.h" -#include "protocols/typemap.h" - -namespace DevDriver -{ - class IProtocolServer; - - // Server Creation Info - // This struct extends the MessageChannelCreateInfo struct and adds information about the destination host - // the client will connect to. It additionally allows specifying protocol servers to enable during initialization. - // See msgChannel.h for a full list of members. - struct ServerCreateInfo : public MessageChannelCreateInfo - { - HostInfo connectionInfo; // Connection information describing how the Server should connect - // to the message bus. - ProtocolFlags servers; // Set of boolean values indicating which servers should be created - // during initialization. - }; - - DD_STATIC_CONST uint32 kQueryStatusTimeoutInMs = 50; - - class DevDriverServer - { - public: - static bool IsConnectionAvailable(const HostInfo& hostInfo, uint32 timeout = kQueryStatusTimeoutInMs); - - explicit DevDriverServer(const AllocCb& allocCb, const ServerCreateInfo& createInfo); - ~DevDriverServer(); - - Result Initialize(); - void Finalize(); - void Destroy(); - - const AllocCb& GetAllocCb() const { return m_allocCb; } - -#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION - // Called by the driver to mark the end of Platform and the start of device initialization. - // Starting with GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION the driver should call the driver control - // functions directly. - void StartDeviceInit(); -#endif - - bool IsConnected() const; - IMsgChannel* GetMessageChannel() const; - - DriverControlProtocol::DriverControlServer* GetDriverControlServer(); - RGPProtocol::RGPServer* GetRGPServer(); - EventProtocol::EventServer* GetEventServer(); - SettingsURIService::SettingsService* GetSettingsService(); - InfoURIService::InfoService* GetInfoService(); - - bool ShouldShowOverlay(); - - private: - Result InitializeProtocols(); - void DestroyProtocols(); - - Result RegisterProtocol(Protocol protocol); - void UnregisterProtocol(Protocol protocol); - void FinalizeProtocol(Protocol protocol); - - IMsgChannel* m_pMsgChannel; - AllocCb m_allocCb; - ServerCreateInfo m_createInfo; - - template - inline Result RegisterProtocol(Args... args); - - template - inline ProtocolServerType* GetServer(); - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "msgChannel.h" +#include "msgTransport.h" +#include "protocols/systemProtocols.h" +#include "protocols/typemap.h" + +namespace DevDriver +{ + class IProtocolServer; + + // Server Creation Info + // This struct extends the MessageChannelCreateInfo struct and adds information about the destination host + // the client will connect to. It additionally allows specifying protocol servers to enable during initialization. + // See msgChannel.h for a full list of members. + struct ServerCreateInfo : public MessageChannelCreateInfo + { + HostInfo connectionInfo; // Connection information describing how the Server should connect + // to the message bus. + ProtocolFlags servers; // Set of boolean values indicating which servers should be created + // during initialization. + }; + + DD_STATIC_CONST uint32 kQueryStatusTimeoutInMs = 50; + + class DevDriverServer + { + public: + static bool IsConnectionAvailable(const HostInfo& hostInfo, uint32 timeout = kQueryStatusTimeoutInMs); + + explicit DevDriverServer(const AllocCb& allocCb, const ServerCreateInfo& createInfo); + ~DevDriverServer(); + + Result Initialize(); + void Finalize(); + void Destroy(); + + const AllocCb& GetAllocCb() const { return m_allocCb; } + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + // Called by the driver to mark the end of Platform and the start of device initialization. + // Starting with GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION the driver should call the driver control + // functions directly. + void StartDeviceInit(); +#endif + + bool IsConnected() const; + IMsgChannel* GetMessageChannel() const; + + DriverControlProtocol::DriverControlServer* GetDriverControlServer(); + RGPProtocol::RGPServer* GetRGPServer(); + EventProtocol::EventServer* GetEventServer(); + SettingsURIService::SettingsService* GetSettingsService(); + InfoURIService::InfoService* GetInfoService(); + + bool ShouldShowOverlay(); + + private: + Result InitializeProtocols(); + void DestroyProtocols(); + + Result RegisterProtocol(Protocol protocol); + void UnregisterProtocol(Protocol protocol); + void FinalizeProtocol(Protocol protocol); + + IMsgChannel* m_pMsgChannel; + AllocCb m_allocCb; + ServerCreateInfo m_createInfo; + + template + inline Result RegisterProtocol(Args... args); + + template + inline ProtocolServerType* GetServer(); + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h index 9cb516d910..ac24de416a 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/gpuopen.h @@ -1,590 +1,590 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include - -#define GPUOPEN_INTERFACE_MAJOR_VERSION 42 - -#define GPUOPEN_INTERFACE_MINOR_VERSION 1 - -#define GPUOPEN_INTERFACE_VERSION ((GPUOPEN_INTERFACE_MAJOR_VERSION << 16) | GPUOPEN_INTERFACE_MINOR_VERSION) - -#define GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION 38 - -#ifndef GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION - static_assert(false, "Client must define GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION."); -#else - static_assert((GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION) && - (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION <= GPUOPEN_INTERFACE_MAJOR_VERSION), - "The specified GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION is not supported."); -#endif - -// Next version number for interface breaking changes -#define DD_UNRELEASED_MAJOR_VERSION 40 - -/* -*********************************************************************************************************************** -*| Version | Change Description | -*| ------- | ---------------------------------------------------------------------------------------------------------| -*| 42.1 | Move Escape Commands to the shared header for access outside of message.h | -*| 42.0 | Updates RGP Protocol to support SPM counters and SE masking. | -*| 41.0 | Updates DriverControlProtocol to allow user to query device clock frequencies for a given | -*| | clock mode without changing the clock mode. | -*| 40.0 | Moves DriverStatus enum out of DriverControlProtocol and into gpuopen.h, and renames several | -*| | DriverControlProtocol functions. | -*| 39.0 | Simplified the LoggingClient interface to remove the internal pending message requirement. | -*| | Removed kInfiniteTimeout and replaced its uses with kLogicFailureTimeout. | -*| | Decoupled RGP trace parameters from trace execution. | -*| 38.0 | Added support for specifying hostname in ListenerCreateInfo and renamed enableUWP flag to | -*| | enableKernelTransport. | -*| 37.0 | Added support for Querying ClientInfo from DriverControlProtocol | -*| 36.1 | Removed internal log message queue inside LoggingClient. This improves performance significantly. | -*| 36.0 | Added support for capturing the RGP trace on specific frame or dispatch. | -*| | Added bitfield to control whether driver internal code objects are included in the code object database. | -*| 35.0 | Updated Settings URI enum SettingType to avoid X11 macro name collision. | -*| 34.0 | Updated URI services to define a version number for each service. | -*| 33.0 | Abstracts URIRequestContext into an abstract interface. | -*| 32.0 | Updated RGPClient::EndTrace to support user specified timeout values. This allows tools to support | -*| | long running traces via user controlled cancellation dialogs. | -*| 31.0 | Clean up DevDriverClient and DevDriverServer create info structs. Replace TransportCreateInfo | -*| | struct with MessageChannelCreateInfo and HostInfo structs. | -*| 30.2 | Added support for RGP v6 protocol which supports trace trigger markers. | -*| 30.1 | Add Push transfer support to the transfer protocol. Added PushBlock class, added v2 of the | -*| | TransferProtocol, and did a lot of internal cleanup. Legacy interfaces will be deprecated in a future | -*| | interface version change alongside URI changes. | -*| 30.0 | Remove CloseSession and OrphanSession from the public ISession object interface, and move the | -*| | functionality into the Session class. | -*| 29.0 | Added a ResponseDataFormat enum to the URI protocol to distinguish between binary and text responses. | -*| 28.0 | Formally deprecate legacy KMD client manager support in the Listener. | -*| 27.2 | Updated FindFirstClient to support returning the matching ClientMetadata struct. | -*| 27.1 | Added PipelineDumpsEnabled status flag. | -*| 27.0 | Deprecate global client status flags + replace it with client metadata. | -*| 26.0 | Add new trace parameters in the RGP protocol. | -*| 25.0 | Initial refactor of LoggingProtocol. Removes Subcategories, being able to set/clear filter outside of a | -*| | trace, and significantly reduces the complexity that is involved in using it. | -*| 24.0 | Expanded driver initialization concept in driver control protocol. | -*| 23.0 | Modified RGP client API usage pattern to be uniform across protocol versions. | -*| 22.0 | Refactor RGP client interface to support calculating transfer progress. | -*| 21.1 | Added backwards compatible workaround for the session termination bug until we get the fix in mainline. | -*| 21.0 | Enable link disconnection detection for socket based transports. | -*| 20.0 | Added support for specifying the clock mode used during RGP profiling. | -*| 19.0 | Refactor platform thread functions so that they are contained in a class. This is the last part of the | -*| | platform library that needed to be refactored, so future work will be focused on migrating the message | -*| | bus components to use the main platform library again. | -*| 18.0 | Consolidate DevDriver::DebugLevel and DevDriver::Message::DebugLevel into DevDriver::LogLevel. | -*| 17.0 | Rename DD_VERSION_IS_SUPPORTED macro to DD_VERSION_SUPPORTS for conciseness. | -*| 16.1 | Rework session version negotiation to allow clients to support multiple server versions. | -*| 16.0 | Change semantics of CreateProtocolClient to AcquireProtocolClient. This aligns better with | -*| | ReleaseProtoclClient in terms of semantics. | -*| 15.0 | Added support for memory allocator callbacks via AllocCb. | -*| 14.1 | Added DisableTrace call in RGP server to allow drivers to disable future traces if necessary. | -*| 14.0 | Add TraceParameters to the RGP protocol to allow for configuration of trace behavior. | -*| 13.0 | Deprecate DevDriverClient::CreateProtocolClient() in favor of typesafe templated version. | -*| 12.0 | Deprecate API features tied to legacy network protocol versions: | -*| | * Replace ConnectToRemoteClient with Connect, which now returns more detailed errors on failure | -*| | * Eliminate Send and ReceiveSessionMessage functions in IMsgChannel and SessionManager | -*| | * Rename AuthenticationFailed to VersionMismatch since it is more semantically accurate | -*| | * Update IMsgChannel::Update so that it takes a default timeout value, get rid of m_receiveTimeoutInMs | -*| 11.5 | Updated server to remove GetVersion() call and pass version into AcceptSession() instead. This allows | -*| | servers to potentially implement backwards compatibility for older client versions. Additionally, | -*| | completely eliminate SessionTermination type in favor of expanding Result type. This allows propagating | -*| | more information on connection failures back to clients, as well as streamlines some code. | -*| 11.4 | Implement per-protocol versioning. Client protocol is sent as part of session request, server decides | -*| | whether or not to accept session both from client and from version. Also rearrange how certain network | -*| | operations work: Syn now stores the initial session ID in the sessionId field of the message, Rst now | -*| | includes a result code, and closing a session now implicitly flushes both the client/server. Rst | -*| | Is also sent on just about every unknown session packet received, allowing faster error detection and | -*| | recovery. Bump network version number | -*| 11.3 | Change ProcessId type from 64bit to 32bit integer and bump network protocol version. | -*| 11.2 | Update the network protocol to give external protocols values from 0-223 and system protocols 224-255. | -*| | Also clean up + deprecate some of the constants associated with protocols. | -*| 11.1 | Force alignment of all network transmitted structs, as well as pad. This is a breaking change for the | -*| | network protocol, but is otherwise API compatible. | -*| 11.0 | Deprecate the Protocol::ClientManangement enum, as well as ReadMessageBuffer and SendMessageBuffer in | -*| | message.lib. | -*| 10.0 | Remove callback from MessageChannel to prevent usage that can cause deadlocking. | -*| 9.0 | Formalized support for selective discard of non-session messages based on right in the message. | -*| | Implementation is that the sequence field of a message can be populated with the contents of a | -*| | ClientMetadata struct, which is then used by the receiving message channel to determine if it should | -*| | respond. Decision is based on whether or not the metadata matches the metadata of the receiving client. | -*| 8.0 | Added support for default settings values in the settings protocol. Removed support for min and max | -*| | settings values since the scripts don't actually support those anyways. | -*| 7.0 | Added a Finalize function to DevDriverServer and all protocol server objects. This function now handles | -*| | the wait on start functionality for drivers internally. Finalize should now be called instead of the old | -*| | wait on start logic in client drivers. | -*| 6.0 | Update client protocol management so that DevDriverClient no longer caches a single instance of each | -*| | client protocol, and add ability for clients to directly create more than one client protocol instance. | -*| | Additionally, make changes to underlying message channel/transport API that is not backwards compatible, | -*| | as well as rename QueryClientInfoResponse to ClientInfoResponse and QueryClientInfoResponsePayload to | -*| | ClientInfoStruct. | -*| 5.0 | Update network protocol to allow specifying status flags at registration time, and add system message. | -*| | to indicate when a driver has been halted. Additionally, this changes the format of the client | -*| | registration packets so as to better detect version mismatch. It also fixes the ClientManangement typo. | -*| 4.0 | Refactor interface so as to better delineate between system protcols/client protocols, as well as add | -*| | ability to query protocol availability. Requires version bump, so also formally deprecated | -*| | Result::Timeout and ClientStatusFlags::ProfilingEnabled, as well as moved entire SessionProtocol | -*| | namespace out of the public headers. | -*| 3.1 | Introduce kNumberClientProtocols to replace usage of Protocol::Count | -*| 3.0 | Rename SettingsProtocol::SettingType::Bool to Boolean to avoid conflict with Xlib macro. Additionally | -*| | formally deprecate Result::Timeout. | -*| 2.2 | Added None (0) to ClientStatusFlags enum. | -*| 2.1 | Added kNamedPipeName to global namespace. | -*| 2.0 | Added functionality for enabling and disabling traces in RGPServer. Traces must now be explicitly | -*| | enabled before remote trace requests will succeed. | -*| 1.2 | Added AbortTrace() function to RGPServer. | -*| 1.1 | Added support for RGP protocol. | -*| 1.0 | Initial versioned release. | -*********************************************************************************************************************** -*/ - -#define GPUOPEN_RGP_SPM_COUNTERS_VERSION 42 -#define GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION 41 -#define GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION 40 -#define GPUOPEN_DECOUPLED_RGP_PARAMETERS_VERSION 39 -#define GPUOPEN_SIMPLER_LOGGING_VERSION 39 -#define GPUOPEN_LISTENER_HOSTNAME_VERSION 38 -#define GPUOPEN_SETTINGS_URI_LINUX_BUILD 35 -#define GPUOPEN_VERSIONED_URI_SERVICES_VERSION 34 -#define GPUOPEN_URIINTERFACE_CLEANUP_VERSION 33 -#define GPUOPEN_LONG_RGP_TRACES_VERSION 32 -#define GPUOPEN_CREATE_INFO_CLEANUP_VERSION 31 -#define GPUOPEN_SESSION_INTERFACE_CLEANUP_VERSION 30 -#define GPUOPEN_URI_RESPONSE_FORMATS_VERSION 29 -#define GPUOPEN_DEPRECATE_LEGACY_KMD_VERSION 28 -#define GPUOPEN_DISTRIBUTED_STATUS_FLAGS_VERSION 27 -#define GPUOPEN_RGP_TRACE_PARAMETERS_V3_VERSION 26 -#define GPUOPEN_LOGGING_SIMPLIFICATION_VERSION 25 -#define GPUOPEN_DRIVERCONTROL_INITIALIZATION_VERSION 24 -#define GPUOPEN_RGP_UNIFORM_API_VERSION 23 -#define GPUOPEN_RGP_PROGRESS_VERSION 22 -#define GPUOPEN_KEEPALIVE_VERSION 21 -#define GPUOPEN_PROFILING_CLOCK_MODES_VERSION 20 -#define GPUOPEN_THREAD_REFACTOR_VERSION 19 -#define GPUOPEN_LOGLEVEL_CLEANUP_VERSION 18 -#define GPUOPEN_RENAME_MACRO_VERSION 17 -#define GPUOPEN_PROTOCOL_CLIENT_REUSE_VERSION 16 -#define GPUOPEN_MEMORY_ALLOCATORS_VERSION 15 -#define GPUOPEN_RGP_TRACE_PARAMETERS_VERSION 14 -#define GPUOPEN_DEPRECATE_CREATEPROTOCOLCLIENT_VERSION 13 -#define GPUOPEN_DEPRECATE_LEGACY_NETAPI_VERSION 12 -#define GPUOPEN_POST_GDC_CLEANUP_VERSION 11 -#define GPUOPEN_DEPRECATE_EXTERNAL_CALLBACK_VERSION 10 -#define GPUOPEN_SELECTIVE_RESPOND_VERSION 9 -#define GPUOPEN_DEFAULT_SETTINGS_VERSION 8 -#define GPUOPEN_SERVER_FINALIZE_VERSION 7 -#define GPUOPEN_DEPRECATE_LEGACY_VERSION 6 -#define GPUOPEN_CLIENT_REGISTRATION_VERSION 5 -#define GPUOPEN_PROTOCOL_CLEANUP_VERSION 4 -#define GPUOPEN_LINUX_BUILD_VERSION 3 -#define GPUOPEN_EXPLICIT_ENABLE_RGP_VERSION 2 -#define GPUOPEN_INITIAL_VERSION 1 - -// This will be properly defined when RMV 1.1 features are complete, defining it now allows -// clients to code to the interface ahead of all of the work being complete. -#define GPUOPEN_RMV_1_1_VERSION 0xFFFF - -#define DD_VERSION_SUPPORTS(x) (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= x) - -namespace DevDriver -{ - typedef uint16_t ClientId; - typedef uint32_t SessionId; - typedef uint8_t MessageCode; - typedef uint16_t WindowSize; - typedef uint64_t Sequence; - typedef uint16_t Version; - typedef uint16_t StatusFlags; - -#if DD_VERSION_SUPPORTS(GPUOPEN_SIMPLER_LOGGING_VERSION) - // A common timeout in milliseconds for components to use when they do not expect timeout to fail. - // If an operation that uses this timeout returns Result::NotReady, consider it a fatal error. - DD_STATIC_CONST uint32 kLogicFailureTimeout = 1000; -#else - DD_STATIC_CONST uint32 kInfiniteTimeout = ~(0u); - DD_STATIC_CONST uint32 kLogicFailureTimeout = kInfiniteTimeout; -#endif - DD_STATIC_CONST uint32 kNoWait = (0u); - - //////////////////////////// - // Driver states - enum struct DriverStatus : uint32 - { - Running = 0, - Paused, - HaltedOnDeviceInit, - EarlyDeviceInit, - LateDeviceInit, - PlatformInit, - HaltedOnPlatformInit, - HaltedPostDeviceInit, - Count - }; - - //////////////////////////// - // Client status codes - enum struct ClientStatusFlags : StatusFlags - { - None = 0, - DeveloperModeEnabled = (1 << 0), - DeviceHaltOnConnect = (1 << 1), - GpuCrashDumpsEnabled = (1 << 2), - PipelineDumpsEnabled = (1 << 3), - PlatformHaltOnConnect = (1 << 4), - DriverInitializer = (1 << 5) - }; - - DD_CHECK_SIZE(ClientId, 2); - DD_STATIC_CONST int16 kRouterPrefixWidth = 3; - DD_STATIC_CONST int16 kRouterPrefixShift = (int16)(16 - kRouterPrefixWidth); - DD_STATIC_CONST ClientId kClientIdMask = (1 << kRouterPrefixShift) - 1; - DD_STATIC_CONST ClientId kRouterPrefixMask = static_cast(~(kClientIdMask)); - - union ProtocolFlags - { - struct DD_ALIGNAS(4) - { - // TODO: Replace logging, settings, and gpuCrashDump with "reserved" once all driver usage is removed. - uint32 logging : 1; - uint32 settings : 1; - uint32 driverControl : 1; - uint32 rgp : 1; - uint32 etw : 1; - uint32 gpuCrashDump : 1; - uint32 event : 1; - uint32 reserved : 25; - }; - uint32 value; - }; - - DD_CHECK_SIZE(ProtocolFlags, 4); - - //////////////////////////// - // Component definitions - enum struct Component : uint8 - { - Unknown = 0, - Server, - Tool, - Driver, - Count - }; - - struct DD_ALIGNAS(4) ClientMetadata - { - ProtocolFlags protocols; - Component clientType; - uint8 reserved; - StatusFlags status; - - // For System messages, which are not session-based, we alias the sequence field as ClientMetadata. This constructor - // is provided to help unpack the raw 64-bit sequence field into a ClientMetadata struct without needing to type-cast - explicit ClientMetadata(uint64 value) - { - // If we're going to alias as a 64-bit value, make sure the struct is still just 64-bits) - static_assert(sizeof(uint64) == sizeof(ClientMetadata), - "Size of ClientMetadata is no longer 64-bits, alias constructor needs updating"); - - // Bits 0-31 are the ProtocolFlags - protocols.value = static_cast(value & 0xFFFF); - - // Bits 32-39 are the Component - clientType = static_cast((value & 0xFF00000000) >> 32); - - // Bits 40-47 are reserved, ignore them and zero initialize - reserved = 0; - - // Bits 48-63 are the StatusFlags - status = static_cast((value & 0xFFFF000000000000) >> 48); - } - - // Default constructor, default initialize everything - ClientMetadata() = default; - - // Returns true if all values are default values - bool IsDefault() const - { - return ((protocols.value == 0) && (clientType == Component::Unknown) && (status == 0)); - } - - // Test if all non-zero fields in the ClientMetadata value are contained in the function parameter - bool Matches(const ClientMetadata &right) const - { - bool result = true; - - // The Matches function treats this struct as a filter, so a ClientMetadata with all default (zero) values - // by definition always matches. - if (IsDefault() == false) - { - // Component is an enum, so the comparison needs to be equality - const bool clientTypeMatches = - (clientType != Component::Unknown) - ? (clientType == right.clientType) - : true; - - // ProtocolFlags is a bit field, so we can do a bitwise comparison - const bool protocolMatches = - (protocols.value != 0) - ? (protocols.value & right.protocols.value) == protocols.value - : true; - // StatusFlags is a bit field, so we can do a bitwise comparison - const bool statusMatches = - (status != 0) - ? (status & right.status) == status - : true; - result = clientTypeMatches & protocolMatches & statusMatches; - } - - return result; - } - - // Test if any non-zero fields in the ClientMetadata value are contained in the function parameter - bool MatchesAny(const ClientMetadata &right) const - { - bool result = true; - - // The MatchesAny function treats this struct as a filter, so a ClientMetadata with all default (zero) values - // by definition always matches. - if (IsDefault() == false) - { - // Component is an enum, so the comparison needs to be equality - const bool clientTypeMatches = (clientType == right.clientType); - // ProtocolFlags is a bit field, so we can do a bitwise comparison - const bool protocolMatches = (protocols.value & right.protocols.value) != 0; - // StatusFlags is a bit field, so we can do a bitwise comparison - const bool statusMatches = (status & right.status) != 0; - result = clientTypeMatches | protocolMatches | statusMatches; - } - - return result; - } - }; - - DD_CHECK_SIZE(ClientMetadata, 8); - - //////////////////////////// - // Protocol definitions - enum struct Protocol : uint8 - { - DriverControl = 0, - Reserved0, - Reserved1, - RGP, - ETW, - Reserved2, - Event, - DefinedProtocolCount, - - // System enumerations - MaxUserProtocol = 223, - /* RESERVED FOR SYSTEM USE */ - Transfer = 251, - URI = 252, - Session = 253, - ClientManagement = 254, - System = 255, - }; - - // this gives you the number of pre-defined user protocols that exist - DD_STATIC_CONST uint32 kNumberClientProtocols = static_cast(Protocol::DefinedProtocolCount); - - // this gives you the maximum number of client protocols you can reserve. - DD_STATIC_CONST uint32 kMaxClientProtocolId = static_cast(Protocol::MaxUserProtocol); - - static_assert(kNumberClientProtocols <= (kMaxClientProtocolId + 1), "Invalid protocol definitions specified"); - - /////////////////////// - // General definitions - DD_STATIC_CONST uint32 kMessageVersion = 1011; - - // Max string size for names and messages - DD_STATIC_CONST Size kMaxStringLength = 128; - - // Broadcast client ID - DD_STATIC_CONST ClientId kBroadcastClientId = 0; - - // Invalid Session ID - DD_STATIC_CONST SessionId kInvalidSessionId = 0; - - // Default network port number - DD_STATIC_CONST uint16_t kDefaultNetworkPort = 27300; - - // Transport type enumeration - enum class TransportType : uint32 - { - Local = 0, - Remote, -#if defined(DD_PLATFORM_WINDOWS_UM) - MessageBus, -#endif - }; - - // Struct used to designate a transport type, port number, and hostname - struct HostInfo - { - TransportType type; // Transport type, as defined above - uint16_t port; // Port number if applicable - const char* pHostname; // Host address, address, or path - }; - - // Default local host information - DD_STATIC_CONST HostInfo kDefaultLocalHost = - { - TransportType::Remote, - kDefaultNetworkPort, - "localhost" - }; - - // Default named pipe information - DD_STATIC_CONST HostInfo kDefaultNamedPipe = - { - TransportType::Local, - 0, - nullptr - }; - -#if defined(DD_PLATFORM_WINDOWS_UM) - // Default message bus information - DD_STATIC_CONST HostInfo kMessageBus = - { - TransportType::MessageBus, - 0, - nullptr - }; -#endif - - //////////////////////////// - // Common definition of a message header - // - // todo: better packing of these values - // - payloadSize needs to be moved to where windowSize is currently - // - windowSize, sessionId, and sequence need to be moved into protocol specific payloads - // - minimum alignment could then be reduced to 2 bytes, and min packet size would be 8 bytes - // - downside is that pretty much every protocol would need to define some extra data - - DD_NETWORK_STRUCT(MessageHeader, 8) - { - // source and destination client ids - ClientId srcClientId; // 0 - 15 - ClientId dstClientId; // 16 - 31 - - // protocol and command - Protocol protocolId; // 31 - 38 - MessageCode messageId; // 39 - 47 - WindowSize windowSize; // 48 - 63 - - // payload size + current session ID - Size payloadSize; // 64 - 91 - SessionId sessionId; // 92 - 127 - - // sequence number when using a session - Sequence sequence; // 128 - 191 - }; - - DD_CHECK_SIZE(MessageHeader, 24); - - DD_STATIC_CONST Size kMaxMessageSizeInBytes = 1408; - DD_STATIC_CONST Size kMaxPayloadSizeInBytes = (kMaxMessageSizeInBytes - sizeof(MessageHeader)); - - DD_NETWORK_STRUCT(MessageBuffer, 8) - { - MessageHeader header; - char payload[kMaxPayloadSizeInBytes]; - }; - - DD_CHECK_SIZE(MessageBuffer, sizeof(MessageHeader) + kMaxPayloadSizeInBytes); - - // Helper function used to validate message buffers that arrive from an external source - // Returns Success if the message buffer is valid and Error otherwise. - inline Result ValidateMessageBuffer(const void* pMsgBuffer, size_t msgBufferSize) - { - Result result = Result::Error; - - // Ensure that we've been passed valid parameters - if ((pMsgBuffer != nullptr) && (msgBufferSize > 0)) - { - // A valid message buffer must be no larger than the full size message buffer structure - // and it must also be large enough to contain a valid header. - if ((msgBufferSize <= sizeof(MessageBuffer)) && (msgBufferSize >= sizeof(MessageHeader))) - { - // Calculate the total size of the message from the data encoded in the buffer. - const MessageHeader* pHeader = reinterpret_cast(pMsgBuffer); - const size_t encodedMessageSize = (sizeof(MessageHeader) + pHeader->payloadSize); - - // The encoded message size should match our expected size exactly - if (encodedMessageSize == msgBufferSize) - { - result = Result::Success; - } - } - } - else - { - result = Result::InvalidParameter; - } - - return result; - } - - // tripwire - this intentionally will break if the message version changes. Since these are breaking changes already, we need to address - // this problem when it happens. - static_assert(kMessageVersion == 1011, "ClientInfoStruct needs to be updated so that clientName is long enough to support a full path"); - // todo: shorten clientDescription to 64bytes and make clientName 320bytes to support full path - DD_NETWORK_STRUCT(ClientInfoStruct, 4) - { - char clientName[kMaxStringLength]; - char clientDescription[kMaxStringLength]; - // reserve 128bytes in case we need another string in the future - char reserved[kMaxStringLength]; - ClientMetadata metadata; - ProcessId processId; - // pad this out to 512 bytes for future expansion - char padding[116]; - }; - - DD_CHECK_SIZE(ClientInfoStruct, 512); - - /////////////////////// - // GPU Open Message codes - enum struct EscapeCommand : uint32 - { - Unknown = 0, - QueryStatus, // Will be deprecated in a future change - RegisterClient, - UnregisterClient, - RegisterExternalClient, // Will be deprecated in a future change - UnregisterExternalClient, // Will be deprecated in a future change - UpdateClientStatus, // Will be deprecated in a future change - QueryCapabilities, - EnableDeveloperMode, - DisableDeveloperMode, - QueryDeveloperModeStatus, - RegisterRouter, - UnregisterRouter, - AmdLogEvent, - Count - }; -} +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#define GPUOPEN_INTERFACE_MAJOR_VERSION 42 + +#define GPUOPEN_INTERFACE_MINOR_VERSION 1 + +#define GPUOPEN_INTERFACE_VERSION ((GPUOPEN_INTERFACE_MAJOR_VERSION << 16) | GPUOPEN_INTERFACE_MINOR_VERSION) + +#define GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION 38 + +#ifndef GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION + static_assert(false, "Client must define GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION."); +#else + static_assert((GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= GPUOPEN_MINIMUM_INTERFACE_MAJOR_VERSION) && + (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION <= GPUOPEN_INTERFACE_MAJOR_VERSION), + "The specified GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION is not supported."); +#endif + +// Next version number for interface breaking changes +#define DD_UNRELEASED_MAJOR_VERSION 40 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 42.1 | Move Escape Commands to the shared header for access outside of message.h | +*| 42.0 | Updates RGP Protocol to support SPM counters and SE masking. | +*| 41.0 | Updates DriverControlProtocol to allow user to query device clock frequencies for a given | +*| | clock mode without changing the clock mode. | +*| 40.0 | Moves DriverStatus enum out of DriverControlProtocol and into gpuopen.h, and renames several | +*| | DriverControlProtocol functions. | +*| 39.0 | Simplified the LoggingClient interface to remove the internal pending message requirement. | +*| | Removed kInfiniteTimeout and replaced its uses with kLogicFailureTimeout. | +*| | Decoupled RGP trace parameters from trace execution. | +*| 38.0 | Added support for specifying hostname in ListenerCreateInfo and renamed enableUWP flag to | +*| | enableKernelTransport. | +*| 37.0 | Added support for Querying ClientInfo from DriverControlProtocol | +*| 36.1 | Removed internal log message queue inside LoggingClient. This improves performance significantly. | +*| 36.0 | Added support for capturing the RGP trace on specific frame or dispatch. | +*| | Added bitfield to control whether driver internal code objects are included in the code object database. | +*| 35.0 | Updated Settings URI enum SettingType to avoid X11 macro name collision. | +*| 34.0 | Updated URI services to define a version number for each service. | +*| 33.0 | Abstracts URIRequestContext into an abstract interface. | +*| 32.0 | Updated RGPClient::EndTrace to support user specified timeout values. This allows tools to support | +*| | long running traces via user controlled cancellation dialogs. | +*| 31.0 | Clean up DevDriverClient and DevDriverServer create info structs. Replace TransportCreateInfo | +*| | struct with MessageChannelCreateInfo and HostInfo structs. | +*| 30.2 | Added support for RGP v6 protocol which supports trace trigger markers. | +*| 30.1 | Add Push transfer support to the transfer protocol. Added PushBlock class, added v2 of the | +*| | TransferProtocol, and did a lot of internal cleanup. Legacy interfaces will be deprecated in a future | +*| | interface version change alongside URI changes. | +*| 30.0 | Remove CloseSession and OrphanSession from the public ISession object interface, and move the | +*| | functionality into the Session class. | +*| 29.0 | Added a ResponseDataFormat enum to the URI protocol to distinguish between binary and text responses. | +*| 28.0 | Formally deprecate legacy KMD client manager support in the Listener. | +*| 27.2 | Updated FindFirstClient to support returning the matching ClientMetadata struct. | +*| 27.1 | Added PipelineDumpsEnabled status flag. | +*| 27.0 | Deprecate global client status flags + replace it with client metadata. | +*| 26.0 | Add new trace parameters in the RGP protocol. | +*| 25.0 | Initial refactor of LoggingProtocol. Removes Subcategories, being able to set/clear filter outside of a | +*| | trace, and significantly reduces the complexity that is involved in using it. | +*| 24.0 | Expanded driver initialization concept in driver control protocol. | +*| 23.0 | Modified RGP client API usage pattern to be uniform across protocol versions. | +*| 22.0 | Refactor RGP client interface to support calculating transfer progress. | +*| 21.1 | Added backwards compatible workaround for the session termination bug until we get the fix in mainline. | +*| 21.0 | Enable link disconnection detection for socket based transports. | +*| 20.0 | Added support for specifying the clock mode used during RGP profiling. | +*| 19.0 | Refactor platform thread functions so that they are contained in a class. This is the last part of the | +*| | platform library that needed to be refactored, so future work will be focused on migrating the message | +*| | bus components to use the main platform library again. | +*| 18.0 | Consolidate DevDriver::DebugLevel and DevDriver::Message::DebugLevel into DevDriver::LogLevel. | +*| 17.0 | Rename DD_VERSION_IS_SUPPORTED macro to DD_VERSION_SUPPORTS for conciseness. | +*| 16.1 | Rework session version negotiation to allow clients to support multiple server versions. | +*| 16.0 | Change semantics of CreateProtocolClient to AcquireProtocolClient. This aligns better with | +*| | ReleaseProtoclClient in terms of semantics. | +*| 15.0 | Added support for memory allocator callbacks via AllocCb. | +*| 14.1 | Added DisableTrace call in RGP server to allow drivers to disable future traces if necessary. | +*| 14.0 | Add TraceParameters to the RGP protocol to allow for configuration of trace behavior. | +*| 13.0 | Deprecate DevDriverClient::CreateProtocolClient() in favor of typesafe templated version. | +*| 12.0 | Deprecate API features tied to legacy network protocol versions: | +*| | * Replace ConnectToRemoteClient with Connect, which now returns more detailed errors on failure | +*| | * Eliminate Send and ReceiveSessionMessage functions in IMsgChannel and SessionManager | +*| | * Rename AuthenticationFailed to VersionMismatch since it is more semantically accurate | +*| | * Update IMsgChannel::Update so that it takes a default timeout value, get rid of m_receiveTimeoutInMs | +*| 11.5 | Updated server to remove GetVersion() call and pass version into AcceptSession() instead. This allows | +*| | servers to potentially implement backwards compatibility for older client versions. Additionally, | +*| | completely eliminate SessionTermination type in favor of expanding Result type. This allows propagating | +*| | more information on connection failures back to clients, as well as streamlines some code. | +*| 11.4 | Implement per-protocol versioning. Client protocol is sent as part of session request, server decides | +*| | whether or not to accept session both from client and from version. Also rearrange how certain network | +*| | operations work: Syn now stores the initial session ID in the sessionId field of the message, Rst now | +*| | includes a result code, and closing a session now implicitly flushes both the client/server. Rst | +*| | Is also sent on just about every unknown session packet received, allowing faster error detection and | +*| | recovery. Bump network version number | +*| 11.3 | Change ProcessId type from 64bit to 32bit integer and bump network protocol version. | +*| 11.2 | Update the network protocol to give external protocols values from 0-223 and system protocols 224-255. | +*| | Also clean up + deprecate some of the constants associated with protocols. | +*| 11.1 | Force alignment of all network transmitted structs, as well as pad. This is a breaking change for the | +*| | network protocol, but is otherwise API compatible. | +*| 11.0 | Deprecate the Protocol::ClientManangement enum, as well as ReadMessageBuffer and SendMessageBuffer in | +*| | message.lib. | +*| 10.0 | Remove callback from MessageChannel to prevent usage that can cause deadlocking. | +*| 9.0 | Formalized support for selective discard of non-session messages based on right in the message. | +*| | Implementation is that the sequence field of a message can be populated with the contents of a | +*| | ClientMetadata struct, which is then used by the receiving message channel to determine if it should | +*| | respond. Decision is based on whether or not the metadata matches the metadata of the receiving client. | +*| 8.0 | Added support for default settings values in the settings protocol. Removed support for min and max | +*| | settings values since the scripts don't actually support those anyways. | +*| 7.0 | Added a Finalize function to DevDriverServer and all protocol server objects. This function now handles | +*| | the wait on start functionality for drivers internally. Finalize should now be called instead of the old | +*| | wait on start logic in client drivers. | +*| 6.0 | Update client protocol management so that DevDriverClient no longer caches a single instance of each | +*| | client protocol, and add ability for clients to directly create more than one client protocol instance. | +*| | Additionally, make changes to underlying message channel/transport API that is not backwards compatible, | +*| | as well as rename QueryClientInfoResponse to ClientInfoResponse and QueryClientInfoResponsePayload to | +*| | ClientInfoStruct. | +*| 5.0 | Update network protocol to allow specifying status flags at registration time, and add system message. | +*| | to indicate when a driver has been halted. Additionally, this changes the format of the client | +*| | registration packets so as to better detect version mismatch. It also fixes the ClientManangement typo. | +*| 4.0 | Refactor interface so as to better delineate between system protcols/client protocols, as well as add | +*| | ability to query protocol availability. Requires version bump, so also formally deprecated | +*| | Result::Timeout and ClientStatusFlags::ProfilingEnabled, as well as moved entire SessionProtocol | +*| | namespace out of the public headers. | +*| 3.1 | Introduce kNumberClientProtocols to replace usage of Protocol::Count | +*| 3.0 | Rename SettingsProtocol::SettingType::Bool to Boolean to avoid conflict with Xlib macro. Additionally | +*| | formally deprecate Result::Timeout. | +*| 2.2 | Added None (0) to ClientStatusFlags enum. | +*| 2.1 | Added kNamedPipeName to global namespace. | +*| 2.0 | Added functionality for enabling and disabling traces in RGPServer. Traces must now be explicitly | +*| | enabled before remote trace requests will succeed. | +*| 1.2 | Added AbortTrace() function to RGPServer. | +*| 1.1 | Added support for RGP protocol. | +*| 1.0 | Initial versioned release. | +*********************************************************************************************************************** +*/ + +#define GPUOPEN_RGP_SPM_COUNTERS_VERSION 42 +#define GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION 41 +#define GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION 40 +#define GPUOPEN_DECOUPLED_RGP_PARAMETERS_VERSION 39 +#define GPUOPEN_SIMPLER_LOGGING_VERSION 39 +#define GPUOPEN_LISTENER_HOSTNAME_VERSION 38 +#define GPUOPEN_SETTINGS_URI_LINUX_BUILD 35 +#define GPUOPEN_VERSIONED_URI_SERVICES_VERSION 34 +#define GPUOPEN_URIINTERFACE_CLEANUP_VERSION 33 +#define GPUOPEN_LONG_RGP_TRACES_VERSION 32 +#define GPUOPEN_CREATE_INFO_CLEANUP_VERSION 31 +#define GPUOPEN_SESSION_INTERFACE_CLEANUP_VERSION 30 +#define GPUOPEN_URI_RESPONSE_FORMATS_VERSION 29 +#define GPUOPEN_DEPRECATE_LEGACY_KMD_VERSION 28 +#define GPUOPEN_DISTRIBUTED_STATUS_FLAGS_VERSION 27 +#define GPUOPEN_RGP_TRACE_PARAMETERS_V3_VERSION 26 +#define GPUOPEN_LOGGING_SIMPLIFICATION_VERSION 25 +#define GPUOPEN_DRIVERCONTROL_INITIALIZATION_VERSION 24 +#define GPUOPEN_RGP_UNIFORM_API_VERSION 23 +#define GPUOPEN_RGP_PROGRESS_VERSION 22 +#define GPUOPEN_KEEPALIVE_VERSION 21 +#define GPUOPEN_PROFILING_CLOCK_MODES_VERSION 20 +#define GPUOPEN_THREAD_REFACTOR_VERSION 19 +#define GPUOPEN_LOGLEVEL_CLEANUP_VERSION 18 +#define GPUOPEN_RENAME_MACRO_VERSION 17 +#define GPUOPEN_PROTOCOL_CLIENT_REUSE_VERSION 16 +#define GPUOPEN_MEMORY_ALLOCATORS_VERSION 15 +#define GPUOPEN_RGP_TRACE_PARAMETERS_VERSION 14 +#define GPUOPEN_DEPRECATE_CREATEPROTOCOLCLIENT_VERSION 13 +#define GPUOPEN_DEPRECATE_LEGACY_NETAPI_VERSION 12 +#define GPUOPEN_POST_GDC_CLEANUP_VERSION 11 +#define GPUOPEN_DEPRECATE_EXTERNAL_CALLBACK_VERSION 10 +#define GPUOPEN_SELECTIVE_RESPOND_VERSION 9 +#define GPUOPEN_DEFAULT_SETTINGS_VERSION 8 +#define GPUOPEN_SERVER_FINALIZE_VERSION 7 +#define GPUOPEN_DEPRECATE_LEGACY_VERSION 6 +#define GPUOPEN_CLIENT_REGISTRATION_VERSION 5 +#define GPUOPEN_PROTOCOL_CLEANUP_VERSION 4 +#define GPUOPEN_LINUX_BUILD_VERSION 3 +#define GPUOPEN_EXPLICIT_ENABLE_RGP_VERSION 2 +#define GPUOPEN_INITIAL_VERSION 1 + +// This will be properly defined when RMV 1.1 features are complete, defining it now allows +// clients to code to the interface ahead of all of the work being complete. +#define GPUOPEN_RMV_1_1_VERSION 0xFFFF + +#define DD_VERSION_SUPPORTS(x) (GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION >= x) + +namespace DevDriver +{ + typedef uint16_t ClientId; + typedef uint32_t SessionId; + typedef uint8_t MessageCode; + typedef uint16_t WindowSize; + typedef uint64_t Sequence; + typedef uint16_t Version; + typedef uint16_t StatusFlags; + +#if DD_VERSION_SUPPORTS(GPUOPEN_SIMPLER_LOGGING_VERSION) + // A common timeout in milliseconds for components to use when they do not expect timeout to fail. + // If an operation that uses this timeout returns Result::NotReady, consider it a fatal error. + DD_STATIC_CONST uint32 kLogicFailureTimeout = 1000; +#else + DD_STATIC_CONST uint32 kInfiniteTimeout = ~(0u); + DD_STATIC_CONST uint32 kLogicFailureTimeout = kInfiniteTimeout; +#endif + DD_STATIC_CONST uint32 kNoWait = (0u); + + //////////////////////////// + // Driver states + enum struct DriverStatus : uint32 + { + Running = 0, + Paused, + HaltedOnDeviceInit, + EarlyDeviceInit, + LateDeviceInit, + PlatformInit, + HaltedOnPlatformInit, + HaltedPostDeviceInit, + Count + }; + + //////////////////////////// + // Client status codes + enum struct ClientStatusFlags : StatusFlags + { + None = 0, + DeveloperModeEnabled = (1 << 0), + DeviceHaltOnConnect = (1 << 1), + GpuCrashDumpsEnabled = (1 << 2), + PipelineDumpsEnabled = (1 << 3), + PlatformHaltOnConnect = (1 << 4), + DriverInitializer = (1 << 5) + }; + + DD_CHECK_SIZE(ClientId, 2); + DD_STATIC_CONST int16 kRouterPrefixWidth = 3; + DD_STATIC_CONST int16 kRouterPrefixShift = (int16)(16 - kRouterPrefixWidth); + DD_STATIC_CONST ClientId kClientIdMask = (1 << kRouterPrefixShift) - 1; + DD_STATIC_CONST ClientId kRouterPrefixMask = static_cast(~(kClientIdMask)); + + union ProtocolFlags + { + struct DD_ALIGNAS(4) + { + // TODO: Replace logging, settings, and gpuCrashDump with "reserved" once all driver usage is removed. + uint32 logging : 1; + uint32 settings : 1; + uint32 driverControl : 1; + uint32 rgp : 1; + uint32 etw : 1; + uint32 gpuCrashDump : 1; + uint32 event : 1; + uint32 reserved : 25; + }; + uint32 value; + }; + + DD_CHECK_SIZE(ProtocolFlags, 4); + + //////////////////////////// + // Component definitions + enum struct Component : uint8 + { + Unknown = 0, + Server, + Tool, + Driver, + Count + }; + + struct DD_ALIGNAS(4) ClientMetadata + { + ProtocolFlags protocols; + Component clientType; + uint8 reserved; + StatusFlags status; + + // For System messages, which are not session-based, we alias the sequence field as ClientMetadata. This constructor + // is provided to help unpack the raw 64-bit sequence field into a ClientMetadata struct without needing to type-cast + explicit ClientMetadata(uint64 value) + { + // If we're going to alias as a 64-bit value, make sure the struct is still just 64-bits) + static_assert(sizeof(uint64) == sizeof(ClientMetadata), + "Size of ClientMetadata is no longer 64-bits, alias constructor needs updating"); + + // Bits 0-31 are the ProtocolFlags + protocols.value = static_cast(value & 0xFFFF); + + // Bits 32-39 are the Component + clientType = static_cast((value & 0xFF00000000) >> 32); + + // Bits 40-47 are reserved, ignore them and zero initialize + reserved = 0; + + // Bits 48-63 are the StatusFlags + status = static_cast((value & 0xFFFF000000000000) >> 48); + } + + // Default constructor, default initialize everything + ClientMetadata() = default; + + // Returns true if all values are default values + bool IsDefault() const + { + return ((protocols.value == 0) && (clientType == Component::Unknown) && (status == 0)); + } + + // Test if all non-zero fields in the ClientMetadata value are contained in the function parameter + bool Matches(const ClientMetadata &right) const + { + bool result = true; + + // The Matches function treats this struct as a filter, so a ClientMetadata with all default (zero) values + // by definition always matches. + if (IsDefault() == false) + { + // Component is an enum, so the comparison needs to be equality + const bool clientTypeMatches = + (clientType != Component::Unknown) + ? (clientType == right.clientType) + : true; + + // ProtocolFlags is a bit field, so we can do a bitwise comparison + const bool protocolMatches = + (protocols.value != 0) + ? (protocols.value & right.protocols.value) == protocols.value + : true; + // StatusFlags is a bit field, so we can do a bitwise comparison + const bool statusMatches = + (status != 0) + ? (status & right.status) == status + : true; + result = clientTypeMatches & protocolMatches & statusMatches; + } + + return result; + } + + // Test if any non-zero fields in the ClientMetadata value are contained in the function parameter + bool MatchesAny(const ClientMetadata &right) const + { + bool result = true; + + // The MatchesAny function treats this struct as a filter, so a ClientMetadata with all default (zero) values + // by definition always matches. + if (IsDefault() == false) + { + // Component is an enum, so the comparison needs to be equality + const bool clientTypeMatches = (clientType == right.clientType); + // ProtocolFlags is a bit field, so we can do a bitwise comparison + const bool protocolMatches = (protocols.value & right.protocols.value) != 0; + // StatusFlags is a bit field, so we can do a bitwise comparison + const bool statusMatches = (status & right.status) != 0; + result = clientTypeMatches | protocolMatches | statusMatches; + } + + return result; + } + }; + + DD_CHECK_SIZE(ClientMetadata, 8); + + //////////////////////////// + // Protocol definitions + enum struct Protocol : uint8 + { + DriverControl = 0, + Reserved0, + Reserved1, + RGP, + ETW, + Reserved2, + Event, + DefinedProtocolCount, + + // System enumerations + MaxUserProtocol = 223, + /* RESERVED FOR SYSTEM USE */ + Transfer = 251, + URI = 252, + Session = 253, + ClientManagement = 254, + System = 255, + }; + + // this gives you the number of pre-defined user protocols that exist + DD_STATIC_CONST uint32 kNumberClientProtocols = static_cast(Protocol::DefinedProtocolCount); + + // this gives you the maximum number of client protocols you can reserve. + DD_STATIC_CONST uint32 kMaxClientProtocolId = static_cast(Protocol::MaxUserProtocol); + + static_assert(kNumberClientProtocols <= (kMaxClientProtocolId + 1), "Invalid protocol definitions specified"); + + /////////////////////// + // General definitions + DD_STATIC_CONST uint32 kMessageVersion = 1011; + + // Max string size for names and messages + DD_STATIC_CONST Size kMaxStringLength = 128; + + // Broadcast client ID + DD_STATIC_CONST ClientId kBroadcastClientId = 0; + + // Invalid Session ID + DD_STATIC_CONST SessionId kInvalidSessionId = 0; + + // Default network port number + DD_STATIC_CONST uint16_t kDefaultNetworkPort = 27300; + + // Transport type enumeration + enum class TransportType : uint32 + { + Local = 0, + Remote, +#if defined(DD_PLATFORM_WINDOWS_UM) + MessageBus, +#endif + }; + + // Struct used to designate a transport type, port number, and hostname + struct HostInfo + { + TransportType type; // Transport type, as defined above + uint16_t port; // Port number if applicable + const char* pHostname; // Host address, address, or path + }; + + // Default local host information + DD_STATIC_CONST HostInfo kDefaultLocalHost = + { + TransportType::Remote, + kDefaultNetworkPort, + "localhost" + }; + + // Default named pipe information + DD_STATIC_CONST HostInfo kDefaultNamedPipe = + { + TransportType::Local, + 0, + nullptr + }; + +#if defined(DD_PLATFORM_WINDOWS_UM) + // Default message bus information + DD_STATIC_CONST HostInfo kMessageBus = + { + TransportType::MessageBus, + 0, + nullptr + }; +#endif + + //////////////////////////// + // Common definition of a message header + // + // todo: better packing of these values + // - payloadSize needs to be moved to where windowSize is currently + // - windowSize, sessionId, and sequence need to be moved into protocol specific payloads + // - minimum alignment could then be reduced to 2 bytes, and min packet size would be 8 bytes + // - downside is that pretty much every protocol would need to define some extra data + + DD_NETWORK_STRUCT(MessageHeader, 8) + { + // source and destination client ids + ClientId srcClientId; // 0 - 15 + ClientId dstClientId; // 16 - 31 + + // protocol and command + Protocol protocolId; // 31 - 38 + MessageCode messageId; // 39 - 47 + WindowSize windowSize; // 48 - 63 + + // payload size + current session ID + Size payloadSize; // 64 - 91 + SessionId sessionId; // 92 - 127 + + // sequence number when using a session + Sequence sequence; // 128 - 191 + }; + + DD_CHECK_SIZE(MessageHeader, 24); + + DD_STATIC_CONST Size kMaxMessageSizeInBytes = 1408; + DD_STATIC_CONST Size kMaxPayloadSizeInBytes = (kMaxMessageSizeInBytes - sizeof(MessageHeader)); + + DD_NETWORK_STRUCT(MessageBuffer, 8) + { + MessageHeader header; + char payload[kMaxPayloadSizeInBytes]; + }; + + DD_CHECK_SIZE(MessageBuffer, sizeof(MessageHeader) + kMaxPayloadSizeInBytes); + + // Helper function used to validate message buffers that arrive from an external source + // Returns Success if the message buffer is valid and Error otherwise. + inline Result ValidateMessageBuffer(const void* pMsgBuffer, size_t msgBufferSize) + { + Result result = Result::Error; + + // Ensure that we've been passed valid parameters + if ((pMsgBuffer != nullptr) && (msgBufferSize > 0)) + { + // A valid message buffer must be no larger than the full size message buffer structure + // and it must also be large enough to contain a valid header. + if ((msgBufferSize <= sizeof(MessageBuffer)) && (msgBufferSize >= sizeof(MessageHeader))) + { + // Calculate the total size of the message from the data encoded in the buffer. + const MessageHeader* pHeader = reinterpret_cast(pMsgBuffer); + const size_t encodedMessageSize = (sizeof(MessageHeader) + pHeader->payloadSize); + + // The encoded message size should match our expected size exactly + if (encodedMessageSize == msgBufferSize) + { + result = Result::Success; + } + } + } + else + { + result = Result::InvalidParameter; + } + + return result; + } + + // tripwire - this intentionally will break if the message version changes. Since these are breaking changes already, we need to address + // this problem when it happens. + static_assert(kMessageVersion == 1011, "ClientInfoStruct needs to be updated so that clientName is long enough to support a full path"); + // todo: shorten clientDescription to 64bytes and make clientName 320bytes to support full path + DD_NETWORK_STRUCT(ClientInfoStruct, 4) + { + char clientName[kMaxStringLength]; + char clientDescription[kMaxStringLength]; + // reserve 128bytes in case we need another string in the future + char reserved[kMaxStringLength]; + ClientMetadata metadata; + ProcessId processId; + // pad this out to 512 bytes for future expansion + char padding[116]; + }; + + DD_CHECK_SIZE(ClientInfoStruct, 512); + + /////////////////////// + // GPU Open Message codes + enum struct EscapeCommand : uint32 + { + Unknown = 0, + QueryStatus, // Will be deprecated in a future change + RegisterClient, + UnregisterClient, + RegisterExternalClient, // Will be deprecated in a future change + UnregisterExternalClient, // Will be deprecated in a future change + UpdateClientStatus, // Will be deprecated in a future change + QueryCapabilities, + EnableDeveloperMode, + DisableDeveloperMode, + QueryDeveloperModeStatus, + RegisterRouter, + UnregisterRouter, + AmdLogEvent, + Count + }; +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h index b79622e08d..fe23bd9828 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgChannel.h @@ -1,271 +1,271 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" -#include "protocolClient.h" -#include "protocolServer.h" -#include "msgTransport.h" -#include "ddUriInterface.h" -#include "util/string.h" -#include "util/vector.h" - -namespace DevDriver -{ - class IMsgChannel; - class IService; - class ISession; - - namespace TransferProtocol - { - class TransferManager; - } - - namespace InfoURIService - { - class InfoService; - } - - // Temporarily changing from 10ms to 15ms to workaround a timing issue with Windows named pipes, should change back once that - // transport is refactored/replaced. - DD_STATIC_CONST uint32 kDefaultUpdateTimeoutInMs = 15; - DD_STATIC_CONST uint32 kFindClientTimeout = 500; - - // Enumeration of events that can occur on the message bus. - enum class BusEventType : uint32 - { - Unknown = 0, - ClientHalted, - PongRequest, - }; - - /// Event data structure for the ClientHalted bus event - struct BusEventClientHalted - { - ClientId clientId; /// Id of the client that is currently halted - ClientInfoStruct clientInfo; /// Additional information about the client - }; - - /// Event data structure for the PongRequest bus event - struct BusEventPongRequest - { - ClientId clientId; /// Id of the client that is requesting a pong message - const ClientInfoStruct* pClientInfo; /// Additional information about the client - /// Note: May be nullptr for older clients - bool* pShouldRespond; /// Set this to false if a pong should not be sent - /// The default value is true. - /// Note: This will never be nullptr - }; - - // Callback function used to handle bus events - typedef void (*PFN_BusEventCallback)(void* pUserdata, BusEventType type, const void* pEventData, size_t eventDataSize); - - // Helper structure used to contain a bus event callback - struct BusEventCallback - { - PFN_BusEventCallback pfnEventCallback; // Message bus event callback function - void* pUserdata; // Message bus event callback userdata - - /// Returns true if this callback contains a valid function - bool IsValid() const { return (pfnEventCallback != nullptr); } - - /// Executes the function stored within the callback - void operator()(BusEventType type, const void* pEventData, size_t eventDataSize) - { - DD_ASSERT(IsValid()); - - pfnEventCallback(pUserdata, type, pEventData, eventDataSize); - } - }; - - // Struct of information required to initialize an IMsgChannel instance - struct MessageChannelCreateInfo - { - StatusFlags initialFlags; // Initial client status flags. - Component componentType; // Type of component the message channel represents. - bool createUpdateThread; // Create a background processing thread for the message - // channel. This should only be set to false if the - // owning object is able to call IMsgChannel::Update() - // at least once per frame. - char clientDescription[kMaxStringLength]; // Description of the client provided to other clients on - // the message bus. - }; - - // Information required to establish a new session - struct EstablishSessionInfo - { - Protocol protocol; - Version minProtocolVersion; - Version maxProtocolVersion; - ClientId remoteClientId; - const char* pSessionName; - }; - - // "Temporary" structure to pack all create info without breaking back-compat - struct MessageChannelCreateInfo2 - { - MessageChannelCreateInfo channelInfo; - HostInfo hostInfo; - AllocCb allocCb; - }; - - // Data structure that contains information about a client that has been discovered - struct DiscoveredClientInfo - { - ClientId id; /// Id of the client - ClientMetadata metadata; /// Metadata for the client - - // Structure that contains additional information about the discovered client - // This information may or may not be valid depending on the value of the "valid" field. - struct - { - bool valid; - ClientInfoStruct data; - } clientInfo; - }; - - // Callback function used to handle client discovery - // Return true from this callback to indicate that the discovery process should be continued. - typedef bool (*PFN_ClientDiscoveredCallback)(void* pUserdata, const DiscoveredClientInfo& clientInfo); - - // Data structure that describes how a client discovery operation should be performed - struct DiscoverClientsInfo - { - PFN_ClientDiscoveredCallback pfnCallback; /// Callback function pointer - void* pUserdata; /// Userdata for callback - ClientMetadata filter; /// Filters out incoming clients from the callback - uint32 timeoutInMs; /// Timeout in milliseconds - }; - - // Create a new message channel object - Result CreateMessageChannel(const MessageChannelCreateInfo2& createInfo, IMsgChannel** ppMessageChannel); - - class IMsgChannel - { - public: - virtual ~IMsgChannel() {} - - // Register, unregister, or check connected status. - virtual Result Register(uint32 timeoutInMs = ~(0u)) = 0; - virtual void Unregister() = 0; - virtual bool IsConnected() = 0; - - virtual void SetBusEventCallback(const BusEventCallback& callback) = 0; - - // Send, receive, and forward messages - virtual Result Send(ClientId dstClientId, - Protocol protocol, - MessageCode message, - const ClientMetadata& metadata, - uint32 payloadSizeInBytes, - const void* pPayload) = 0; - virtual Result Receive(MessageBuffer& message, uint32 timeoutInMs) = 0; - virtual Result Forward(const MessageBuffer& messageBuffer) = 0; - - // Register, unregister, and retrieve IProtocolServer objects - virtual Result RegisterProtocolServer(IProtocolServer* pServer) = 0; - virtual Result UnregisterProtocolServer(IProtocolServer* pServer) = 0; - virtual IProtocolServer* GetProtocolServer(Protocol protocol) = 0; - - // Initiates a connection to the specified destination client id - // Returns the intermediate session via ppSession - virtual Result EstablishSessionForClient(SharedPointer* ppSession, - const EstablishSessionInfo& sessionInfo) = 0; - - // Register or Unregister an IService object - virtual Result RegisterService(IService* pService) = 0; - virtual Result UnregisterService(IService* pService) = 0; - - // Get the allocator used to create this message channel - virtual const AllocCb& GetAllocCb() const = 0; - - // Attempts to discover clients on the message bus - virtual Result DiscoverClients(const DiscoverClientsInfo& info) = 0; - - // Returns client information for the first client to respond that matches the specified filter - virtual Result FindFirstClient(const ClientMetadata& filter, - ClientId* pClientId, - uint32 timeoutInMs = kFindClientTimeout, - ClientMetadata* pClientMetadata = nullptr) = 0; - - // Get the client ID, or returns kBroadcastClientId if disconnected. - virtual ClientId GetClientId() const = 0; - - // Get the client information struct for the message channel. - virtual const ClientInfoStruct& GetClientInfo() const = 0; - - // Get a human-readable string describing the connection type. - virtual const char* GetTransportName() const = 0; - - // Set and get all client status flags. - virtual Result SetStatusFlags(StatusFlags flags) = 0; - virtual StatusFlags GetStatusFlags() const = 0; - - // Set the specified client status flag. - template - Result SetStatusFlag(bool enable) - { - Result toggleResult = Result::Success; - StatusFlags oldFlags = GetStatusFlags(); - StatusFlags newFlags; - - if (enable) - { - // Toggle developer mode - newFlags = oldFlags | static_cast(flag); - } - else - { - // Toggle developer mode - newFlags = oldFlags & ~static_cast(flag); - } - - if (newFlags != oldFlags) - { - toggleResult = SetStatusFlags(newFlags); - } - return toggleResult; - } - - // Get the specified client status flag. - template - bool GetStatusFlag() const - { - return ((GetStatusFlags() & static_cast(flag)) != 0); - } - - virtual InfoURIService::InfoService& GetInfoService() = 0; - - // Utility functions that should probably not be publicly exposed. - // TODO: Refactor surrounding code to eliminate these. - virtual TransferProtocol::TransferManager& GetTransferManager() = 0; - virtual void Update(uint32 timeoutInMs = kDefaultUpdateTimeoutInMs) = 0; - - protected: - IMsgChannel() {}; - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolClient.h" +#include "protocolServer.h" +#include "msgTransport.h" +#include "ddUriInterface.h" +#include "util/string.h" +#include "util/vector.h" + +namespace DevDriver +{ + class IMsgChannel; + class IService; + class ISession; + + namespace TransferProtocol + { + class TransferManager; + } + + namespace InfoURIService + { + class InfoService; + } + + // Temporarily changing from 10ms to 15ms to workaround a timing issue with Windows named pipes, should change back once that + // transport is refactored/replaced. + DD_STATIC_CONST uint32 kDefaultUpdateTimeoutInMs = 15; + DD_STATIC_CONST uint32 kFindClientTimeout = 500; + + // Enumeration of events that can occur on the message bus. + enum class BusEventType : uint32 + { + Unknown = 0, + ClientHalted, + PongRequest, + }; + + /// Event data structure for the ClientHalted bus event + struct BusEventClientHalted + { + ClientId clientId; /// Id of the client that is currently halted + ClientInfoStruct clientInfo; /// Additional information about the client + }; + + /// Event data structure for the PongRequest bus event + struct BusEventPongRequest + { + ClientId clientId; /// Id of the client that is requesting a pong message + const ClientInfoStruct* pClientInfo; /// Additional information about the client + /// Note: May be nullptr for older clients + bool* pShouldRespond; /// Set this to false if a pong should not be sent + /// The default value is true. + /// Note: This will never be nullptr + }; + + // Callback function used to handle bus events + typedef void (*PFN_BusEventCallback)(void* pUserdata, BusEventType type, const void* pEventData, size_t eventDataSize); + + // Helper structure used to contain a bus event callback + struct BusEventCallback + { + PFN_BusEventCallback pfnEventCallback; // Message bus event callback function + void* pUserdata; // Message bus event callback userdata + + /// Returns true if this callback contains a valid function + bool IsValid() const { return (pfnEventCallback != nullptr); } + + /// Executes the function stored within the callback + void operator()(BusEventType type, const void* pEventData, size_t eventDataSize) + { + DD_ASSERT(IsValid()); + + pfnEventCallback(pUserdata, type, pEventData, eventDataSize); + } + }; + + // Struct of information required to initialize an IMsgChannel instance + struct MessageChannelCreateInfo + { + StatusFlags initialFlags; // Initial client status flags. + Component componentType; // Type of component the message channel represents. + bool createUpdateThread; // Create a background processing thread for the message + // channel. This should only be set to false if the + // owning object is able to call IMsgChannel::Update() + // at least once per frame. + char clientDescription[kMaxStringLength]; // Description of the client provided to other clients on + // the message bus. + }; + + // Information required to establish a new session + struct EstablishSessionInfo + { + Protocol protocol; + Version minProtocolVersion; + Version maxProtocolVersion; + ClientId remoteClientId; + const char* pSessionName; + }; + + // "Temporary" structure to pack all create info without breaking back-compat + struct MessageChannelCreateInfo2 + { + MessageChannelCreateInfo channelInfo; + HostInfo hostInfo; + AllocCb allocCb; + }; + + // Data structure that contains information about a client that has been discovered + struct DiscoveredClientInfo + { + ClientId id; /// Id of the client + ClientMetadata metadata; /// Metadata for the client + + // Structure that contains additional information about the discovered client + // This information may or may not be valid depending on the value of the "valid" field. + struct + { + bool valid; + ClientInfoStruct data; + } clientInfo; + }; + + // Callback function used to handle client discovery + // Return true from this callback to indicate that the discovery process should be continued. + typedef bool (*PFN_ClientDiscoveredCallback)(void* pUserdata, const DiscoveredClientInfo& clientInfo); + + // Data structure that describes how a client discovery operation should be performed + struct DiscoverClientsInfo + { + PFN_ClientDiscoveredCallback pfnCallback; /// Callback function pointer + void* pUserdata; /// Userdata for callback + ClientMetadata filter; /// Filters out incoming clients from the callback + uint32 timeoutInMs; /// Timeout in milliseconds + }; + + // Create a new message channel object + Result CreateMessageChannel(const MessageChannelCreateInfo2& createInfo, IMsgChannel** ppMessageChannel); + + class IMsgChannel + { + public: + virtual ~IMsgChannel() {} + + // Register, unregister, or check connected status. + virtual Result Register(uint32 timeoutInMs = ~(0u)) = 0; + virtual void Unregister() = 0; + virtual bool IsConnected() = 0; + + virtual void SetBusEventCallback(const BusEventCallback& callback) = 0; + + // Send, receive, and forward messages + virtual Result Send(ClientId dstClientId, + Protocol protocol, + MessageCode message, + const ClientMetadata& metadata, + uint32 payloadSizeInBytes, + const void* pPayload) = 0; + virtual Result Receive(MessageBuffer& message, uint32 timeoutInMs) = 0; + virtual Result Forward(const MessageBuffer& messageBuffer) = 0; + + // Register, unregister, and retrieve IProtocolServer objects + virtual Result RegisterProtocolServer(IProtocolServer* pServer) = 0; + virtual Result UnregisterProtocolServer(IProtocolServer* pServer) = 0; + virtual IProtocolServer* GetProtocolServer(Protocol protocol) = 0; + + // Initiates a connection to the specified destination client id + // Returns the intermediate session via ppSession + virtual Result EstablishSessionForClient(SharedPointer* ppSession, + const EstablishSessionInfo& sessionInfo) = 0; + + // Register or Unregister an IService object + virtual Result RegisterService(IService* pService) = 0; + virtual Result UnregisterService(IService* pService) = 0; + + // Get the allocator used to create this message channel + virtual const AllocCb& GetAllocCb() const = 0; + + // Attempts to discover clients on the message bus + virtual Result DiscoverClients(const DiscoverClientsInfo& info) = 0; + + // Returns client information for the first client to respond that matches the specified filter + virtual Result FindFirstClient(const ClientMetadata& filter, + ClientId* pClientId, + uint32 timeoutInMs = kFindClientTimeout, + ClientMetadata* pClientMetadata = nullptr) = 0; + + // Get the client ID, or returns kBroadcastClientId if disconnected. + virtual ClientId GetClientId() const = 0; + + // Get the client information struct for the message channel. + virtual const ClientInfoStruct& GetClientInfo() const = 0; + + // Get a human-readable string describing the connection type. + virtual const char* GetTransportName() const = 0; + + // Set and get all client status flags. + virtual Result SetStatusFlags(StatusFlags flags) = 0; + virtual StatusFlags GetStatusFlags() const = 0; + + // Set the specified client status flag. + template + Result SetStatusFlag(bool enable) + { + Result toggleResult = Result::Success; + StatusFlags oldFlags = GetStatusFlags(); + StatusFlags newFlags; + + if (enable) + { + // Toggle developer mode + newFlags = oldFlags | static_cast(flag); + } + else + { + // Toggle developer mode + newFlags = oldFlags & ~static_cast(flag); + } + + if (newFlags != oldFlags) + { + toggleResult = SetStatusFlags(newFlags); + } + return toggleResult; + } + + // Get the specified client status flag. + template + bool GetStatusFlag() const + { + return ((GetStatusFlags() & static_cast(flag)) != 0); + } + + virtual InfoURIService::InfoService& GetInfoService() = 0; + + // Utility functions that should probably not be publicly exposed. + // TODO: Refactor surrounding code to eliminate these. + virtual TransferProtocol::TransferManager& GetTransferManager() = 0; + virtual void Update(uint32 timeoutInMs = kDefaultUpdateTimeoutInMs) = 0; + + protected: + IMsgChannel() {}; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h index db3dd3f079..187109e76e 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/msgTransport.h @@ -1,70 +1,70 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" - -namespace DevDriver -{ - class IMsgTransport - { - public: - virtual ~IMsgTransport() {} - - // Connect and disconnect from the transport. - virtual Result Connect(ClientId* pClientId, uint32 timeoutInMs) = 0; - virtual Result Disconnect() = 0; - - // Read and Write messages from a connected transport - virtual Result WriteMessage(const MessageBuffer &messageBuffer) = 0; - virtual Result ReadMessage(MessageBuffer &messageBuffer, uint32 timeoutInMs) = 0; - - // Get a human-readable string describing the connection type. - virtual const char* GetTransportName() const = 0; - - // Static method to be implemented by individual transports - // true indicates that the transport is incapable of detecting - // dropped connections and some form of keep-alive is required - // false indicates that the transport can properly detect dropped - // connections - DD_STATIC_CONST bool RequiresKeepAlive() - { - return false; - } - - // Static method to be implemented by individual transports - // true indicates that Connect is expected to also negotiate a client ID - // false indicates that the MessageChannel needs to do it's own client ID - // negotiation, e.g. in the case of network connections - DD_STATIC_CONST bool RequiresClientRegistration() - { - return false; - } - protected: - IMsgTransport() {} - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +namespace DevDriver +{ + class IMsgTransport + { + public: + virtual ~IMsgTransport() {} + + // Connect and disconnect from the transport. + virtual Result Connect(ClientId* pClientId, uint32 timeoutInMs) = 0; + virtual Result Disconnect() = 0; + + // Read and Write messages from a connected transport + virtual Result WriteMessage(const MessageBuffer &messageBuffer) = 0; + virtual Result ReadMessage(MessageBuffer &messageBuffer, uint32 timeoutInMs) = 0; + + // Get a human-readable string describing the connection type. + virtual const char* GetTransportName() const = 0; + + // Static method to be implemented by individual transports + // true indicates that the transport is incapable of detecting + // dropped connections and some form of keep-alive is required + // false indicates that the transport can properly detect dropped + // connections + DD_STATIC_CONST bool RequiresKeepAlive() + { + return false; + } + + // Static method to be implemented by individual transports + // true indicates that Connect is expected to also negotiate a client ID + // false indicates that the MessageChannel needs to do it's own client ID + // negotiation, e.g. in the case of network connections + DD_STATIC_CONST bool RequiresClientRegistration() + { + return false; + } + protected: + IMsgTransport() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h index 1b61ebdf95..bb18c9d498 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/platforms/ddWinPlatform.h @@ -1,28 +1,28 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "../../core/inc/platforms/ddcWinPlatform.h" +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "../../core/inc/platforms/ddcWinPlatform.h" diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h index 33cfde7729..1649f888e1 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolClient.h @@ -1,54 +1,54 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" -#include "protocolSession.h" - -namespace DevDriver -{ - class Session; - - class IProtocolClient : public IProtocolSession - { - public: - virtual ~IProtocolClient() {} - - virtual Version GetSessionVersion() const = 0; - - virtual Result Connect(ClientId clientId, uint32 timeoutInMs) = 0; - virtual Result Connect(ClientId clientId) = 0; - virtual void Disconnect() = 0; - - virtual bool IsConnected() const = 0; - virtual ClientId GetRemoteClientId() const = 0; - - virtual bool QueryConnectionStatus() = 0; - protected: - IProtocolClient() {} - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolSession.h" + +namespace DevDriver +{ + class Session; + + class IProtocolClient : public IProtocolSession + { + public: + virtual ~IProtocolClient() {} + + virtual Version GetSessionVersion() const = 0; + + virtual Result Connect(ClientId clientId, uint32 timeoutInMs) = 0; + virtual Result Connect(ClientId clientId) = 0; + virtual void Disconnect() = 0; + + virtual bool IsConnected() const = 0; + virtual ClientId GetRemoteClientId() const = 0; + + virtual bool QueryConnectionStatus() = 0; + protected: + IProtocolClient() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h index 812bbbcee6..d238091e18 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolServer.h @@ -1,53 +1,53 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" -#include "protocolSession.h" - -namespace DevDriver -{ - class IMsgChannel; - class Session; - - class IProtocolServer : public IProtocolSession - { - public: - virtual ~IProtocolServer() {} - - virtual void Finalize() = 0; - - virtual bool GetSupportedVersion(Version minVersion, Version maxVersion, Version *version) const = 0; - virtual bool AcceptSession(const SharedPointer& pSession) = 0; - - virtual void SessionEstablished(const SharedPointer &pSession) = 0; - virtual void UpdateSession(const SharedPointer &pSession) = 0; - virtual void SessionTerminated(const SharedPointer &pSession, Result terminationReason) = 0; - protected: - IProtocolServer() {} - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" +#include "protocolSession.h" + +namespace DevDriver +{ + class IMsgChannel; + class Session; + + class IProtocolServer : public IProtocolSession + { + public: + virtual ~IProtocolServer() {} + + virtual void Finalize() = 0; + + virtual bool GetSupportedVersion(Version minVersion, Version maxVersion, Version *version) const = 0; + virtual bool AcceptSession(const SharedPointer& pSession) = 0; + + virtual void SessionEstablished(const SharedPointer &pSession) = 0; + virtual void UpdateSession(const SharedPointer &pSession) = 0; + virtual void SessionTerminated(const SharedPointer &pSession, Result terminationReason) = 0; + protected: + IProtocolServer() {} + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h index 0fba6138af..beaf32a746 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocolSession.h @@ -1,175 +1,175 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include -#include -#include - -namespace DevDriver -{ - // A container struct that can hold any protocol's payload and keep track of its size. - // Not intended for network transport. This struct is intended to help simplify code that works with variably sized payloads. - // The struct is 8 byte aligned because the internal payload field requires 8 byte alignment. - DD_ALIGNED_STRUCT(SizedPayloadContainer, 8) - { - uint32 payloadSize; - uint32 padding; - char payload[kMaxPayloadSizeInBytes]; - - // For safety purposes we limit the CreatePayload and GetPayload methods to types that: - // - // 1. Have a standard layout, to ensure that the contents are safe to transmit across the network - // 2. Are trivially destructible, to ensure that a user doesn't construct an object and then overwrite it - // without destroying it - // 3. Small enough to fit inside the payload field of this struct - template - struct CanUseAsPayload - { - static_assert(Platform::IsStandardLayout::Value, "Type provided does not meet standard layout requirements"); - static_assert(Platform::IsTriviallyDestructible::Value, "Type provided is not trivially destructible"); - static_assert((sizeof(T) <= kMaxPayloadSizeInBytes), "Type provided is too large to fit in the container"); - - DD_STATIC_CONST bool Value = Platform::IsStandardLayout::Value && - Platform::IsTriviallyDestructible::Value && - (sizeof(T) <= kMaxPayloadSizeInBytes); - }; - - // We additionally only allow creation of a payload if the type is constructible using the arguments specified - template - struct CanCreatePayload - { - static_assert(CanUseAsPayload::Value, "Type specified cannot be used as a payload"); - static_assert(Platform::IsConstructible::Value, "Type provided cannot be constructed with the provided arguments"); - - DD_STATIC_CONST bool Value = CanUseAsPayload::Value && - Platform::IsConstructible::Value; - }; - - // Convenience function to allow in-place construction of a payload object using placement new. - template::Value>::Type> - void CreatePayload(Args&&... args) - { - // This is tremendously unsafe, but we use placement new to construct an object inside the buffer. - // Why do we do this? The big benefit is that it lets us skip having to create a temporary object - // and then copy it into this buffer. - // - // There are a couple of other ancillary benefits that are useful. The biggest is that if an object has - // a constexpr constructor it can initialize the memory using a memcpy/move instead of having to actually - // call the constructor. The other benefit is that if the constructor omits initializing memory (e.g., - // a giant data buffer) it will also skip re-initializing the memory here. This is not the case with - // when you create another instance of the object and copy it - the temporary object is almost certainly - // zero initialized, and the copy/move will result in the entire struct being copied. - - static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); - - new(reinterpret_cast(&payload[0])) T(Platform::Forward(args)...); - payloadSize = sizeof(T); - padding = 0; - } - - // Convenience function to allow accessing the payload as if it was the specified type. - template::Value>::Type> - T& GetPayload() - { - return *GetPayloadPointer(&payload[0]); - } - - private: - // Convenience function to allow accessing the payload as if it was the specified type. - template::Value>::Type> - static constexpr T* GetPayloadPointer(char* DD_RESTRICT pPointer) - { - static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); - return (T*)(pPointer); - } - }; - - DD_CHECK_SIZE(SizedPayloadContainer, 8 + kMaxPayloadSizeInBytes); - - class IMsgChannel; - class Session; - - enum struct SessionType - { - Unknown = 0, - Client, - Server - }; - - class ISession - { - public: - virtual ~ISession() {}; - - virtual Result Send(uint32 payloadSizeInBytes, const void* pPayload, uint32 timeoutInMs) = 0; - virtual Result Receive(uint32 payloadSizeInBytes, void *pPayload, uint32 *pBytesReceived, uint32 timeoutInMs) = 0; - virtual Result WaitForConnection(uint32 timeoutInMs) = 0; - virtual Result WaitForDisconnection(uint32 timeoutInMs) = 0; - - virtual bool IsClosed() const = 0; - - virtual void* SetUserData(void* pUserdata) = 0; - virtual void* GetUserData() const = 0; - virtual SessionId GetSessionId() const = 0; - virtual ClientId GetDestinationClientId() const = 0; - virtual Version GetVersion() const = 0; - virtual Protocol GetProtocol() const = 0; - - // Helper functions for working with SizedPayloadContainers and managing back-compat. - Result SendPayload(const SizedPayloadContainer& payload, uint32 timeoutInMs) - { - return Send(payload.payloadSize, payload.payload, timeoutInMs); - } - - Result ReceivePayload(SizedPayloadContainer* pPayload, uint32 timeoutInMs) - { - DD_ASSERT(pPayload != nullptr); - return Receive(sizeof(pPayload->payload), pPayload->payload, &pPayload->payloadSize, timeoutInMs); - } - - protected: - ISession() {} - }; - - class IProtocolSession - { - public: - virtual ~IProtocolSession() {} - - virtual Protocol GetProtocol() const = 0; - virtual SessionType GetType() const = 0; - virtual Version GetMinVersion() const = 0; - virtual Version GetMaxVersion() const = 0; - - protected: - IProtocolSession() {} - }; -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace DevDriver +{ + // A container struct that can hold any protocol's payload and keep track of its size. + // Not intended for network transport. This struct is intended to help simplify code that works with variably sized payloads. + // The struct is 8 byte aligned because the internal payload field requires 8 byte alignment. + DD_ALIGNED_STRUCT(SizedPayloadContainer, 8) + { + uint32 payloadSize; + uint32 padding; + char payload[kMaxPayloadSizeInBytes]; + + // For safety purposes we limit the CreatePayload and GetPayload methods to types that: + // + // 1. Have a standard layout, to ensure that the contents are safe to transmit across the network + // 2. Are trivially destructible, to ensure that a user doesn't construct an object and then overwrite it + // without destroying it + // 3. Small enough to fit inside the payload field of this struct + template + struct CanUseAsPayload + { + static_assert(Platform::IsStandardLayout::Value, "Type provided does not meet standard layout requirements"); + static_assert(Platform::IsTriviallyDestructible::Value, "Type provided is not trivially destructible"); + static_assert((sizeof(T) <= kMaxPayloadSizeInBytes), "Type provided is too large to fit in the container"); + + DD_STATIC_CONST bool Value = Platform::IsStandardLayout::Value && + Platform::IsTriviallyDestructible::Value && + (sizeof(T) <= kMaxPayloadSizeInBytes); + }; + + // We additionally only allow creation of a payload if the type is constructible using the arguments specified + template + struct CanCreatePayload + { + static_assert(CanUseAsPayload::Value, "Type specified cannot be used as a payload"); + static_assert(Platform::IsConstructible::Value, "Type provided cannot be constructed with the provided arguments"); + + DD_STATIC_CONST bool Value = CanUseAsPayload::Value && + Platform::IsConstructible::Value; + }; + + // Convenience function to allow in-place construction of a payload object using placement new. + template::Value>::Type> + void CreatePayload(Args&&... args) + { + // This is tremendously unsafe, but we use placement new to construct an object inside the buffer. + // Why do we do this? The big benefit is that it lets us skip having to create a temporary object + // and then copy it into this buffer. + // + // There are a couple of other ancillary benefits that are useful. The biggest is that if an object has + // a constexpr constructor it can initialize the memory using a memcpy/move instead of having to actually + // call the constructor. The other benefit is that if the constructor omits initializing memory (e.g., + // a giant data buffer) it will also skip re-initializing the memory here. This is not the case with + // when you create another instance of the object and copy it - the temporary object is almost certainly + // zero initialized, and the copy/move will result in the entire struct being copied. + + static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); + + new(reinterpret_cast(&payload[0])) T(Platform::Forward(args)...); + payloadSize = sizeof(T); + padding = 0; + } + + // Convenience function to allow accessing the payload as if it was the specified type. + template::Value>::Type> + T& GetPayload() + { + return *GetPayloadPointer(&payload[0]); + } + + private: + // Convenience function to allow accessing the payload as if it was the specified type. + template::Value>::Type> + static constexpr T* GetPayloadPointer(char* DD_RESTRICT pPointer) + { + static_assert(alignof(T) <= alignof(SizedPayloadContainer), "Type provided cannot be aligned in the container"); + return (T*)(pPointer); + } + }; + + DD_CHECK_SIZE(SizedPayloadContainer, 8 + kMaxPayloadSizeInBytes); + + class IMsgChannel; + class Session; + + enum struct SessionType + { + Unknown = 0, + Client, + Server + }; + + class ISession + { + public: + virtual ~ISession() {}; + + virtual Result Send(uint32 payloadSizeInBytes, const void* pPayload, uint32 timeoutInMs) = 0; + virtual Result Receive(uint32 payloadSizeInBytes, void *pPayload, uint32 *pBytesReceived, uint32 timeoutInMs) = 0; + virtual Result WaitForConnection(uint32 timeoutInMs) = 0; + virtual Result WaitForDisconnection(uint32 timeoutInMs) = 0; + + virtual bool IsClosed() const = 0; + + virtual void* SetUserData(void* pUserdata) = 0; + virtual void* GetUserData() const = 0; + virtual SessionId GetSessionId() const = 0; + virtual ClientId GetDestinationClientId() const = 0; + virtual Version GetVersion() const = 0; + virtual Protocol GetProtocol() const = 0; + + // Helper functions for working with SizedPayloadContainers and managing back-compat. + Result SendPayload(const SizedPayloadContainer& payload, uint32 timeoutInMs) + { + return Send(payload.payloadSize, payload.payload, timeoutInMs); + } + + Result ReceivePayload(SizedPayloadContainer* pPayload, uint32 timeoutInMs) + { + DD_ASSERT(pPayload != nullptr); + return Receive(sizeof(pPayload->payload), pPayload->payload, &pPayload->payloadSize, timeoutInMs); + } + + protected: + ISession() {} + }; + + class IProtocolSession + { + public: + virtual ~IProtocolSession() {} + + virtual Protocol GetProtocol() const = 0; + virtual SessionType GetType() const = 0; + virtual Version GetMinVersion() const = 0; + virtual Version GetMaxVersion() const = 0; + + protected: + IProtocolSession() {} + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h index ac2f78cd36..afd88a00e2 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlProtocol.h @@ -1,531 +1,531 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" - -#define DRIVERCONTROL_PROTOCOL_VERSION 10 - -#define DRIVERCONTROL_PROTOCOL_MINIMUM_VERSION 1 - -/* -*********************************************************************************************************************** -*| Version | Change Description | -*| ------- | ---------------------------------------------------------------------------------------------------------| -*| 10.0 | Add ability to set clock mode on all adapters. | -*| 9.0 | Added a feature that allows tools to indicate when they will be ignoring a specific driver. | -*| 8.0 | Added a new version of the step driver response that contains the current driver status. | -*| 7.0 | Corrected a back-compat issue related to the new device clock query code. | -*| 6.0 | Added ability to query device clock frequencies for a given clock mode. | -*| 5.0 | Cleaned up the driver facing interface. | -*| 4.0 | Added HaltedOnPostDeviceInit state. | -*| 3.0 | Added QueryClientInfoRequest support. | -*| 2.1 | Added initialization time step functionality. | -*| 2.0 | Added initialization time driver status values and a terminate driver command. | -*| 1.0 | Initial version | -*********************************************************************************************************************** -*/ - -#define DRIVERCONTROL_SET_CLOCKS_ALL_ADAPTERS_VERSION 10 -#define DRIVERCONTROL_IGNORE_DRIVER_VERSION 9 -#define DRIVERCONTROL_STEP_RETURN_STATUS_VERSION 8 -#define DRIVERCONTROL_QUERY_BY_MODE_BACK_COMPAT_VERSION 7 -#define DRIVERCONTROL_QUERY_DEVICE_CLOCKS_BY_MODE_VERSION 6 -#define DRIVERCONTROL_DRIVER_INTERFACE_CLEANUP_VERSION 5 -#define DRIVERCONTROL_HALTEDPOSTDEVICEINIT_VERSION 4 -#define DRIVERCONTROL_QUERYCLIENTINFO_VERSION 3 -#define DRIVERCONTROL_INITIALIZATION_STATUS_VERSION 2 -#define DRIVERCONTROL_INITIAL_VERSION 1 - -namespace DevDriver -{ - namespace DriverControlProtocol - { - /////////////////////// - // DriverControl Constants - DD_STATIC_CONST uint32 kLegacyDriverControlPayloadSize = 16; - - /////////////////////// - // DriverControl Protocol - enum struct DriverControlMessage : MessageCode - { - Unknown = 0, - PauseDriverRequest, - PauseDriverResponse, - ResumeDriverRequest, - ResumeDriverResponse, - QueryNumGpusRequest, - QueryNumGpusResponse, - QueryDeviceClockModeRequest, - QueryDeviceClockModeResponse, - SetDeviceClockModeRequest, - SetDeviceClockModeResponse, - QueryDeviceClockRequest, - QueryDeviceClockResponse, - QueryMaxDeviceClockRequest, - QueryMaxDeviceClockResponse, - QueryDriverStatusRequest, - QueryDriverStatusResponse, - StepDriverRequest, - StepDriverResponse, - QueryClientInfoRequest, - QueryClientInfoResponse, - QueryDeviceClockByModeRequest, - QueryDeviceClockByModeResponse, - StepDriverResponseV2, - IgnoreDriverRequest, - IgnoreDriverResponse, - Count - }; - - /////////////////////// - // DriverControl Types - enum struct DeviceClockMode : uint32 - { - Unknown = 0, - Default, - Profiling, - MinimumMemory, - MinimumEngine, - Peak, - Count - }; - -#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION - typedef DevDriver::DriverStatus DriverStatus; -#endif - - /////////////////////// - // DriverControl Payloads - DD_NETWORK_STRUCT(DriverControlHeader, 4) - { - DriverControlMessage command; - char _padding[3]; - - constexpr DriverControlHeader(DriverControlMessage message) - : command(message) - , _padding() - { - } - }; - - DD_CHECK_SIZE(DriverControlHeader, 4); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Pause Driver Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(PauseDriverRequestPayload, 4) - { - DriverControlHeader header; - - constexpr PauseDriverRequestPayload() - : header(DriverControlMessage::PauseDriverRequest) - { - } - }; - - DD_CHECK_SIZE(PauseDriverRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(PauseDriverResponsePayload, 4) - { - DriverControlHeader header; - Result result; - - constexpr PauseDriverResponsePayload(Result result) - : header(DriverControlMessage::PauseDriverResponse) - , result(result) - { - } - }; - - DD_CHECK_SIZE(PauseDriverResponsePayload, sizeof(DriverControlHeader) + 4); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Resume Driver Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(ResumeDriverRequestPayload, 4) - { - DriverControlHeader header; - - constexpr ResumeDriverRequestPayload() - : header(DriverControlMessage::ResumeDriverRequest) - { - } - }; - - DD_CHECK_SIZE(ResumeDriverRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(ResumeDriverResponsePayload, 4) - { - DriverControlHeader header; - Result result; - - constexpr ResumeDriverResponsePayload(Result result) - : header(DriverControlMessage::ResumeDriverResponse) - , result(result) - { - } - }; - - DD_CHECK_SIZE(ResumeDriverResponsePayload, sizeof(DriverControlHeader) + 4); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Num Gpus Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryNumGpusRequestPayload, 4) - { - DriverControlHeader header; - - constexpr QueryNumGpusRequestPayload() - : header(DriverControlMessage::QueryNumGpusRequest) - { - } - }; - - DD_CHECK_SIZE(QueryNumGpusRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(QueryNumGpusResponsePayload, 4) - { - DriverControlHeader header; - Result result; - uint32 numGpus; - - constexpr QueryNumGpusResponsePayload(Result result, uint32 numGpus) - : header(DriverControlMessage::QueryNumGpusResponse) - , result(result) - , numGpus(numGpus) - { - } - }; - - DD_CHECK_SIZE(QueryNumGpusResponsePayload, sizeof(DriverControlHeader) + 8); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Device Clock Mode Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryDeviceClockModeRequestPayload, 4) - { - DriverControlHeader header; - uint32 gpuIndex; - - constexpr QueryDeviceClockModeRequestPayload(uint32 gpuIndex) - : header(DriverControlMessage::QueryDeviceClockModeRequest) - , gpuIndex(gpuIndex) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 4); - - DD_NETWORK_STRUCT(QueryDeviceClockModeResponsePayload, 4) - { - DriverControlHeader header; - Result result; - DeviceClockMode mode; - - constexpr QueryDeviceClockModeResponsePayload(Result result, DeviceClockMode mode) - : header(DriverControlMessage::QueryDeviceClockModeResponse) - , result(result) - , mode(mode) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 8); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Set Device Clock Mode Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(SetDeviceClockModeRequestPayload, 4) - { - DriverControlHeader header; - uint32 gpuIndex; - DeviceClockMode mode; - - constexpr SetDeviceClockModeRequestPayload(uint32 gpuIndex, DeviceClockMode mode) - : header(DriverControlMessage::SetDeviceClockModeRequest) - , gpuIndex(gpuIndex) - , mode(mode) - { - } - }; - - DD_CHECK_SIZE(SetDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 8); - - DD_NETWORK_STRUCT(SetDeviceClockModeResponsePayload, 4) - { - DriverControlHeader header; - Result result; - - constexpr SetDeviceClockModeResponsePayload(Result result) - : header(DriverControlMessage::SetDeviceClockModeResponse) - , result(result) - { - } - }; - - DD_CHECK_SIZE(SetDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 4); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Device Clock Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryDeviceClockRequestPayload, 4) - { - DriverControlHeader header; - uint32 gpuIndex; - - constexpr QueryDeviceClockRequestPayload(uint32 gpuIndex) - : header(DriverControlMessage::QueryDeviceClockRequest) - , gpuIndex(gpuIndex) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); - - DD_NETWORK_STRUCT(QueryDeviceClockResponsePayload, 4) - { - DriverControlHeader header; - Result result; - float gpuClock; - float memClock; - - constexpr QueryDeviceClockResponsePayload(Result result, float gpuClock, float memClock) - : header(DriverControlMessage::QueryDeviceClockResponse) - , result(result) - , gpuClock(gpuClock) - , memClock(memClock) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Device Clock By Mode Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryDeviceClockByModeRequestPayload, 4) - { - DriverControlHeader header; - uint32 gpuIndex; - DeviceClockMode deviceClockMode; - - constexpr QueryDeviceClockByModeRequestPayload(uint32 gpuIndex, DeviceClockMode clockMode) - : header(DriverControlMessage::QueryDeviceClockByModeRequest) - , gpuIndex(gpuIndex) - , deviceClockMode(clockMode) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockByModeRequestPayload, sizeof(DriverControlHeader) + 8); - - DD_NETWORK_STRUCT(QueryDeviceClockByModeResponsePayload, 4) - { - DriverControlHeader header; - Result result; - float gpuClock; - float memClock; - - constexpr QueryDeviceClockByModeResponsePayload(Result result, float gpuClock, float memClock) - : header(DriverControlMessage::QueryDeviceClockByModeResponse) - , result(result) - , gpuClock(gpuClock) - , memClock(memClock) - { - } - }; - - DD_CHECK_SIZE(QueryDeviceClockByModeResponsePayload, sizeof(DriverControlHeader) + 12); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Max Device Clock Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryMaxDeviceClockRequestPayload, 4) - { - DriverControlHeader header; - uint32 gpuIndex; - - constexpr QueryMaxDeviceClockRequestPayload(uint32 gpuIndex) - : header(DriverControlMessage::QueryMaxDeviceClockRequest) - , gpuIndex(gpuIndex) - { - } - }; - - DD_CHECK_SIZE(QueryMaxDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); - - DD_NETWORK_STRUCT(QueryMaxDeviceClockResponsePayload, 4) - { - DriverControlHeader header; - Result result; - float maxGpuClock; - float maxMemClock; - - constexpr QueryMaxDeviceClockResponsePayload(Result result, float maxGpuClock, float maxMemClock) - : header(DriverControlMessage::QueryMaxDeviceClockResponse) - , result(result) - , maxGpuClock(maxGpuClock) - , maxMemClock(maxMemClock) - { - } - }; - - DD_CHECK_SIZE(QueryMaxDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Driver Status Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryDriverStatusRequestPayload, 4) - { - DriverControlHeader header; - - constexpr QueryDriverStatusRequestPayload() - : header(DriverControlMessage::QueryDriverStatusRequest) - { - } - }; - - DD_CHECK_SIZE(QueryDriverStatusRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(QueryDriverStatusResponsePayload, 4) - { - DriverControlHeader header; - DriverStatus status; - - constexpr QueryDriverStatusResponsePayload(DriverStatus status) - : header(DriverControlMessage::QueryDriverStatusResponse) - , status(status) - { - } - }; - - DD_CHECK_SIZE(QueryDriverStatusResponsePayload, sizeof(DriverControlHeader) + 4); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Step Driver Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(StepDriverRequestPayload, 4) - { - DriverControlHeader header; - uint32 count; - - constexpr StepDriverRequestPayload(uint32 count) - : header(DriverControlMessage::StepDriverRequest) - , count(count) - { - } - }; - - DD_CHECK_SIZE(StepDriverRequestPayload, sizeof(DriverControlHeader) + 4); - - DD_NETWORK_STRUCT(StepDriverResponsePayload, 4) - { - DriverControlHeader header; - Result result; - - constexpr StepDriverResponsePayload(Result result) - : header(DriverControlMessage::StepDriverResponse) - , result(result) - { - } - }; - - DD_CHECK_SIZE(StepDriverResponsePayload, sizeof(DriverControlHeader) + 4); - - DD_NETWORK_STRUCT(StepDriverResponsePayloadV2, 4) - { - DriverControlHeader header; - Result result; - DriverStatus status; - - constexpr StepDriverResponsePayloadV2(Result result, DriverStatus status) - : header(DriverControlMessage::StepDriverResponseV2) - , result(result) - , status(status) - { - } - }; - - DD_CHECK_SIZE(StepDriverResponsePayloadV2, sizeof(DriverControlHeader) + 8); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Query Client Info Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(QueryClientInfoRequestPayload, 4) - { - DriverControlHeader header; - - constexpr QueryClientInfoRequestPayload() - : header(DriverControlMessage::QueryClientInfoRequest) - { - } - }; - - DD_CHECK_SIZE(QueryClientInfoRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(QueryClientInfoResponsePayload, 4) - { - DriverControlHeader header; - ClientInfoStruct clientInfo; - - constexpr QueryClientInfoResponsePayload(const ClientInfoStruct& clientInfo) - : header(DriverControlMessage::QueryClientInfoResponse) - , clientInfo(clientInfo) - { - } - }; - - DD_CHECK_SIZE(QueryClientInfoResponsePayload, sizeof(DriverControlHeader) + sizeof(ClientInfoStruct)); - - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Ignore Driver Request/Response - /////////////////////////////////////////////////////////////////////////////////////////////////////////////// - DD_NETWORK_STRUCT(IgnoreDriverRequestPayload, 4) - { - DriverControlHeader header; - - constexpr IgnoreDriverRequestPayload() - : header(DriverControlMessage::IgnoreDriverRequest) - { - } - }; - - DD_CHECK_SIZE(IgnoreDriverRequestPayload, sizeof(DriverControlHeader)); - - DD_NETWORK_STRUCT(IgnoreDriverResponsePayload, 4) - { - DriverControlHeader header; - - constexpr IgnoreDriverResponsePayload() - : header(DriverControlMessage::IgnoreDriverResponse) - { - } - }; - - DD_CHECK_SIZE(IgnoreDriverResponsePayload, sizeof(DriverControlHeader)); - } -} +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +#define DRIVERCONTROL_PROTOCOL_VERSION 10 + +#define DRIVERCONTROL_PROTOCOL_MINIMUM_VERSION 1 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 10.0 | Add ability to set clock mode on all adapters. | +*| 9.0 | Added a feature that allows tools to indicate when they will be ignoring a specific driver. | +*| 8.0 | Added a new version of the step driver response that contains the current driver status. | +*| 7.0 | Corrected a back-compat issue related to the new device clock query code. | +*| 6.0 | Added ability to query device clock frequencies for a given clock mode. | +*| 5.0 | Cleaned up the driver facing interface. | +*| 4.0 | Added HaltedOnPostDeviceInit state. | +*| 3.0 | Added QueryClientInfoRequest support. | +*| 2.1 | Added initialization time step functionality. | +*| 2.0 | Added initialization time driver status values and a terminate driver command. | +*| 1.0 | Initial version | +*********************************************************************************************************************** +*/ + +#define DRIVERCONTROL_SET_CLOCKS_ALL_ADAPTERS_VERSION 10 +#define DRIVERCONTROL_IGNORE_DRIVER_VERSION 9 +#define DRIVERCONTROL_STEP_RETURN_STATUS_VERSION 8 +#define DRIVERCONTROL_QUERY_BY_MODE_BACK_COMPAT_VERSION 7 +#define DRIVERCONTROL_QUERY_DEVICE_CLOCKS_BY_MODE_VERSION 6 +#define DRIVERCONTROL_DRIVER_INTERFACE_CLEANUP_VERSION 5 +#define DRIVERCONTROL_HALTEDPOSTDEVICEINIT_VERSION 4 +#define DRIVERCONTROL_QUERYCLIENTINFO_VERSION 3 +#define DRIVERCONTROL_INITIALIZATION_STATUS_VERSION 2 +#define DRIVERCONTROL_INITIAL_VERSION 1 + +namespace DevDriver +{ + namespace DriverControlProtocol + { + /////////////////////// + // DriverControl Constants + DD_STATIC_CONST uint32 kLegacyDriverControlPayloadSize = 16; + + /////////////////////// + // DriverControl Protocol + enum struct DriverControlMessage : MessageCode + { + Unknown = 0, + PauseDriverRequest, + PauseDriverResponse, + ResumeDriverRequest, + ResumeDriverResponse, + QueryNumGpusRequest, + QueryNumGpusResponse, + QueryDeviceClockModeRequest, + QueryDeviceClockModeResponse, + SetDeviceClockModeRequest, + SetDeviceClockModeResponse, + QueryDeviceClockRequest, + QueryDeviceClockResponse, + QueryMaxDeviceClockRequest, + QueryMaxDeviceClockResponse, + QueryDriverStatusRequest, + QueryDriverStatusResponse, + StepDriverRequest, + StepDriverResponse, + QueryClientInfoRequest, + QueryClientInfoResponse, + QueryDeviceClockByModeRequest, + QueryDeviceClockByModeResponse, + StepDriverResponseV2, + IgnoreDriverRequest, + IgnoreDriverResponse, + Count + }; + + /////////////////////// + // DriverControl Types + enum struct DeviceClockMode : uint32 + { + Unknown = 0, + Default, + Profiling, + MinimumMemory, + MinimumEngine, + Peak, + Count + }; + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + typedef DevDriver::DriverStatus DriverStatus; +#endif + + /////////////////////// + // DriverControl Payloads + DD_NETWORK_STRUCT(DriverControlHeader, 4) + { + DriverControlMessage command; + char _padding[3]; + + constexpr DriverControlHeader(DriverControlMessage message) + : command(message) + , _padding() + { + } + }; + + DD_CHECK_SIZE(DriverControlHeader, 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Pause Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(PauseDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr PauseDriverRequestPayload() + : header(DriverControlMessage::PauseDriverRequest) + { + } + }; + + DD_CHECK_SIZE(PauseDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(PauseDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr PauseDriverResponsePayload(Result result) + : header(DriverControlMessage::PauseDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(PauseDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Resume Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(ResumeDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr ResumeDriverRequestPayload() + : header(DriverControlMessage::ResumeDriverRequest) + { + } + }; + + DD_CHECK_SIZE(ResumeDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(ResumeDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr ResumeDriverResponsePayload(Result result) + : header(DriverControlMessage::ResumeDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(ResumeDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Num Gpus Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryNumGpusRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryNumGpusRequestPayload() + : header(DriverControlMessage::QueryNumGpusRequest) + { + } + }; + + DD_CHECK_SIZE(QueryNumGpusRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryNumGpusResponsePayload, 4) + { + DriverControlHeader header; + Result result; + uint32 numGpus; + + constexpr QueryNumGpusResponsePayload(Result result, uint32 numGpus) + : header(DriverControlMessage::QueryNumGpusResponse) + , result(result) + , numGpus(numGpus) + { + } + }; + + DD_CHECK_SIZE(QueryNumGpusResponsePayload, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryDeviceClockModeRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryDeviceClockModeRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryDeviceClockModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + DeviceClockMode mode; + + constexpr QueryDeviceClockModeResponsePayload(Result result, DeviceClockMode mode) + : header(DriverControlMessage::QueryDeviceClockModeResponse) + , result(result) + , mode(mode) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Set Device Clock Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(SetDeviceClockModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + DeviceClockMode mode; + + constexpr SetDeviceClockModeRequestPayload(uint32 gpuIndex, DeviceClockMode mode) + : header(DriverControlMessage::SetDeviceClockModeRequest) + , gpuIndex(gpuIndex) + , mode(mode) + { + } + }; + + DD_CHECK_SIZE(SetDeviceClockModeRequestPayload, sizeof(DriverControlHeader) + 8); + + DD_NETWORK_STRUCT(SetDeviceClockModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr SetDeviceClockModeResponsePayload(Result result) + : header(DriverControlMessage::SetDeviceClockModeResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(SetDeviceClockModeResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryDeviceClockRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryDeviceClockRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryDeviceClockResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float gpuClock; + float memClock; + + constexpr QueryDeviceClockResponsePayload(Result result, float gpuClock, float memClock) + : header(DriverControlMessage::QueryDeviceClockResponse) + , result(result) + , gpuClock(gpuClock) + , memClock(memClock) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Device Clock By Mode Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDeviceClockByModeRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + DeviceClockMode deviceClockMode; + + constexpr QueryDeviceClockByModeRequestPayload(uint32 gpuIndex, DeviceClockMode clockMode) + : header(DriverControlMessage::QueryDeviceClockByModeRequest) + , gpuIndex(gpuIndex) + , deviceClockMode(clockMode) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockByModeRequestPayload, sizeof(DriverControlHeader) + 8); + + DD_NETWORK_STRUCT(QueryDeviceClockByModeResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float gpuClock; + float memClock; + + constexpr QueryDeviceClockByModeResponsePayload(Result result, float gpuClock, float memClock) + : header(DriverControlMessage::QueryDeviceClockByModeResponse) + , result(result) + , gpuClock(gpuClock) + , memClock(memClock) + { + } + }; + + DD_CHECK_SIZE(QueryDeviceClockByModeResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Max Device Clock Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryMaxDeviceClockRequestPayload, 4) + { + DriverControlHeader header; + uint32 gpuIndex; + + constexpr QueryMaxDeviceClockRequestPayload(uint32 gpuIndex) + : header(DriverControlMessage::QueryMaxDeviceClockRequest) + , gpuIndex(gpuIndex) + { + } + }; + + DD_CHECK_SIZE(QueryMaxDeviceClockRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(QueryMaxDeviceClockResponsePayload, 4) + { + DriverControlHeader header; + Result result; + float maxGpuClock; + float maxMemClock; + + constexpr QueryMaxDeviceClockResponsePayload(Result result, float maxGpuClock, float maxMemClock) + : header(DriverControlMessage::QueryMaxDeviceClockResponse) + , result(result) + , maxGpuClock(maxGpuClock) + , maxMemClock(maxMemClock) + { + } + }; + + DD_CHECK_SIZE(QueryMaxDeviceClockResponsePayload, sizeof(DriverControlHeader) + 12); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Driver Status Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryDriverStatusRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryDriverStatusRequestPayload() + : header(DriverControlMessage::QueryDriverStatusRequest) + { + } + }; + + DD_CHECK_SIZE(QueryDriverStatusRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryDriverStatusResponsePayload, 4) + { + DriverControlHeader header; + DriverStatus status; + + constexpr QueryDriverStatusResponsePayload(DriverStatus status) + : header(DriverControlMessage::QueryDriverStatusResponse) + , status(status) + { + } + }; + + DD_CHECK_SIZE(QueryDriverStatusResponsePayload, sizeof(DriverControlHeader) + 4); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Step Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(StepDriverRequestPayload, 4) + { + DriverControlHeader header; + uint32 count; + + constexpr StepDriverRequestPayload(uint32 count) + : header(DriverControlMessage::StepDriverRequest) + , count(count) + { + } + }; + + DD_CHECK_SIZE(StepDriverRequestPayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(StepDriverResponsePayload, 4) + { + DriverControlHeader header; + Result result; + + constexpr StepDriverResponsePayload(Result result) + : header(DriverControlMessage::StepDriverResponse) + , result(result) + { + } + }; + + DD_CHECK_SIZE(StepDriverResponsePayload, sizeof(DriverControlHeader) + 4); + + DD_NETWORK_STRUCT(StepDriverResponsePayloadV2, 4) + { + DriverControlHeader header; + Result result; + DriverStatus status; + + constexpr StepDriverResponsePayloadV2(Result result, DriverStatus status) + : header(DriverControlMessage::StepDriverResponseV2) + , result(result) + , status(status) + { + } + }; + + DD_CHECK_SIZE(StepDriverResponsePayloadV2, sizeof(DriverControlHeader) + 8); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Query Client Info Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(QueryClientInfoRequestPayload, 4) + { + DriverControlHeader header; + + constexpr QueryClientInfoRequestPayload() + : header(DriverControlMessage::QueryClientInfoRequest) + { + } + }; + + DD_CHECK_SIZE(QueryClientInfoRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(QueryClientInfoResponsePayload, 4) + { + DriverControlHeader header; + ClientInfoStruct clientInfo; + + constexpr QueryClientInfoResponsePayload(const ClientInfoStruct& clientInfo) + : header(DriverControlMessage::QueryClientInfoResponse) + , clientInfo(clientInfo) + { + } + }; + + DD_CHECK_SIZE(QueryClientInfoResponsePayload, sizeof(DriverControlHeader) + sizeof(ClientInfoStruct)); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Ignore Driver Request/Response + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// + DD_NETWORK_STRUCT(IgnoreDriverRequestPayload, 4) + { + DriverControlHeader header; + + constexpr IgnoreDriverRequestPayload() + : header(DriverControlMessage::IgnoreDriverRequest) + { + } + }; + + DD_CHECK_SIZE(IgnoreDriverRequestPayload, sizeof(DriverControlHeader)); + + DD_NETWORK_STRUCT(IgnoreDriverResponsePayload, 4) + { + DriverControlHeader header; + + constexpr IgnoreDriverResponsePayload() + : header(DriverControlMessage::IgnoreDriverResponse) + { + } + }; + + DD_CHECK_SIZE(IgnoreDriverResponsePayload, sizeof(DriverControlHeader)); + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h index 77e3e589a6..428ff76b1b 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/driverControlServer.h @@ -1,157 +1,157 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "baseProtocolServer.h" -#include "driverControlProtocol.h" - -namespace DevDriver -{ - namespace DriverControlProtocol - { - DD_STATIC_CONST uint32 kMaxNumGpus = 16; - - typedef Result(*SetDeviceClockModeCallback)(uint32 gpuIndex, DeviceClockMode clockMode, void* pUserdata); - -#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION - typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, float* pGpuClock, float* pMemClock, void* pUserdata); - typedef Result(*QueryMaxDeviceClockCallback)(uint32 gpuIndex, float* pMaxGpuClock, float* pMaxMemClock, void* pUserdata); - - struct DeviceClockCallbackInfo - { - QueryDeviceClockCallback queryClockCallback; - QueryMaxDeviceClockCallback queryMaxClockCallback; - SetDeviceClockModeCallback setCallback; - void* pUserdata; - }; -#else - typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, DevDriver::DriverControlProtocol::DeviceClockMode clockMode, float* pGpuClock, float* pMemClock, void* pUserdata); - - struct DeviceClockCallbackInfo - { - QueryDeviceClockCallback queryClockCallback; - SetDeviceClockModeCallback setCallback; - void* pUserdata; - }; -#endif - - enum class SessionState; - - class DriverControlServer : public BaseProtocolServer - { - public: - explicit DriverControlServer(IMsgChannel* pMsgChannel); - ~DriverControlServer(); - - void Finalize() override; - - // Session handling functions - bool AcceptSession(const SharedPointer& pSession) override; - void SessionEstablished(const SharedPointer& pSession) override; - void UpdateSession(const SharedPointer& pSession) override; - void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; - - // Driver state functions -#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION - // These functions just pass through to the new renamed variants to preserve backward compatibility - void WaitForDriverResume() { DriverTick(); } - void StartDeviceInit() { StartEarlyDeviceInit(); } - void FinishDriverInitialization() { FinishDeviceInit(); } -#endif - void StartEarlyDeviceInit(); - void StartLateDeviceInit(); - void FinishDeviceInit(); - void PauseDriver(); - void ResumeDriver(); - void DriverTick(); - - // Other public functions - bool IsDriverInitialized() const; - DriverStatus QueryDriverStatus(); - void SetNumGpus(uint32 numGpus); - void SetDeviceClockCallback(const DeviceClockCallbackInfo& deviceClockCallbackInfo); - uint32 GetNumGpus(); - DeviceClockMode GetDeviceClockMode(uint32 gpuIndex); - - // Sets the client id that's expected to walk us through the driver initialization process. - // If this isn't set, the server will attempt to find a suitable client itself via broadcast + discovery. - void SetDriverInitClientId(ClientId clientId) { m_driverInitClientId = clientId; } - - /// Returns true if this driver will be ignored by tools - bool IsDriverIgnored() const { return m_isIgnored; } - - private: - void LockData(); - void UnlockData(); - - // Private driver state functions - void AdvanceDriverInitState(); - void WaitForResume(); - bool DiscoverHaltRequests(); - void HandleDriverHalt(); - bool IsHalted() const - { - return ((m_driverStatus == DriverStatus::HaltedOnPlatformInit) || - (m_driverStatus == DriverStatus::HaltedOnDeviceInit) || - (m_driverStatus == DriverStatus::HaltedPostDeviceInit)); - } - - // Protocol message handlers - SessionState HandlePauseDriverRequest(SizedPayloadContainer& container); - SessionState HandleResumeDriverRequest(SizedPayloadContainer& container); - SessionState HandleQueryDeviceClockModeRequest(SizedPayloadContainer& container); - SessionState HandleSetDeviceClockModeRequest(SizedPayloadContainer& container); - SessionState HandleQueryDeviceClockRequest(SizedPayloadContainer& container); - SessionState HandleQueryDeviceClockByModeRequest(SizedPayloadContainer& container); - SessionState HandleQueryMaxDeviceClockRequest(SizedPayloadContainer& container); - SessionState HandleQueryNumGpusRequest(SizedPayloadContainer& container); - SessionState HandleQueryDriverStatusRequest(SizedPayloadContainer& container, const Version sessionVersion); - SessionState HandleStepDriverRequest(SizedPayloadContainer& container, const Version sessionVersion); - SessionState HandleIgnoreDriverRequest(SizedPayloadContainer& container); - - Platform::Mutex m_mutex; - DriverStatus m_driverStatus; - Platform::Event m_driverResumedEvent; - - uint32 m_numGpus; - DeviceClockMode m_deviceClockModes[kMaxNumGpus]; - DeviceClockCallbackInfo m_deviceClockCallbackInfo; - Platform::Atomic m_numSessions; - Platform::Atomic m_stepCounter; - bool m_initStepRequested; - - // The client id of the remote client who's responsible for walking us through the driver initialization - // process. - ClientId m_driverInitClientId; - - // This value is set to true if a remote tool has indicated that this driver will be ignored - bool m_isIgnored; - - DD_STATIC_CONST uint32 kBroadcastIntervalInMs = 100; - DD_STATIC_CONST uint32 kDefaultDriverStartTimeoutMs = 1000; - }; - } -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "baseProtocolServer.h" +#include "driverControlProtocol.h" + +namespace DevDriver +{ + namespace DriverControlProtocol + { + DD_STATIC_CONST uint32 kMaxNumGpus = 16; + + typedef Result(*SetDeviceClockModeCallback)(uint32 gpuIndex, DeviceClockMode clockMode, void* pUserdata); + +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_QUERY_CLOCKS_BY_MODE_VERSION + typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, float* pGpuClock, float* pMemClock, void* pUserdata); + typedef Result(*QueryMaxDeviceClockCallback)(uint32 gpuIndex, float* pMaxGpuClock, float* pMaxMemClock, void* pUserdata); + + struct DeviceClockCallbackInfo + { + QueryDeviceClockCallback queryClockCallback; + QueryMaxDeviceClockCallback queryMaxClockCallback; + SetDeviceClockModeCallback setCallback; + void* pUserdata; + }; +#else + typedef Result(*QueryDeviceClockCallback)(uint32 gpuIndex, DevDriver::DriverControlProtocol::DeviceClockMode clockMode, float* pGpuClock, float* pMemClock, void* pUserdata); + + struct DeviceClockCallbackInfo + { + QueryDeviceClockCallback queryClockCallback; + SetDeviceClockModeCallback setCallback; + void* pUserdata; + }; +#endif + + enum class SessionState; + + class DriverControlServer : public BaseProtocolServer + { + public: + explicit DriverControlServer(IMsgChannel* pMsgChannel); + ~DriverControlServer(); + + void Finalize() override; + + // Session handling functions + bool AcceptSession(const SharedPointer& pSession) override; + void SessionEstablished(const SharedPointer& pSession) override; + void UpdateSession(const SharedPointer& pSession) override; + void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; + + // Driver state functions +#if GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION < GPUOPEN_DRIVER_CONTROL_CLEANUP_VERSION + // These functions just pass through to the new renamed variants to preserve backward compatibility + void WaitForDriverResume() { DriverTick(); } + void StartDeviceInit() { StartEarlyDeviceInit(); } + void FinishDriverInitialization() { FinishDeviceInit(); } +#endif + void StartEarlyDeviceInit(); + void StartLateDeviceInit(); + void FinishDeviceInit(); + void PauseDriver(); + void ResumeDriver(); + void DriverTick(); + + // Other public functions + bool IsDriverInitialized() const; + DriverStatus QueryDriverStatus(); + void SetNumGpus(uint32 numGpus); + void SetDeviceClockCallback(const DeviceClockCallbackInfo& deviceClockCallbackInfo); + uint32 GetNumGpus(); + DeviceClockMode GetDeviceClockMode(uint32 gpuIndex); + + // Sets the client id that's expected to walk us through the driver initialization process. + // If this isn't set, the server will attempt to find a suitable client itself via broadcast + discovery. + void SetDriverInitClientId(ClientId clientId) { m_driverInitClientId = clientId; } + + /// Returns true if this driver will be ignored by tools + bool IsDriverIgnored() const { return m_isIgnored; } + + private: + void LockData(); + void UnlockData(); + + // Private driver state functions + void AdvanceDriverInitState(); + void WaitForResume(); + bool DiscoverHaltRequests(); + void HandleDriverHalt(); + bool IsHalted() const + { + return ((m_driverStatus == DriverStatus::HaltedOnPlatformInit) || + (m_driverStatus == DriverStatus::HaltedOnDeviceInit) || + (m_driverStatus == DriverStatus::HaltedPostDeviceInit)); + } + + // Protocol message handlers + SessionState HandlePauseDriverRequest(SizedPayloadContainer& container); + SessionState HandleResumeDriverRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockModeRequest(SizedPayloadContainer& container); + SessionState HandleSetDeviceClockModeRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockRequest(SizedPayloadContainer& container); + SessionState HandleQueryDeviceClockByModeRequest(SizedPayloadContainer& container); + SessionState HandleQueryMaxDeviceClockRequest(SizedPayloadContainer& container); + SessionState HandleQueryNumGpusRequest(SizedPayloadContainer& container); + SessionState HandleQueryDriverStatusRequest(SizedPayloadContainer& container, const Version sessionVersion); + SessionState HandleStepDriverRequest(SizedPayloadContainer& container, const Version sessionVersion); + SessionState HandleIgnoreDriverRequest(SizedPayloadContainer& container); + + Platform::Mutex m_mutex; + DriverStatus m_driverStatus; + Platform::Event m_driverResumedEvent; + + uint32 m_numGpus; + DeviceClockMode m_deviceClockModes[kMaxNumGpus]; + DeviceClockCallbackInfo m_deviceClockCallbackInfo; + Platform::Atomic m_numSessions; + Platform::Atomic m_stepCounter; + bool m_initStepRequested; + + // The client id of the remote client who's responsible for walking us through the driver initialization + // process. + ClientId m_driverInitClientId; + + // This value is set to true if a remote tool has indicated that this driver will be ignored + bool m_isIgnored; + + DD_STATIC_CONST uint32 kBroadcastIntervalInMs = 100; + DD_STATIC_CONST uint32 kDefaultDriverStartTimeoutMs = 1000; + }; + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h index 7de472f132..027b3568b5 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpProtocol.h @@ -1,537 +1,537 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#pragma pack(push) - -#include "gpuopen.h" - -#define RGP_PROTOCOL_VERSION 11 - -#define RGP_PROTOCOL_MINIMUM_VERSION 2 - -/* -*********************************************************************************************************************** -*| Version | Change Description | -*| ------- | ---------------------------------------------------------------------------------------------------------| -*| 11.0 | Version bumped to indicate driver support for SE mask only applying to detailed instruction tracing | -*| 10.0 | Added support for SPM counters and SE masking. | -*| 9.0 | Decoupled trace parameters from execute trace request. | -*| 8.0 | Added support for capturing the RGP trace on specific frame or dispatch | -*| | Added bitfield to control whether driver internal code objects are included in the code object database | -*| 7.0 | Added support for aborting traces that are still in the pending state on the server. | -*| 6.0 | Added support for trace trigger markers. | -*| 5.0 | Added support for allow compute presents trace parameter and removed unused clock mode parameter. | -*| 4.0 | Added support for reporting trace transfer progress. | -*| 3.0 | Updated TraceParameters struct to allow for specifying profiling clock mode. | -*| 2.0 | Add TraceParameters struct and ExecuteTraceRequestPayload so a client can specify trace options. | -*| 1.0 | Initial version | -*********************************************************************************************************************** -*/ - -#define RGP_DETAILED_SEMASK_VERSION 11 -#define RGP_SPM_COUNTERS_VERSION 10 -#define RGP_DECOUPLED_TRACE_PARAMETERS 9 -#define RGP_FRAME_CAPTURE_VERSION 8 -#define RGP_PENDING_ABORT_VERSION 7 -#define RGP_TRIGGER_MARKERS_VERSION 6 -#define RGP_COMPUTE_PRESENTS_VERSION 5 -#define RGP_TRACE_PROGRESS_VERSION 4 -#define RGP_PROFILING_CLOCK_MODES_VERSION 3 -#define RGP_TRACE_PARAMETERS_VERSION 2 -#define RGP_INITIAL_VERSION 1 - -namespace DevDriver -{ - namespace RGPProtocol - { - /////////////////////// - // RGP Protocol - enum struct RGPMessage : MessageCode - { - Unknown = 0, - ExecuteTraceRequest, - TraceDataChunk, - TraceDataSentinel, - QueryProfilingStatusRequest, - QueryProfilingStatusResponse, - EnableProfilingRequest, - EnableProfilingResponse, - TraceDataHeader, - AbortTrace, - QueryTraceParametersRequest, - QueryTraceParametersResponse, - UpdateTraceParametersRequest, - UpdateTraceParametersResponse, - UpdateSpmConfigRequest, - UpdateSpmConfigData, - UpdateSpmConfigResponse, - Count - }; - - // @note: We currently subtract sizeof(uint32) instead of sizeof(RGPMessage) to work around struct packing issues. - // The compiler pads out RGPMessage to 4 bytes when it's included in the payload struct. It also pads out - // the TraceDataChunk data field to 1000 bytes. This causes the total payload size to be 1004 bytes which is - // 4 bytes larger than the maximum size allowed. - DD_STATIC_CONST Size kMaxTraceDataChunkSize = (kMaxPayloadSizeInBytes - sizeof(uint32) - sizeof(uint32)); - - /////////////////////// - // RGP Constants - const uint32 kMarkerStringLength = 256; - const uint32 kMaxSpmCountersPerUpdate = 320; - - // Define the number of bits per SPM id value - constexpr uint32 kSpmBlockIdBits = 8; - constexpr uint32 kSpmInstanceIdBits = 12; - constexpr uint32 kSpmEventIdBits = 12; - - // Define the max SPM id values based on the number of bits we allocate for them in the network packet - constexpr uint32 kMaxSpmBlockId = (1 << kSpmBlockIdBits); - constexpr uint32 kMaxSpmInstanceId = (1 << kSpmInstanceIdBits); - constexpr uint32 kMaxSpmEventId = (1 << kSpmEventIdBits); - - // The application can specify this value for the instance id and it will be expanded into - // all available instances on the driver side. - // The counter fields are bit packed when transferred over the network so we need to account for that here - // rather than simply setting all bits. - constexpr uint32 kSpmAllInstancesId = (kMaxSpmInstanceId - 1); - - /////////////////////// - // RGP Types - DD_NETWORK_STRUCT(TraceDataChunk, 4) - { - uint32 dataSize; - uint8 data[kMaxTraceDataChunkSize]; - }; - - DD_CHECK_SIZE(TraceDataChunk, kMaxTraceDataChunkSize + sizeof(int32)); - - DD_NETWORK_STRUCT(TraceParameters, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 reserved : 31; - }; - uint32 u32All; - } flags; - }; - - DD_CHECK_SIZE(TraceParameters, 12); - - enum struct ProfilingClockMode : uint32 - { - Stable = 0, - Max, - Normal, - Count - }; - - DD_NETWORK_STRUCT(TraceParametersV2, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - ProfilingClockMode clockMode; - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 reserved : 31; - }; - uint32 u32All; - } flags; - }; - - DD_CHECK_SIZE(TraceParametersV2, 16); - - DD_NETWORK_STRUCT(TraceParametersV3, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 reserved : 30; - }; - uint32 u32All; - } flags; - }; - - DD_CHECK_SIZE(TraceParametersV3, 12); - - DD_NETWORK_STRUCT(TraceParametersV4, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 reserved : 30; - }; - uint32 u32All; - } flags; - - // Begin Tag - uint32 beginTagHigh; - uint32 beginTagLow; - - // End Tag - uint32 endTagHigh; - uint32 endTagLow; - - // Begin/End Marker Strings - char beginMarker[kMarkerStringLength]; - char endMarker[kMarkerStringLength]; - }; - - DD_CHECK_SIZE(TraceParametersV4, 540); - - enum struct CaptureTriggerMode : uint32 - { - Present = 0, - Markers, - Index, - Count - }; - - DD_NETWORK_STRUCT(TraceParametersV5, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - uint32 captureStartIndex; - uint32 captureStopIndex; - CaptureTriggerMode captureMode; - - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 captureDriverCodeObjects : 1; - uint32 reserved : 29; - }; - uint32 u32All; - } flags; - - // Begin Tag - uint32 beginTagHigh; - uint32 beginTagLow; - - // End Tag - uint32 endTagHigh; - uint32 endTagLow; - - // Begin/End Marker Strings - char beginMarker[kMarkerStringLength]; - char endMarker[kMarkerStringLength]; - }; - - DD_CHECK_SIZE(TraceParametersV5, 552); - - DD_NETWORK_STRUCT(TraceParametersV6, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - uint32 captureStartIndex; - uint32 captureStopIndex; - CaptureTriggerMode captureMode; - - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 captureDriverCodeObjects : 1; - uint32 reserved : 29; - }; - uint32 u32All; - } flags; - - // Begin Tag - uint32 beginTagHigh; - uint32 beginTagLow; - - // End Tag - uint32 endTagHigh; - uint32 endTagLow; - - // Begin/End Marker Strings - char beginMarker[kMarkerStringLength]; - char endMarker[kMarkerStringLength]; - - // Target pipeline hash - uint32 pipelineHashHi; - uint32 pipelineHashLo; - }; - - DD_CHECK_SIZE(TraceParametersV6, 560); - - DD_NETWORK_STRUCT(TraceParametersV7, 4) - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - uint32 captureStartIndex; - uint32 captureStopIndex; - CaptureTriggerMode captureMode; - - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 captureDriverCodeObjects : 1; - uint32 enableSpm : 1; - uint32 reserved : 28; - }; - uint32 u32All; - } flags; - - // Begin Tag - uint32 beginTagHigh; - uint32 beginTagLow; - - // End Tag - uint32 endTagHigh; - uint32 endTagLow; - - // Begin/End Marker Strings - char beginMarker[kMarkerStringLength]; - char endMarker[kMarkerStringLength]; - - // Target pipeline hash - uint32 pipelineHashHi; - uint32 pipelineHashLo; - - // Shader Engine Mask - uint32 seMask; - }; - - DD_CHECK_SIZE(TraceParametersV7, 564); - - DD_NETWORK_STRUCT(SpmCounterId, 4) - { - uint32 blockId : kSpmBlockIdBits; - uint32 instanceId : kSpmInstanceIdBits; - uint32 eventId : kSpmEventIdBits; - }; - - DD_CHECK_SIZE(SpmCounterId, 4); - - static_assert( - kSpmBlockIdBits + kSpmInstanceIdBits + kSpmEventIdBits == 8 * sizeof(SpmCounterId), - "SpmCounterId is wasting bits"); - - enum struct ProfilingStatus : uint32 - { - NotAvailable = 0, - Available, - Enabled, - Count - }; - - /////////////////////// - // RGP Payloads - - DD_NETWORK_STRUCT(ExecuteTraceRequestPayload, 4) - { - TraceParameters parameters; - }; - - DD_CHECK_SIZE(ExecuteTraceRequestPayload, 12); - - DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV2, 4) - { - TraceParametersV2 parameters; - }; - - DD_CHECK_SIZE(ExecuteTraceRequestPayloadV2, 16); - - DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV3, 4) - { - TraceParametersV3 parameters; - }; - - DD_CHECK_SIZE(ExecuteTraceRequestPayloadV3, 12); - - DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV4, 4) - { - TraceParametersV4 parameters; - }; - - DD_CHECK_SIZE(ExecuteTraceRequestPayloadV4, 540); - - DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV5, 4) - { - TraceParametersV5 parameters; - }; - - DD_CHECK_SIZE(ExecuteTraceRequestPayloadV5, 552); - - DD_NETWORK_STRUCT(TraceDataChunkPayload, 4) - { - TraceDataChunk chunk; - }; - - DD_CHECK_SIZE(TraceDataChunkPayload, kMaxTraceDataChunkSize + sizeof(int32)); - - DD_NETWORK_STRUCT(TraceDataSentinelPayload, 4) - { - Result result; - }; - - DD_CHECK_SIZE(TraceDataSentinelPayload, 4); - - DD_NETWORK_STRUCT(TraceDataHeaderPayload, 4) - { - Result result; - uint32 numChunks; - uint32 sizeInBytes; - }; - - DD_CHECK_SIZE(TraceDataHeaderPayload, 12); - - DD_NETWORK_STRUCT(QueryProfilingStatusResponsePayload, 4) - { - ProfilingStatus status; - }; - - DD_CHECK_SIZE(QueryProfilingStatusResponsePayload, 4); - - DD_NETWORK_STRUCT(EnableProfilingResponsePayload, 4) - { - Result result; - }; - - DD_CHECK_SIZE(EnableProfilingResponsePayload, 4); - - DD_NETWORK_STRUCT(QueryTraceParametersResponsePayload, 4) - { - Result result; - TraceParametersV6 parameters; - }; - - DD_CHECK_SIZE(QueryTraceParametersResponsePayload, 564); - - DD_NETWORK_STRUCT(QueryTraceParametersResponsePayloadV2, 4) - { - Result result; - TraceParametersV7 parameters; - }; - - DD_CHECK_SIZE(QueryTraceParametersResponsePayloadV2, 568); - - DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayload, 4) - { - TraceParametersV6 parameters; - }; - - DD_CHECK_SIZE(UpdateTraceParametersRequestPayload, 560); - - DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayloadV2, 4) - { - TraceParametersV7 parameters; - }; - - DD_CHECK_SIZE(UpdateTraceParametersRequestPayloadV2, 564); - - DD_NETWORK_STRUCT(UpdateTraceParametersResponsePayload, 4) - { - Result result; - }; - - DD_CHECK_SIZE(UpdateTraceParametersResponsePayload, 4); - - DD_NETWORK_STRUCT(UpdateSpmConfigRequestPayload, 4) - { - uint32 sampleFrequency; - uint32 memoryLimitInMb; - uint32 numDataPayloads; - }; - - DD_CHECK_SIZE(UpdateSpmConfigRequestPayload, 12); - - DD_NETWORK_STRUCT(UpdateSpmConfigDataPayload, 4) - { - uint32 numCounters; - SpmCounterId counters[kMaxSpmCountersPerUpdate]; - }; - - DD_CHECK_SIZE(UpdateSpmConfigDataPayload, 1284); - - DD_NETWORK_STRUCT(UpdateSpmConfigResponsePayload, 4) - { - Result result; - }; - - DD_CHECK_SIZE(UpdateSpmConfigResponsePayload, 4); - - DD_NETWORK_STRUCT(RGPPayload, 4) - { - RGPMessage command; - // pad out to 4 bytes for alignment requirements - char padding[3]; - union - { - ExecuteTraceRequestPayload executeTraceRequest; - ExecuteTraceRequestPayloadV2 executeTraceRequestV2; - ExecuteTraceRequestPayloadV3 executeTraceRequestV3; - ExecuteTraceRequestPayloadV4 executeTraceRequestV4; - ExecuteTraceRequestPayloadV5 executeTraceRequestV5; - TraceDataChunkPayload traceDataChunk; - TraceDataSentinelPayload traceDataSentinel; - TraceDataHeaderPayload traceDataHeader; - QueryProfilingStatusResponsePayload queryProfilingStatusResponse; - EnableProfilingResponsePayload enableProfilingStatusResponse; - QueryTraceParametersResponsePayload queryTraceParametersResponse; - QueryTraceParametersResponsePayloadV2 queryTraceParametersResponseV2; - UpdateTraceParametersRequestPayload updateTraceParametersRequest; - UpdateTraceParametersRequestPayloadV2 updateTraceParametersRequestV2; - UpdateTraceParametersResponsePayload updateTraceParametersResponse; - UpdateSpmConfigRequestPayload updateSpmConfigRequest; - UpdateSpmConfigDataPayload updateSpmConfigData; - UpdateSpmConfigResponsePayload updateSpmConfigResponse; - }; - }; - - DD_CHECK_SIZE(RGPPayload, kMaxPayloadSizeInBytes); - } -} - -#pragma pack(pop) +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#pragma pack(push) + +#include "gpuopen.h" + +#define RGP_PROTOCOL_VERSION 11 + +#define RGP_PROTOCOL_MINIMUM_VERSION 2 + +/* +*********************************************************************************************************************** +*| Version | Change Description | +*| ------- | ---------------------------------------------------------------------------------------------------------| +*| 11.0 | Version bumped to indicate driver support for SE mask only applying to detailed instruction tracing | +*| 10.0 | Added support for SPM counters and SE masking. | +*| 9.0 | Decoupled trace parameters from execute trace request. | +*| 8.0 | Added support for capturing the RGP trace on specific frame or dispatch | +*| | Added bitfield to control whether driver internal code objects are included in the code object database | +*| 7.0 | Added support for aborting traces that are still in the pending state on the server. | +*| 6.0 | Added support for trace trigger markers. | +*| 5.0 | Added support for allow compute presents trace parameter and removed unused clock mode parameter. | +*| 4.0 | Added support for reporting trace transfer progress. | +*| 3.0 | Updated TraceParameters struct to allow for specifying profiling clock mode. | +*| 2.0 | Add TraceParameters struct and ExecuteTraceRequestPayload so a client can specify trace options. | +*| 1.0 | Initial version | +*********************************************************************************************************************** +*/ + +#define RGP_DETAILED_SEMASK_VERSION 11 +#define RGP_SPM_COUNTERS_VERSION 10 +#define RGP_DECOUPLED_TRACE_PARAMETERS 9 +#define RGP_FRAME_CAPTURE_VERSION 8 +#define RGP_PENDING_ABORT_VERSION 7 +#define RGP_TRIGGER_MARKERS_VERSION 6 +#define RGP_COMPUTE_PRESENTS_VERSION 5 +#define RGP_TRACE_PROGRESS_VERSION 4 +#define RGP_PROFILING_CLOCK_MODES_VERSION 3 +#define RGP_TRACE_PARAMETERS_VERSION 2 +#define RGP_INITIAL_VERSION 1 + +namespace DevDriver +{ + namespace RGPProtocol + { + /////////////////////// + // RGP Protocol + enum struct RGPMessage : MessageCode + { + Unknown = 0, + ExecuteTraceRequest, + TraceDataChunk, + TraceDataSentinel, + QueryProfilingStatusRequest, + QueryProfilingStatusResponse, + EnableProfilingRequest, + EnableProfilingResponse, + TraceDataHeader, + AbortTrace, + QueryTraceParametersRequest, + QueryTraceParametersResponse, + UpdateTraceParametersRequest, + UpdateTraceParametersResponse, + UpdateSpmConfigRequest, + UpdateSpmConfigData, + UpdateSpmConfigResponse, + Count + }; + + // @note: We currently subtract sizeof(uint32) instead of sizeof(RGPMessage) to work around struct packing issues. + // The compiler pads out RGPMessage to 4 bytes when it's included in the payload struct. It also pads out + // the TraceDataChunk data field to 1000 bytes. This causes the total payload size to be 1004 bytes which is + // 4 bytes larger than the maximum size allowed. + DD_STATIC_CONST Size kMaxTraceDataChunkSize = (kMaxPayloadSizeInBytes - sizeof(uint32) - sizeof(uint32)); + + /////////////////////// + // RGP Constants + const uint32 kMarkerStringLength = 256; + const uint32 kMaxSpmCountersPerUpdate = 320; + + // Define the number of bits per SPM id value + constexpr uint32 kSpmBlockIdBits = 8; + constexpr uint32 kSpmInstanceIdBits = 12; + constexpr uint32 kSpmEventIdBits = 12; + + // Define the max SPM id values based on the number of bits we allocate for them in the network packet + constexpr uint32 kMaxSpmBlockId = (1 << kSpmBlockIdBits); + constexpr uint32 kMaxSpmInstanceId = (1 << kSpmInstanceIdBits); + constexpr uint32 kMaxSpmEventId = (1 << kSpmEventIdBits); + + // The application can specify this value for the instance id and it will be expanded into + // all available instances on the driver side. + // The counter fields are bit packed when transferred over the network so we need to account for that here + // rather than simply setting all bits. + constexpr uint32 kSpmAllInstancesId = (kMaxSpmInstanceId - 1); + + /////////////////////// + // RGP Types + DD_NETWORK_STRUCT(TraceDataChunk, 4) + { + uint32 dataSize; + uint8 data[kMaxTraceDataChunkSize]; + }; + + DD_CHECK_SIZE(TraceDataChunk, kMaxTraceDataChunkSize + sizeof(int32)); + + DD_NETWORK_STRUCT(TraceParameters, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 reserved : 31; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParameters, 12); + + enum struct ProfilingClockMode : uint32 + { + Stable = 0, + Max, + Normal, + Count + }; + + DD_NETWORK_STRUCT(TraceParametersV2, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + ProfilingClockMode clockMode; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 reserved : 31; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParametersV2, 16); + + DD_NETWORK_STRUCT(TraceParametersV3, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + }; + + DD_CHECK_SIZE(TraceParametersV3, 12); + + DD_NETWORK_STRUCT(TraceParametersV4, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 reserved : 30; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + }; + + DD_CHECK_SIZE(TraceParametersV4, 540); + + enum struct CaptureTriggerMode : uint32 + { + Present = 0, + Markers, + Index, + Count + }; + + DD_NETWORK_STRUCT(TraceParametersV5, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 reserved : 29; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + }; + + DD_CHECK_SIZE(TraceParametersV5, 552); + + DD_NETWORK_STRUCT(TraceParametersV6, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 reserved : 29; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + // Target pipeline hash + uint32 pipelineHashHi; + uint32 pipelineHashLo; + }; + + DD_CHECK_SIZE(TraceParametersV6, 560); + + DD_NETWORK_STRUCT(TraceParametersV7, 4) + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 enableSpm : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } flags; + + // Begin Tag + uint32 beginTagHigh; + uint32 beginTagLow; + + // End Tag + uint32 endTagHigh; + uint32 endTagLow; + + // Begin/End Marker Strings + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + // Target pipeline hash + uint32 pipelineHashHi; + uint32 pipelineHashLo; + + // Shader Engine Mask + uint32 seMask; + }; + + DD_CHECK_SIZE(TraceParametersV7, 564); + + DD_NETWORK_STRUCT(SpmCounterId, 4) + { + uint32 blockId : kSpmBlockIdBits; + uint32 instanceId : kSpmInstanceIdBits; + uint32 eventId : kSpmEventIdBits; + }; + + DD_CHECK_SIZE(SpmCounterId, 4); + + static_assert( + kSpmBlockIdBits + kSpmInstanceIdBits + kSpmEventIdBits == 8 * sizeof(SpmCounterId), + "SpmCounterId is wasting bits"); + + enum struct ProfilingStatus : uint32 + { + NotAvailable = 0, + Available, + Enabled, + Count + }; + + /////////////////////// + // RGP Payloads + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayload, 4) + { + TraceParameters parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayload, 12); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV2, 4) + { + TraceParametersV2 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV2, 16); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV3, 4) + { + TraceParametersV3 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV3, 12); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV4, 4) + { + TraceParametersV4 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV4, 540); + + DD_NETWORK_STRUCT(ExecuteTraceRequestPayloadV5, 4) + { + TraceParametersV5 parameters; + }; + + DD_CHECK_SIZE(ExecuteTraceRequestPayloadV5, 552); + + DD_NETWORK_STRUCT(TraceDataChunkPayload, 4) + { + TraceDataChunk chunk; + }; + + DD_CHECK_SIZE(TraceDataChunkPayload, kMaxTraceDataChunkSize + sizeof(int32)); + + DD_NETWORK_STRUCT(TraceDataSentinelPayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(TraceDataSentinelPayload, 4); + + DD_NETWORK_STRUCT(TraceDataHeaderPayload, 4) + { + Result result; + uint32 numChunks; + uint32 sizeInBytes; + }; + + DD_CHECK_SIZE(TraceDataHeaderPayload, 12); + + DD_NETWORK_STRUCT(QueryProfilingStatusResponsePayload, 4) + { + ProfilingStatus status; + }; + + DD_CHECK_SIZE(QueryProfilingStatusResponsePayload, 4); + + DD_NETWORK_STRUCT(EnableProfilingResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(EnableProfilingResponsePayload, 4); + + DD_NETWORK_STRUCT(QueryTraceParametersResponsePayload, 4) + { + Result result; + TraceParametersV6 parameters; + }; + + DD_CHECK_SIZE(QueryTraceParametersResponsePayload, 564); + + DD_NETWORK_STRUCT(QueryTraceParametersResponsePayloadV2, 4) + { + Result result; + TraceParametersV7 parameters; + }; + + DD_CHECK_SIZE(QueryTraceParametersResponsePayloadV2, 568); + + DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayload, 4) + { + TraceParametersV6 parameters; + }; + + DD_CHECK_SIZE(UpdateTraceParametersRequestPayload, 560); + + DD_NETWORK_STRUCT(UpdateTraceParametersRequestPayloadV2, 4) + { + TraceParametersV7 parameters; + }; + + DD_CHECK_SIZE(UpdateTraceParametersRequestPayloadV2, 564); + + DD_NETWORK_STRUCT(UpdateTraceParametersResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(UpdateTraceParametersResponsePayload, 4); + + DD_NETWORK_STRUCT(UpdateSpmConfigRequestPayload, 4) + { + uint32 sampleFrequency; + uint32 memoryLimitInMb; + uint32 numDataPayloads; + }; + + DD_CHECK_SIZE(UpdateSpmConfigRequestPayload, 12); + + DD_NETWORK_STRUCT(UpdateSpmConfigDataPayload, 4) + { + uint32 numCounters; + SpmCounterId counters[kMaxSpmCountersPerUpdate]; + }; + + DD_CHECK_SIZE(UpdateSpmConfigDataPayload, 1284); + + DD_NETWORK_STRUCT(UpdateSpmConfigResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(UpdateSpmConfigResponsePayload, 4); + + DD_NETWORK_STRUCT(RGPPayload, 4) + { + RGPMessage command; + // pad out to 4 bytes for alignment requirements + char padding[3]; + union + { + ExecuteTraceRequestPayload executeTraceRequest; + ExecuteTraceRequestPayloadV2 executeTraceRequestV2; + ExecuteTraceRequestPayloadV3 executeTraceRequestV3; + ExecuteTraceRequestPayloadV4 executeTraceRequestV4; + ExecuteTraceRequestPayloadV5 executeTraceRequestV5; + TraceDataChunkPayload traceDataChunk; + TraceDataSentinelPayload traceDataSentinel; + TraceDataHeaderPayload traceDataHeader; + QueryProfilingStatusResponsePayload queryProfilingStatusResponse; + EnableProfilingResponsePayload enableProfilingStatusResponse; + QueryTraceParametersResponsePayload queryTraceParametersResponse; + QueryTraceParametersResponsePayloadV2 queryTraceParametersResponseV2; + UpdateTraceParametersRequestPayload updateTraceParametersRequest; + UpdateTraceParametersRequestPayloadV2 updateTraceParametersRequestV2; + UpdateTraceParametersResponsePayload updateTraceParametersResponse; + UpdateSpmConfigRequestPayload updateSpmConfigRequest; + UpdateSpmConfigDataPayload updateSpmConfigData; + UpdateSpmConfigResponsePayload updateSpmConfigResponse; + }; + }; + + DD_CHECK_SIZE(RGPPayload, kMaxPayloadSizeInBytes); + } +} + +#pragma pack(pop) diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h index 7dbc74f15d..05002b2c8b 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/rgpServer.h @@ -1,174 +1,174 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "baseProtocolServer.h" -#include "util/vector.h" - -#include "rgpProtocol.h" - -namespace DevDriver -{ - namespace RGPProtocol - { - enum class TraceStatus : uint32 - { - Idle = 0, - Pending, - Running, - Finishing, - Aborting - }; - - struct ServerTraceParametersInfo - { - uint32 gpuMemoryLimitInMb; - uint32 numPreparationFrames; - uint32 captureStartIndex; - uint32 captureStopIndex; - CaptureTriggerMode captureMode; - - union - { - struct - { - uint32 enableInstructionTokens : 1; - uint32 allowComputePresents : 1; - uint32 captureDriverCodeObjects : 1; - uint32 enableSpm : 1; - uint32 reserved : 28; - }; - uint32 u32All; - } flags; - - uint64 beginTag; - uint64 endTag; - - char beginMarker[kMarkerStringLength]; - char endMarker[kMarkerStringLength]; - - uint64 pipelineHash; - -#if DD_VERSION_SUPPORTS(GPUOPEN_RGP_SPM_COUNTERS_VERSION) - uint32 seMask; -#endif - }; - - struct ServerSpmCounterId - { - uint32 blockId; - uint32 instanceId; - uint32 eventId; - }; - - struct ServerSpmConfig - { - uint32 sampleFrequency; - uint32 memoryLimitInMb; - }; - - typedef bool (*PFN_ValidateSpmConfig)(void* pUserdata, const ServerSpmConfig* pConfig, const Vector* pCounterData); - struct ValidateSpmCallbackInfo - { - void* pUserdata; - PFN_ValidateSpmConfig pfnValidateSpmConfig; - }; - - struct RGPSession; - - class RGPServer : public BaseProtocolServer - { - public: - explicit RGPServer(IMsgChannel* pMsgChannel); - ~RGPServer(); - - void Finalize() override; - - bool AcceptSession(const SharedPointer& pSession) override; - void SessionEstablished(const SharedPointer& pSession) override; - void UpdateSession(const SharedPointer& pSession) override; - void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; - - // Returns true if traces are currently enabled. - bool TracesEnabled(); - - // Allows remote clients to request traces. - Result EnableTraces(); - - // Disable support for traces. - Result DisableTraces(); - - // Returns true if a client has requested a trace and it has not been started yet. - bool IsTracePending(); - - // Returns true if a client has requested a trace and it is currently running. - bool IsTraceRunning(); - - // Returns true if the server is currently outputting trace results to a client. - bool IsTraceOutputInProgress(); - - // Starts a new trace. This will only succeed if a trace was previously pending. - Result BeginTrace(); - - // Ends a trace. This will only succeed if a trace was previously in progress. - Result EndTrace(); - - // Aborts a trace. This will only succeed if a trace was previously in progress. - Result AbortTrace(); - - // Writes data into the current trace. This can only be performed when there is a trace in progress. - Result WriteTraceData(const uint8* pTraceData, size_t traceDataSize); - - // Returns the current profiling status on the rgp server. - ProfilingStatus QueryProfilingStatus(); - - // Returns the current trace parameters on the rgp server. - ServerTraceParametersInfo QueryTraceParameters(); - - // Populates the provided structure with the current perf counter config and returns data for each counter - // in the provided vector - Result QuerySpmConfig(ServerSpmConfig* pConfig, Vector* pCounterData); - - // Sets a validation callback that will be used to validate SPM configuration data - void SetSpmValidationCallback(const ValidateSpmCallbackInfo& callback); - - private: - void LockData(); - void UnlockData(); - void ClearCurrentSession(); - Result UpdateSpmConfig(const ServerSpmConfig& config, const Vector& counters); - - Platform::Mutex m_mutex; - TraceStatus m_traceStatus; - RGPSession* m_pCurrentSessionData; - ProfilingStatus m_profilingStatus; - ServerTraceParametersInfo m_traceParameters; - ServerSpmConfig m_spmConfig; - Vector m_spmCounterData; - ValidateSpmCallbackInfo m_spmValidationCb; - }; - } -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "baseProtocolServer.h" +#include "util/vector.h" + +#include "rgpProtocol.h" + +namespace DevDriver +{ + namespace RGPProtocol + { + enum class TraceStatus : uint32 + { + Idle = 0, + Pending, + Running, + Finishing, + Aborting + }; + + struct ServerTraceParametersInfo + { + uint32 gpuMemoryLimitInMb; + uint32 numPreparationFrames; + uint32 captureStartIndex; + uint32 captureStopIndex; + CaptureTriggerMode captureMode; + + union + { + struct + { + uint32 enableInstructionTokens : 1; + uint32 allowComputePresents : 1; + uint32 captureDriverCodeObjects : 1; + uint32 enableSpm : 1; + uint32 reserved : 28; + }; + uint32 u32All; + } flags; + + uint64 beginTag; + uint64 endTag; + + char beginMarker[kMarkerStringLength]; + char endMarker[kMarkerStringLength]; + + uint64 pipelineHash; + +#if DD_VERSION_SUPPORTS(GPUOPEN_RGP_SPM_COUNTERS_VERSION) + uint32 seMask; +#endif + }; + + struct ServerSpmCounterId + { + uint32 blockId; + uint32 instanceId; + uint32 eventId; + }; + + struct ServerSpmConfig + { + uint32 sampleFrequency; + uint32 memoryLimitInMb; + }; + + typedef bool (*PFN_ValidateSpmConfig)(void* pUserdata, const ServerSpmConfig* pConfig, const Vector* pCounterData); + struct ValidateSpmCallbackInfo + { + void* pUserdata; + PFN_ValidateSpmConfig pfnValidateSpmConfig; + }; + + struct RGPSession; + + class RGPServer : public BaseProtocolServer + { + public: + explicit RGPServer(IMsgChannel* pMsgChannel); + ~RGPServer(); + + void Finalize() override; + + bool AcceptSession(const SharedPointer& pSession) override; + void SessionEstablished(const SharedPointer& pSession) override; + void UpdateSession(const SharedPointer& pSession) override; + void SessionTerminated(const SharedPointer& pSession, Result terminationReason) override; + + // Returns true if traces are currently enabled. + bool TracesEnabled(); + + // Allows remote clients to request traces. + Result EnableTraces(); + + // Disable support for traces. + Result DisableTraces(); + + // Returns true if a client has requested a trace and it has not been started yet. + bool IsTracePending(); + + // Returns true if a client has requested a trace and it is currently running. + bool IsTraceRunning(); + + // Returns true if the server is currently outputting trace results to a client. + bool IsTraceOutputInProgress(); + + // Starts a new trace. This will only succeed if a trace was previously pending. + Result BeginTrace(); + + // Ends a trace. This will only succeed if a trace was previously in progress. + Result EndTrace(); + + // Aborts a trace. This will only succeed if a trace was previously in progress. + Result AbortTrace(); + + // Writes data into the current trace. This can only be performed when there is a trace in progress. + Result WriteTraceData(const uint8* pTraceData, size_t traceDataSize); + + // Returns the current profiling status on the rgp server. + ProfilingStatus QueryProfilingStatus(); + + // Returns the current trace parameters on the rgp server. + ServerTraceParametersInfo QueryTraceParameters(); + + // Populates the provided structure with the current perf counter config and returns data for each counter + // in the provided vector + Result QuerySpmConfig(ServerSpmConfig* pConfig, Vector* pCounterData); + + // Sets a validation callback that will be used to validate SPM configuration data + void SetSpmValidationCallback(const ValidateSpmCallbackInfo& callback); + + private: + void LockData(); + void UnlockData(); + void ClearCurrentSession(); + Result UpdateSpmConfig(const ServerSpmConfig& config, const Vector& counters); + + Platform::Mutex m_mutex; + TraceStatus m_traceStatus; + RGPSession* m_pCurrentSessionData; + ProfilingStatus m_profilingStatus; + ServerTraceParametersInfo m_traceParameters; + ServerSpmConfig m_spmConfig; + Vector m_spmCounterData; + ValidateSpmCallbackInfo m_spmValidationCb; + }; + } +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h index e299725e56..fcc817123f 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/systemProtocols.h @@ -1,217 +1,217 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include "gpuopen.h" - -namespace DevDriver -{ - namespace SystemProtocol - { - /////////////////////// - // GPU Open System Protocol - enum struct SystemMessage : MessageCode - { - Unknown = 0, - ClientConnected, - ClientDisconnected, - Ping, - Pong, - QueryClientInfo, - ClientInfo, - Halted, - Count, - }; - } - - namespace SessionProtocol - { - /////////////////////// - // GPU Open Session Protocol - enum struct SessionMessage : MessageCode - { - Unknown = 0, - Syn, - SynAck, - Fin, - Data, - Ack, - Rst, - Count - }; - - typedef uint8 SessionVersion; - // Session protocol 2 lets session servers return session version as part of the synack - DD_STATIC_CONST SessionVersion kSessionProtocolVersionSynAckVersion = 2; - // Session protocol 1 lets session clients specify a max range supported as part of the syn - DD_STATIC_CONST SessionVersion kSessionProtocolRangeVersion = 1; - // current version is 2 - DD_STATIC_CONST SessionVersion kSessionProtocolVersion = kSessionProtocolVersionSynAckVersion; - // not mentioned is session version 0. It only supported min version in SynAck, servers reporting it cannot - // cleanly terminate in response to a Fin packet. - - // tripwire - this intentionally will break if the message version changes. Since that implies a breaking change, we need to address - // to re-baseline this as version 0 and update the SynPayload struct at the same time - static_assert(kMessageVersion == 1011, "Session packets need to be cleaned up as part of the next protocol version"); - - DD_NETWORK_STRUCT(SynPayload, 4) - { - Version minVersion; - Protocol protocol; - // pad out to 4 bytes - SessionVersion sessionVersion; - - // New fields read if sessionVersion != 0 - Version maxVersion; - // pad out to 8 bytes - uint8 reserved[2]; - }; - - DD_CHECK_SIZE(SynPayload, 8); - - // - // SynPayloadV2 is here so that we can use it with the next breaking message bus change. - // - //DD_NETWORK_STRUCT(SynPayloadV2, 4) - //{ - // Protocol protocol; - // SessionVersion sessionVersion; - // Version minVersion; - // Version maxVersion; - // // pad out to 8 bytes - // uint8 reserved[2]; - //}; - - //DD_CHECK_SIZE(SynPayloadV2, 8); - - DD_NETWORK_STRUCT(SynAckPayload, 8) - { - Sequence sequence; - SessionId initialSessionId; - Version version; - SessionVersion sessionVersion; - uint8 reserved[1]; - }; - - DD_CHECK_SIZE(SynAckPayload, 16); - } - - namespace ClientManagementProtocol - { - - /////////////////////// - // GPU Open ClientManagement Protocol - enum struct ManagementMessage : MessageCode - { - Unknown = 0, - ConnectRequest, - ConnectResponse, - DisconnectNotification, - DisconnectResponse, - SetClientFlags, - SetClientFlagsResponse, - QueryStatus, - QueryStatusResponse, - KeepAlive, - Count - }; - - DD_STATIC_CONST MessageBuffer kOutOfBandMessage = - { - { // header - kBroadcastClientId, //srcClientId - kBroadcastClientId, //dstClientId - Protocol::ClientManagement, //protocolId - 0, //messageId - 0, //windowSize - 0, //payloadSize - 0, //sessionId - kMessageVersion //sequence - }, - {} // payload - }; - - inline bool IsOutOfBandMessage(const MessageBuffer &message) - { - // an out of band message is denoted by both the dstClientId and srcClientId - // being initialized to kBroadcastClientId. - static_assert(kBroadcastClientId == 0, "Error, kBroadcastClientId is non-zero. IsOutOfBandMessage needs to be fixed"); - return ((message.header.dstClientId | message.header.srcClientId) == kBroadcastClientId); - } - - inline bool IsValidOutOfBandMessage(const MessageBuffer &message) - { - // an out of band message is only valid if the sequence field is initialized with the correct version - // and the protocolId is equal to the receiving client's Protocol::ClientManagement value - return ((message.header.sequence == kMessageVersion) & - (message.header.protocolId == Protocol::ClientManagement)); - } - - DD_NETWORK_STRUCT(ConnectRequestPayload, 4) - { - StatusFlags initialClientFlags; - uint8 padding[2]; - Component componentType; - uint8 reserved[3]; - }; - - DD_CHECK_SIZE(ConnectRequestPayload, 8); - - DD_NETWORK_STRUCT(ConnectResponsePayload, 4) - { - Result result; - ClientId clientId; - // pad this out to 8 bytes for future expansion - uint8 padding[2]; - }; - - DD_CHECK_SIZE(ConnectResponsePayload, 8); - - DD_NETWORK_STRUCT(SetClientFlagsPayload, 4) - { - StatusFlags flags; - uint8 padding[2]; - }; - - DD_CHECK_SIZE(SetClientFlagsPayload, 4); - - DD_NETWORK_STRUCT(SetClientFlagsResponsePayload, 4) - { - Result result; - }; - - DD_CHECK_SIZE(SetClientFlagsResponsePayload, 4); - - DD_NETWORK_STRUCT(QueryStatusResponsePayload, 4) - { - Result result; - StatusFlags flags; - uint8 reserved[2]; - }; - - DD_CHECK_SIZE(QueryStatusResponsePayload, 8); - } -} +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include "gpuopen.h" + +namespace DevDriver +{ + namespace SystemProtocol + { + /////////////////////// + // GPU Open System Protocol + enum struct SystemMessage : MessageCode + { + Unknown = 0, + ClientConnected, + ClientDisconnected, + Ping, + Pong, + QueryClientInfo, + ClientInfo, + Halted, + Count, + }; + } + + namespace SessionProtocol + { + /////////////////////// + // GPU Open Session Protocol + enum struct SessionMessage : MessageCode + { + Unknown = 0, + Syn, + SynAck, + Fin, + Data, + Ack, + Rst, + Count + }; + + typedef uint8 SessionVersion; + // Session protocol 2 lets session servers return session version as part of the synack + DD_STATIC_CONST SessionVersion kSessionProtocolVersionSynAckVersion = 2; + // Session protocol 1 lets session clients specify a max range supported as part of the syn + DD_STATIC_CONST SessionVersion kSessionProtocolRangeVersion = 1; + // current version is 2 + DD_STATIC_CONST SessionVersion kSessionProtocolVersion = kSessionProtocolVersionSynAckVersion; + // not mentioned is session version 0. It only supported min version in SynAck, servers reporting it cannot + // cleanly terminate in response to a Fin packet. + + // tripwire - this intentionally will break if the message version changes. Since that implies a breaking change, we need to address + // to re-baseline this as version 0 and update the SynPayload struct at the same time + static_assert(kMessageVersion == 1011, "Session packets need to be cleaned up as part of the next protocol version"); + + DD_NETWORK_STRUCT(SynPayload, 4) + { + Version minVersion; + Protocol protocol; + // pad out to 4 bytes + SessionVersion sessionVersion; + + // New fields read if sessionVersion != 0 + Version maxVersion; + // pad out to 8 bytes + uint8 reserved[2]; + }; + + DD_CHECK_SIZE(SynPayload, 8); + + // + // SynPayloadV2 is here so that we can use it with the next breaking message bus change. + // + //DD_NETWORK_STRUCT(SynPayloadV2, 4) + //{ + // Protocol protocol; + // SessionVersion sessionVersion; + // Version minVersion; + // Version maxVersion; + // // pad out to 8 bytes + // uint8 reserved[2]; + //}; + + //DD_CHECK_SIZE(SynPayloadV2, 8); + + DD_NETWORK_STRUCT(SynAckPayload, 8) + { + Sequence sequence; + SessionId initialSessionId; + Version version; + SessionVersion sessionVersion; + uint8 reserved[1]; + }; + + DD_CHECK_SIZE(SynAckPayload, 16); + } + + namespace ClientManagementProtocol + { + + /////////////////////// + // GPU Open ClientManagement Protocol + enum struct ManagementMessage : MessageCode + { + Unknown = 0, + ConnectRequest, + ConnectResponse, + DisconnectNotification, + DisconnectResponse, + SetClientFlags, + SetClientFlagsResponse, + QueryStatus, + QueryStatusResponse, + KeepAlive, + Count + }; + + DD_STATIC_CONST MessageBuffer kOutOfBandMessage = + { + { // header + kBroadcastClientId, //srcClientId + kBroadcastClientId, //dstClientId + Protocol::ClientManagement, //protocolId + 0, //messageId + 0, //windowSize + 0, //payloadSize + 0, //sessionId + kMessageVersion //sequence + }, + {} // payload + }; + + inline bool IsOutOfBandMessage(const MessageBuffer &message) + { + // an out of band message is denoted by both the dstClientId and srcClientId + // being initialized to kBroadcastClientId. + static_assert(kBroadcastClientId == 0, "Error, kBroadcastClientId is non-zero. IsOutOfBandMessage needs to be fixed"); + return ((message.header.dstClientId | message.header.srcClientId) == kBroadcastClientId); + } + + inline bool IsValidOutOfBandMessage(const MessageBuffer &message) + { + // an out of band message is only valid if the sequence field is initialized with the correct version + // and the protocolId is equal to the receiving client's Protocol::ClientManagement value + return ((message.header.sequence == kMessageVersion) & + (message.header.protocolId == Protocol::ClientManagement)); + } + + DD_NETWORK_STRUCT(ConnectRequestPayload, 4) + { + StatusFlags initialClientFlags; + uint8 padding[2]; + Component componentType; + uint8 reserved[3]; + }; + + DD_CHECK_SIZE(ConnectRequestPayload, 8); + + DD_NETWORK_STRUCT(ConnectResponsePayload, 4) + { + Result result; + ClientId clientId; + // pad this out to 8 bytes for future expansion + uint8 padding[2]; + }; + + DD_CHECK_SIZE(ConnectResponsePayload, 8); + + DD_NETWORK_STRUCT(SetClientFlagsPayload, 4) + { + StatusFlags flags; + uint8 padding[2]; + }; + + DD_CHECK_SIZE(SetClientFlagsPayload, 4); + + DD_NETWORK_STRUCT(SetClientFlagsResponsePayload, 4) + { + Result result; + }; + + DD_CHECK_SIZE(SetClientFlagsResponsePayload, 4); + + DD_NETWORK_STRUCT(QueryStatusResponsePayload, 4) + { + Result result; + StatusFlags flags; + uint8 reserved[2]; + }; + + DD_CHECK_SIZE(QueryStatusResponsePayload, 8); + } +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h index 57f45bb28e..7935107d10 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/protocols/typemap.h @@ -1,168 +1,168 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -namespace DevDriver -{ - class IProtocolServer; - class IProtocolClient; - - template - struct ProtocolServerMap - { - typedef IProtocolServer type; - }; - - template - struct ProtocolClientMap - { - typedef IProtocolClient type; - }; - - template - using ProtocolServerType = typename ProtocolServerMap::type; - - template - using ProtocolClientType = typename ProtocolClientMap::type; - - namespace DriverControlProtocol - { - class DriverControlServer; - class DriverControlClient; - } - - template <> - struct ProtocolServerMap - { - typedef DriverControlProtocol::DriverControlServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef DriverControlProtocol::DriverControlClient type; - }; - - namespace RGPProtocol - { - class RGPServer; - class RGPClient; - } - - template <> - struct ProtocolServerMap - { - typedef RGPProtocol::RGPServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef RGPProtocol::RGPClient type; - }; - - namespace EventProtocol - { - class EventServer; - class EventClient; - } - - template <> - struct ProtocolServerMap - { - typedef EventProtocol::EventServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef EventProtocol::EventClient type; - }; - - namespace ETWProtocol - { - class ETWServer; - class ETWClient; - } - - namespace SettingsURIService - { - class SettingsService; - } - - namespace InfoURIService - { - class InfoService; - } - - template <> - struct ProtocolServerMap - { - typedef ETWProtocol::ETWServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef ETWProtocol::ETWClient type; - }; - - namespace TransferProtocol - { - class TransferServer; - class TransferClient; - } - - template <> - struct ProtocolServerMap - { - typedef TransferProtocol::TransferServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef TransferProtocol::TransferClient type; - }; - - namespace URIProtocol - { - class URIServer; - class URIClient; - } - - template <> - struct ProtocolServerMap - { - typedef URIProtocol::URIServer type; - }; - - template <> - struct ProtocolClientMap - { - typedef URIProtocol::URIClient type; - }; -} +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +namespace DevDriver +{ + class IProtocolServer; + class IProtocolClient; + + template + struct ProtocolServerMap + { + typedef IProtocolServer type; + }; + + template + struct ProtocolClientMap + { + typedef IProtocolClient type; + }; + + template + using ProtocolServerType = typename ProtocolServerMap::type; + + template + using ProtocolClientType = typename ProtocolClientMap::type; + + namespace DriverControlProtocol + { + class DriverControlServer; + class DriverControlClient; + } + + template <> + struct ProtocolServerMap + { + typedef DriverControlProtocol::DriverControlServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef DriverControlProtocol::DriverControlClient type; + }; + + namespace RGPProtocol + { + class RGPServer; + class RGPClient; + } + + template <> + struct ProtocolServerMap + { + typedef RGPProtocol::RGPServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef RGPProtocol::RGPClient type; + }; + + namespace EventProtocol + { + class EventServer; + class EventClient; + } + + template <> + struct ProtocolServerMap + { + typedef EventProtocol::EventServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef EventProtocol::EventClient type; + }; + + namespace ETWProtocol + { + class ETWServer; + class ETWClient; + } + + namespace SettingsURIService + { + class SettingsService; + } + + namespace InfoURIService + { + class InfoService; + } + + template <> + struct ProtocolServerMap + { + typedef ETWProtocol::ETWServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef ETWProtocol::ETWClient type; + }; + + namespace TransferProtocol + { + class TransferServer; + class TransferClient; + } + + template <> + struct ProtocolServerMap + { + typedef TransferProtocol::TransferServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef TransferProtocol::TransferClient type; + }; + + namespace URIProtocol + { + class URIServer; + class URIClient; + } + + template <> + struct ProtocolServerMap + { + typedef URIProtocol::URIServer type; + }; + + template <> + struct ProtocolClientMap + { + typedef URIProtocol::URIClient type; + }; +} diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h index 87d500174e..ed16ad2714 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddMetroHash.h @@ -1,86 +1,86 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include -#include - -namespace DevDriver -{ - -namespace MetroHash -{ -/// 128-bit hash structure -struct Hash -{ - union - { - uint32 dwords[4]; ///< Output hash in dwords. - uint8 bytes[16]; ///< Output hash in bytes. - }; -}; - -// Compacts a 128-bit hash into a 64-bit one by XOR'ing the low and high 64-bits together. -inline uint64 Compact64( - const Hash* pHash) -{ - return (static_cast(pHash->dwords[3] ^ pHash->dwords[1]) | - (static_cast(pHash->dwords[2] ^ pHash->dwords[0]) << 32)); -} - -// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. -inline uint32 Compact32( - const Hash* pHash) -{ - return pHash->dwords[3] ^ pHash->dwords[2] ^ pHash->dwords[1] ^ pHash->dwords[0]; -} - -// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. -inline uint32 Compact32( - const uint64 hash) -{ - return static_cast(hash) ^ static_cast(hash >> 32); -} - -inline uint64 MetroHash64(const uint8* pData, const uint64 dataSize) -{ - uint64 hash = 0; - Util::MetroHash64::Hash(pData, dataSize, reinterpret_cast(&hash)); - return hash; -} - -inline uint32 MetroHash32(const uint8* pData, const uint64 dataSize) -{ - return Compact32(MetroHash64(pData, dataSize)); -} - -inline uint64 HashCStr64(const char* pString) -{ - return MetroHash64(reinterpret_cast(pString), strlen(pString)); -} - -} // MetroHash -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ + +namespace MetroHash +{ +/// 128-bit hash structure +struct Hash +{ + union + { + uint32 dwords[4]; ///< Output hash in dwords. + uint8 bytes[16]; ///< Output hash in bytes. + }; +}; + +// Compacts a 128-bit hash into a 64-bit one by XOR'ing the low and high 64-bits together. +inline uint64 Compact64( + const Hash* pHash) +{ + return (static_cast(pHash->dwords[3] ^ pHash->dwords[1]) | + (static_cast(pHash->dwords[2] ^ pHash->dwords[0]) << 32)); +} + +// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. +inline uint32 Compact32( + const Hash* pHash) +{ + return pHash->dwords[3] ^ pHash->dwords[2] ^ pHash->dwords[1] ^ pHash->dwords[0]; +} + +// Compacts a 64-bit hash checksum into a 32-bit one by XOR'ing each 32-bit chunk together. +inline uint32 Compact32( + const uint64 hash) +{ + return static_cast(hash) ^ static_cast(hash >> 32); +} + +inline uint64 MetroHash64(const uint8* pData, const uint64 dataSize) +{ + uint64 hash = 0; + Util::MetroHash64::Hash(pData, dataSize, reinterpret_cast(&hash)); + return hash; +} + +inline uint32 MetroHash32(const uint8* pData, const uint64 dataSize) +{ + return Compact32(MetroHash64(pData, dataSize)); +} + +inline uint64 HashCStr64(const char* pString) +{ + return MetroHash64(reinterpret_cast(pString), strlen(pString)); +} + +} // MetroHash +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h index 13fa37a215..5008db7748 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/ddStructuredReader.h @@ -1,335 +1,335 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -#pragma once - -#include - -namespace DevDriver -{ - // The value half of a key-value pair from a StructuredReader. - // This always wraps a valid IValue pointer, but the value semantically stored may be empty. (e.g. a Json null) - class StructuredValue - { - public: - // In order to avoid leaking internal headers, we treat this member as an opaque data type. - // Its size and alignment is checked in the cpp file. - /// This is an internal type, only exposed due to limitations in C++ semantics. - struct OpaqueNode - { - void* blob[2] = {}; - }; - - ~StructuredValue() = default; - - StructuredValue() - : m_opaque() - {} - - explicit StructuredValue(OpaqueNode opaque) - : m_opaque(opaque) - {} - - StructuredValue(StructuredValue&& other) = default; - StructuredValue(const StructuredValue& other) = default; - - StructuredValue& operator=(StructuredValue&& other) = default; - StructuredValue& operator=(const StructuredValue& other) = default; - - enum class Type - { - Null = 0, - Array, - Map, - Str, - Bool, - Int, - Uint, - Double, - Float, - }; - - // Type of data contained in this node. - Type GetType() const; - - const char* GetTypeString() const - { - switch (GetType()) - { - case StructuredValue::Type::Null: return "Null"; - - case StructuredValue::Type::Array: return "Array"; - case StructuredValue::Type::Map: return "Map"; - case StructuredValue::Type::Str: return "Str"; - - case StructuredValue::Type::Bool: return "Bool"; - - case StructuredValue::Type::Int: return "Int"; - case StructuredValue::Type::Uint: return "Uint"; - - case StructuredValue::Type::Double: return "Double"; - case StructuredValue::Type::Float: return "Float"; - default: - DD_WARN_ALWAYS(); - return "Unknown"; - } - } - - // Create a new empty value - StructuredValue MakeNull() const; - - // Return whether this is an empty, or "null" node. - bool IsNull() const; - - /// ===== Unsigned Integer Types - - /// Returns true when this node contains a Uint8. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetUint8(uint8* pValue) const; - // Returns true when this node contains a Uint16. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetUint16(uint16* pValue) const; - // Returns true when this node contains a Uint32. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetUint32(uint32* pValue) const; - // Returns true when this node contains a Uint64. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetUint64(uint64* pValue) const; - - /// ===== Signed Integer Types - - /// Returns true when this node contains a Int8. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetInt8(int8* pValue) const; - // Returns true when this node contains a Int16. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetInt16(int16* pValue) const; - // Returns true when this node contains a Int32. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetInt32(int32* pValue) const; - // Returns true when this node contains a Int64. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetInt64(int64* pValue) const; - - /// ===== Floating Point Types - - /// Returns true when this node contains a Float. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetFloat(float* pValue) const; - // Returns true when this node contains a Double. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetDouble(double* pValue) const; - - /// ===== Other Types - - // Returns true when this node contains a Bool. If pValue is not NULL, copy out the node's value - DD_NODISCARD bool GetBool(bool* pValue) const; - - // Copy a string value from a node into a buffer - // If the StructuredValue is not a string, - // false is returned and no writes occur - // - // If pStringSize is not NULL, - // the string length is written and processing continues - // - // If pBuffer is not NULL, - // not more than bufferSize bytes (including a NULL terminator) are written. - // If the buffer is large enough to hold the entire string, - // true is returned - // - // If both pBuffer and pStringSize are NULL and the value *is* a string, - // true is returned - // - // TODO: ... this is complicated. Should we use a Result? - // The other types are simple enough that they benefit from using bool instead of a Result, but Strings may not. - DD_NODISCARD bool GetStringCopy(char* pBuffer, size_t bufferSize, size_t* pStringSize) const; - - template - DD_NODISCARD bool GetStringCopy(char(&buffer)[BufferSize]) const - { - return GetStringCopy(buffer, BufferSize, nullptr); - } - - // Return a NULL-terminated string from the backing messagepack data. - // This will fail and return NULL if the embedded string does not end with a NULL byte. Use GetStringCopy() if this is the case. - DD_NODISCARD const char* GetStringPtr() const; - - // Lookup a value in a map by a string key - // If the key does not exist, returns false and writes a Null value to `*pValue` - DD_NODISCARD bool GetValueByKey(const char* pKey, StructuredValue* pValue) const; - - // Lookup a value in an array. - // If `index` is out of bounds, returns false and writes a Null value to `*pValue` - DD_NODISCARD bool GetValueByIndex(size_t index, StructuredValue* pValue) const; - - // Query information about Maps and Arrays - - // Returns whether this node has key-value pairs - bool IsMap() const; - - // Returns whether this node has numeric indices - bool IsArray() const; - - // Returns the length of the array if this node is an array, otherwise 0. - size_t GetArrayLength() const; - - // Get-methods with defaults - // If you don't want to check the `bool` value anyway, prefer these. - - uint8 GetUint8Or(uint8 defaultValue) const - { - const bool ok = GetUint8(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - uint16 GetUint16Or(uint16 defaultValue) const - { - const bool ok = GetUint16(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - uint32 GetUint32Or(uint32 defaultValue) const - { - const bool ok = GetUint32(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - uint64 GetUint64Or(uint64 defaultValue) const - { - const bool ok = GetUint64(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - int8 GetInt8Or(int8 defaultValue) const - { - const bool ok = GetInt8(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - int16 GetInt16Or(int16 defaultValue) const - { - const bool ok = GetInt16(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - int32 GetInt32Or(int32 defaultValue) const - { - const bool ok = GetInt32(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - int64 GetInt64Or(int64 defaultValue) const - { - const bool ok = GetInt64(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - float GetFloatOr(float defaultValue) const - { - const bool ok = GetFloat(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - double GetDoubleOr(double defaultValue) const - { - const bool ok = GetDouble(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - bool GetBoolOr(bool defaultValue) const - { - const bool ok = GetBool(&defaultValue); - DD_UNUSED(ok); - return defaultValue; - } - - // Index methods - - StructuredValue operator[](const char* pKey) const - { - StructuredValue next = MakeNull(); - - // Ignore the result of this fetch, `next` is already an empty value - const bool ok = GetValueByKey(pKey, &next); - DD_UNUSED(ok); - - return next; - } - - StructuredValue operator[](size_t index) const - { - StructuredValue next = MakeNull(); - - // Ignore the result of this fetch, `next` is already an empty value - const bool ok = GetValueByIndex(index, &next); - DD_UNUSED(ok); - - return next; - } - - template - StructuredValue operator[](T index) const - { - // This overload exists so that we don't get ambiguous calls when calling operator[] with integer types. - // If the type of the index can't be statically cast to a size_t, this will fail to compile. - // Note: pointer types cannot be static_cast()'d, which is great! - return this->operator[](static_cast(index)); - } - - private: - bool ResetInternalErrorStateImpl(const char* pFile, int line, const char* pCallingFunction) const; - - OpaqueNode m_opaque; - }; - - // Top level container of structured data - class IStructuredReader - { - public: - virtual ~IStructuredReader() {}; - - DD_NODISCARD static Result CreateFromJson( - const void* pBytes, - size_t numBytes, - const AllocCb& allocCb, - IStructuredReader** ppReader - ); - - DD_NODISCARD static Result CreateFromMessagePack( - const uint8* pBytes, - size_t numBytes, - const AllocCb& allocCb, - IStructuredReader** ppReader - ); - - static void Destroy(IStructuredReader **ppReader); - - /// Get the root object being read - virtual StructuredValue GetRoot() const = 0; - - /// Get the allocation callbacks - virtual const AllocCb& GetAllocCb() const = 0; - }; - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +#pragma once + +#include + +namespace DevDriver +{ + // The value half of a key-value pair from a StructuredReader. + // This always wraps a valid IValue pointer, but the value semantically stored may be empty. (e.g. a Json null) + class StructuredValue + { + public: + // In order to avoid leaking internal headers, we treat this member as an opaque data type. + // Its size and alignment is checked in the cpp file. + /// This is an internal type, only exposed due to limitations in C++ semantics. + struct OpaqueNode + { + void* blob[2] = {}; + }; + + ~StructuredValue() = default; + + StructuredValue() + : m_opaque() + {} + + explicit StructuredValue(OpaqueNode opaque) + : m_opaque(opaque) + {} + + StructuredValue(StructuredValue&& other) = default; + StructuredValue(const StructuredValue& other) = default; + + StructuredValue& operator=(StructuredValue&& other) = default; + StructuredValue& operator=(const StructuredValue& other) = default; + + enum class Type + { + Null = 0, + Array, + Map, + Str, + Bool, + Int, + Uint, + Double, + Float, + }; + + // Type of data contained in this node. + Type GetType() const; + + const char* GetTypeString() const + { + switch (GetType()) + { + case StructuredValue::Type::Null: return "Null"; + + case StructuredValue::Type::Array: return "Array"; + case StructuredValue::Type::Map: return "Map"; + case StructuredValue::Type::Str: return "Str"; + + case StructuredValue::Type::Bool: return "Bool"; + + case StructuredValue::Type::Int: return "Int"; + case StructuredValue::Type::Uint: return "Uint"; + + case StructuredValue::Type::Double: return "Double"; + case StructuredValue::Type::Float: return "Float"; + default: + DD_WARN_ALWAYS(); + return "Unknown"; + } + } + + // Create a new empty value + StructuredValue MakeNull() const; + + // Return whether this is an empty, or "null" node. + bool IsNull() const; + + /// ===== Unsigned Integer Types + + /// Returns true when this node contains a Uint8. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint8(uint8* pValue) const; + // Returns true when this node contains a Uint16. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint16(uint16* pValue) const; + // Returns true when this node contains a Uint32. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint32(uint32* pValue) const; + // Returns true when this node contains a Uint64. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetUint64(uint64* pValue) const; + + /// ===== Signed Integer Types + + /// Returns true when this node contains a Int8. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt8(int8* pValue) const; + // Returns true when this node contains a Int16. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt16(int16* pValue) const; + // Returns true when this node contains a Int32. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt32(int32* pValue) const; + // Returns true when this node contains a Int64. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetInt64(int64* pValue) const; + + /// ===== Floating Point Types + + /// Returns true when this node contains a Float. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetFloat(float* pValue) const; + // Returns true when this node contains a Double. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetDouble(double* pValue) const; + + /// ===== Other Types + + // Returns true when this node contains a Bool. If pValue is not NULL, copy out the node's value + DD_NODISCARD bool GetBool(bool* pValue) const; + + // Copy a string value from a node into a buffer + // If the StructuredValue is not a string, + // false is returned and no writes occur + // + // If pStringSize is not NULL, + // the string length is written and processing continues + // + // If pBuffer is not NULL, + // not more than bufferSize bytes (including a NULL terminator) are written. + // If the buffer is large enough to hold the entire string, + // true is returned + // + // If both pBuffer and pStringSize are NULL and the value *is* a string, + // true is returned + // + // TODO: ... this is complicated. Should we use a Result? + // The other types are simple enough that they benefit from using bool instead of a Result, but Strings may not. + DD_NODISCARD bool GetStringCopy(char* pBuffer, size_t bufferSize, size_t* pStringSize) const; + + template + DD_NODISCARD bool GetStringCopy(char(&buffer)[BufferSize]) const + { + return GetStringCopy(buffer, BufferSize, nullptr); + } + + // Return a NULL-terminated string from the backing messagepack data. + // This will fail and return NULL if the embedded string does not end with a NULL byte. Use GetStringCopy() if this is the case. + DD_NODISCARD const char* GetStringPtr() const; + + // Lookup a value in a map by a string key + // If the key does not exist, returns false and writes a Null value to `*pValue` + DD_NODISCARD bool GetValueByKey(const char* pKey, StructuredValue* pValue) const; + + // Lookup a value in an array. + // If `index` is out of bounds, returns false and writes a Null value to `*pValue` + DD_NODISCARD bool GetValueByIndex(size_t index, StructuredValue* pValue) const; + + // Query information about Maps and Arrays + + // Returns whether this node has key-value pairs + bool IsMap() const; + + // Returns whether this node has numeric indices + bool IsArray() const; + + // Returns the length of the array if this node is an array, otherwise 0. + size_t GetArrayLength() const; + + // Get-methods with defaults + // If you don't want to check the `bool` value anyway, prefer these. + + uint8 GetUint8Or(uint8 defaultValue) const + { + const bool ok = GetUint8(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint16 GetUint16Or(uint16 defaultValue) const + { + const bool ok = GetUint16(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint32 GetUint32Or(uint32 defaultValue) const + { + const bool ok = GetUint32(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + uint64 GetUint64Or(uint64 defaultValue) const + { + const bool ok = GetUint64(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int8 GetInt8Or(int8 defaultValue) const + { + const bool ok = GetInt8(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int16 GetInt16Or(int16 defaultValue) const + { + const bool ok = GetInt16(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int32 GetInt32Or(int32 defaultValue) const + { + const bool ok = GetInt32(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + int64 GetInt64Or(int64 defaultValue) const + { + const bool ok = GetInt64(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + float GetFloatOr(float defaultValue) const + { + const bool ok = GetFloat(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + double GetDoubleOr(double defaultValue) const + { + const bool ok = GetDouble(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + bool GetBoolOr(bool defaultValue) const + { + const bool ok = GetBool(&defaultValue); + DD_UNUSED(ok); + return defaultValue; + } + + // Index methods + + StructuredValue operator[](const char* pKey) const + { + StructuredValue next = MakeNull(); + + // Ignore the result of this fetch, `next` is already an empty value + const bool ok = GetValueByKey(pKey, &next); + DD_UNUSED(ok); + + return next; + } + + StructuredValue operator[](size_t index) const + { + StructuredValue next = MakeNull(); + + // Ignore the result of this fetch, `next` is already an empty value + const bool ok = GetValueByIndex(index, &next); + DD_UNUSED(ok); + + return next; + } + + template + StructuredValue operator[](T index) const + { + // This overload exists so that we don't get ambiguous calls when calling operator[] with integer types. + // If the type of the index can't be statically cast to a size_t, this will fail to compile. + // Note: pointer types cannot be static_cast()'d, which is great! + return this->operator[](static_cast(index)); + } + + private: + bool ResetInternalErrorStateImpl(const char* pFile, int line, const char* pCallingFunction) const; + + OpaqueNode m_opaque; + }; + + // Top level container of structured data + class IStructuredReader + { + public: + virtual ~IStructuredReader() {}; + + DD_NODISCARD static Result CreateFromJson( + const void* pBytes, + size_t numBytes, + const AllocCb& allocCb, + IStructuredReader** ppReader + ); + + DD_NODISCARD static Result CreateFromMessagePack( + const uint8* pBytes, + size_t numBytes, + const AllocCb& allocCb, + IStructuredReader** ppReader + ); + + static void Destroy(IStructuredReader **ppReader); + + /// Get the root object being read + virtual StructuredValue GetRoot() const = 0; + + /// Get the allocation callbacks + virtual const AllocCb& GetAllocCb() const = 0; + }; + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h index 5e43c9fb83..fc022a9401 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/hashFunc.h @@ -1,102 +1,102 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include -#include - -namespace DevDriver -{ - -/// ==================================================================================================================== -/// Hashes the bytes of a Key using MetroHash -template -struct DefaultHashFunc -{ - uint32 operator()(const Key& key) const - { - return MetroHash::MetroHash32(reinterpret_cast(&key), sizeof(Key)); - } -}; - -/// ==================================================================================================================== -/// Hashes a const char* CString Key using Metrohash -template<> -struct DefaultHashFunc -{ - uint32 operator()(const char* pKey) const - { - // We cannot pass NULL strings to strlen() and friends, so guard against it anyway. - uint32 hash = 0; - DD_ASSERT(pKey != nullptr); - if (pKey != nullptr) - { - hash = MetroHash::MetroHash32(reinterpret_cast(pKey), strlen(pKey)); - } - - return hash; - } -}; - -/// Pointer keys are usually a mistake, so this version is explicitly 'delete'd -/// Overload this template if you're sure you need this. (See: const char* above) -template -struct DefaultHashFunc -{ - uint32 operator()(const T* pKey) const = delete; -}; - -/// Generic compare functor for types that have defined the comparison operator -/// -/// Used by @ref HashBase to prevent defining compare functions for each type. -template -struct DefaultEqualFunc -{ - bool operator()(const Key& key1, const Key& key2) const { return (key1 == key2); } -}; - -/// String compare functor for use with C-style strings -template<> -struct DefaultEqualFunc -{ - bool operator()(const char* pKey1, const char* pKey2) const - { - DD_ASSERT(pKey1 != nullptr); - DD_ASSERT(pKey2 != nullptr); - return (strcmp(pKey1, pKey2) == 0); - } -}; - -/// Generic compare functor for types with arbitrary size -/// -/// Used by @ref HashBase to prevent defining compare functions for each type. -template -struct BitwiseEqualFunc -{ - bool operator()(const Key& key1, const Key& key2) const { return (memcmp(&key1, &key2, sizeof(Key)) == 0); } -}; - -} // namespace DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ + +/// ==================================================================================================================== +/// Hashes the bytes of a Key using MetroHash +template +struct DefaultHashFunc +{ + uint32 operator()(const Key& key) const + { + return MetroHash::MetroHash32(reinterpret_cast(&key), sizeof(Key)); + } +}; + +/// ==================================================================================================================== +/// Hashes a const char* CString Key using Metrohash +template<> +struct DefaultHashFunc +{ + uint32 operator()(const char* pKey) const + { + // We cannot pass NULL strings to strlen() and friends, so guard against it anyway. + uint32 hash = 0; + DD_ASSERT(pKey != nullptr); + if (pKey != nullptr) + { + hash = MetroHash::MetroHash32(reinterpret_cast(pKey), strlen(pKey)); + } + + return hash; + } +}; + +/// Pointer keys are usually a mistake, so this version is explicitly 'delete'd +/// Overload this template if you're sure you need this. (See: const char* above) +template +struct DefaultHashFunc +{ + uint32 operator()(const T* pKey) const = delete; +}; + +/// Generic compare functor for types that have defined the comparison operator +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct DefaultEqualFunc +{ + bool operator()(const Key& key1, const Key& key2) const { return (key1 == key2); } +}; + +/// String compare functor for use with C-style strings +template<> +struct DefaultEqualFunc +{ + bool operator()(const char* pKey1, const char* pKey2) const + { + DD_ASSERT(pKey1 != nullptr); + DD_ASSERT(pKey2 != nullptr); + return (strcmp(pKey1, pKey2) == 0); + } +}; + +/// Generic compare functor for types with arbitrary size +/// +/// Used by @ref HashBase to prevent defining compare functions for each type. +template +struct BitwiseEqualFunc +{ + bool operator()(const Key& key1, const Key& key2) const { return (memcmp(&key1, &key2, sizeof(Key)) == 0); } +}; + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h index 4b00e1e34e..5f92ed61de 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/sharedptr.h @@ -1,291 +1,291 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include - -namespace DevDriver -{ - // Helper structure that sets value to true if T is not abstract and is constructable using the provided - // arguments, otherwise it returns false. We use this to prevent the Create() function from being defined - // for classes that are not creatable. This in turn prevents code from being generated that causes - // Container to be defined, which leads to compile errors even if the client application never attempts - // to directly create an object of that type. - template - struct CanConstruct - { - DD_STATIC_CONST bool Value = !Platform::IsAbstract::Value && Platform::IsConstructible::Value; - }; - - // SharedPointerBase is the common parent class used by SharedPointer<> - // It implements common functions (e.g., pointer management) but cannot ever be - // used directly. The purpose of this separation is to enable SharedPointer to perform - // typecasts between derived and base types by casting through SharedPointerBase. - class SharedPointerBase - { - // SharedPointer is a subclass, but all need to have access to the protected + private members - template - friend class SharedPointer; - // public functions available to all subclasses - public: - // check to see if the class has been set - bool IsNull() const { return m_pObject == nullptr; } - - // clear the pointer and, if required, delete the underlying allocation - void Clear() - { - if (m_pContainer != nullptr) - { - if (m_pContainer->Release() == 0) - { - // ContainerBase has been declared with a virtual destructor, which guarantees - // that the specific ContainerBase subclass destructor is called - DD_DELETE(m_pContainer, m_pContainer->GetAllocCb()); - } - m_pContainer = nullptr; - m_pObject = nullptr; - } - } - - protected: - // Inner class that provides a standardized reference counted container interface - // Subclassed by SharedPointer to include an actual object - class ContainerBase - { - public: - // Construct container and initialize ref count to zero. This class should never be - // constructed directly by anything other than a subclass. - constexpr ContainerBase(const AllocCb &allocCb) - : m_allocCb(allocCb) - , m_refCount(0) - { - //DD_PRINT(LogLevel::Never, "Created reference counted container %i", m_refCount); - } - - // Destroy the container. Since this class is never directly created, this ensures - // subclasses (and the contained object) are always destroyed correctly. - virtual ~ContainerBase() - { - DD_ASSERT(m_refCount == 0); - DD_PRINT(LogLevel::Never, "Deleted reference counted container %i", m_refCount); - } - - // Increments the reference count of the container - int32 Retain(void) - { - DD_ASSERT(m_refCount >= 0); - int32 result = Platform::AtomicIncrement(&m_refCount); - DD_ASSERT(result >= 1); - DD_PRINT(LogLevel::Never, "Incremented reference count: %i", result); - return result; - } - - // Decrements the reference count of the container - int32 Release(void) - { - int32 result = Platform::AtomicDecrement(&m_refCount); - DD_ASSERT(result >= 0); - DD_PRINT(LogLevel::Never, "Decremented reference count: %i", result); - return result; - } - - // Returns the reference count of the container - int32 QueryReferenceCount(void) const - { - return m_refCount; - } - - // Retrieve the allocator callbacks so it can be destroyed - const AllocCb& GetAllocCb() const { return m_allocCb; } - private: - // Allocator callbacks - const AllocCb m_allocCb; - // Reference count - Platform::Atomic m_refCount; - - }; - - // Default constructor that is constexpr. Allows the compiler to inline this if it wants to. - constexpr SharedPointerBase() - : m_pContainer(nullptr) - , m_pObject(nullptr) - { - } - - // Initialize the object using the provided pointer - SharedPointerBase(ContainerBase* pContainer, void* pObject) - : m_pContainer(pContainer) - , m_pObject(pObject) - { - // We should always have a valid object if the container is valid. - DD_ASSERT((m_pContainer == nullptr) || (m_pObject != nullptr)); - - // If we have a valid container, increment the reference count. - if (m_pContainer != nullptr) - { - m_pContainer->Retain(); - } - } - - // Copy constructor copies the container pointer and increments the reference count - SharedPointerBase(const SharedPointerBase &right) - : SharedPointerBase(right.m_pContainer, right.m_pObject) - { - } - - // Move constructor takes the container pointer and clears the other container's pointer - SharedPointerBase(SharedPointerBase &&right) - : m_pContainer(Platform::Exchange(right.m_pContainer, nullptr)) - , m_pObject(Platform::Exchange(right.m_pObject, nullptr)) - { - } - - // On deletion of the object clear the pointer - ~SharedPointerBase() - { - Clear(); - } - private: - // Pointer to the shared container - ContainerBase* m_pContainer; - // Pointer to the object inside the shared container. We keep a copy of this to allow - // direct access to the object since we might not know the actual parent type of it. - void* m_pObject; - }; - - template - class SharedPointer : public SharedPointerBase - { - public: - // Create SharedPointer object with the default constructor - constexpr SharedPointer() : SharedPointerBase() {}; - - SharedPointer(const SharedPointer&) = default; - - // Copy conversion constructor. Creates a new object if you can cast from type U to type T. - template ::Value>::Type> - SharedPointer(const SharedPointer &right) - : SharedPointerBase(Platform::Forward(right)) - { - } - - // Move conversion constructor. Takes ownership of the shared container if you can cast from type U to type T. - template ::Value>::Type> - SharedPointer(SharedPointer &&right) - : SharedPointerBase(Platform::Forward(right)) - { - } - - // Assignment operator to allow copy + swap idiom - SharedPointer &operator= (SharedPointer right) - { - m_pContainer = Platform::Exchange(right.m_pContainer, m_pContainer); - m_pObject = Platform::Exchange(right.m_pObject, m_pObject); - return *this; - } - - // Indirection operator. Returns a const reference to the object in the shared container. - // This operator is unsafe to use if the container hasn't been allocated. - T& operator*() const - { - DD_ASSERT(m_pObject != nullptr); - return *Get(); - } - - // Member of pointer operator. Returns a pointer to the object in the shared container. - // This operator is unsafe to use if the container hasn't been allocated. - T* operator->() const - { - DD_ASSERT(m_pObject != nullptr); - return Get(); - } - - // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. - template ::Value>::Type> - bool operator== (const SharedPointer< U >&right) const - { - return m_pObject == right.m_pObject; - } - - // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. - template ::Value>::Type> - bool operator!= (const SharedPointer< U >&right) const - { - return m_pObject != right.m_pObject; - } - - // Get a pointer to the contained object - T* Get() const - { - return static_cast(m_pObject); - } - - // Returns the reference count for the container - int32 QueryReferenceCount() const - { - return m_pContainer->QueryReferenceCount(); - } - - // Create a SharedPointer using the provided allocator callbacks and arguments - // This function is only valid if the class is not a valid class - template::Value>::Type> - static SharedPointer Create(const AllocCb& allocCb, Args&&... args) - { - SharedPointer result; - Container *pContainer = - DD_NEW(Container, allocCb)(allocCb, Platform::Forward(args)...); - - if (pContainer != nullptr) - { - result = SharedPointer(pContainer, &pContainer->m_object); - } - return result; - } - private: - // Templated Container class that inherents the type from the outer (SharedPointer) class - class Container : public ContainerBase - { - public: - // Constructor that initializes ContainerBase class and the object using the provided parameters - template - explicit constexpr Container(const AllocCb& allocCb, Args&&... args) - : ContainerBase(allocCb) - , m_object(Platform::Forward(args)...) - { - } - - // Actual object that the SharedPointer instance encapsulates - T m_object; - }; - - // Private constructor to allow direct initialization using an externally created Container - explicit SharedPointer(Container* pContainer, T* pObject) - : SharedPointerBase(static_cast(pContainer), pObject) - { - } - }; -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +namespace DevDriver +{ + // Helper structure that sets value to true if T is not abstract and is constructable using the provided + // arguments, otherwise it returns false. We use this to prevent the Create() function from being defined + // for classes that are not creatable. This in turn prevents code from being generated that causes + // Container to be defined, which leads to compile errors even if the client application never attempts + // to directly create an object of that type. + template + struct CanConstruct + { + DD_STATIC_CONST bool Value = !Platform::IsAbstract::Value && Platform::IsConstructible::Value; + }; + + // SharedPointerBase is the common parent class used by SharedPointer<> + // It implements common functions (e.g., pointer management) but cannot ever be + // used directly. The purpose of this separation is to enable SharedPointer to perform + // typecasts between derived and base types by casting through SharedPointerBase. + class SharedPointerBase + { + // SharedPointer is a subclass, but all need to have access to the protected + private members + template + friend class SharedPointer; + // public functions available to all subclasses + public: + // check to see if the class has been set + bool IsNull() const { return m_pObject == nullptr; } + + // clear the pointer and, if required, delete the underlying allocation + void Clear() + { + if (m_pContainer != nullptr) + { + if (m_pContainer->Release() == 0) + { + // ContainerBase has been declared with a virtual destructor, which guarantees + // that the specific ContainerBase subclass destructor is called + DD_DELETE(m_pContainer, m_pContainer->GetAllocCb()); + } + m_pContainer = nullptr; + m_pObject = nullptr; + } + } + + protected: + // Inner class that provides a standardized reference counted container interface + // Subclassed by SharedPointer to include an actual object + class ContainerBase + { + public: + // Construct container and initialize ref count to zero. This class should never be + // constructed directly by anything other than a subclass. + constexpr ContainerBase(const AllocCb &allocCb) + : m_allocCb(allocCb) + , m_refCount(0) + { + //DD_PRINT(LogLevel::Never, "Created reference counted container %i", m_refCount); + } + + // Destroy the container. Since this class is never directly created, this ensures + // subclasses (and the contained object) are always destroyed correctly. + virtual ~ContainerBase() + { + DD_ASSERT(m_refCount == 0); + DD_PRINT(LogLevel::Never, "Deleted reference counted container %i", m_refCount); + } + + // Increments the reference count of the container + int32 Retain(void) + { + DD_ASSERT(m_refCount >= 0); + int32 result = Platform::AtomicIncrement(&m_refCount); + DD_ASSERT(result >= 1); + DD_PRINT(LogLevel::Never, "Incremented reference count: %i", result); + return result; + } + + // Decrements the reference count of the container + int32 Release(void) + { + int32 result = Platform::AtomicDecrement(&m_refCount); + DD_ASSERT(result >= 0); + DD_PRINT(LogLevel::Never, "Decremented reference count: %i", result); + return result; + } + + // Returns the reference count of the container + int32 QueryReferenceCount(void) const + { + return m_refCount; + } + + // Retrieve the allocator callbacks so it can be destroyed + const AllocCb& GetAllocCb() const { return m_allocCb; } + private: + // Allocator callbacks + const AllocCb m_allocCb; + // Reference count + Platform::Atomic m_refCount; + + }; + + // Default constructor that is constexpr. Allows the compiler to inline this if it wants to. + constexpr SharedPointerBase() + : m_pContainer(nullptr) + , m_pObject(nullptr) + { + } + + // Initialize the object using the provided pointer + SharedPointerBase(ContainerBase* pContainer, void* pObject) + : m_pContainer(pContainer) + , m_pObject(pObject) + { + // We should always have a valid object if the container is valid. + DD_ASSERT((m_pContainer == nullptr) || (m_pObject != nullptr)); + + // If we have a valid container, increment the reference count. + if (m_pContainer != nullptr) + { + m_pContainer->Retain(); + } + } + + // Copy constructor copies the container pointer and increments the reference count + SharedPointerBase(const SharedPointerBase &right) + : SharedPointerBase(right.m_pContainer, right.m_pObject) + { + } + + // Move constructor takes the container pointer and clears the other container's pointer + SharedPointerBase(SharedPointerBase &&right) + : m_pContainer(Platform::Exchange(right.m_pContainer, nullptr)) + , m_pObject(Platform::Exchange(right.m_pObject, nullptr)) + { + } + + // On deletion of the object clear the pointer + ~SharedPointerBase() + { + Clear(); + } + private: + // Pointer to the shared container + ContainerBase* m_pContainer; + // Pointer to the object inside the shared container. We keep a copy of this to allow + // direct access to the object since we might not know the actual parent type of it. + void* m_pObject; + }; + + template + class SharedPointer : public SharedPointerBase + { + public: + // Create SharedPointer object with the default constructor + constexpr SharedPointer() : SharedPointerBase() {}; + + SharedPointer(const SharedPointer&) = default; + + // Copy conversion constructor. Creates a new object if you can cast from type U to type T. + template ::Value>::Type> + SharedPointer(const SharedPointer &right) + : SharedPointerBase(Platform::Forward(right)) + { + } + + // Move conversion constructor. Takes ownership of the shared container if you can cast from type U to type T. + template ::Value>::Type> + SharedPointer(SharedPointer &&right) + : SharedPointerBase(Platform::Forward(right)) + { + } + + // Assignment operator to allow copy + swap idiom + SharedPointer &operator= (SharedPointer right) + { + m_pContainer = Platform::Exchange(right.m_pContainer, m_pContainer); + m_pObject = Platform::Exchange(right.m_pObject, m_pObject); + return *this; + } + + // Indirection operator. Returns a const reference to the object in the shared container. + // This operator is unsafe to use if the container hasn't been allocated. + T& operator*() const + { + DD_ASSERT(m_pObject != nullptr); + return *Get(); + } + + // Member of pointer operator. Returns a pointer to the object in the shared container. + // This operator is unsafe to use if the container hasn't been allocated. + T* operator->() const + { + DD_ASSERT(m_pObject != nullptr); + return Get(); + } + + // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. + template ::Value>::Type> + bool operator== (const SharedPointer< U >&right) const + { + return m_pObject == right.m_pObject; + } + + // Templated comparison operator. Allows comparing shared pointer objects so long as U is convertable to T. + template ::Value>::Type> + bool operator!= (const SharedPointer< U >&right) const + { + return m_pObject != right.m_pObject; + } + + // Get a pointer to the contained object + T* Get() const + { + return static_cast(m_pObject); + } + + // Returns the reference count for the container + int32 QueryReferenceCount() const + { + return m_pContainer->QueryReferenceCount(); + } + + // Create a SharedPointer using the provided allocator callbacks and arguments + // This function is only valid if the class is not a valid class + template::Value>::Type> + static SharedPointer Create(const AllocCb& allocCb, Args&&... args) + { + SharedPointer result; + Container *pContainer = + DD_NEW(Container, allocCb)(allocCb, Platform::Forward(args)...); + + if (pContainer != nullptr) + { + result = SharedPointer(pContainer, &pContainer->m_object); + } + return result; + } + private: + // Templated Container class that inherents the type from the outer (SharedPointer) class + class Container : public ContainerBase + { + public: + // Constructor that initializes ContainerBase class and the object using the provided parameters + template + explicit constexpr Container(const AllocCb& allocCb, Args&&... args) + : ContainerBase(allocCb) + , m_object(Platform::Forward(args)...) + { + } + + // Actual object that the SharedPointer instance encapsulates + T m_object; + }; + + // Private constructor to allow direct initialization using an externally created Container + explicit SharedPointer(Container* pContainer, T* pObject) + : SharedPointerBase(static_cast(pContainer), pObject) + { + } + }; +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h index 46692c9a0b..4452fd0eca 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/string.h @@ -1,292 +1,292 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include -#include - -namespace DevDriver -{ -// A String class that stores the string inline with a compile-time maximum size. -// This class facilitiates passing bounded sized C Strings around without dynamic allocation. It has POD semantics -// when copied or passed by value into functions, and can be stored in a vector. -template -class FixedString -{ -private: - char m_data[FixedSize]; - -public: - FixedString() - { - // Only the first byte needs to be initialized - we actively do not want to zero the entire array! - m_data[0] = 0; - } - - FixedString(const FixedString&) = default; - FixedString(FixedString&&) = default; - - ~FixedString() {} - - FixedString& operator=(FixedString& pOther) - { - Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); - return *this; - } - - FixedString& operator=(FixedString&& pOther) - { - Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); - return *this; - } - - bool operator==(const FixedString& other) const - { - return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) == 0; - } - - bool operator!=(const FixedString& other) const - { - return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) != 0; - } - - // Create a FixedString from a C String, truncating the copy if pString is too long - FixedString(const char* pString) { Platform::Strncpy(m_data, pString, sizeof(m_data)); } - - // Return a pointer to the inline C String. - const char* AsCStr() const { return m_data; } - - char* AsCStr() { return m_data; } - - // Computes the length of the string. - // Note! This is an O(N) operation! - size_t Size() const { return strlen(m_data); } -}; - -// Sanity check for class size. -static_assert(sizeof(FixedString<16>) == 16, "FixedString<16> should be exactly 16 bytes"); - -/// ==================================================================================================================== -// Hashes a FixedString<> Key using Metrohash -template -struct DefaultHashFunc> -{ - uint32 operator()(const FixedString& key) const { return DefaultHashFunc()(key.AsCStr()); } -}; - -/// ==================================================================================================================== -/// Utility functions for strings -/// ==================================================================================================================== - -enum struct HexStringFmt -{ - Lowercase, - Uppercase, -}; - -/// ==================================================================================================================== -// Encode not more than `numBytes` from `pBytes` into hexadecimal, storing not more than stringBufferSize characters -// into pStringBuffer. -// -// This is the compliment of DecodeFromHexString() and is suitable for saving large binary blocks in text formats such -// as Json. -// -// This function NULL terminates its output if it writes anything. -// Hex pairs are written to `pStrBuff` in pairs - either both digits are written or neither is. A lone nibble -// is never written to the buffer. -// Thus, Hex strings are always an even length (+ a NULL byte) -// -// Returns the number of characters written out through `pStrBuff` (including the NULL terminator). -template -inline size_t EncodeToHexString(const void* pBytesIn, size_t numBytes, char* pStrBuff, size_t strBuffSize) -{ - const uint8* pBytes = static_cast(pBytesIn); - - // Character offset that we've written into pStrBuff - size_t charsProcessed = 0; - - if ((pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strBuffSize != 0)) - { - - // Both lookups are indexed by nibble - constexpr const char kHexStringLookupLower[] = "0123456789abcdef"; - constexpr const char kHexStringLookupUpper[] = "0123456789ABCDEF"; - - // This is the index where our next character pair goes. - // We save this outside of the loop to NULL terminate correctly. - size_t strIdx = 0; - for (size_t byteIdx = 0; byteIdx < numBytes; byteIdx += 1) - { - // We're going to write two bytes this loop, but need to exit early if we're out of bounds. - // We need room for: - // - the high nibble - // - the low nibble - // - the NULL terminator - // Offsets (from stdIdx) of 0, 1, and 2 must be within the buffer bounds. - if ((strIdx + 2) < strBuffSize) - { - const uint8 byte = pBytes[byteIdx]; - - if (fmt == HexStringFmt::Lowercase) - { - pStrBuff[strIdx + 0] = kHexStringLookupLower[byte >> 4]; // High nibble first - pStrBuff[strIdx + 1] = kHexStringLookupLower[byte & 0xf]; // Low nibble - } - else - { - pStrBuff[strIdx + 0] = kHexStringLookupUpper[byte >> 4]; // High nibble first - pStrBuff[strIdx + 1] = kHexStringLookupUpper[byte & 0xf]; // Low nibble - } - - strIdx += 2; - } - else - { - break; - } - } - - pStrBuff[strIdx] = '\0'; - charsProcessed += strIdx + 1; // Hex characters (if any) + NULL - } - - return charsProcessed; -} - -/// ==================================================================================================================== -// Helper function that translates hex digits into numeric values. -// Returns 0xff if the value is not a hex digit -#if DD_CPLUSPLUS_SUPPORTS(CPP17) -constexpr uint8 HexDigitToValue(char c) -#else -inline uint8 HexDigitToValue(char c) -#endif -{ - // We use a switch case here to get the point across - // gcc9, clang8, and MSVC all turn this into a lookup table indexing with c (sometimes subtracting from it first) - - switch (c) - { - // clang-format off - case '0': - case '1': case '2': case '3': - case '4': case '5': case '6': - case '7': case '8': case '9': - return c - '0'; - - case 'a': case 'A': return 0xa; - case 'b': case 'B': return 0xb; - case 'c': case 'C': return 0xc; - case 'd': case 'D': return 0xd; - case 'e': case 'E': return 0xe; - case 'f': case 'F': return 0xf; - - default: - return 0xff; - // clang-format on - } -} - -#if DD_CPLUSPLUS_SUPPORTS(CPP17) - static_assert(HexDigitToValue('Z') == 0xff); - - static_assert(HexDigitToValue('0') == 0); - static_assert(HexDigitToValue('1') == 1); - static_assert(HexDigitToValue('2') == 2); - static_assert(HexDigitToValue('3') == 3); - static_assert(HexDigitToValue('4') == 4); - static_assert(HexDigitToValue('5') == 5); - static_assert(HexDigitToValue('6') == 6); - static_assert(HexDigitToValue('7') == 7); - static_assert(HexDigitToValue('8') == 8); - static_assert(HexDigitToValue('9') == 9); - - static_assert(HexDigitToValue('a') == 10); - static_assert(HexDigitToValue('b') == 11); - static_assert(HexDigitToValue('c') == 12); - static_assert(HexDigitToValue('d') == 13); - static_assert(HexDigitToValue('e') == 14); - static_assert(HexDigitToValue('f') == 15); - - static_assert(HexDigitToValue('A') == 10); - static_assert(HexDigitToValue('B') == 11); - static_assert(HexDigitToValue('C') == 12); - static_assert(HexDigitToValue('D') == 13); - static_assert(HexDigitToValue('E') == 14); - static_assert(HexDigitToValue('F') == 15); -#endif - -/// ==================================================================================================================== -// Decode not more than `strLength` hex characters from `pStrBuff` into their binary representation, storing -// not more than `numBytes` into `pBytesOut`. -// -// This is the compliment of EncodeToHexString() and is suitable for decoding large binary blocks out of text formats -// such as Json. -// -// Returns the number of bytes written out through `pBytesOut`. -inline size_t DecodeFromHexString(const char* pStrBuff, size_t strLength, void* pBytesOut, size_t numBytes) -{ - uint8* pBytes = static_cast(pBytesOut); - - // Byte offset that we've written into pBytes - size_t bytesProcessed = 0; - - // Note: Only even-length hex strings are supported - if ((strLength % 2 == 0) && (pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strLength != 0)) - { - size_t byteIdx = 0; - - // Process two characters (one byte) per iteration. - // This loop is bounded on two sizes: the string buffer and the byte buffer - for (size_t strIdx = 0; - ((strIdx + 1) < strLength) && (byteIdx < numBytes); - strIdx += 2, byteIdx += 1) - { - const uint8 hi = HexDigitToValue(pStrBuff[strIdx + 0]); // High nibble first - const uint8 lo = HexDigitToValue(pStrBuff[strIdx + 1]); // Low nibble - - if ((lo != 0xff) && (hi != 0xff)) - { - pBytes[byteIdx] = (hi << 4) | lo; - bytesProcessed += 1; - } - else - { - // Non-hex digit encountered, this is a parsing error. - // This log statement is compiled out, but may be useful for debugging something funny. - DD_PRINT(LogLevel::Never, - "[DecodeFromHexString] Expected hex digits ([0-9a-fA-F]), but found \"%c%c\"", - pStrBuff[strIdx + 0], - pStrBuff[strIdx + 1]); - break; - } - } - } - - return bytesProcessed; -} - -} // namespace DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include +#include + +namespace DevDriver +{ +// A String class that stores the string inline with a compile-time maximum size. +// This class facilitiates passing bounded sized C Strings around without dynamic allocation. It has POD semantics +// when copied or passed by value into functions, and can be stored in a vector. +template +class FixedString +{ +private: + char m_data[FixedSize]; + +public: + FixedString() + { + // Only the first byte needs to be initialized - we actively do not want to zero the entire array! + m_data[0] = 0; + } + + FixedString(const FixedString&) = default; + FixedString(FixedString&&) = default; + + ~FixedString() {} + + FixedString& operator=(FixedString& pOther) + { + Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); + return *this; + } + + FixedString& operator=(FixedString&& pOther) + { + Platform::Strncpy(m_data, pOther.m_data, sizeof(m_data)); + return *this; + } + + bool operator==(const FixedString& other) const + { + return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) == 0; + } + + bool operator!=(const FixedString& other) const + { + return strncmp(this->AsCStr(), other.AsCStr(), FixedSize) != 0; + } + + // Create a FixedString from a C String, truncating the copy if pString is too long + FixedString(const char* pString) { Platform::Strncpy(m_data, pString, sizeof(m_data)); } + + // Return a pointer to the inline C String. + const char* AsCStr() const { return m_data; } + + char* AsCStr() { return m_data; } + + // Computes the length of the string. + // Note! This is an O(N) operation! + size_t Size() const { return strlen(m_data); } +}; + +// Sanity check for class size. +static_assert(sizeof(FixedString<16>) == 16, "FixedString<16> should be exactly 16 bytes"); + +/// ==================================================================================================================== +// Hashes a FixedString<> Key using Metrohash +template +struct DefaultHashFunc> +{ + uint32 operator()(const FixedString& key) const { return DefaultHashFunc()(key.AsCStr()); } +}; + +/// ==================================================================================================================== +/// Utility functions for strings +/// ==================================================================================================================== + +enum struct HexStringFmt +{ + Lowercase, + Uppercase, +}; + +/// ==================================================================================================================== +// Encode not more than `numBytes` from `pBytes` into hexadecimal, storing not more than stringBufferSize characters +// into pStringBuffer. +// +// This is the compliment of DecodeFromHexString() and is suitable for saving large binary blocks in text formats such +// as Json. +// +// This function NULL terminates its output if it writes anything. +// Hex pairs are written to `pStrBuff` in pairs - either both digits are written or neither is. A lone nibble +// is never written to the buffer. +// Thus, Hex strings are always an even length (+ a NULL byte) +// +// Returns the number of characters written out through `pStrBuff` (including the NULL terminator). +template +inline size_t EncodeToHexString(const void* pBytesIn, size_t numBytes, char* pStrBuff, size_t strBuffSize) +{ + const uint8* pBytes = static_cast(pBytesIn); + + // Character offset that we've written into pStrBuff + size_t charsProcessed = 0; + + if ((pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strBuffSize != 0)) + { + + // Both lookups are indexed by nibble + constexpr const char kHexStringLookupLower[] = "0123456789abcdef"; + constexpr const char kHexStringLookupUpper[] = "0123456789ABCDEF"; + + // This is the index where our next character pair goes. + // We save this outside of the loop to NULL terminate correctly. + size_t strIdx = 0; + for (size_t byteIdx = 0; byteIdx < numBytes; byteIdx += 1) + { + // We're going to write two bytes this loop, but need to exit early if we're out of bounds. + // We need room for: + // - the high nibble + // - the low nibble + // - the NULL terminator + // Offsets (from stdIdx) of 0, 1, and 2 must be within the buffer bounds. + if ((strIdx + 2) < strBuffSize) + { + const uint8 byte = pBytes[byteIdx]; + + if (fmt == HexStringFmt::Lowercase) + { + pStrBuff[strIdx + 0] = kHexStringLookupLower[byte >> 4]; // High nibble first + pStrBuff[strIdx + 1] = kHexStringLookupLower[byte & 0xf]; // Low nibble + } + else + { + pStrBuff[strIdx + 0] = kHexStringLookupUpper[byte >> 4]; // High nibble first + pStrBuff[strIdx + 1] = kHexStringLookupUpper[byte & 0xf]; // Low nibble + } + + strIdx += 2; + } + else + { + break; + } + } + + pStrBuff[strIdx] = '\0'; + charsProcessed += strIdx + 1; // Hex characters (if any) + NULL + } + + return charsProcessed; +} + +/// ==================================================================================================================== +// Helper function that translates hex digits into numeric values. +// Returns 0xff if the value is not a hex digit +#if DD_CPLUSPLUS_SUPPORTS(CPP17) +constexpr uint8 HexDigitToValue(char c) +#else +inline uint8 HexDigitToValue(char c) +#endif +{ + // We use a switch case here to get the point across + // gcc9, clang8, and MSVC all turn this into a lookup table indexing with c (sometimes subtracting from it first) + + switch (c) + { + // clang-format off + case '0': + case '1': case '2': case '3': + case '4': case '5': case '6': + case '7': case '8': case '9': + return c - '0'; + + case 'a': case 'A': return 0xa; + case 'b': case 'B': return 0xb; + case 'c': case 'C': return 0xc; + case 'd': case 'D': return 0xd; + case 'e': case 'E': return 0xe; + case 'f': case 'F': return 0xf; + + default: + return 0xff; + // clang-format on + } +} + +#if DD_CPLUSPLUS_SUPPORTS(CPP17) + static_assert(HexDigitToValue('Z') == 0xff); + + static_assert(HexDigitToValue('0') == 0); + static_assert(HexDigitToValue('1') == 1); + static_assert(HexDigitToValue('2') == 2); + static_assert(HexDigitToValue('3') == 3); + static_assert(HexDigitToValue('4') == 4); + static_assert(HexDigitToValue('5') == 5); + static_assert(HexDigitToValue('6') == 6); + static_assert(HexDigitToValue('7') == 7); + static_assert(HexDigitToValue('8') == 8); + static_assert(HexDigitToValue('9') == 9); + + static_assert(HexDigitToValue('a') == 10); + static_assert(HexDigitToValue('b') == 11); + static_assert(HexDigitToValue('c') == 12); + static_assert(HexDigitToValue('d') == 13); + static_assert(HexDigitToValue('e') == 14); + static_assert(HexDigitToValue('f') == 15); + + static_assert(HexDigitToValue('A') == 10); + static_assert(HexDigitToValue('B') == 11); + static_assert(HexDigitToValue('C') == 12); + static_assert(HexDigitToValue('D') == 13); + static_assert(HexDigitToValue('E') == 14); + static_assert(HexDigitToValue('F') == 15); +#endif + +/// ==================================================================================================================== +// Decode not more than `strLength` hex characters from `pStrBuff` into their binary representation, storing +// not more than `numBytes` into `pBytesOut`. +// +// This is the compliment of EncodeToHexString() and is suitable for decoding large binary blocks out of text formats +// such as Json. +// +// Returns the number of bytes written out through `pBytesOut`. +inline size_t DecodeFromHexString(const char* pStrBuff, size_t strLength, void* pBytesOut, size_t numBytes) +{ + uint8* pBytes = static_cast(pBytesOut); + + // Byte offset that we've written into pBytes + size_t bytesProcessed = 0; + + // Note: Only even-length hex strings are supported + if ((strLength % 2 == 0) && (pBytes != nullptr) && (numBytes != 0) && (pStrBuff != nullptr) && (strLength != 0)) + { + size_t byteIdx = 0; + + // Process two characters (one byte) per iteration. + // This loop is bounded on two sizes: the string buffer and the byte buffer + for (size_t strIdx = 0; + ((strIdx + 1) < strLength) && (byteIdx < numBytes); + strIdx += 2, byteIdx += 1) + { + const uint8 hi = HexDigitToValue(pStrBuff[strIdx + 0]); // High nibble first + const uint8 lo = HexDigitToValue(pStrBuff[strIdx + 1]); // Low nibble + + if ((lo != 0xff) && (hi != 0xff)) + { + pBytes[byteIdx] = (hi << 4) | lo; + bytesProcessed += 1; + } + else + { + // Non-hex digit encountered, this is a parsing error. + // This log statement is compiled out, but may be useful for debugging something funny. + DD_PRINT(LogLevel::Never, + "[DecodeFromHexString] Expected hex digits ([0-9a-fA-F]), but found \"%c%c\"", + pStrBuff[strIdx + 0], + pStrBuff[strIdx + 1]); + break; + } + } + } + + return bytesProcessed; +} + +} // namespace DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h index faae3006a0..82c10bbc43 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/shared/legacy/inc/util/vector.h @@ -1,605 +1,605 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ - -#pragma once - -#include - -#if !DD_PLATFORM_WINDOWS_KM -#include -#endif -#include - -namespace DevDriver -{ - template - class Vector - { - public: - class Iterator; - - // The capacity this Vector can hold without allocating extra space. - static constexpr size_t DefaultCapacity = defaultCapacity; - - // Standard constructor - explicit Vector(const AllocCb& allocCb) - : m_pData(m_data) - , m_size(0) - , m_capacity(defaultCapacity) - , m_allocCb(allocCb) - { - } - - // Move constructor - Vector(Vector &&rhs) - : m_pData(m_data) // default initialize it to the default allocation - , m_size(Platform::Exchange(rhs.m_size, (size_t)0)) // move the rhs size value into ours - , m_capacity(defaultCapacity) // initialize the capacity to default - , m_allocCb(rhs.m_allocCb) // copy the allocator callback - { - // if the vector will fit inside the default allocation, move it into it - if (m_size <= defaultCapacity) - { - for (size_t index = 0; index < m_size; index++) - { - m_data[index] = Platform::Move(rhs.m_pData[index]); - } - } - else // otherwise, we want to move the allocation + replace the capacity - { - m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); - m_capacity = Platform::Exchange(rhs.m_capacity, defaultCapacity); - } - } - - // Destructor - ~Vector() - { - Clear(); - } - - void operator=(Vector&& rhs) - { - Swap(rhs); - } - - // Convenience methods - size_t Size() const { return m_size; } - size_t Capacity() const { return m_capacity; } - bool IsEmpty() const { return (m_size == 0); } - - // Subscript operator - T& operator[](size_t index) { DD_ASSERT(index < m_size); return m_pData[index]; } - const T& operator[](size_t index) const { DD_ASSERT(index < m_size); return m_pData[index]; } - - // Insert elements into the back of the Vector - template - bool PushBack(Args&&... args) - { - bool result = false; - Reserve(m_size + 1); - if (m_size < m_capacity) - { - m_pData[m_size] = T(Platform::Forward(args)...); - ++m_size; - result = true; - } - return result; - } - - // Insert elements from another Vector to the back of the Vector - bool Append(const Vector& other) - { - return Append(other.Data(), other.Size()); - } - - template - bool Append(const T (&buffer)[Len]) - { - return Append(buffer, Len); - } - - // This is un-used by default, but may be overloaded for some Ts - bool Append(const T* pTs); - - // Insert elements from a buffer to the back of the Vector - // An empty slice (countOfTs == 0) is effectively a no-op - bool Append(const T* pTs, size_t countOfTs) - { - // Check that we get a valid pointer. If this fires, we'll crash but this is more visible than just crashing - // in a memcpy below. - if (countOfTs != 0) - { - DD_ASSERT(pTs != nullptr); - } - - // Pre-allocate all the new elements, since we know how many there are. - const size_t oldSize = Grow(countOfTs); - - // Some types can be bulk-transferred with a memcpy. - // Instead of letting the compiler guess, we dictate when dealing with Pods. - if (Platform::IsPod::Value) - { - memcpy(&m_pData[oldSize], pTs, (sizeof(T) * countOfTs)); - } - else - { - for (size_t i = 0; i < countOfTs; ++i) - { - m_pData[oldSize + i] = pTs[i]; - } - } - - // Pretend Grow() cannot fail, since we cannot check allocation failure with it right now. - return true; - } - - // Pop elements out of the Vector - bool PopBack(T* pData) - { - bool result = !IsEmpty(); - if (result) - { - --m_size; - if (pData != nullptr) - { - *pData = Platform::Move(m_pData[m_size]); - } - } - return result; - } - - // Pop elements out of the Vector - bool PopFront(T* pData) - { - bool result = !IsEmpty(); - if (result) - { - if (pData != nullptr) - { - *pData = Platform::Move(m_pData[0]); - } - - --m_size; - - if (m_size > 0) - { - for (size_t i = 0; i < m_size; i++) - { - m_pData[i] = Platform::Move(m_pData[i + 1]); - } - } - } - return result; - } - - // Remove the object at the specified index. Does not maintain order. - void Remove(size_t index) - { - DD_ASSERT(index < m_size); - - const size_t lastIndex = m_size - 1; - - // If the index is the last index, we move the last element into it's place - if (index != lastIndex) - { - m_pData[index] = Platform::Move(m_pData[lastIndex]); - } - // Otherwise, if it is the last element and not a POD we replace it with a default constructed object - else if (!Platform::IsPod::Value) - { - m_pData[index] = T(); - } - - --m_size; - } - - // Remove all instances of the specified object from the vector. Does not maintain order. - size_t Remove(const T& object) - { - size_t numRemoved = 0; - - for (size_t index = m_size; index > 0; index--) - { - if (m_pData[index - 1] == object) - { - Remove(index - 1); - numRemoved++; - } - } - return numRemoved; - } - - // Free all memory - void Clear() - { - if (m_pData != m_data) - { - // If the object is not a POD we explicitly destroy all objects prior to freeing the allocation. - if (!Platform::IsPod::Value) - { - for (size_t i = 0; i < m_capacity; i++) - { - m_pData[i].~T(); - } - } - DD_FREE(m_pData, m_allocCb); - m_pData = m_data; - m_capacity = defaultCapacity; - m_size = 0; - } - else - { - Reset(); - } - } - - // Clears all objects stored, but doesn't free memory. - void Reset() - { - // If the object is not a POD we need to destroy all instances and replace them with default constructed - // instances. - if (!Platform::IsPod::Value) - { - for (size_t index = 0; index < m_size; index++) - { - m_pData[index] = T(); - } - } - m_size = 0; - } - - // Swaps the contents of the current vector with the provided vector - void Swap(Vector& rhs) - { - // If we can, we swap allocations directly - if ((m_pData != m_data) && (rhs.m_pData != rhs.m_data)) - { - m_pData = Platform::Exchange(rhs.m_pData, m_pData); - } - // Else if the other object is using the default allocation we move it's contents here - // and give ownership of our allocation to it - else if (m_pData != m_data) - { - for (size_t index = 0; index < rhs.m_size; index++) - { - m_data[index] = Platform::Move(rhs.m_data[index]); - } - rhs.m_pData = Platform::Exchange(m_pData, m_data); - } - // Else if this object is using the default allocation we move our data into it's allocation - // and take ownership of our allocation to it - else if (rhs.m_pData != rhs.m_data) - { - for (size_t index = 0; index < m_size; index++) - { - rhs.m_data[index] = Platform::Move(m_data[index]); - } - m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); - } - // Otherwise we just exchange all the objects that we need to - else - { - for (size_t index = 0; index < Platform::Max(m_size, rhs.m_size); index++) - { - m_data[index] = Platform::Exchange(rhs.m_data[index], m_data[index]); - } - } - - // Finally, we exchange the rest of the data - m_allocCb = Platform::Exchange(rhs.m_allocCb, m_allocCb); - m_capacity = Platform::Exchange(rhs.m_capacity, m_capacity); - m_size = Platform::Exchange(rhs.m_size, m_size); - } - - // Get a pointer to the beginning of the data - // - // Returns nullptr if there is no data available - const T* Data() const - { - return (Size() != 0) ? m_pData : nullptr; - } - - // Get a pointer to the beginning of the data - // - // Returns nullptr if there is no data available - T* Data() - { - return (Size() != 0) ? m_pData : nullptr; - } - - // Allocates enough memory to hold the specified number of elements - void Reserve(size_t newSize) - { - if (m_capacity < newSize) - { - const size_t newCapacity = Platform::Pow2Pad(Platform::Max(newSize, (size_t)1)); - const size_t allocSize = sizeof(T) * newCapacity; - T* pData = static_cast(DD_MALLOC(allocSize, alignof(T), m_allocCb)); - - DD_ASSERT(pData != nullptr); - - // If the struct is not a POD, then we need to construct objects - if (is_type_trivial() == false) - { - size_t i = 0; - // First, we move all existing objects into the vector. - for (; i < m_size; i++) - { - new(&pData[i]) T(Platform::Move(m_pData[i])); - } - // Then we construct new objects with the remaining memory. - for (; i < newCapacity; i++) - { - new(&pData[i]) T(); - } - } - // Otherwise, we just copy the existing data into the new vector and call it good. - else - { - // Need to use reinterpret_cast here because gcc can't seem to evaluate - // `is_trivial_v` at compile-time, thus generating a no-class-memaccess warning. - // `if constexpr` fixes the issue, but AMDLOG's toolchain doesn't support c++17. - std::memcpy(reinterpret_cast(pData), m_pData, m_size * sizeof(T)); - } - - if (m_pData != m_data) - { - // If the object wasn't a POD we need to destroy all instances before freeing the memory. - if (!Platform::IsPod::Value) - { - for (size_t i = 0; i < m_capacity; i++) - { - m_pData[i].~T(); - } - } - DD_FREE(m_pData, m_allocCb); - } - m_pData = pData; - m_capacity = newCapacity; - } - } - - // Resizes the vector. Implicitly destroys objects if newSize is smaller than the existing size. - void Resize(size_t newSize) - { - // TODO: Reserve should return whether allocation failed - Reserve(newSize); - - // If the object isn't a POD and we are shrinking the size, we need to replace destroyed objects with - // default constructed instances. - if (!Platform::IsPod::Value) - { - for (size_t i = newSize; i < m_size; i++) - { - m_pData[i] = T(); - } - } - m_size = newSize; - } - - // Resizes the vector, zeroing additional elements - // - // Warning: This will break badly if your type cannot be safely memset() to 0! - void ResizeAndZero(size_t newSize) - { - // TODO: Reserve should return whether allocation failed - Reserve(newSize); - - if (newSize > m_size) - { - memset(&m_pData[m_size], 0, (newSize - m_size) * sizeof(T)); - } - - m_size = newSize; - } - - // Grows the vector by the specified number of elements and returns the previous size - size_t Grow(size_t numElements) - { - const size_t oldSize = m_size; - - Resize(m_size + numElements); - - return oldSize; - } - - // Iterator creation function - Iterator Begin() const - { - return CreateIterator(0); - } - - // Iterator creation function - constexpr Iterator End() const - { - return Iterator(nullptr, 0); - } - - // Iterator creation function - Iterator CreateIterator(size_t index) const - { - if (index < m_size) - return Iterator(this, index); - return End(); - } - - // Finds the first index for the provided object - Iterator Find(const T& object) const - { - auto it = Begin(); - for (; it != End(); ++it) - { - if (*it == object) - { - break; - } - } - return it; - } - - // Removes the element represented by the provided iterator. Does not maintain order. - Iterator Remove(const Iterator& it) - { - DD_ASSERT(it.m_pContainer == this); - - Remove(it.m_index); - if (it.m_index < m_size) - return it; - - return End(); - } - - /// Returns the internal allocator - const AllocCb& GetAllocCb() const { return m_allocCb; } - - private: - // Disallow copy construct. - Vector(Vector& rhs) = delete; - - // This indirection fixes the warning comparision of a constant with another constant. This should be - // replace with `if constexpr` once AMDLog upgrades to support C++17. - constexpr bool is_type_trivial() - { -#if !DD_PLATFORM_WINDOWS_KM - return std::is_trivial_v; -#else - // not available in kernel so treat every type as nontrivial - return false; -#endif - } - - T m_data[defaultCapacity]; - T* m_pData; - size_t m_size; - size_t m_capacity; - AllocCb m_allocCb; - }; - - // Iterator class for the Vector type - template - class Vector::Iterator - { - friend Vector; - public: - // Comparison operators - bool operator==(const Iterator& rhs) const - { - return ((m_pContainer == rhs.m_pContainer) && (m_index == rhs.m_index)); - } - - bool operator!=(const Iterator& rhs) const - { - return ((m_pContainer != rhs.m_pContainer) || (m_index != rhs.m_index)); - } - - // Prefix operator to increment the iterator - Iterator& operator++() - { - if (m_pContainer != nullptr) - { - m_index += 1; - if (m_index >= m_pContainer->m_size) - { - m_index = 0; - m_pContainer = nullptr; - } - } - return *this; - } - - // Addition operator to add to the iterator - Iterator& operator+(size_t value) - { - if (m_pContainer != nullptr) - { - m_index += value; - if (m_index >= m_pContainer->m_size) - { - m_index = 0; - m_pContainer = nullptr; - } - } - return *this; - } - - // Indirection operator - T& operator*() const - { - DD_ASSERT(m_pContainer != nullptr); - return m_pContainer->m_pData[m_index]; - } - - // Member of pointer operator. Returns a pointer to the object in the shared container. - T* operator->() const - { - DD_ASSERT(m_pContainer != nullptr); - return &m_pContainer->m_pData[m_index]; - } - private: - // Constructor is private to ensure it cannot be created by anything other than the Vector itself - Iterator(const Vector* pContainer, size_t index) : - m_pContainer(pContainer), - m_index(index) - { - }; - - const Vector* m_pContainer; - size_t m_index; - }; - - // - // functions necessary for C++ ranged based for loop support - // - - // Implement begin() function for range-based for loops - template - inline typename Vector::Iterator begin(Vector& rhs) - { - return rhs.Begin(); - } - - // Implement end() function for range-based for loops - template - inline constexpr typename Vector::Iterator end(const Vector& rhs) - { - return rhs.End(); - } - - // Specialized functions for using Vector<> like a String - template <> - inline bool Vector::Append(const char* pStr) - { - return Append(pStr, strlen(pStr)); - } - - template <> - template - inline bool Vector::Append(const char (&str)[Len]) - { - return Append(str, strlen(str)); - } - -} // DevDriver +/* + *********************************************************************************************************************** + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ + +#pragma once + +#include + +#if !DD_PLATFORM_WINDOWS_KM +#include +#endif +#include + +namespace DevDriver +{ + template + class Vector + { + public: + class Iterator; + + // The capacity this Vector can hold without allocating extra space. + static constexpr size_t DefaultCapacity = defaultCapacity; + + // Standard constructor + explicit Vector(const AllocCb& allocCb) + : m_pData(m_data) + , m_size(0) + , m_capacity(defaultCapacity) + , m_allocCb(allocCb) + { + } + + // Move constructor + Vector(Vector &&rhs) + : m_pData(m_data) // default initialize it to the default allocation + , m_size(Platform::Exchange(rhs.m_size, (size_t)0)) // move the rhs size value into ours + , m_capacity(defaultCapacity) // initialize the capacity to default + , m_allocCb(rhs.m_allocCb) // copy the allocator callback + { + // if the vector will fit inside the default allocation, move it into it + if (m_size <= defaultCapacity) + { + for (size_t index = 0; index < m_size; index++) + { + m_data[index] = Platform::Move(rhs.m_pData[index]); + } + } + else // otherwise, we want to move the allocation + replace the capacity + { + m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); + m_capacity = Platform::Exchange(rhs.m_capacity, defaultCapacity); + } + } + + // Destructor + ~Vector() + { + Clear(); + } + + void operator=(Vector&& rhs) + { + Swap(rhs); + } + + // Convenience methods + size_t Size() const { return m_size; } + size_t Capacity() const { return m_capacity; } + bool IsEmpty() const { return (m_size == 0); } + + // Subscript operator + T& operator[](size_t index) { DD_ASSERT(index < m_size); return m_pData[index]; } + const T& operator[](size_t index) const { DD_ASSERT(index < m_size); return m_pData[index]; } + + // Insert elements into the back of the Vector + template + bool PushBack(Args&&... args) + { + bool result = false; + Reserve(m_size + 1); + if (m_size < m_capacity) + { + m_pData[m_size] = T(Platform::Forward(args)...); + ++m_size; + result = true; + } + return result; + } + + // Insert elements from another Vector to the back of the Vector + bool Append(const Vector& other) + { + return Append(other.Data(), other.Size()); + } + + template + bool Append(const T (&buffer)[Len]) + { + return Append(buffer, Len); + } + + // This is un-used by default, but may be overloaded for some Ts + bool Append(const T* pTs); + + // Insert elements from a buffer to the back of the Vector + // An empty slice (countOfTs == 0) is effectively a no-op + bool Append(const T* pTs, size_t countOfTs) + { + // Check that we get a valid pointer. If this fires, we'll crash but this is more visible than just crashing + // in a memcpy below. + if (countOfTs != 0) + { + DD_ASSERT(pTs != nullptr); + } + + // Pre-allocate all the new elements, since we know how many there are. + const size_t oldSize = Grow(countOfTs); + + // Some types can be bulk-transferred with a memcpy. + // Instead of letting the compiler guess, we dictate when dealing with Pods. + if (Platform::IsPod::Value) + { + memcpy(&m_pData[oldSize], pTs, (sizeof(T) * countOfTs)); + } + else + { + for (size_t i = 0; i < countOfTs; ++i) + { + m_pData[oldSize + i] = pTs[i]; + } + } + + // Pretend Grow() cannot fail, since we cannot check allocation failure with it right now. + return true; + } + + // Pop elements out of the Vector + bool PopBack(T* pData) + { + bool result = !IsEmpty(); + if (result) + { + --m_size; + if (pData != nullptr) + { + *pData = Platform::Move(m_pData[m_size]); + } + } + return result; + } + + // Pop elements out of the Vector + bool PopFront(T* pData) + { + bool result = !IsEmpty(); + if (result) + { + if (pData != nullptr) + { + *pData = Platform::Move(m_pData[0]); + } + + --m_size; + + if (m_size > 0) + { + for (size_t i = 0; i < m_size; i++) + { + m_pData[i] = Platform::Move(m_pData[i + 1]); + } + } + } + return result; + } + + // Remove the object at the specified index. Does not maintain order. + void Remove(size_t index) + { + DD_ASSERT(index < m_size); + + const size_t lastIndex = m_size - 1; + + // If the index is the last index, we move the last element into it's place + if (index != lastIndex) + { + m_pData[index] = Platform::Move(m_pData[lastIndex]); + } + // Otherwise, if it is the last element and not a POD we replace it with a default constructed object + else if (!Platform::IsPod::Value) + { + m_pData[index] = T(); + } + + --m_size; + } + + // Remove all instances of the specified object from the vector. Does not maintain order. + size_t Remove(const T& object) + { + size_t numRemoved = 0; + + for (size_t index = m_size; index > 0; index--) + { + if (m_pData[index - 1] == object) + { + Remove(index - 1); + numRemoved++; + } + } + return numRemoved; + } + + // Free all memory + void Clear() + { + if (m_pData != m_data) + { + // If the object is not a POD we explicitly destroy all objects prior to freeing the allocation. + if (!Platform::IsPod::Value) + { + for (size_t i = 0; i < m_capacity; i++) + { + m_pData[i].~T(); + } + } + DD_FREE(m_pData, m_allocCb); + m_pData = m_data; + m_capacity = defaultCapacity; + m_size = 0; + } + else + { + Reset(); + } + } + + // Clears all objects stored, but doesn't free memory. + void Reset() + { + // If the object is not a POD we need to destroy all instances and replace them with default constructed + // instances. + if (!Platform::IsPod::Value) + { + for (size_t index = 0; index < m_size; index++) + { + m_pData[index] = T(); + } + } + m_size = 0; + } + + // Swaps the contents of the current vector with the provided vector + void Swap(Vector& rhs) + { + // If we can, we swap allocations directly + if ((m_pData != m_data) && (rhs.m_pData != rhs.m_data)) + { + m_pData = Platform::Exchange(rhs.m_pData, m_pData); + } + // Else if the other object is using the default allocation we move it's contents here + // and give ownership of our allocation to it + else if (m_pData != m_data) + { + for (size_t index = 0; index < rhs.m_size; index++) + { + m_data[index] = Platform::Move(rhs.m_data[index]); + } + rhs.m_pData = Platform::Exchange(m_pData, m_data); + } + // Else if this object is using the default allocation we move our data into it's allocation + // and take ownership of our allocation to it + else if (rhs.m_pData != rhs.m_data) + { + for (size_t index = 0; index < m_size; index++) + { + rhs.m_data[index] = Platform::Move(m_data[index]); + } + m_pData = Platform::Exchange(rhs.m_pData, rhs.m_data); + } + // Otherwise we just exchange all the objects that we need to + else + { + for (size_t index = 0; index < Platform::Max(m_size, rhs.m_size); index++) + { + m_data[index] = Platform::Exchange(rhs.m_data[index], m_data[index]); + } + } + + // Finally, we exchange the rest of the data + m_allocCb = Platform::Exchange(rhs.m_allocCb, m_allocCb); + m_capacity = Platform::Exchange(rhs.m_capacity, m_capacity); + m_size = Platform::Exchange(rhs.m_size, m_size); + } + + // Get a pointer to the beginning of the data + // + // Returns nullptr if there is no data available + const T* Data() const + { + return (Size() != 0) ? m_pData : nullptr; + } + + // Get a pointer to the beginning of the data + // + // Returns nullptr if there is no data available + T* Data() + { + return (Size() != 0) ? m_pData : nullptr; + } + + // Allocates enough memory to hold the specified number of elements + void Reserve(size_t newSize) + { + if (m_capacity < newSize) + { + const size_t newCapacity = Platform::Pow2Pad(Platform::Max(newSize, (size_t)1)); + const size_t allocSize = sizeof(T) * newCapacity; + T* pData = static_cast(DD_MALLOC(allocSize, alignof(T), m_allocCb)); + + DD_ASSERT(pData != nullptr); + + // If the struct is not a POD, then we need to construct objects + if (is_type_trivial() == false) + { + size_t i = 0; + // First, we move all existing objects into the vector. + for (; i < m_size; i++) + { + new(&pData[i]) T(Platform::Move(m_pData[i])); + } + // Then we construct new objects with the remaining memory. + for (; i < newCapacity; i++) + { + new(&pData[i]) T(); + } + } + // Otherwise, we just copy the existing data into the new vector and call it good. + else + { + // Need to use reinterpret_cast here because gcc can't seem to evaluate + // `is_trivial_v` at compile-time, thus generating a no-class-memaccess warning. + // `if constexpr` fixes the issue, but AMDLOG's toolchain doesn't support c++17. + std::memcpy(reinterpret_cast(pData), m_pData, m_size * sizeof(T)); + } + + if (m_pData != m_data) + { + // If the object wasn't a POD we need to destroy all instances before freeing the memory. + if (!Platform::IsPod::Value) + { + for (size_t i = 0; i < m_capacity; i++) + { + m_pData[i].~T(); + } + } + DD_FREE(m_pData, m_allocCb); + } + m_pData = pData; + m_capacity = newCapacity; + } + } + + // Resizes the vector. Implicitly destroys objects if newSize is smaller than the existing size. + void Resize(size_t newSize) + { + // TODO: Reserve should return whether allocation failed + Reserve(newSize); + + // If the object isn't a POD and we are shrinking the size, we need to replace destroyed objects with + // default constructed instances. + if (!Platform::IsPod::Value) + { + for (size_t i = newSize; i < m_size; i++) + { + m_pData[i] = T(); + } + } + m_size = newSize; + } + + // Resizes the vector, zeroing additional elements + // + // Warning: This will break badly if your type cannot be safely memset() to 0! + void ResizeAndZero(size_t newSize) + { + // TODO: Reserve should return whether allocation failed + Reserve(newSize); + + if (newSize > m_size) + { + memset(&m_pData[m_size], 0, (newSize - m_size) * sizeof(T)); + } + + m_size = newSize; + } + + // Grows the vector by the specified number of elements and returns the previous size + size_t Grow(size_t numElements) + { + const size_t oldSize = m_size; + + Resize(m_size + numElements); + + return oldSize; + } + + // Iterator creation function + Iterator Begin() const + { + return CreateIterator(0); + } + + // Iterator creation function + constexpr Iterator End() const + { + return Iterator(nullptr, 0); + } + + // Iterator creation function + Iterator CreateIterator(size_t index) const + { + if (index < m_size) + return Iterator(this, index); + return End(); + } + + // Finds the first index for the provided object + Iterator Find(const T& object) const + { + auto it = Begin(); + for (; it != End(); ++it) + { + if (*it == object) + { + break; + } + } + return it; + } + + // Removes the element represented by the provided iterator. Does not maintain order. + Iterator Remove(const Iterator& it) + { + DD_ASSERT(it.m_pContainer == this); + + Remove(it.m_index); + if (it.m_index < m_size) + return it; + + return End(); + } + + /// Returns the internal allocator + const AllocCb& GetAllocCb() const { return m_allocCb; } + + private: + // Disallow copy construct. + Vector(Vector& rhs) = delete; + + // This indirection fixes the warning comparision of a constant with another constant. This should be + // replace with `if constexpr` once AMDLog upgrades to support C++17. + constexpr bool is_type_trivial() + { +#if !DD_PLATFORM_WINDOWS_KM + return std::is_trivial_v; +#else + // not available in kernel so treat every type as nontrivial + return false; +#endif + } + + T m_data[defaultCapacity]; + T* m_pData; + size_t m_size; + size_t m_capacity; + AllocCb m_allocCb; + }; + + // Iterator class for the Vector type + template + class Vector::Iterator + { + friend Vector; + public: + // Comparison operators + bool operator==(const Iterator& rhs) const + { + return ((m_pContainer == rhs.m_pContainer) && (m_index == rhs.m_index)); + } + + bool operator!=(const Iterator& rhs) const + { + return ((m_pContainer != rhs.m_pContainer) || (m_index != rhs.m_index)); + } + + // Prefix operator to increment the iterator + Iterator& operator++() + { + if (m_pContainer != nullptr) + { + m_index += 1; + if (m_index >= m_pContainer->m_size) + { + m_index = 0; + m_pContainer = nullptr; + } + } + return *this; + } + + // Addition operator to add to the iterator + Iterator& operator+(size_t value) + { + if (m_pContainer != nullptr) + { + m_index += value; + if (m_index >= m_pContainer->m_size) + { + m_index = 0; + m_pContainer = nullptr; + } + } + return *this; + } + + // Indirection operator + T& operator*() const + { + DD_ASSERT(m_pContainer != nullptr); + return m_pContainer->m_pData[m_index]; + } + + // Member of pointer operator. Returns a pointer to the object in the shared container. + T* operator->() const + { + DD_ASSERT(m_pContainer != nullptr); + return &m_pContainer->m_pData[m_index]; + } + private: + // Constructor is private to ensure it cannot be created by anything other than the Vector itself + Iterator(const Vector* pContainer, size_t index) : + m_pContainer(pContainer), + m_index(index) + { + }; + + const Vector* m_pContainer; + size_t m_index; + }; + + // + // functions necessary for C++ ranged based for loop support + // + + // Implement begin() function for range-based for loops + template + inline typename Vector::Iterator begin(Vector& rhs) + { + return rhs.Begin(); + } + + // Implement end() function for range-based for loops + template + inline constexpr typename Vector::Iterator end(const Vector& rhs) + { + return rhs.End(); + } + + // Specialized functions for using Vector<> like a String + template <> + inline bool Vector::Append(const char* pStr) + { + return Append(pStr, strlen(pStr)); + } + + template <> + template + inline bool Vector::Append(const char (&str)[Len]) + { + return Append(str, strlen(str)); + } + +} // DevDriver diff --git a/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h b/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h index 5f0665f047..530c32cf8a 100644 --- a/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h +++ b/shared/amdgpu-windows-interop/pal/shared/devdriver/third_party/dd_crc32/inc/dd_crc32.h @@ -1,77 +1,77 @@ -//--------------------------------------------------------------------- -// CRC32 -// -// Calculate a 32bit crc using a the Sarwate look up table method. The original algorithm was created by -// Dilip V. Sarwate, and is based off of Stephan Brumme's implementation. See also: -// https://dl.acm.org/citation.cfm?doid=63030.63037 -// http://create.stephan-brumme.com/crc32/#sarwate -// -//// Copyright (c) 2011-2016 Stephan Brumme. All rights reserved. -//***************************************************************************************************************** -// * This software is provided 'as-is', without any express or implied warranty. In no event will the author be held -// * liable for any damages arising from the use of this software. Permission is granted to anyone to use this -// * software for any purpose, including commercial applications, and to alter it and redistribute it freely, -// * subject to the following restrictions: -// * 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original -// * software -// * 2. If you use this software in a product, an acknowledgment in the product documentation would be -// * appreciated but is not required. -// * 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the -// * original software. -// ***************************************************************************************************************** -// -// Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved -// -// This software program is licensed subject to the BSD License, -// available at http://www.opensource.org/licenses/bsd-license.html. -// -// -// Tables for software CRC generation -// - -#include -static inline uint32_t CRC32(const void *pData, size_t length, uint32_t lastCRC = 0) -{ - DD_STATIC_CONST uint32_t lookupTable[256] = - { - 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, - 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, - 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, - 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, - 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, - 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, - 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, - 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, - 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, - 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, - 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, - 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, - 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, - 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, - 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, - 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, - 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, - 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, - 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, - 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, - 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, - 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, - 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, - 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, - 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, - 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, - 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, - 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, - 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, - 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, - 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, - 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, - }; - - uint32_t crc = ~lastCRC; // same as lastCRC ^ 0xFFFFFFFF - const unsigned char* DD_RESTRICT pCurrent = (const unsigned char*)pData; - while (length--) - crc = (crc >> 8) ^ lookupTable[(crc & 0xFF) ^ *pCurrent++]; - return ~crc; -} - +//--------------------------------------------------------------------- +// CRC32 +// +// Calculate a 32bit crc using a the Sarwate look up table method. The original algorithm was created by +// Dilip V. Sarwate, and is based off of Stephan Brumme's implementation. See also: +// https://dl.acm.org/citation.cfm?doid=63030.63037 +// http://create.stephan-brumme.com/crc32/#sarwate +// +//// Copyright (c) 2011-2016 Stephan Brumme. All rights reserved. +//***************************************************************************************************************** +// * This software is provided 'as-is', without any express or implied warranty. In no event will the author be held +// * liable for any damages arising from the use of this software. Permission is granted to anyone to use this +// * software for any purpose, including commercial applications, and to alter it and redistribute it freely, +// * subject to the following restrictions: +// * 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original +// * software +// * 2. If you use this software in a product, an acknowledgment in the product documentation would be +// * appreciated but is not required. +// * 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the +// * original software. +// ***************************************************************************************************************** +// +// Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved +// +// This software program is licensed subject to the BSD License, +// available at http://www.opensource.org/licenses/bsd-license.html. +// +// +// Tables for software CRC generation +// + +#include +static inline uint32_t CRC32(const void *pData, size_t length, uint32_t lastCRC = 0) +{ + DD_STATIC_CONST uint32_t lookupTable[256] = + { + 0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3, + 0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91, + 0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7, + 0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5, + 0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B, + 0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59, + 0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F, + 0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D, + 0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433, + 0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01, + 0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457, + 0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65, + 0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB, + 0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9, + 0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F, + 0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD, + 0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683, + 0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1, + 0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7, + 0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5, + 0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B, + 0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79, + 0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F, + 0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D, + 0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713, + 0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21, + 0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777, + 0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45, + 0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB, + 0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9, + 0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF, + 0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D, + }; + + uint32_t crc = ~lastCRC; // same as lastCRC ^ 0xFFFFFFFF + const unsigned char* DD_RESTRICT pCurrent = (const unsigned char*)pData; + while (length--) + crc = (crc >> 8) ^ lookupTable[(crc & 0xFF) ^ *pCurrent++]; + return ~crc; +} + diff --git a/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h b/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h index 909291cfc7..80f5ca6617 100644 --- a/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h +++ b/shared/amdgpu-windows-interop/pal/shared/inc/trackedCmdLocation.h @@ -1,187 +1,187 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file trackedCmdLocation.h - * @brief Defines the format used for correlation buffers reported through - * ICmdBufferReporting::CorrelationReportOnSubmit, - * - enum class TrackedCmdLocationMode - * - struct TrackedCmdLocation - * - * Plus the helper functions - * - TrackedCmdLocationGetDeltaInDwords - * - TrackedCmdLocationGetDeltaInBytes - *********************************************************************************************************************** - */ - -#pragma once - -namespace Pal -{ - -namespace CmdDisassembly -{ - -/// @brief enum class TrackedCmdLocationMode -/// Defines how to interpret the unions within struct TrackedCmdLocation -/// -enum class TrackedCmdLocationMode : uint8_t -{ - Invalid = 0, - Before, - After, - Delta, // before and after - ClientId, - ClientEventId -}; - -/// @brief struct TrackedCmdLocation defines the format used for correlation data submitted through -/// ICmdBufferReporting::CorrelationReportOnSubmit, and is two DWORDs in size (uint64_t) -/// -/// @detail struct TrackedCmdLocation has a number of flavors interpreted by its member m_mode -/// -/// For m_mode == TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After or -/// TrackedCmdLocationMode::Delta, m_correlateInternal will be used -/// -/// For m_mode == TrackedCmdLocationMode::ClientId, m_clientId will be used -/// -/// For m_mode == TrackedCmdLocationMode::ClientEventId, m_clientEvent will be used -/// -/// -/// For use as m_correlateInternal -/// m_correlateInternal.m_event refers to an internal function that can be converted to a name via -/// Pal::CmdDisassembly::TrackedCmdSupportBase -/// m_correlateInternal.m_ptr is address within the cmdList being correlated by this -/// TrackedCmdLocation -/// For a cmdList with baseAddress and sizeInBytes, m_ptr is in the range -/// [baseAddress, baseAddress+sizeInBytes) -/// m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta -/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair -/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr -/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) -/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList -/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the -/// event described by m_event -/// -/// For use as m_clientEvent -/// m_clientEvent.m_clientEventId is a number provided by the client, provided by a call to -/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) -/// The m_ptr for this event will be the next TrackedCmdLocation, which will have -/// m_correlateInternal.m_mode == TrackedCmdLocationMode::Delta -/// m_correlateInternal.m_event == PostClientEvent -/// m_correlateInternal.m_deltaInDWords == 0 -/// -/// For use as m_clientId -/// m_clientId.m_clientId is an identifier use by the client for the cmdList that corresponds to -/// this array of correlation data. This will have been set through a call to -/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) -/// When internal correlation is not active, this will be the first tracked location. Otherwise -/// it will not appear until what tracking occurs during Reset is complete. -struct TrackedCmdLocation -{ - static constexpr uint32_t DeltaBitCount = 5; - static constexpr uint32_t MaxDelta = (1LL << DeltaBitCount) - 1; - static constexpr uint32_t DwordDeltaShift = 3; - static constexpr uint8_t PostClientEvent = 0xff; - - static constexpr uint64_t PtrBitCount = 48; - /// NoCorrespondingBaseAddress is set to an impossible pointer value, that still fits in to the 48 fits - /// used for m_correlateInternal.m_ptr; - static constexpr uint64_t NoCorrespondingBaseAddress = (1LL << PtrBitCount) - 1; - - union - { - struct - { - uint64_t m_mode : 3; - }; - - struct - { - uint64_t m_mode : 3; // TrackedCmdLocationMode::Before/After/Delta - uint64_t m_event : 8; // TrackedEvents - uint64_t m_ptr : PtrBitCount; // Note, can probably use two bits fewer, - // since these addresses appear to be at a minimum 4-byte aligned. - uint64_t m_deltaInDWords : DeltaBitCount; - } m_correlateInternal; - - struct - { - uint64_t m_mode : 3; - uint64_t m_clientId : 61; - } m_clientId; - - struct - { - uint64_t m_mode : 3; - uint64_t m_clientEventId : 61; - - } m_clientEvent; - - uint64_t m_all; - }; -}; - -// ===================================================================================================================== -/// @brief Helper funcion to obtain DeltaInDwords from TrackedCmdLocation -/// -/// @detail m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta -/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair -/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr -/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) -/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList -/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the -/// event described by m_event -/// -/// -/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta -/// m_correlateInternal.m_deltaInDWords otherwise -constexpr uint64_t TrackedCmdLocationGetDeltaInDwords( - const TrackedCmdLocation location) -{ - const TrackedCmdLocationMode mode = static_cast(location.m_mode); - if (mode == TrackedCmdLocationMode::Delta) - { - return location.m_correlateInternal.m_deltaInDWords; - } - else - { - return 0; - } -} - -// ===================================================================================================================== -/// @brief Helper funcion to convert DeltaInDwords from TrackedCmdLocation to "InBytes" -/// -/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta -/// m_correlateInternal.m_deltaInDWords * sizeof(DWORD) otherwise - where DWORD is uint32_t -constexpr uint64_t TrackedCmdLocationGetDeltaInBytes( - const TrackedCmdLocation location) -{ - return TrackedCmdLocationGetDeltaInDwords(location) << TrackedCmdLocation::DwordDeltaShift; -} - -} // namespace CmdDisassembly -} // namespace Pal +/* + *********************************************************************************************************************** + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file trackedCmdLocation.h + * @brief Defines the format used for correlation buffers reported through + * ICmdBufferReporting::CorrelationReportOnSubmit, + * - enum class TrackedCmdLocationMode + * - struct TrackedCmdLocation + * + * Plus the helper functions + * - TrackedCmdLocationGetDeltaInDwords + * - TrackedCmdLocationGetDeltaInBytes + *********************************************************************************************************************** + */ + +#pragma once + +namespace Pal +{ + +namespace CmdDisassembly +{ + +/// @brief enum class TrackedCmdLocationMode +/// Defines how to interpret the unions within struct TrackedCmdLocation +/// +enum class TrackedCmdLocationMode : uint8_t +{ + Invalid = 0, + Before, + After, + Delta, // before and after + ClientId, + ClientEventId +}; + +/// @brief struct TrackedCmdLocation defines the format used for correlation data submitted through +/// ICmdBufferReporting::CorrelationReportOnSubmit, and is two DWORDs in size (uint64_t) +/// +/// @detail struct TrackedCmdLocation has a number of flavors interpreted by its member m_mode +/// +/// For m_mode == TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After or +/// TrackedCmdLocationMode::Delta, m_correlateInternal will be used +/// +/// For m_mode == TrackedCmdLocationMode::ClientId, m_clientId will be used +/// +/// For m_mode == TrackedCmdLocationMode::ClientEventId, m_clientEvent will be used +/// +/// +/// For use as m_correlateInternal +/// m_correlateInternal.m_event refers to an internal function that can be converted to a name via +/// Pal::CmdDisassembly::TrackedCmdSupportBase +/// m_correlateInternal.m_ptr is address within the cmdList being correlated by this +/// TrackedCmdLocation +/// For a cmdList with baseAddress and sizeInBytes, m_ptr is in the range +/// [baseAddress, baseAddress+sizeInBytes) +/// m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta +/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair +/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr +/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) +/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList +/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the +/// event described by m_event +/// +/// For use as m_clientEvent +/// m_clientEvent.m_clientEventId is a number provided by the client, provided by a call to +/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) +/// The m_ptr for this event will be the next TrackedCmdLocation, which will have +/// m_correlateInternal.m_mode == TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_event == PostClientEvent +/// m_correlateInternal.m_deltaInDWords == 0 +/// +/// For use as m_clientId +/// m_clientId.m_clientId is an identifier use by the client for the cmdList that corresponds to +/// this array of correlation data. This will have been set through a call to +/// IAmdExtCmdDisassembly::IssueClientEvent(clientId, clientEventId) +/// When internal correlation is not active, this will be the first tracked location. Otherwise +/// it will not appear until what tracking occurs during Reset is complete. +struct TrackedCmdLocation +{ + static constexpr uint32_t DeltaBitCount = 5; + static constexpr uint32_t MaxDelta = (1LL << DeltaBitCount) - 1; + static constexpr uint32_t DwordDeltaShift = 3; + static constexpr uint8_t PostClientEvent = 0xff; + + static constexpr uint64_t PtrBitCount = 48; + /// NoCorrespondingBaseAddress is set to an impossible pointer value, that still fits in to the 48 fits + /// used for m_correlateInternal.m_ptr; + static constexpr uint64_t NoCorrespondingBaseAddress = (1LL << PtrBitCount) - 1; + + union + { + struct + { + uint64_t m_mode : 3; + }; + + struct + { + uint64_t m_mode : 3; // TrackedCmdLocationMode::Before/After/Delta + uint64_t m_event : 8; // TrackedEvents + uint64_t m_ptr : PtrBitCount; // Note, can probably use two bits fewer, + // since these addresses appear to be at a minimum 4-byte aligned. + uint64_t m_deltaInDWords : DeltaBitCount; + } m_correlateInternal; + + struct + { + uint64_t m_mode : 3; + uint64_t m_clientId : 61; + } m_clientId; + + struct + { + uint64_t m_mode : 3; + uint64_t m_clientEventId : 61; + + } m_clientEvent; + + uint64_t m_all; + }; +}; + +// ===================================================================================================================== +/// @brief Helper funcion to obtain DeltaInDwords from TrackedCmdLocation +/// +/// @detail m_correlateInternal.m_deltaInDWords is only used when m_mode == TrackedCmdLocationMode::Delta +/// And describes a TrackedCmdLocationMode::Before, TrackedCmdLocationMode::After pair +/// when the m_ptr corresponding to TrackedCmdLocationMode::Before is m_ptr +/// and for TrackedCmdLocationMode::After is m_ptr + m_deltaInDWords * sizeof(DWORD) +/// m_deltaInDWords = 0 if no DWORDS/PM4Packets were written between to the corresponding cmdList +/// between TrackedCmdLocationMode::Before and TrackedCmdLocationMode::After for the +/// event described by m_event +/// +/// +/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_deltaInDWords otherwise +constexpr uint64_t TrackedCmdLocationGetDeltaInDwords( + const TrackedCmdLocation location) +{ + const TrackedCmdLocationMode mode = static_cast(location.m_mode); + if (mode == TrackedCmdLocationMode::Delta) + { + return location.m_correlateInternal.m_deltaInDWords; + } + else + { + return 0; + } +} + +// ===================================================================================================================== +/// @brief Helper funcion to convert DeltaInDwords from TrackedCmdLocation to "InBytes" +/// +/// @returns 0 in m_mode != TrackedCmdLocationMode::Delta +/// m_correlateInternal.m_deltaInDWords * sizeof(DWORD) otherwise - where DWORD is uint32_t +constexpr uint64_t TrackedCmdLocationGetDeltaInBytes( + const TrackedCmdLocation location) +{ + return TrackedCmdLocationGetDeltaInDwords(location) << TrackedCmdLocation::DwordDeltaShift; +} + +} // namespace CmdDisassembly +} // namespace Pal diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt b/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt index e4f30c2a54..544afca91d 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/CMakeLists.txt @@ -1,70 +1,70 @@ -## - ####################################################################################################################### - # - # Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - # - ####################################################################################################################### - -cmake_minimum_required(VERSION 3.1...3.21) - -project(MetroHash VERSION 1.0.0 LANGUAGES CXX) - -option(METROHASH_ENABLE_WERROR "Build with -Werror enabled" OFF) - -add_library(metrohash STATIC "") - -target_include_directories(metrohash PUBLIC src) - -target_sources(metrohash PRIVATE src/metrohash64.cpp - src/metrohash128.cpp) - - -set_target_properties(metrohash PROPERTIES CXX_STANDARD 11 - CXX_STANDARD_REQUIRED ON - CXX_EXTENSIONS OFF - POSITION_INDEPENDENT_CODE ON) - - -if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - - if(METROHASH_ENABLE_WERROR) - target_compile_options(metrohash PRIVATE -Werror) - endif() - - # [GCC] Exceptions - # https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_exceptions.html - # - # [GCC] Options Controlling C++ Dialect - # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/C_002b_002b-Dialect-Options.html - target_compile_options(metrohash PRIVATE - -fno-exceptions # Disable exception handling support. - -fno-rtti) # Disable run-time type information support. - - # [GCC] Options to Request or Suppress Warnings - # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/Warning-Options.html - target_compile_options(metrohash PRIVATE - -Wall - -Wextra - -Wpedantic) - -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - - # [MSVC] Exception Handling Model - # - # [MSVC] Enable Run-Time Type Information - # - # [MSVC] Buffer Security Check - target_compile_options(metrohash PRIVATE - /EHsc # Catches only C++ exceptions and assumes - # functions declared as extern "C" never throw a C++ exception. - /GR- # Disables run-time type information. - /GS-) # Disables detection of buffer overruns. - - # [MSVC] Warning Level - target_compile_options(metrohash PRIVATE - /W4 # Enable warning level 4. - /WX) # Treat warnings as errors. - -else() - message(FATAL_ERROR "Compiler ${CMAKE_CXX_COMPILER_ID} is not supported!") -endif() +## + ####################################################################################################################### + # + # Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + # + ####################################################################################################################### + +cmake_minimum_required(VERSION 3.1...3.21) + +project(MetroHash VERSION 1.0.0 LANGUAGES CXX) + +option(METROHASH_ENABLE_WERROR "Build with -Werror enabled" OFF) + +add_library(metrohash STATIC "") + +target_include_directories(metrohash PUBLIC src) + +target_sources(metrohash PRIVATE src/metrohash64.cpp + src/metrohash128.cpp) + + +set_target_properties(metrohash PROPERTIES CXX_STANDARD 11 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + POSITION_INDEPENDENT_CODE ON) + + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + + if(METROHASH_ENABLE_WERROR) + target_compile_options(metrohash PRIVATE -Werror) + endif() + + # [GCC] Exceptions + # https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_exceptions.html + # + # [GCC] Options Controlling C++ Dialect + # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/C_002b_002b-Dialect-Options.html + target_compile_options(metrohash PRIVATE + -fno-exceptions # Disable exception handling support. + -fno-rtti) # Disable run-time type information support. + + # [GCC] Options to Request or Suppress Warnings + # https://gcc.gnu.org/onlinedocs/gcc-8.1.0/gcc/Warning-Options.html + target_compile_options(metrohash PRIVATE + -Wall + -Wextra + -Wpedantic) + +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + + # [MSVC] Exception Handling Model + # + # [MSVC] Enable Run-Time Type Information + # + # [MSVC] Buffer Security Check + target_compile_options(metrohash PRIVATE + /EHsc # Catches only C++ exceptions and assumes + # functions declared as extern "C" never throw a C++ exception. + /GR- # Disables run-time type information. + /GS-) # Disables detection of buffer overruns. + + # [MSVC] Warning Level + target_compile_options(metrohash PRIVATE + /W4 # Enable warning level 4. + /WX) # Treat warnings as errors. + +else() + message(FATAL_ERROR "Compiler ${CMAKE_CXX_COMPILER_ID} is not supported!") +endif() diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE b/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE index 29f81d812f..261eeb9e9f 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/LICENSE @@ -1,201 +1,201 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md b/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md index 3401a286dd..2ac16b1437 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/README.md @@ -1,56 +1,56 @@ -## MetroHash: Faster, Better Hash Functions - -MetroHash is a set of state-of-the-art hash functions for *non-cryptographic* use cases. They are notable for being algorithmically generated in addition to their exceptional performance. The set of published hash functions may be expanded in the future, having been selected from a very large set of hash functions that have been constructed this way. - -* Fastest general-purpose functions for bulk hashing. -* Fastest general-purpose functions for small, variable length keys. -* Robust statistical bias profile, similar to the MD5 cryptographic hash. -* Hashes can be constructed incrementally (**new**) -* 64-bit, 128-bit, and 128-bit CRC variants currently available. -* Optimized for modern x86-64 microarchitectures. -* Elegant, compact, readable functions. - -You can read more about the design and history [here](http://www.jandrewrogers.com/2015/05/27/metrohash/). - -## News - -### 23 October 2018 - -The project has been re-licensed under Apache License v2.0. The purpose of this license change is consistency with the imminent release of MetroHash v2.0, which is also licensed under the Apache license. - -### 27 July 2015 - -Two new 64-bit and 128-bit algorithms add the ability to construct hashes incrementally. In addition to supporting incremental construction, the algorithms are slightly superior to the prior versions. - -A big change is that these new algorithms are implemented as C++ classes that support both incremental and stateless hashing. These classes also have a static method for verifying the implementation against the test vectors built into the classes. Implementations are now fully contained by their respective headers e.g. "metrohash128.h". - -*Note: an incremental version of the 128-bit CRC version is on its way but is not included in this push.* - -**Usage Example For Stateless Hashing** - -`MetroHash128::Hash(key, key_length, hash_ptr, seed)` - -**Usage Example For Incremental Hashing** - -`MetroHash128 hasher;` -`hasher.Update(partial_key, partial_key_length);` -`...` -`hasher.Update(partial_key, partial_key_length);` -`hasher.Finalize(hash_ptr);` - -An `Initialize(seed)` method allows the hasher objects to be reused. - - -### 27 May 2015 - -Six hash functions have been included in the initial release: - -* 64-bit hash functions, "metrohash64_1" and "metrohash64_2" -* 128-bit hash functions, "metrohash128_1" and "metrohash128_2" -* 128-bit hash functions using CRC instructions, "metrohash128crc_1" and "metrohash128crc_2" - -Hash functions in the same family are effectively statistically unique. In other words, if you need two hash functions for a bloom filter, you can use "metrohash64_1" and "metrohash64_2" in the same implementation without issue. An unbounded set of statistically unique functions can be generated in each family. The functions in this repo were generated specifically for public release. - -The hash function generation software made no effort toward portability. While these hash functions should be easily portable to big-endian microarchitectures, they have not been tested on them and the performance optimization algorithms were not targeted at them. ARM64 microarchitectures might be a worthwhile hash function generation targets if I had the hardware. - - +## MetroHash: Faster, Better Hash Functions + +MetroHash is a set of state-of-the-art hash functions for *non-cryptographic* use cases. They are notable for being algorithmically generated in addition to their exceptional performance. The set of published hash functions may be expanded in the future, having been selected from a very large set of hash functions that have been constructed this way. + +* Fastest general-purpose functions for bulk hashing. +* Fastest general-purpose functions for small, variable length keys. +* Robust statistical bias profile, similar to the MD5 cryptographic hash. +* Hashes can be constructed incrementally (**new**) +* 64-bit, 128-bit, and 128-bit CRC variants currently available. +* Optimized for modern x86-64 microarchitectures. +* Elegant, compact, readable functions. + +You can read more about the design and history [here](http://www.jandrewrogers.com/2015/05/27/metrohash/). + +## News + +### 23 October 2018 + +The project has been re-licensed under Apache License v2.0. The purpose of this license change is consistency with the imminent release of MetroHash v2.0, which is also licensed under the Apache license. + +### 27 July 2015 + +Two new 64-bit and 128-bit algorithms add the ability to construct hashes incrementally. In addition to supporting incremental construction, the algorithms are slightly superior to the prior versions. + +A big change is that these new algorithms are implemented as C++ classes that support both incremental and stateless hashing. These classes also have a static method for verifying the implementation against the test vectors built into the classes. Implementations are now fully contained by their respective headers e.g. "metrohash128.h". + +*Note: an incremental version of the 128-bit CRC version is on its way but is not included in this push.* + +**Usage Example For Stateless Hashing** + +`MetroHash128::Hash(key, key_length, hash_ptr, seed)` + +**Usage Example For Incremental Hashing** + +`MetroHash128 hasher;` +`hasher.Update(partial_key, partial_key_length);` +`...` +`hasher.Update(partial_key, partial_key_length);` +`hasher.Finalize(hash_ptr);` + +An `Initialize(seed)` method allows the hasher objects to be reused. + + +### 27 May 2015 + +Six hash functions have been included in the initial release: + +* 64-bit hash functions, "metrohash64_1" and "metrohash64_2" +* 128-bit hash functions, "metrohash128_1" and "metrohash128_2" +* 128-bit hash functions using CRC instructions, "metrohash128crc_1" and "metrohash128crc_2" + +Hash functions in the same family are effectively statistically unique. In other words, if you need two hash functions for a bloom filter, you can use "metrohash64_1" and "metrohash64_2" in the same implementation without issue. An unbounded set of statistically unique functions can be generated in each family. The functions in this repo were generated specifically for public release. + +The hash function generation software made no effort toward portability. While these hash functions should be easily portable to big-endian microarchitectures, they have not been tested on them and the performance optimization algorithms were not targeted at them. ARM64 microarchitectures might be a worthwhile hash function generation targets if I had the hardware. + + diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash b/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash index de855571f1..3c5ee421fd 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/make/Makefile.metrohash @@ -1,20 +1,20 @@ -#### -# -# Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. -# -#### - -#----------------------------------------------------------------------- -# Common MetroHash Implementation Files -#----------------------------------------------------------------------- - -vpath %.cpp $(METROHASH_DEPTH)/src - -CPPFILES += metrohash64.cpp \ - metrohash128.cpp - -#----------------------------------------------------------------------- -# Common MetroHash Includes -#----------------------------------------------------------------------- - -LCXXINCS += -I$(METROHASH_DEPTH)/src +#### +# +# Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. +# +#### + +#----------------------------------------------------------------------- +# Common MetroHash Implementation Files +#----------------------------------------------------------------------- + +vpath %.cpp $(METROHASH_DEPTH)/src + +CPPFILES += metrohash64.cpp \ + metrohash128.cpp + +#----------------------------------------------------------------------- +# Common MetroHash Includes +#----------------------------------------------------------------------- + +LCXXINCS += -I$(METROHASH_DEPTH)/src diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h index 072ff21d7a..ffab03216b 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash.h @@ -1,24 +1,24 @@ -// metrohash.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef METROHASH_METROHASH_H -#define METROHASH_METROHASH_H - -#include "metrohash64.h" -#include "metrohash128.h" -#include "metrohash128crc.h" - -#endif // #ifndef METROHASH_METROHASH_H +// metrohash.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef METROHASH_METROHASH_H +#define METROHASH_METROHASH_H + +#include "metrohash64.h" +#include "metrohash128.h" +#include "metrohash128crc.h" + +#endif // #ifndef METROHASH_METROHASH_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp index 1cdbd703a5..2763de5f88 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.cpp @@ -1,419 +1,419 @@ -// metrohash128.cpp -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#include -#include "platform.h" -#include "metrohash128.h" - -namespace Util -{ - -const char * MetroHash128::test_string = "012345678901234567890123456789012345678901234567890123456789012"; - -const uint8_t MetroHash128::test_seed_0[16] = { - 0xC7, 0x7C, 0xE2, 0xBF, 0xA4, 0xED, 0x9F, 0x9B, - 0x05, 0x48, 0xB2, 0xAC, 0x50, 0x74, 0xA2, 0x97 - }; - -const uint8_t MetroHash128::test_seed_1[16] = { - 0x45, 0xA3, 0xCD, 0xB8, 0x38, 0x19, 0x9D, 0x7F, - 0xBD, 0xD6, 0x8D, 0x86, 0x7A, 0x14, 0xEC, 0xEF - }; - - - -MetroHash128::MetroHash128(const uint64_t seed) -{ - Initialize(seed); -} - - -void MetroHash128::Initialize(const uint64_t seed) -{ - // initialize internal hash registers - state.v[0] = (static_cast(seed) - k0) * k3; - state.v[1] = (static_cast(seed) + k1) * k2; - state.v[2] = (static_cast(seed) + k0) * k2; - state.v[3] = (static_cast(seed) - k1) * k3; - - // initialize total length of input - bytes = 0; -} - - -void MetroHash128::Update(const uint8_t * const buffer, const uint64_t length) -{ - const uint8_t * ptr = reinterpret_cast(buffer); - const uint8_t * const end = ptr + length; - - // input buffer may be partially filled - if (bytes % 32) - { - uint64_t fill = 32 - (bytes % 32); - if (fill > length) - fill = length; - - memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); - ptr += fill; - bytes += fill; - - // input buffer is still partially filled - if ((bytes % 32) != 0) return; - - // process full input buffer - state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; - state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; - state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; - state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; - } - - // bulk update - bytes += (end - ptr); - while (ptr <= (end - 32)) - { - // process directly from the source, bypassing the input buffer - state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; - state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; - state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; - state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; - } - - // store remaining bytes in input buffer - if (ptr < end) - memcpy(input.b, ptr, end - ptr); -} - - -void MetroHash128::Finalize(uint8_t * const hash) -{ - // finalize bulk loop, if used - if (bytes >= 32) - { - state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 21) * k1; - state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 21) * k0; - state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 21) * k1; - state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 21) * k0; - } - - // process any bytes remaining in the input buffer - const uint8_t * ptr = reinterpret_cast(input.b); - const uint8_t * const end = ptr + (bytes % 32); - - if ((end - ptr) >= 16) - { - state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; - state.v[1] += read_u64(ptr) * k2; ptr += 8; state.v[1] = rotate_right(state.v[1],33) * k3; - state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 45) * k1; - state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 45) * k0; - } - - if ((end - ptr) >= 8) - { - state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; - state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 27) * k1; - } - - if ((end - ptr) >= 4) - { - state.v[1] += read_u32(ptr) * k2; ptr += 4; state.v[1] = rotate_right(state.v[1],33) * k3; - state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 46) * k0; - } - - if ((end - ptr) >= 2) - { - state.v[0] += read_u16(ptr) * k2; ptr += 2; state.v[0] = rotate_right(state.v[0],33) * k3; - state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 22) * k1; - } - - if ((end - ptr) >= 1) - { - state.v[1] += read_u8 (ptr) * k2; state.v[1] = rotate_right(state.v[1],33) * k3; - state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 58) * k0; - } - - state.v[0] += rotate_right((state.v[0] * k0) + state.v[1], 13); - state.v[1] += rotate_right((state.v[1] * k1) + state.v[0], 37); - state.v[0] += rotate_right((state.v[0] * k2) + state.v[1], 13); - state.v[1] += rotate_right((state.v[1] * k3) + state.v[0], 37); - - bytes = 0; - - // do any endian conversion here - - memcpy(hash, state.v, 16); -} - - -void MetroHash128::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) -{ - const uint8_t * ptr = reinterpret_cast(buffer); - const uint8_t * const end = ptr + length; - - uint64_t v[4]; - - v[0] = (static_cast(seed) - k0) * k3; - v[1] = (static_cast(seed) + k1) * k2; - - if (length >= 32) - { - v[2] = (static_cast(seed) + k0) * k2; - v[3] = (static_cast(seed) - k1) * k3; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 21) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 21) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 21) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 21) * k0; - } - - if ((end - ptr) >= 16) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; - v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 45) * k1; - v[1] ^= rotate_right((v[1] * k3) + v[0], 45) * k0; - } - - if ((end - ptr) >= 8) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 27) * k1; - } - - if ((end - ptr) >= 4) - { - v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 46) * k0; - } - - if ((end - ptr) >= 2) - { - v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; - } - - if ((end - ptr) >= 1) - { - v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 58) * k0; - } - - v[0] += rotate_right((v[0] * k0) + v[1], 13); - v[1] += rotate_right((v[1] * k1) + v[0], 37); - v[0] += rotate_right((v[0] * k2) + v[1], 13); - v[1] += rotate_right((v[1] * k3) + v[0], 37); - - // do any endian conversion here - - memcpy(hash, v, 16); -} - - -bool MetroHash128::ImplementationVerified() -{ - uint8_t hash[16]; - const uint8_t * key = reinterpret_cast(MetroHash128::test_string); - - // verify one-shot implementation - MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 0); - if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; - - MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 1); - if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; - - // verify incremental implementation - MetroHash128 metro; - - metro.Initialize(0); - metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); - metro.Finalize(hash); - if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; - - metro.Initialize(1); - metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); - metro.Finalize(hash); - if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; - - return true; -} - - -void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xC83A91E1; - static const uint64_t k1 = 0x8648DBDB; - static const uint64_t k2 = 0x7BDEC03B; - static const uint64_t k3 = 0x2F5870A5; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t v[4]; - - v[0] = ((static_cast(seed) - k0) * k3) + len; - v[1] = ((static_cast(seed) + k1) * k2) + len; - - if (len >= 32) - { - v[2] = ((static_cast(seed) + k0) * k2) + len; - v[3] = ((static_cast(seed) - k1) * k3) + len; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 26) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 26) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 26) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; - } - - if ((end - ptr) >= 16) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; - v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 17) * k1; - v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; - } - - if ((end - ptr) >= 8) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 20) * k1; - } - - if ((end - ptr) >= 4) - { - v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; - } - - if ((end - ptr) >= 2) - { - v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 24) * k1; - } - - if ((end - ptr) >= 1) - { - v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 24) * k0; - } - - v[0] += rotate_right((v[0] * k0) + v[1], 13); - v[1] += rotate_right((v[1] * k1) + v[0], 37); - v[0] += rotate_right((v[0] * k2) + v[1], 13); - v[1] += rotate_right((v[1] * k3) + v[0], 37); - - // do any endian conversion here - - memcpy(out, v, 16); -} - - -void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xD6D018F5; - static const uint64_t k1 = 0xA2AA033B; - static const uint64_t k2 = 0x62992FC1; - static const uint64_t k3 = 0x30BC5B29; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t v[4]; - - v[0] = ((static_cast(seed) - k0) * k3) + len; - v[1] = ((static_cast(seed) + k1) * k2) + len; - - if (len >= 32) - { - v[2] = ((static_cast(seed) + k0) * k2) + len; - v[3] = ((static_cast(seed) - k1) * k3) + len; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; - } - - if ((end - ptr) >= 16) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; - v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],29) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; - v[1] ^= rotate_right((v[1] * k3) + v[0], 29) * k0; - } - - if ((end - ptr) >= 8) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; - } - - if ((end - ptr) >= 4) - { - v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],29) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 25) * k0; - } - - if ((end - ptr) >= 2) - { - v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],29) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; - } - - if ((end - ptr) >= 1) - { - v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],29) * k3; - v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; - } - - v[0] += rotate_right((v[0] * k0) + v[1], 33); - v[1] += rotate_right((v[1] * k1) + v[0], 33); - v[0] += rotate_right((v[0] * k2) + v[1], 33); - v[1] += rotate_right((v[1] * k3) + v[0], 33); - - // do any endian conversion here - - memcpy(out, v, 16); -} - -} // Util +// metrohash128.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include +#include "platform.h" +#include "metrohash128.h" + +namespace Util +{ + +const char * MetroHash128::test_string = "012345678901234567890123456789012345678901234567890123456789012"; + +const uint8_t MetroHash128::test_seed_0[16] = { + 0xC7, 0x7C, 0xE2, 0xBF, 0xA4, 0xED, 0x9F, 0x9B, + 0x05, 0x48, 0xB2, 0xAC, 0x50, 0x74, 0xA2, 0x97 + }; + +const uint8_t MetroHash128::test_seed_1[16] = { + 0x45, 0xA3, 0xCD, 0xB8, 0x38, 0x19, 0x9D, 0x7F, + 0xBD, 0xD6, 0x8D, 0x86, 0x7A, 0x14, 0xEC, 0xEF + }; + + + +MetroHash128::MetroHash128(const uint64_t seed) +{ + Initialize(seed); +} + + +void MetroHash128::Initialize(const uint64_t seed) +{ + // initialize internal hash registers + state.v[0] = (static_cast(seed) - k0) * k3; + state.v[1] = (static_cast(seed) + k1) * k2; + state.v[2] = (static_cast(seed) + k0) * k2; + state.v[3] = (static_cast(seed) - k1) * k3; + + // initialize total length of input + bytes = 0; +} + + +void MetroHash128::Update(const uint8_t * const buffer, const uint64_t length) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + // input buffer may be partially filled + if (bytes % 32) + { + uint64_t fill = 32 - (bytes % 32); + if (fill > length) + fill = length; + + memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); + ptr += fill; + bytes += fill; + + // input buffer is still partially filled + if ((bytes % 32) != 0) return; + + // process full input buffer + state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // bulk update + bytes += (end - ptr); + while (ptr <= (end - 32)) + { + // process directly from the source, bypassing the input buffer + state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // store remaining bytes in input buffer + if (ptr < end) + memcpy(input.b, ptr, end - ptr); +} + + +void MetroHash128::Finalize(uint8_t * const hash) +{ + // finalize bulk loop, if used + if (bytes >= 32) + { + state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 21) * k1; + state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 21) * k0; + state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 21) * k1; + state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 21) * k0; + } + + // process any bytes remaining in the input buffer + const uint8_t * ptr = reinterpret_cast(input.b); + const uint8_t * const end = ptr + (bytes % 32); + + if ((end - ptr) >= 16) + { + state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[1] += read_u64(ptr) * k2; ptr += 8; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 45) * k1; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 45) * k0; + } + + if ((end - ptr) >= 8) + { + state.v[0] += read_u64(ptr) * k2; ptr += 8; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 27) * k1; + } + + if ((end - ptr) >= 4) + { + state.v[1] += read_u32(ptr) * k2; ptr += 4; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 46) * k0; + } + + if ((end - ptr) >= 2) + { + state.v[0] += read_u16(ptr) * k2; ptr += 2; state.v[0] = rotate_right(state.v[0],33) * k3; + state.v[0] ^= rotate_right((state.v[0] * k2) + state.v[1], 22) * k1; + } + + if ((end - ptr) >= 1) + { + state.v[1] += read_u8 (ptr) * k2; state.v[1] = rotate_right(state.v[1],33) * k3; + state.v[1] ^= rotate_right((state.v[1] * k3) + state.v[0], 58) * k0; + } + + state.v[0] += rotate_right((state.v[0] * k0) + state.v[1], 13); + state.v[1] += rotate_right((state.v[1] * k1) + state.v[0], 37); + state.v[0] += rotate_right((state.v[0] * k2) + state.v[1], 13); + state.v[1] += rotate_right((state.v[1] * k3) + state.v[0], 37); + + bytes = 0; + + // do any endian conversion here + + memcpy(hash, state.v, 16); +} + + +void MetroHash128::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + uint64_t v[4]; + + v[0] = (static_cast(seed) - k0) * k3; + v[1] = (static_cast(seed) + k1) * k2; + + if (length >= 32) + { + v[2] = (static_cast(seed) + k0) * k2; + v[3] = (static_cast(seed) - k1) * k3; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 21) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 21) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 21) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 21) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 45) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 45) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 27) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 46) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 58) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 13); + v[1] += rotate_right((v[1] * k1) + v[0], 37); + v[0] += rotate_right((v[0] * k2) + v[1], 13); + v[1] += rotate_right((v[1] * k3) + v[0], 37); + + // do any endian conversion here + + memcpy(hash, v, 16); +} + + +bool MetroHash128::ImplementationVerified() +{ + uint8_t hash[16]; + const uint8_t * key = reinterpret_cast(MetroHash128::test_string); + + // verify one-shot implementation + MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 0); + if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; + + MetroHash128::Hash(key, strlen(MetroHash128::test_string), hash, 1); + if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; + + // verify incremental implementation + MetroHash128 metro; + + metro.Initialize(0); + metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash128::test_seed_0, 16) != 0) return false; + + metro.Initialize(1); + metro.Update(reinterpret_cast(MetroHash128::test_string), strlen(MetroHash128::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash128::test_seed_1, 16) != 0) return false; + + return true; +} + + +void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 26) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 26) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 26) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 17) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 20) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],33) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 24) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],33) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 24) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 13); + v[1] += rotate_right((v[1] * k1) + v[0], 37); + v[0] += rotate_right((v[0] * k2) + v[1], 13); + v[1] += rotate_right((v[1] * k3) + v[0], 37); + + // do any endian conversion here + + memcpy(out, v, 16); +} + + +void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 29) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 29) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] += read_u32(ptr) * k2; ptr += 4; v[1] = rotate_right(v[1],29) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 25) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] += read_u16(ptr) * k2; ptr += 2; v[0] = rotate_right(v[0],29) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] += read_u8 (ptr) * k2; v[1] = rotate_right(v[1],29) * k3; + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 33); + v[1] += rotate_right((v[1] * k1) + v[0], 33); + v[0] += rotate_right((v[0] * k2) + v[1], 33); + v[1] += rotate_right((v[1] * k3) + v[0], 33); + + // do any endian conversion here + + memcpy(out, v, 16); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h index e109dccf92..1517024b18 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128.h @@ -1,85 +1,85 @@ -// metrohash128.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#ifndef METROHASH_METROHASH_128_H -#define METROHASH_METROHASH_128_H - -#include - -namespace Util -{ - -class MetroHash128 -{ -public: - static const uint32_t bits = 128; - - // Constructor initializes the same as Initialize() - MetroHash128(const uint64_t seed=0); - - // Initializes internal state for new hash with optional seed - void Initialize(const uint64_t seed=0); - - // Update the hash state with a string of bytes. If the length - // is sufficiently long, the implementation switches to a bulk - // hashing algorithm directly on the argument buffer for speed. - void Update(const uint8_t * buffer, const uint64_t length); - - // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. - template - void Update(const T& object) - { - Update(reinterpret_cast(&object), sizeof(object)); - } - - // Constructs the final hash and writes it to the argument buffer. - // After a hash is finalized, this instance must be Initialized()-ed - // again or the behavior of Update() and Finalize() is undefined. - void Finalize(uint8_t * const hash); - - // A non-incremental function implementation. This can be significantly - // faster than the incremental implementation for some usage patterns. - static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); - - // Does implementation correctly execute test vectors? - static bool ImplementationVerified(); - - // test vectors -- Hash(test_string, seed=0) => test_seed_0 - static const char * test_string; - static const uint8_t test_seed_0[16]; - static const uint8_t test_seed_1[16]; - -private: - static const uint64_t k0 = 0xC83A91E1; - static const uint64_t k1 = 0x8648DBDB; - static const uint64_t k2 = 0x7BDEC03B; - static const uint64_t k3 = 0x2F5870A5; - - struct { uint64_t v[4]; } state; - struct { uint8_t b[32]; } input; - uint64_t bytes; -}; - - -// Legacy 128-bit hash functions -- do not use -void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); -void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); - -} // Util - -#endif // #ifndef METROHASH_METROHASH_128_H +// metrohash128.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_128_H +#define METROHASH_METROHASH_128_H + +#include + +namespace Util +{ + +class MetroHash128 +{ +public: + static const uint32_t bits = 128; + + // Constructor initializes the same as Initialize() + MetroHash128(const uint64_t seed=0); + + // Initializes internal state for new hash with optional seed + void Initialize(const uint64_t seed=0); + + // Update the hash state with a string of bytes. If the length + // is sufficiently long, the implementation switches to a bulk + // hashing algorithm directly on the argument buffer for speed. + void Update(const uint8_t * buffer, const uint64_t length); + + // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. + template + void Update(const T& object) + { + Update(reinterpret_cast(&object), sizeof(object)); + } + + // Constructs the final hash and writes it to the argument buffer. + // After a hash is finalized, this instance must be Initialized()-ed + // again or the behavior of Update() and Finalize() is undefined. + void Finalize(uint8_t * const hash); + + // A non-incremental function implementation. This can be significantly + // faster than the incremental implementation for some usage patterns. + static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); + + // Does implementation correctly execute test vectors? + static bool ImplementationVerified(); + + // test vectors -- Hash(test_string, seed=0) => test_seed_0 + static const char * test_string; + static const uint8_t test_seed_0[16]; + static const uint8_t test_seed_1[16]; + +private: + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + struct { uint64_t v[4]; } state; + struct { uint8_t b[32]; } input; + uint64_t bytes; +}; + + +// Legacy 128-bit hash functions -- do not use +void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_128_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp index 521ac4f8df..4e988223d1 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.cpp @@ -1,178 +1,178 @@ -// metrohash128crc.cpp -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include "metrohash.h" -#include "platform.h" - -namespace Util -{ - -void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xC83A91E1; - static const uint64_t k1 = 0x8648DBDB; - static const uint64_t k2 = 0x7BDEC03B; - static const uint64_t k3 = 0x2F5870A5; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t v[4]; - - v[0] = ((static_cast(seed) - k0) * k3) + len; - v[1] = ((static_cast(seed) + k1) * k2) + len; - - if (len >= 32) - { - v[2] = ((static_cast(seed) + k0) * k2) + len; - v[3] = ((static_cast(seed) - k1) * k3) + len; - - do - { - v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; - v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; - v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; - v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; - } - - if ((end - ptr) >= 16) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; - v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; - v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0; - } - - if ((end - ptr) >= 8) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1; - } - - if ((end - ptr) >= 4) - { - v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; - v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0; - } - - if ((end - ptr) >= 2) - { - v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; - v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1; - } - - if ((end - ptr) >= 1) - { - v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); - v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; - } - - v[0] += rotate_right((v[0] * k0) + v[1], 11); - v[1] += rotate_right((v[1] * k1) + v[0], 26); - v[0] += rotate_right((v[0] * k0) + v[1], 11); - v[1] += rotate_right((v[1] * k1) + v[0], 26); - - memcpy(out, v, 16); -} - - -void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xEE783E2F; - static const uint64_t k1 = 0xAD07C493; - static const uint64_t k2 = 0x797A90BB; - static const uint64_t k3 = 0x2E4B2E1B; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t v[4]; - - v[0] = ((static_cast(seed) - k0) * k3) + len; - v[1] = ((static_cast(seed) + k1) * k2) + len; - - if (len >= 32) - { - v[2] = ((static_cast(seed) + k0) * k2) + len; - v[3] = ((static_cast(seed) - k1) * k3) + len; - - do - { - v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; - v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; - v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; - v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0; - } - - if ((end - ptr) >= 16) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3; - v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1; - v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0; - } - - if ((end - ptr) >= 8) - { - v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; - v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; - } - - if ((end - ptr) >= 4) - { - v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; - v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0; - } - - if ((end - ptr) >= 2) - { - v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; - v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1; - } - - if ((end - ptr) >= 1) - { - v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); - v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; - } - - v[0] += rotate_right((v[0] * k0) + v[1], 15); - v[1] += rotate_right((v[1] * k1) + v[0], 27); - v[0] += rotate_right((v[0] * k0) + v[1], 15); - v[1] += rotate_right((v[1] * k1) + v[0], 27); - - memcpy(out, v, 16); -} - -} // Util +// metrohash128crc.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include "metrohash.h" +#include "platform.h" + +namespace Util +{ + +void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; + v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; + v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; + v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; + v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; + v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); + v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 11); + v[1] += rotate_right((v[1] * k1) + v[0], 26); + v[0] += rotate_right((v[0] * k0) + v[1], 11); + v[1] += rotate_right((v[1] * k1) + v[0], 26); + + memcpy(out, v, 16); +} + + +void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xEE783E2F; + static const uint64_t k1 = 0xAD07C493; + static const uint64_t k2 = 0x797A90BB; + static const uint64_t k3 = 0x2E4B2E1B; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t v[4]; + + v[0] = ((static_cast(seed) - k0) * k3) + len; + v[1] = ((static_cast(seed) + k1) * k2) + len; + + if (len >= 32) + { + v[2] = ((static_cast(seed) + k0) * k2) + len; + v[3] = ((static_cast(seed) - k1) * k3) + len; + + do + { + v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; + v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; + v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; + v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0; + } + + if ((end - ptr) >= 16) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3; + v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1; + v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0; + } + + if ((end - ptr) >= 8) + { + v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; + v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; + } + + if ((end - ptr) >= 4) + { + v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; + v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0; + } + + if ((end - ptr) >= 2) + { + v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; + v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1; + } + + if ((end - ptr) >= 1) + { + v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); + v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; + } + + v[0] += rotate_right((v[0] * k0) + v[1], 15); + v[1] += rotate_right((v[1] * k1) + v[0], 27); + v[0] += rotate_right((v[0] * k0) + v[1], 15); + v[1] += rotate_right((v[1] * k1) + v[0], 27); + + memcpy(out, v, 16); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h index d19e9f98e3..7877394d3b 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash128crc.h @@ -1,33 +1,33 @@ -// metrohash128crc.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#ifndef METROHASH_METROHASH_128_CRC_H -#define METROHASH_METROHASH_128_CRC_H - -#include - -namespace Util -{ - -// Legacy 128-bit hash functions -void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); -void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); - -} // Util - -#endif // #ifndef METROHASH_METROHASH_128_CRC_H +// metrohash128crc.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_128_CRC_H +#define METROHASH_METROHASH_128_CRC_H + +#include + +namespace Util +{ + +// Legacy 128-bit hash functions +void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_128_CRC_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp index 9a1deb76b5..aec3563dd2 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.cpp @@ -1,415 +1,415 @@ -// metrohash64.cpp -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#include "platform.h" -#include "metrohash64.h" - -#include - -namespace Util -{ - -const char * MetroHash64::test_string = "012345678901234567890123456789012345678901234567890123456789012"; - -const uint8_t MetroHash64::test_seed_0[8] = { 0x6B, 0x75, 0x3D, 0xAE, 0x06, 0x70, 0x4B, 0xAD }; -const uint8_t MetroHash64::test_seed_1[8] = { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0xB9, 0xB8, 0xDF }; - - - -MetroHash64::MetroHash64(const uint64_t seed) -{ - Initialize(seed); -} - - -void MetroHash64::Initialize(const uint64_t seed) -{ - vseed = (static_cast(seed) + k2) * k0; - - // initialize internal hash registers - state.v[0] = vseed; - state.v[1] = vseed; - state.v[2] = vseed; - state.v[3] = vseed; - - // initialize total length of input - bytes = 0; -} - - -void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length) -{ - const uint8_t * ptr = reinterpret_cast(buffer); - const uint8_t * const end = ptr + length; - - // input buffer may be partially filled - if (bytes % 32) - { - uint64_t fill = 32 - (bytes % 32); - if (fill > length) - fill = length; - - memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); - ptr += fill; - bytes += fill; - - // input buffer is still partially filled - if ((bytes % 32) != 0) return; - - // process full input buffer - state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; - state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; - state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; - state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; - } - - // bulk update - bytes += static_cast(end - ptr); - while (ptr <= (end - 32)) - { - // process directly from the source, bypassing the input buffer - state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; - state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; - state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; - state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; - } - - // store remaining bytes in input buffer - if (ptr < end) - memcpy(input.b, ptr, static_cast(end - ptr)); -} - - -void MetroHash64::Finalize(uint8_t * const hash) -{ - // finalize bulk loop, if used - if (bytes >= 32) - { - state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 37) * k1; - state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 37) * k0; - state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 37) * k1; - state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 37) * k0; - - state.v[0] = vseed + (state.v[0] ^ state.v[1]); - } - - // process any bytes remaining in the input buffer - const uint8_t * ptr = reinterpret_cast(input.b); - const uint8_t * const end = ptr + (bytes % 32); - - if ((end - ptr) >= 16) - { - state.v[1] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3; - state.v[2] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[2] = rotate_right(state.v[2],29) * k3; - state.v[1] ^= rotate_right(state.v[1] * k0, 21) + state.v[2]; - state.v[2] ^= rotate_right(state.v[2] * k3, 21) + state.v[1]; - state.v[0] += state.v[2]; - } - - if ((end - ptr) >= 8) - { - state.v[0] += read_u64(ptr) * k3; ptr += 8; - state.v[0] ^= rotate_right(state.v[0], 55) * k1; - } - - if ((end - ptr) >= 4) - { - state.v[0] += read_u32(ptr) * k3; ptr += 4; - state.v[0] ^= rotate_right(state.v[0], 26) * k1; - } - - if ((end - ptr) >= 2) - { - state.v[0] += read_u16(ptr) * k3; ptr += 2; - state.v[0] ^= rotate_right(state.v[0], 48) * k1; - } - - if ((end - ptr) >= 1) - { - state.v[0] += read_u8 (ptr) * k3; - state.v[0] ^= rotate_right(state.v[0], 37) * k1; - } - - state.v[0] ^= rotate_right(state.v[0], 28); - state.v[0] *= k0; - state.v[0] ^= rotate_right(state.v[0], 29); - - bytes = 0; - - // do any endian conversion here - - memcpy(hash, state.v, 8); -} - - -void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) -{ - const uint8_t * ptr = reinterpret_cast(buffer); - const uint8_t * const end = ptr + length; - - uint64_t h = (static_cast(seed) + k2) * k0; - - if (length >= 32) - { - uint64_t v[4]; - v[0] = h; - v[1] = h; - v[2] = h; - v[3] = h; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 37) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 37) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; - h += v[0] ^ v[1]; - } - - if ((end - ptr) >= 16) - { - uint64_t v0 = h + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; - uint64_t v1 = h + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; - v0 ^= rotate_right(v0 * k0, 21) + v1; - v1 ^= rotate_right(v1 * k3, 21) + v0; - h += v1; - } - - if ((end - ptr) >= 8) - { - h += read_u64(ptr) * k3; ptr += 8; - h ^= rotate_right(h, 55) * k1; - } - - if ((end - ptr) >= 4) - { - h += read_u32(ptr) * k3; ptr += 4; - h ^= rotate_right(h, 26) * k1; - } - - if ((end - ptr) >= 2) - { - h += read_u16(ptr) * k3; ptr += 2; - h ^= rotate_right(h, 48) * k1; - } - - if ((end - ptr) >= 1) - { - h += read_u8 (ptr) * k3; - h ^= rotate_right(h, 37) * k1; - } - - h ^= rotate_right(h, 28); - h *= k0; - h ^= rotate_right(h, 29); - - memcpy(hash, &h, 8); -} - - -bool MetroHash64::ImplementationVerified() -{ - uint8_t hash[8]; - const uint8_t * key = reinterpret_cast(MetroHash64::test_string); - - // verify one-shot implementation - MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 0); - if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; - - MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 1); - if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; - - // verify incremental implementation - MetroHash64 metro; - - metro.Initialize(0); - metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); - metro.Finalize(hash); - if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; - - metro.Initialize(1); - metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); - metro.Finalize(hash); - if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; - - return true; -} - - -void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xC83A91E1; - static const uint64_t k1 = 0x8648DBDB; - static const uint64_t k2 = 0x7BDEC03B; - static const uint64_t k3 = 0x2F5870A5; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t hash = ((static_cast(seed) + k2) * k0) + len; - - if (len >= 32) - { - uint64_t v[4]; - v[0] = hash; - v[1] = hash; - v[2] = hash; - v[3] = hash; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; - hash += v[0] ^ v[1]; - } - - if ((end - ptr) >= 16) - { - uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1; - uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2; - v0 ^= rotate_right(v0 * k0, 35) + v1; - v1 ^= rotate_right(v1 * k3, 35) + v0; - hash += v1; - } - - if ((end - ptr) >= 8) - { - hash += read_u64(ptr) * k3; ptr += 8; - hash ^= rotate_right(hash, 33) * k1; - - } - - if ((end - ptr) >= 4) - { - hash += read_u32(ptr) * k3; ptr += 4; - hash ^= rotate_right(hash, 15) * k1; - } - - if ((end - ptr) >= 2) - { - hash += read_u16(ptr) * k3; ptr += 2; - hash ^= rotate_right(hash, 13) * k1; - } - - if ((end - ptr) >= 1) - { - hash += read_u8 (ptr) * k3; - hash ^= rotate_right(hash, 25) * k1; - } - - hash ^= rotate_right(hash, 33); - hash *= k0; - hash ^= rotate_right(hash, 33); - - memcpy(out, &hash, 8); -} - - -void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) -{ - static const uint64_t k0 = 0xD6D018F5; - static const uint64_t k1 = 0xA2AA033B; - static const uint64_t k2 = 0x62992FC1; - static const uint64_t k3 = 0x30BC5B29; - - const uint8_t * ptr = reinterpret_cast(key); - const uint8_t * const end = ptr + len; - - uint64_t hash = ((static_cast(seed) + k2) * k0) + len; - - if (len >= 32) - { - uint64_t v[4]; - v[0] = hash; - v[1] = hash; - v[2] = hash; - v[3] = hash; - - do - { - v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; - v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; - v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; - v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; - } - while (ptr <= (end - 32)); - - v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1; - v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0; - v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1; - v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; - hash += v[0] ^ v[1]; - } - - if ((end - ptr) >= 16) - { - uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; - uint64_t v1 = hash + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; - v0 ^= rotate_right(v0 * k0, 34) + v1; - v1 ^= rotate_right(v1 * k3, 34) + v0; - hash += v1; - } - - if ((end - ptr) >= 8) - { - hash += read_u64(ptr) * k3; ptr += 8; - hash ^= rotate_right(hash, 36) * k1; - } - - if ((end - ptr) >= 4) - { - hash += read_u32(ptr) * k3; ptr += 4; - hash ^= rotate_right(hash, 15) * k1; - } - - if ((end - ptr) >= 2) - { - hash += read_u16(ptr) * k3; ptr += 2; - hash ^= rotate_right(hash, 15) * k1; - } - - if ((end - ptr) >= 1) - { - hash += read_u8 (ptr) * k3; - hash ^= rotate_right(hash, 23) * k1; - } - - hash ^= rotate_right(hash, 28); - hash *= k0; - hash ^= rotate_right(hash, 29); - - memcpy(out, &hash, 8); -} - -} // Util +// metrohash64.cpp +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#include "platform.h" +#include "metrohash64.h" + +#include + +namespace Util +{ + +const char * MetroHash64::test_string = "012345678901234567890123456789012345678901234567890123456789012"; + +const uint8_t MetroHash64::test_seed_0[8] = { 0x6B, 0x75, 0x3D, 0xAE, 0x06, 0x70, 0x4B, 0xAD }; +const uint8_t MetroHash64::test_seed_1[8] = { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0xB9, 0xB8, 0xDF }; + + + +MetroHash64::MetroHash64(const uint64_t seed) +{ + Initialize(seed); +} + + +void MetroHash64::Initialize(const uint64_t seed) +{ + vseed = (static_cast(seed) + k2) * k0; + + // initialize internal hash registers + state.v[0] = vseed; + state.v[1] = vseed; + state.v[2] = vseed; + state.v[3] = vseed; + + // initialize total length of input + bytes = 0; +} + + +void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + // input buffer may be partially filled + if (bytes % 32) + { + uint64_t fill = 32 - (bytes % 32); + if (fill > length) + fill = length; + + memcpy(input.b + (bytes % 32), ptr, static_cast(fill)); + ptr += fill; + bytes += fill; + + // input buffer is still partially filled + if ((bytes % 32) != 0) return; + + // process full input buffer + state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // bulk update + bytes += static_cast(end - ptr); + while (ptr <= (end - 32)) + { + // process directly from the source, bypassing the input buffer + state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2]; + state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3]; + state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0]; + state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1]; + } + + // store remaining bytes in input buffer + if (ptr < end) + memcpy(input.b, ptr, static_cast(end - ptr)); +} + + +void MetroHash64::Finalize(uint8_t * const hash) +{ + // finalize bulk loop, if used + if (bytes >= 32) + { + state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 37) * k1; + state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 37) * k0; + state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 37) * k1; + state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 37) * k0; + + state.v[0] = vseed + (state.v[0] ^ state.v[1]); + } + + // process any bytes remaining in the input buffer + const uint8_t * ptr = reinterpret_cast(input.b); + const uint8_t * const end = ptr + (bytes % 32); + + if ((end - ptr) >= 16) + { + state.v[1] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3; + state.v[2] = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[2] = rotate_right(state.v[2],29) * k3; + state.v[1] ^= rotate_right(state.v[1] * k0, 21) + state.v[2]; + state.v[2] ^= rotate_right(state.v[2] * k3, 21) + state.v[1]; + state.v[0] += state.v[2]; + } + + if ((end - ptr) >= 8) + { + state.v[0] += read_u64(ptr) * k3; ptr += 8; + state.v[0] ^= rotate_right(state.v[0], 55) * k1; + } + + if ((end - ptr) >= 4) + { + state.v[0] += read_u32(ptr) * k3; ptr += 4; + state.v[0] ^= rotate_right(state.v[0], 26) * k1; + } + + if ((end - ptr) >= 2) + { + state.v[0] += read_u16(ptr) * k3; ptr += 2; + state.v[0] ^= rotate_right(state.v[0], 48) * k1; + } + + if ((end - ptr) >= 1) + { + state.v[0] += read_u8 (ptr) * k3; + state.v[0] ^= rotate_right(state.v[0], 37) * k1; + } + + state.v[0] ^= rotate_right(state.v[0], 28); + state.v[0] *= k0; + state.v[0] ^= rotate_right(state.v[0], 29); + + bytes = 0; + + // do any endian conversion here + + memcpy(hash, state.v, 8); +} + + +void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed) +{ + const uint8_t * ptr = reinterpret_cast(buffer); + const uint8_t * const end = ptr + length; + + uint64_t h = (static_cast(seed) + k2) * k0; + + if (length >= 32) + { + uint64_t v[4]; + v[0] = h; + v[1] = h; + v[2] = h; + v[3] = h; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 37) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 37) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; + h += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = h + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; + uint64_t v1 = h + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; + v0 ^= rotate_right(v0 * k0, 21) + v1; + v1 ^= rotate_right(v1 * k3, 21) + v0; + h += v1; + } + + if ((end - ptr) >= 8) + { + h += read_u64(ptr) * k3; ptr += 8; + h ^= rotate_right(h, 55) * k1; + } + + if ((end - ptr) >= 4) + { + h += read_u32(ptr) * k3; ptr += 4; + h ^= rotate_right(h, 26) * k1; + } + + if ((end - ptr) >= 2) + { + h += read_u16(ptr) * k3; ptr += 2; + h ^= rotate_right(h, 48) * k1; + } + + if ((end - ptr) >= 1) + { + h += read_u8 (ptr) * k3; + h ^= rotate_right(h, 37) * k1; + } + + h ^= rotate_right(h, 28); + h *= k0; + h ^= rotate_right(h, 29); + + memcpy(hash, &h, 8); +} + + +bool MetroHash64::ImplementationVerified() +{ + uint8_t hash[8]; + const uint8_t * key = reinterpret_cast(MetroHash64::test_string); + + // verify one-shot implementation + MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 0); + if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; + + MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 1); + if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; + + // verify incremental implementation + MetroHash64 metro; + + metro.Initialize(0); + metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false; + + metro.Initialize(1); + metro.Update(reinterpret_cast(MetroHash64::test_string), strlen(MetroHash64::test_string)); + metro.Finalize(hash); + if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false; + + return true; +} + + +void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t hash = ((static_cast(seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1; + uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2; + v0 ^= rotate_right(v0 * k0, 35) + v1; + v1 ^= rotate_right(v1 * k3, 35) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += read_u64(ptr) * k3; ptr += 8; + hash ^= rotate_right(hash, 33) * k1; + + } + + if ((end - ptr) >= 4) + { + hash += read_u32(ptr) * k3; ptr += 4; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += read_u16(ptr) * k3; ptr += 2; + hash ^= rotate_right(hash, 13) * k1; + } + + if ((end - ptr) >= 1) + { + hash += read_u8 (ptr) * k3; + hash ^= rotate_right(hash, 25) * k1; + } + + hash ^= rotate_right(hash, 33); + hash *= k0; + hash ^= rotate_right(hash, 33); + + memcpy(out, &hash, 8); +} + + +void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + const uint8_t * ptr = reinterpret_cast(key); + const uint8_t * const end = ptr + len; + + uint64_t hash = ((static_cast(seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2]; + v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3]; + v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0]; + v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1; + v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0; + v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1; + v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3; + uint64_t v1 = hash + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3; + v0 ^= rotate_right(v0 * k0, 34) + v1; + v1 ^= rotate_right(v1 * k3, 34) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += read_u64(ptr) * k3; ptr += 8; + hash ^= rotate_right(hash, 36) * k1; + } + + if ((end - ptr) >= 4) + { + hash += read_u32(ptr) * k3; ptr += 4; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += read_u16(ptr) * k3; ptr += 2; + hash ^= rotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 1) + { + hash += read_u8 (ptr) * k3; + hash ^= rotate_right(hash, 23) * k1; + } + + hash ^= rotate_right(hash, 28); + hash *= k0; + hash ^= rotate_right(hash, 29); + + memcpy(out, &hash, 8); +} + +} // Util diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h index 583f5f3753..43926bbf20 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/metrohash64.h @@ -1,86 +1,86 @@ -// metrohash64.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#ifndef METROHASH_METROHASH_64_H -#define METROHASH_METROHASH_64_H - -#include - -namespace Util -{ - -class MetroHash64 -{ -public: - static const uint32_t bits = 64; - - // Constructor initializes the same as Initialize() - MetroHash64(const uint64_t seed=0); - - // Initializes internal state for new hash with optional seed - void Initialize(const uint64_t seed=0); - - // Update the hash state with a string of bytes. If the length - // is sufficiently long, the implementation switches to a bulk - // hashing algorithm directly on the argument buffer for speed. - void Update(const uint8_t * buffer, const uint64_t length); - - // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. - template - void Update(const T& object) - { - Update(reinterpret_cast(&object), sizeof(object)); - } - - // Constructs the final hash and writes it to the argument buffer. - // After a hash is finalized, this instance must be Initialized()-ed - // again or the behavior of Update() and Finalize() is undefined. - void Finalize(uint8_t * const hash); - - // A non-incremental function implementation. This can be significantly - // faster than the incremental implementation for some usage patterns. - static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); - - // Does implementation correctly execute test vectors? - static bool ImplementationVerified(); - - // test vectors -- Hash(test_string, seed=0) => test_seed_0 - static const char * test_string; - static const uint8_t test_seed_0[8]; - static const uint8_t test_seed_1[8]; - -private: - static const uint64_t k0 = 0xD6D018F5; - static const uint64_t k1 = 0xA2AA033B; - static const uint64_t k2 = 0x62992FC1; - static const uint64_t k3 = 0x30BC5B29; - - struct { uint64_t v[4]; } state; - struct { uint8_t b[32]; } input; - uint64_t bytes; - uint64_t vseed; -}; - - -// Legacy 64-bit hash functions -- do not use -void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); -void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); - -} // Util - -#endif // #ifndef METROHASH_METROHASH_64_H +// metrohash64.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_METROHASH_64_H +#define METROHASH_METROHASH_64_H + +#include + +namespace Util +{ + +class MetroHash64 +{ +public: + static const uint32_t bits = 64; + + // Constructor initializes the same as Initialize() + MetroHash64(const uint64_t seed=0); + + // Initializes internal state for new hash with optional seed + void Initialize(const uint64_t seed=0); + + // Update the hash state with a string of bytes. If the length + // is sufficiently long, the implementation switches to a bulk + // hashing algorithm directly on the argument buffer for speed. + void Update(const uint8_t * buffer, const uint64_t length); + + // Updates the hash state with the specified object. Modified by Advanced Micro Devices, Inc. + template + void Update(const T& object) + { + Update(reinterpret_cast(&object), sizeof(object)); + } + + // Constructs the final hash and writes it to the argument buffer. + // After a hash is finalized, this instance must be Initialized()-ed + // again or the behavior of Update() and Finalize() is undefined. + void Finalize(uint8_t * const hash); + + // A non-incremental function implementation. This can be significantly + // faster than the incremental implementation for some usage patterns. + static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0); + + // Does implementation correctly execute test vectors? + static bool ImplementationVerified(); + + // test vectors -- Hash(test_string, seed=0) => test_seed_0 + static const char * test_string; + static const uint8_t test_seed_0[8]; + static const uint8_t test_seed_1[8]; + +private: + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + struct { uint64_t v[4]; } state; + struct { uint8_t b[32]; } input; + uint64_t bytes; + uint64_t vseed; +}; + + +// Legacy 64-bit hash functions -- do not use +void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + +} // Util + +#endif // #ifndef METROHASH_METROHASH_64_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h index e927906657..3762acdd14 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/platform.h @@ -1,62 +1,62 @@ -// platform.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. - -#ifndef METROHASH_PLATFORM_H -#define METROHASH_PLATFORM_H - -#include -#include - -namespace Util -{ - -// rotate right idiom recognized by most compilers -inline static uint64_t rotate_right(uint64_t v, unsigned k) -{ - return (v >> k) | (v << (64 - k)); -} - -inline static uint64_t read_u64(const void * const ptr) -{ - uint64_t val; - memcpy(&val, ptr, sizeof(val)); - return val; -} - -inline static uint64_t read_u32(const void * const ptr) -{ - uint32_t val; - memcpy(&val, ptr, sizeof(val)); - return static_cast(val); -} - -inline static uint64_t read_u16(const void * const ptr) -{ - uint16_t val; - memcpy(&val, ptr, sizeof(val)); - return static_cast(val); -} - -inline static uint64_t read_u8 (const void * const ptr) -{ - return static_cast(*reinterpret_cast(ptr)); -} - -} // Util - -#endif // #ifndef METROHASH_PLATFORM_H +// platform.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Modifications Copyright© 2019 Advanced Micro Devices, Inc. All rights reserved. + +#ifndef METROHASH_PLATFORM_H +#define METROHASH_PLATFORM_H + +#include +#include + +namespace Util +{ + +// rotate right idiom recognized by most compilers +inline static uint64_t rotate_right(uint64_t v, unsigned k) +{ + return (v >> k) | (v << (64 - k)); +} + +inline static uint64_t read_u64(const void * const ptr) +{ + uint64_t val; + memcpy(&val, ptr, sizeof(val)); + return val; +} + +inline static uint64_t read_u32(const void * const ptr) +{ + uint32_t val; + memcpy(&val, ptr, sizeof(val)); + return static_cast(val); +} + +inline static uint64_t read_u16(const void * const ptr) +{ + uint16_t val; + memcpy(&val, ptr, sizeof(val)); + return static_cast(val); +} + +inline static uint64_t read_u8 (const void * const ptr) +{ + return static_cast(*reinterpret_cast(ptr)); +} + +} // Util + +#endif // #ifndef METROHASH_PLATFORM_H diff --git a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h index 2fc7b0b23a..ba56b316ae 100644 --- a/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h +++ b/shared/amdgpu-windows-interop/pal/shared/metrohash/src/testvector.h @@ -1,63 +1,63 @@ -// testvector.h -// -// Copyright 2015-2018 J. Andrew Rogers -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef METROHASH_TESTVECTOR_H -#define METROHASH_TESTVECTOR_H - -#include "metrohash.h" - - -typedef void (*HashFunction) (const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * hash); - -struct TestVectorData -{ - HashFunction function; - uint32_t bits; - const char * key; - uint32_t seed; - uint8_t hash[64]; -}; - -// The test vector string is selected such that it will properly exercise every -// internal branch of the hash function. Currently that requires a string with -// a length of (at least) 63 bytes. - -static const char * test_key_63 = "012345678901234567890123456789012345678901234567890123456789012"; - -// The hash assumes a little-endian architecture. Treating the hash results -// as an array of uint64_t should enable conversion for big-endian implementations. -const TestVectorData TestVector [] = -{ - // seed = 0 - { metrohash64_1, 64, test_key_63, 0, "658F044F5C730E40" }, - { metrohash64_2, 64, test_key_63, 0, "073CAAB960623211" }, - { metrohash128_1, 128, test_key_63, 0, "ED9997ED9D0A8B0FF3F266399477788F" }, - { metrohash128_2, 128, test_key_63, 0, "7BBA6FE119CF35D45507EDF3505359AB" }, - { metrohash128crc_1, 128, test_key_63, 0, "B329ED67831604D3DFAC4E4876D8262F" }, - { metrohash128crc_2, 128, test_key_63, 0, "0502A67E257BBD77206BBCA6BBEF2653" }, - - // seed = 1 - { metrohash64_1, 64, test_key_63, 1, "AE49EBB0A856537B" }, - { metrohash64_2, 64, test_key_63, 1, "CF518E9CF58402C0" }, - { metrohash128_1, 128, test_key_63, 1, "DDA6BA67F7DE755EFDF6BEABECCFD1F4" }, - { metrohash128_2, 128, test_key_63, 1, "2DA6AF149A5CDBC12B09DB0846D69EF0" }, - { metrohash128crc_1, 128, test_key_63, 1, "E8FAB51AF19F18A7B10D0A57D4276DF2" }, - { metrohash128crc_2, 128, test_key_63, 1, "2D54F87181A0CF64B02C50D95692BC19" }, -}; - - - -#endif // #ifndef METROHASH_TESTVECTOR_H +// testvector.h +// +// Copyright 2015-2018 J. Andrew Rogers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef METROHASH_TESTVECTOR_H +#define METROHASH_TESTVECTOR_H + +#include "metrohash.h" + + +typedef void (*HashFunction) (const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * hash); + +struct TestVectorData +{ + HashFunction function; + uint32_t bits; + const char * key; + uint32_t seed; + uint8_t hash[64]; +}; + +// The test vector string is selected such that it will properly exercise every +// internal branch of the hash function. Currently that requires a string with +// a length of (at least) 63 bytes. + +static const char * test_key_63 = "012345678901234567890123456789012345678901234567890123456789012"; + +// The hash assumes a little-endian architecture. Treating the hash results +// as an array of uint64_t should enable conversion for big-endian implementations. +const TestVectorData TestVector [] = +{ + // seed = 0 + { metrohash64_1, 64, test_key_63, 0, "658F044F5C730E40" }, + { metrohash64_2, 64, test_key_63, 0, "073CAAB960623211" }, + { metrohash128_1, 128, test_key_63, 0, "ED9997ED9D0A8B0FF3F266399477788F" }, + { metrohash128_2, 128, test_key_63, 0, "7BBA6FE119CF35D45507EDF3505359AB" }, + { metrohash128crc_1, 128, test_key_63, 0, "B329ED67831604D3DFAC4E4876D8262F" }, + { metrohash128crc_2, 128, test_key_63, 0, "0502A67E257BBD77206BBCA6BBEF2653" }, + + // seed = 1 + { metrohash64_1, 64, test_key_63, 1, "AE49EBB0A856537B" }, + { metrohash64_2, 64, test_key_63, 1, "CF518E9CF58402C0" }, + { metrohash128_1, 128, test_key_63, 1, "DDA6BA67F7DE755EFDF6BEABECCFD1F4" }, + { metrohash128_2, 128, test_key_63, 1, "2DA6AF149A5CDBC12B09DB0846D69EF0" }, + { metrohash128crc_1, 128, test_key_63, 1, "E8FAB51AF19F18A7B10D0A57D4276DF2" }, + { metrohash128crc_2, 128, test_key_63, 1, "2D54F87181A0CF64B02C50D95692BC19" }, +}; + + + +#endif // #ifndef METROHASH_TESTVECTOR_H