/* *********************************************************************************************************************** * * Copyright (c) 2014-2025 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * **********************************************************************************************************************/ /** *********************************************************************************************************************** * @file palCmdBuffer.h * @brief Defines the Platform Abstraction Library (PAL) ICmdBuffer interface and related types. *********************************************************************************************************************** */ #pragma once #include "pal.h" #include "palDevice.h" #include "palGpuMemory.h" #include "palImage.h" #include "palMsaaState.h" #include "palPipeline.h" #include "palQueryPool.h" #include "palCmdTracking.h" /// HSA kernel dispatch packet typedef typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t; /// AMD kernel code typedef typedef struct amd_kernel_code_s amd_kernel_code_t; /// AMD kernel descriptor namespace llvm {namespace amdhsa {struct kernel_descriptor_t;} } namespace Util { class VirtualLinearAllocator; class Event; } namespace Pal { // Forward declarations. class IBorderColorPalette; class ICmdAllocator; class ICmdBuffer; class IColorBlendState; class IColorTargetView; class IDepthStencilState; class IDepthStencilView; class IGpuEvent; class IGpuMemory; class IIndirectCmdGenerator; class IMsaaState; class IPerfExperiment; class IQueue; class IQueryPool; enum class PerfTraceMarkerType : uint32; enum class PointOrigin : uint32; struct VideoCodecInfo; struct VideoCodecAuxInfo; /// Specifies a pipeline bind point (i.e., compute or graphics). enum class PipelineBindPoint : uint32 { Compute = 0x0, Graphics = 0x1, Count }; /// Fully specifies a type of graphics primitive and vertex ordering for geometry. enum class PrimitiveTopology : uint8 { PointList = 0x0, LineList = 0x1, LineStrip = 0x2, TriangleList = 0x3, TriangleStrip = 0x4, RectList = 0x5, ///< Each rect is three 2D axis-aligned rectangle vertices. QuadList = 0x6, QuadStrip = 0x7, LineListAdj = 0x8, LineStripAdj = 0x9, TriangleListAdj = 0xA, TriangleStripAdj = 0xB, Patch = 0xC, TriangleFan = 0xD, LineLoop = 0xE, Polygon = 0xF, TwoDRectList = 0x10, ///< Each rect is the bounding box of an arbitrary 2D triangle. /// Support is optional, see support2DRectList in DeviceProperties. Count }; /// Specifies how triangle primitives should be rasterized. enum class FillMode : uint8 { Points = 0x0, Wireframe = 0x1, Solid = 0x2, Count }; /// Specifies the triangle face direction that should result in culled primitives. enum class CullMode : uint8 { _None = 0x0, ///< All triangles are rasterized. Front = 0x1, ///< Front facing triangles are culled. Back = 0x2, ///< Back facing triangles are culled. FrontAndBack = 0x3, ///< All triangles are culled. // Unfortunately for Linux clients, X.h includes a "#define None 0" macro. Clients have their choice of either // undefing None before including this header or using _None when dealing with PAL. #ifndef None None = _None, ///< All triangles are rasterized. #endif }; /// Specifies vertex winding order corresponding to a front facing triangle. @see CullMode. enum class FaceOrientation : uint8 { Ccw = 0x0, ///< Counter-clockwise vertex winding primitives are front facing. Cw = 0x1 ///< Clockwise vertex winding primitives are front facing. }; /// Specifies which vertex of a primitive is the _provoking vertex_. This impacts which vertex's "flat" VS outputs /// are passed to the PS (i.e., flat shading). enum class ProvokingVertex : uint8 { First = 0x0, Last = 0x1 }; /// Specifies bit size of each element in an index buffer. enum class IndexType : uint32 { Idx8 = 0x0, Idx16 = 0x1, Idx32 = 0x2, Count }; /// Specifies a memory atomic operation that can be performed from command buffers with ICmdBuffer::CmdMemoryAtomic(). enum class AtomicOp : uint32 { AddInt32 = 0x00, SubInt32 = 0x01, MinUint32 = 0x02, MaxUint32 = 0x03, MinSint32 = 0x04, MaxSint32 = 0x05, AndInt32 = 0x06, OrInt32 = 0x07, XorInt32 = 0x08, IncUint32 = 0x09, DecUint32 = 0x0A, AddInt64 = 0x0B, SubInt64 = 0x0C, MinUint64 = 0x0D, MaxUint64 = 0x0E, MinSint64 = 0x0F, MaxSint64 = 0x10, AndInt64 = 0x11, OrInt64 = 0x12, XorInt64 = 0x13, IncUint64 = 0x14, DecUint64 = 0x15, Count }; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 /// Specifies the point in the GPU pipeline where an action should take place. /// /// Relevant operations include setting GPU events, waiting on GPU events in hardware, or writing timestamps. /// /// @note The numeric value of these enums are ordered such that a "newState < oldState" comparison will generally yield /// true if a stall is necessary to resolve a hazard between those two pipe points. This guideline does not /// hold up when comparing PreRasterization or PostPs with PostCs, as CS work is not properly pipelined with /// graphics shader work. /// /// @see ICmdBuffer::CmdSetEvent() /// @see ICmdBuffer::CmdResetEvent() /// @see ICmdBuffer::CmdPredicateEvent() /// @see ICmdBuffer::CmdBarrier() /// @see ICmdBuffer::CmdWriteTimestamp() /// @see ICmdBuffer::CmdWriteImmediate() enum HwPipePoint : uint32 { HwPipeTop = 0x0, ///< Earliest possible point in the GPU pipeline (CP PFP), can be /// used as wait point for indirect args and index buffer fetch. HwPipePostPrefetch = 0x1, ///< Indirect arguments have been fetched for all prior /// draws/dispatches (CP ME). HwPipePreRasterization = 0x2, ///< All prior generated VS/HS/DS/GS waves have completed, can be /// used as release point for VB/IB fetch and streamout target. HwPipePostPs = 0x3, ///< All prior generated PS waves have completed. /// Only valid as a pipe point to wait on (release point). HwPipePreColorTarget = 0x4, ///< Represents the same point in pipe to HwPipePostPs, but provides /// clients with a better option to accurately specify the pipeline /// sync request. And PAL uses it as entry-point to add partial /// flushes to prevent write-after-read hazard from corner cases. /// Only valid as a wait point (acquire point). HwPipePreIndexBuffer = HwPipeTop, ///< As late as possible before index buffer fetches (CP PFP). HwPipePostIndexBuffer = HwPipePreRasterization,///< All prior index buffer fetches have completed. // The following points apply to compute-specific work: HwPipePreCs = HwPipePostPrefetch, ///< As late as possible before CS waves are launched (CP ME). HwPipePostCs = 0x5, ///< All prior generated CS waves have completed. // The following points apply to BLT-specific work: HwPipePreBlt = HwPipePostPrefetch, ///< As late as possible before BLT operations are launched. HwPipePostBlt = 0x6, ///< All prior requested BLTs have completed. HwPipeBottom = 0x7, ///< All prior GPU work (graphics, compute, or BLT) has completed. HwPipePointCount }; #endif /// Bitmask values that can be OR'ed together to specify a synchronization scope. See srcStageMask and dstStageMask in /// @ref AcquireReleaseInfo. /// /// When specifying an execution dependency at a synchronization point where previous operations must *happen-before* /// future operations, a mask of these flags specifies a *synchronization scope* that restricts which stages of prior /// draws, dispatches, or BLTs must *happen-before* which stages of future draws, dispatches, or BLTs. /// /// Note that flag numerical order does not indicate any happens-before or happens-after relationships. Clients should /// not compare flags numerically to judge execution order, only barriers can guarantee execution ordering. enum PipelineStageFlag : uint32 { PipelineStageTopOfPipe = 0x00000001, PipelineStageFetchIndirectArgs = 0x00000002, PipelineStagePostPrefetch = 0x00000004, PipelineStageFetchIndices = 0x00000008, PipelineStageStreamOut = 0x00000010, PipelineStageVs = 0x00000020, PipelineStageHs = 0x00000040, PipelineStageDs = 0x00000080, PipelineStageGs = 0x00000100, PipelineStagePs = 0x00000200, PipelineStageSampleRate = 0x00000400, PipelineStageEarlyDsTarget = 0x00000800, PipelineStageLateDsTarget = 0x00001000, PipelineStageColorTarget = 0x00002000, PipelineStageCs = 0x00004000, PipelineStageBlt = 0x00008000, PipelineStageBottomOfPipe = 0x00010000, PipelineStageDsTarget = PipelineStageEarlyDsTarget | PipelineStageLateDsTarget, PipelineStageAllStages = 0x0001FFFF }; /// Bitmask values that can be ORed together to specify all potential usages of an image at a point in time. Such a /// mask should be specified in the usages field of ImageLayout. These combined usages can be examined by PAL to infer /// the layout (i.e., compression state) of the image. /// /// @note There is no layout corresponding to CmdClear*(). The layout flags passed to those functions will determine /// the expected image layout at that time, and the CmdClear*() implementation will execute a clear that keeps the /// layout the same. enum ImageLayoutUsageFlags : uint32 { LayoutUninitializedTarget = 0x00000001, ///< Initial state of any image that can be used as a color or /// depth/stencil target. A layout transition out of this state will /// likely result in a mask RAM initialization BLT. If this bit is /// set, no other bits may be set. LayoutColorTarget = 0x00000002, ///< Color target bound via CmdBindTargets(). This bit is exclusive /// with LayoutDepthStencilTarget. LayoutDepthStencilTarget = 0x00000004, ///< Depth/stencil target bound via CmdBindTargets(). This bit is /// exclusive with LayoutColorTarget. LayoutShaderRead = 0x00000008, ///< Any shader read state including texture, UAV, constant buffer, /// vertex buffer. LayoutShaderFmaskBasedRead = 0x00000010, ///< Images in this state support the load_fptr AMD IL instruction, /// which will read decompressed fmask in order to access compressed /// MSAA color data from a shader. LayoutShaderWrite = 0x00000020, ///< Writeable UAV. LayoutCopySrc = 0x00000040, ///< CmdCopyImage(), CmdCopyImageToMemory(), CmdScaledCopyImage or /// CmdCopyTiledImageToMemory() source image. LayoutCopyDst = 0x00000080, ///< CmdCopyImage(), CmdCopyMemoryToImage(), CmdScaledCopyImage or /// CmdCopyMemoryToTiledImage() destination image. LayoutResolveSrc = 0x00000100, ///< CmdResolveImage() source. LayoutResolveDst = 0x00000200, ///< CmdResolveImage() destination. LayoutPresentWindowed = 0x00000400, ///< Windowed-mode IQueue::Present(). LayoutPresentFullscreen = 0x00000800, ///< Fullscreen (flip) present. Layout must be supported by the /// display engine. LayoutUncompressed = 0x00001000, ///< Metadata fully decompressed/expanded layout LayoutSampleRate = 0x00002000, ///< CmdBindSampleRateImage() source. LayoutAllUsages = 0x00003FFF }; /// Bitmask values that can be ORed together to specify all potential engines an image might be used on. Such a /// mask should be specified in the engines field of ImageLayout. /// /// If the client API is unable to determine which engines might be used, it should specify all possible engines /// corresponding to the usage flags. enum ImageLayoutEngineFlags : uint32 { LayoutUniversalEngine = 0x1, LayoutComputeEngine = 0x2, LayoutDmaEngine = 0x4, LayoutVideoEncodeEngine = 0x8, LayoutVideoDecodeEngine = 0x10, LayoutVideoJpegDecodeEngine = 0x20, LayoutAllEngines = 0x3F }; /// Bitmask values that can be ORed together to specify previous output usage and upcoming input usages of an image or /// GPU memory in a ICmdBuffer::CmdBarrier() call to ensure cache coherency between those usages. enum CacheCoherencyUsageFlags : uint32 { CoherCpu = 0x00000001, ///< Data read or written by CPU. CoherShaderRead = 0x00000002, ///< Data read by a GPU shader. CoherShaderWrite = 0x00000004, ///< Data written by a GPU shader. CoherCopySrc = 0x00000008, ///< Source of a ICmdBuffer::CmdCopy*() call. CoherCopyDst = 0x00000010, ///< Destination of a ICmdBuffer::CmdCopy*() call. CoherColorTarget = 0x00000020, ///< Color target. CoherDepthStencilTarget = 0x00000040, ///< Depth stencil target. CoherResolveSrc = 0x00000080, ///< Source of a CmdResolveImage() call. CoherResolveDst = 0x00000100, ///< Destination of a CmdResolveImage() call. CoherClear = 0x00000200, ///< Destination of a CmdClear() call. CoherIndirectArgs = 0x00000400, ///< Source argument data read by CmdDrawIndirect() and similar functions. CoherIndexData = 0x00000800, ///< Index buffer data. CoherQueueAtomic = 0x00001000, ///< Destination of a CmdMemoryAtomic() call. CoherTimestamp = 0x00002000, ///< Destination of a CmdWriteTimestamp() call. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 914 CoherStreamOut = 0x00004000, ///< Data written as stream output. CoherMemory = 0x00008000, ///< Data read or written directly from/to memory CoherSampleRate = 0x00010000, ///< CmdBindSampleRateImage() source. CoherPresent = 0x00020000, ///< Source of present. CoherCp = 0x00080000, ///< HW Command Processor (CP) encompassing the front - end command CoherAllUsages = 0x000FFFFF, ///< processing of any queue, including SDMA. #else CoherCeLoad = 0x00004000, ///< Source of a CmdLoadCeRam() call. CoherCeDump = 0x00008000, ///< Destination of CmdDumpCeRam() call. CoherStreamOut = 0x00010000, ///< Data written as stream output. CoherMemory = 0x00020000, ///< Data read or written directly from/to memory CoherSampleRate = 0x00040000, ///< CmdBindSampleRateImage() source. CoherPresent = 0x00080000, ///< Source of present. CoherCp = 0x00200000, ///< HW Command Processor (CP) encompassing the front - end command CoherAllUsages = 0x003FFFFF, ///< processing of any queue, including SDMA. #endif CoherShader = CoherShaderRead | CoherShaderWrite, CoherCopy = CoherCopySrc | CoherCopyDst, CoherResolve = CoherResolveSrc | CoherResolveDst, }; /// Bitmask values for the flags parameter of ICmdBuffer::CmdClearColorImage(). enum ClearColorImageFlags : uint32 { ColorClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before /// and after the clear assuming all subresources to be cleared are currently /// ready for rendering as a color target (as is required by API convention in /// DX12). Allows reduced sync costs in some situations since PAL knows /// the details of how the clear will be performed. ColorClearForceSlow = 0x00000002, ///< Force these to use slow clears. ColorClearSkipIfSlow = 0x00000004, ///< Only issue the clear if it is a fast clear. ColorClearAllFlags = 0x00000007 ///< Clients should NOT use it, for internal static_assert purpose only. }; /// Bitmask values for the flags parameter of ICmdBuffer::CmdClearDepthStencil(). enum ClearDepthStencilFlags : uint32 { DsClearAutoSync = 0x00000001, ///< PAL will automatically insert required barrier synchronization before /// and after the clear assuming all subresources to be cleared are currently /// ready for rendering as a depth/stencil target (as is required by API convention /// in DX12). Allows reduced sync costs in some situations since PAL knows the /// details of how the clear will be performed. DsClearAllFlags = 0x00000001 ///< Clients should NOT use it, for internal static_assert purpose only. }; /// Bitmask values for the flags parameter of ICmdBuffer::CmdResolveImage(). enum ResolveImageFlags : uint32 { ImageResolveInvertY = 0x00000001, ///< PAL will invert the y-axis (flip upside down) of the resolved region to /// the destination image. ImageResolveDstAsSrgb = 0x00000002, ///< If set, a non-srgb destination image will be treated as srgb format. /// The flag cannot be set when @ref ImageResolveDstAsNorm is set. ImageResolveDstAsNorm = 0x00000004, ///< If set, a srgb destination image will be treated as non-srgb format. /// The flag cannot be set when @ref ImageResolveDstAsSrgb is set. ImageResolveSrcAsNorm = 0x00000008, ///< If set, a srgb source image will be treated as non-srgb format. ImageResolveAllFlags = 0x0000000F ///< Clients should NOT use it, for internal static_assert purpose only. }; /// Specifies properties for creation of an ICmdBuffer object. Input structure to IDevice::CreateCmdBuffer(). struct CmdBufferCreateInfo { ICmdAllocator* pCmdAllocator; ///< The command buffer will use this command allocator to allocate all GPU memory /// If the client specifies a null pCmdAllocator, it must call ICmdBuffer::Reset /// with a non-null pCmdAllocator before calling ICmdBuffer::Begin. QueueType queueType; ///< Type of queue commands in this command buffer will target. /// This defines the set of allowed actions in the command buffer. QueuePriority queuePriority; ///< Priority level of the queue this command buffer will target. EngineType engineType; ///< Type of engine the queue commands will run on. union { struct { /// Indicates that this command buffer will be a "nested" command buffer, instead of a normal, "root" /// command buffer. Nested command buffers differ from root command buffers in how they are sent to the /// GPU for execution: root command buffers must be submitted to the hardware by calling /// @ref IQueue::Submit, whereas nested command buffers can only be submitted by being executed by a root /// command buffer. /// /// Currently, only Universal and Compute command buffers can be nested. Nesting DMA command buffers is /// meaningless and unsupported. It is an error to attempt to create a nested DMA command buffer. /// /// @see ICmdBuffer::CmdExecuteNestedCmdBuffers. uint32 nested : 1; /// Dedicated CUs are reserved for this queue. Thus we have to skip CU mask programming. uint32 realtimeComputeUnits : 1; /// Target queue uses dispatch tunneling. uint32 dispatchTunneling : 1; /// Indicates that each subsequent Dispatch command is desired to be executed in alternating /// order of forward and reverse workgroup walk order. This can improve cache locality when /// subsequent Dispatches consume data from the previous Dispatch and the overall footprint /// does not fit in cache. /// This is a best effort as not all implementations or Queues may support this. uint32 dispatchPingPongWalk : 1; /// Reserved for future use. uint32 reserved : 28; }; /// Flags packed as 32-bit uint. uint32 u32All; } flags; ///< Command buffer creation flags. }; /// Specifies which states will not be bound in a nested command buffer, and instead must be inherited from the calling /// root-level command buffer. union InheritedStateFlags { struct { /// Color and depth target views are inherited from the root-level command buffer. The nested command buffer /// should not modify this state. uint32 targetViewState : 1; /// Occlusion query is inherited from the root-level command buffer. The nested command buffer /// should not modify this state. uint32 occlusionQuery : 1; /// Predication is inherited from the root-level command buffer. The nested command buffer should not modify /// this state. uint32 predication : 1; /// Reserved for future usage. uint32 reserved : 29; }; /// Flags packed as 32-bit uint. uint32 u32All; }; /// Specifies parameters inherited from primary command buffer into nested command buffer. struct InheritedStateParams { #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 891 uint32 colorTargetCount; ///< Number of color targets bound in the /// root-level command buffer. SwizzledFormat colorTargetSwizzledFormats[MaxColorTargets]; ///< Format and swizzle for each color /// target. uint32 sampleCount[MaxColorTargets]; ///< Sample count for each color target. #endif InheritedStateFlags stateFlags; ///< States that are inherited from the /// calling root-level command buffer. }; /// Specifies optional hints to control command buffer building optimizations. union CmdBufferBuildFlags { struct { /// Optimize command buffer building for large sets of draw or dispatch operations that are GPU front-end /// limited. These optimizations include removing redundant PM4 commands and reducing the VGT prim group size. /// This flag might increase the CPU overhead of building command buffers. uint32 optimizeGpuSmallBatch : 1; /// Optimize command buffer building for exclusive command buffer submission. Command buffers built with this /// flag cannot be submitted if they have already been submitted previously unless the caller guarantees that /// they are no longer in use. This flag allows PAL to modify the contents of command buffers during /// submission. uint32 optimizeExclusiveSubmit : 1; /// Optimize command buffer building for single command buffer submission. Command buffers built with this flag /// cannot be submitted more than once. This flag allows PAL to modify the contents of command buffers during /// submission. This flag is a stricter version of optimizeExclusiveSubmit, it is not necessary to set /// optimizeExclusiveSubmit if this flag is set. uint32 optimizeOneTimeSubmit : 1; /// Indicates that the client is providing custom tessellation distribution settings. If set, it is the clients /// responsibility to ensure all 5 (isoline, triangle, quad, donut, trapezoid) factors are provided. uint32 optimizeTessDistributionFactors : 1; /// Attempt to prefetch shader code into cache before launching draws or dispatches with a freshly bound /// pipeline object. This optimization might increase the CPU overhead of building command buffers and/or /// introduce additional front-end GPU bottlenecks. uint32 prefetchShaders : 1; /// Attempt to prefetch the command buffer into cache to avoid bottlenecking the GPU front-end. /// This optimization might slightly increase the overhead of some GPU copies and other front-end reads/writes. uint32 prefetchCommands : 1; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 /// Indicates the command buffer will use one or more constant engine commands: CmdLoadCeRam(), CmdDumpCeRam(), /// or CmdWriteCeRam() uint32 usesCeRamCmds : 1; #else uint32 placeholder914 : 1; #endif /// Indicates that the client would prefer that this nested command buffer not be launched using an IB2 packet. /// The calling command buffer will either inline this command buffer into itself or use IB chaining based on if /// the optimizeExclusiveSubmit flag is also set. This flag is ignored for root command buffers. uint32 disallowNestedLaunchViaIb2 : 1; /// placeholder uint32 placeholder1 : 2; /// Enable TMZ mode to allow reading TMZ protected allocations. If this command buffer attempts to write /// non-TMZ memory, the results are undefined. Only valid for graphics and compute. uint32 enableTmz : 1; uint32 placeholder3 : 1; /// If set, internal operations such as blits, copies, etc. will not affect active Query results. /// Otherwise they may affect the results. uint32 disableQueryInternalOps : 1; uint32 optimizeContextStatesPerBin : 1; uint32 optimizePersistentStatesPerBin : 1; /// Reserved for future use. uint32 reserved : 16; }; /// Flags packed as 32-bit uint. uint32 u32All; }; /// Specifies tessellation accum factors. union TessDistributionFactors { struct { /// The following 3 factors are used by hardware when distributed tessellation is active: the min tess factors for /// each patch processed by a VGT are accumulated. When the sum exceeds this threshold, the next patch is sent to a /// different VGT. uint32 isoDistributionFactor : 8; uint32 triDistributionFactor : 8; ///< Recommended to be higher than quad factor. uint32 quadDistributionFactor : 8; /// Used by the hardware when distributed tessellation is in DONUT mode: the min tess factor for each patch is /// tested against this threshold to determine whether a patch gets split up. If the patch isn't split, it still /// increments the accumulator for the Patch distribution factor. uint32 donutDistributionFactor : 5; /// Used when the distribution mode is TRAPEZOID for quad and tri domain types. The number of donuts in the patch /// are compared against this value to detemine whether this donut gets split up into trapezoids (needs the patch to /// be in donut mode). A value of 0 or 1 will be treated as 2. The innermost donut is never allowed to be broken /// into trapezoids. uint32 trapDistributionFactor : 3; }; /// Values packed as 32-bit uint. uint32 u32All; }; /// Specifies options that direct command buffer building. struct CmdBufferBuildInfo { /// Command buffer build flags, specifies optional hints to control command buffer build optimizations. CmdBufferBuildFlags flags; /// Command buffer inherited state and params. If non-null, related state is assumed set in root-level and nested /// command buffer should not modify the software states. Any software params that may be needed within nested /// command buffer needs to be provided here. const InheritedStateParams* pInheritedState; /// If non-null, the command buffer will begin with all states set as they are in this previously built command /// buffer. Any state specified in pInheritedState is excluded if it is also provided. const ICmdBuffer* pStateInheritCmdBuffer; /// Optional allocator for PAL to use when allocating temporary memory during command buffer building. PAL will /// stop using this allocator once command building ends. If no allocator is provided PAL will use an internally /// managed allocator instead which may be less efficient. PAL will use this allocator in two ways: /// + Temporary storage within a single command building call. PAL will rewind the allocator before returning to /// free all memory allocated within the call. /// + Temporary storage for the entire command building period. When Begin() is called, PAL will save the current /// position of the allocator and rewind the allocator to that point when End() is called. If the client also /// wishes to allocate temporary storage that lasts between command building function calls they must allocate it /// before calling Begin() or PAL will accidentally free it. Util::VirtualLinearAllocator* pMemAllocator; /// Optional tessellation distribution factors that will overwrite PAL set defaults. Clients must also set the /// optimizeTessDistributionFactors flag for these custom factors to take effect. /// Nested command buffers inherit this value from the primary. TessDistributionFactors clientTessDistributionFactors; /// Number of context states per PBB bin. /// Client must also set @ref CmdBufferBuildFlags::optimizeContextStatesPerBin for this to take effect. uint8 contextStatesPerBin; /// Number of persistent states per PBB bin. /// Client must also set @ref CmdBufferBuildFlags::optimizePersistentStatesPerBin for this to take effect. uint8 persistentStatesPerBin; /// Client/app data handle. This can have an arbitrary value and is used to uniquely identify this command buffer. uint64 execMarkerClientHandle; }; /// Specifies info on how a compute shader should use resources. struct DynamicComputeShaderInfo { float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] /// and specify the maximum number of waves per compute unit. If the hardware has one wave /// limit control for multiple shader stages PAL will select the most strict limit. /// This option is converted internally to set set HW WavesPerSh setting and the non-integer /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for /// example specify less number of waves than number of CUs per shader array. uint32 maxThreadGroupsPerCu; ///< Override the maximum number of threadgroups that a particular CS can run on, /// throttling it, to enable more graphics work to complete. 0 disables the limit. uint32 tgScheduleCountPerCu; ///< Override the number of threadgroups to schedule on a single compute unit before /// moving to the next compute unit. 0 selects optimal default. uint32 ldsBytesPerTg; ///< Override the amount of LDS space used per thread-group for this pipeline, in bytes. /// Zero indicates that the LDS size determined at pipeline-compilation time will be used. }; /// Specifies info on how a graphics shader should use resources. struct DynamicGraphicsShaderInfo { float maxWavesPerCu; ///< Limits the number of waves in flight per compute unit. This can be used to selectively /// throttle certain workloads that bottleneck multiqueue applications. For ease of use, a /// value of zero means no limit is set. The remaining valid values are in the range (0, 40] /// and specify the maximum number of waves per compute unit. If the hardware has one wave /// limit control for multiple shader stages PAL will select the most strict limit. /// This option is converted internally to set HW WavesPerSh setting and the non-integer /// maxWavesPerCu value provides more flexibility to allow arbitrary WavesPerSh value; for /// example specify less number of waves than number of CUs per shader array. }; /// Specifies dynamic states of a graphics pipeline struct DynamicGraphicsState { uint32 colorWriteMask; ///< Color target write mask. 4b / RT (8 count) struct { uint32 switchWinding : 1; ///< Whether to reverse vertex ordering for tessellation. uint32 depthClipNearEnable : 1; ///< Enable clipping based on Near Z coordinate. uint32 depthClipFarEnable : 1; ///< Enable clipping based on Far Z coordinate. uint32 alphaToCoverageEnable : 1; ///< Enable alpha to coverage. uint32 perpLineEndCapsEnable : 1; ///< Forces the use of perpendicular line end caps as opposed to /// axis-aligned line end caps during line rasterization. uint32 rasterizerDiscardEnable : 1; ///< Whether to kill all rasterized pixels. uint32 dualSourceBlendEnable : 1; ///< Enable dual source blend uint32 vertexBufferCount : 6; ///< Number vertex buffer slots accessed by this pipeline LogicOp logicOp : 4; ///< Logic operation to perform. DepthRange depthRange : 1; ///< Specifies Z dimensions of screen space (i.e., post viewport /// transform: 0 to 1 or -1 to 1). DepthClampMode depthClampMode : 2; ///< Depth clamping behavior. uint32 reserved1 : 7; ///< Reserved uint32 reserved : 5; ///< Reserved for future use. }; union { struct { uint32 depthClampMode : 1; ///< Whether to enable dynamic state depthClampMode. uint32 depthRange : 1; ///< Whether to enable dynamic state depthRange. uint32 logicOp : 1; ///< Whether to enable dynamic state logicOp. uint32 colorWriteMask : 1; ///< Whether to enable dynamic state colorWriteMask. uint32 switchWinding : 1; ///< Whether to enable dynamic state switchWinding. uint32 depthClipMode : 1; ///< Whether to enable dynamic state depthClipNear/FarEnable. uint32 alphaToCoverageEnable : 1; ///< Whether to enable dynamic state alphaToCoverageEnable. uint32 perpLineEndCapsEnable : 1; ///< Whether to enable dynamic state perpLineEndCapsEnable. uint32 rasterizerDiscardEnable : 1; ///< Whether to enable dynamic state rasterizerDiscardEnable. uint32 dualSourceBlendEnable : 1; ///< Whether to enable dynamic state dualSourceBlendEnable uint32 vertexBufferCount : 1; ///< Whether to enable dynamic state vertexBufferCount. uint32 reserved1 : 1; ///< Reserved. uint32 reserved : 20; ///< Reserved for future use. }; uint32 u32All; } enable; }; /// Specifies info on how graphics shaders should use resources. struct DynamicGraphicsShaderInfos { union { // VS/HS/DS/GS or TS/MS are active struct { DynamicGraphicsShaderInfo vs; ///< Dynamic Vertex shader information. DynamicGraphicsShaderInfo hs; ///< Dynamic Hull shader information. DynamicGraphicsShaderInfo ds; ///< Dynamic Domain shader information. DynamicGraphicsShaderInfo gs; ///< Dynamic Geometry shader information. }; struct { DynamicGraphicsShaderInfo ts; ///< Dynamic Task shader information. DynamicGraphicsShaderInfo ms; ///< Dynamic Mesh shader information. }; }; DynamicGraphicsShaderInfo ps; ///< Dynamic Pixel shader information. union { struct { uint8 vs : 1; // If set, there is dynamic VS shader info. uint8 hs : 1; // If set, there is dynamic HS shader info. uint8 ds : 1; // If set, there is dynamic DS shader info. uint8 gs : 1; // If set, there is dynamic GS shader info. uint8 ps : 1; // If set, there is dynamic PS shader info. uint8 ts : 1; // If set, there is dynamic TS shader info. uint8 ms : 1; // If set, there is dynamic MS shader info. uint8 reserved : 1; // Reserved. }; uint8 u8All; } enable; }; /// Specifies parameters for binding a pipeline. /// @see ICmdBuffer::CmdBindPipeline struct PipelineBindParams { PipelineBindPoint pipelineBindPoint; ///< Specifies which type of pipeline is to be bound (compute or graphics). const IPipeline* pPipeline; ///< New pipeline to be bound. Can be null in order to unbind a previously /// bound pipeline without binding a new one. uint64 apiPsoHash; ///< 64-bit identifier provided by client driver based on the Pipeline State /// Object. There exists a many-to-one correlation for ApiPsoHash to /// internalPipelineHash to map the two. union { DynamicComputeShaderInfo cs; ///< Dynamic Compute shader information. struct { DynamicGraphicsShaderInfos gfxShaderInfo; DynamicGraphicsState gfxDynState; }; }; }; /// Specifies per-MRT color target view and current image state. Used as input to ICmdBuffer::CmdBindTargets(). struct ColorTargetBindInfo { const IColorTargetView* pColorTargetView; ///< Color target view to bind. ImageLayout imageLayout; ///< Specifies the current image layout based on bitmasks of currently /// allowed operations and engines that may perform those operations. /// At minimum, the LayoutColorTarget usage flag and /// LayoutUniversalEngine engine flag must be set. }; /// Specifies depth/stencil view and current image state of the depth and stencil planes. Used as input to /// ICmdBuffer::CmdBindTargets(). struct DepthStencilBindInfo { const IDepthStencilView* pDepthStencilView; ///< Depth/stencil target view to bind. ImageLayout depthLayout; ///< Specifies the current image layout of the depth plane based on /// bitmasks of currently allowed operations and engines that may /// perform those operations. At minimum, the /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine /// engine flag must be set. Ignored if the specified view does not /// have a depth plane. ImageLayout stencilLayout; ///< Specifies the current image layout of the stencil plane based on /// bitmasks of currently allowed operations and engines that may /// perform those operations. At minimum, the /// LayoutDepthStencilTarget usage flag and LayoutUniversalEngine /// engine flag must be set. Ignored if the specified view does not /// have a stencil plane. }; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 /// Represents a GPU memory or image transition as part of a barrier. /// /// A single transition will ensure cache coherency of dirty data in the specific set of source caches with the /// specified set of destination caches. The source and destination designation is relative to the barrier itself /// and does not indicate whether a particular cache is a read or write cache. /// /// Typically a transition flushes written data from the source caches into the destination caches and thus the source /// cache mask typically only contains write caches. However, the client is encouraged to include flags for any prior /// read-only caches accesses as PAL may be able to optimize its cache operations. /// /// If the both cache masks are zero the client is indicating that no cache coherency operations are required but PAL /// may still issue cache operations for internal reasons. /// /// In addition, the client can change an image's layout usage/engine flags which may result in a metadata blt. /// /// @note There is no range provided to control the range of addresses that will be flushed/invalidated in GPU caches. struct BarrierTransition { uint32 srcCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing previous write operations whose /// results need to be visible for subsequent operations. Flags for prior read operations /// may be included as well and may be used for internal optimizations. uint32 dstCacheMask; ///< Bitmask of @ref CacheCoherencyUsageFlags describing the operations expected to read /// and/or write data flushed from the caches indicated by the srcCacheMask. struct { const IImage* pImage; ///< If non-null, indicates this transition only applies to the specified image. /// The remaining members of this structure are ignored if this member is null. SubresRange subresRange; ///< Subset of pImage this transition applies to. If newLayout includes @ref /// LayoutUninitializedTarget this range must cover all subresources of pImage /// unless the perSubresInit image create flag was specified. ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and /// engines up to this point. These masks imply the previous compression state. No /// usage flags should ever be set in oldLayout.usages that correspond to usages /// that are not supported by the engine that is performing the transition. The /// queue type performing the transition must be set in oldLayout.engines. ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and /// engines after this point. These masks imply the upcoming compression state. /// point. This usage mask implies the upcoming compressions state. A difference /// between oldLayoutUsageMask and newLayoutUsageMask may result in a /// decompression. /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a /// grid where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum /// valid position (not quite to the bottom/right border of the pixel). /// Specifies a custom sample pattern over a 2x2 pixel quad. Can be left null for non-MSAA images or when /// a valid MsaaQuadSamplePattern is bound prior to the CmdBarrier call. const MsaaQuadSamplePattern* pQuadSamplePattern; } imageInfo; ///< Image-specific transition information. }; /// Describes a barrier as inserted by a call to ICmdBuffer::CmdBarrier(). /// /// A barrier can be used to 1) stall GPU execution at a specified point to resolve a data hazard, 2) flush/invalidate /// GPU caches to ensure data coherency, and/or 3) compress/decompress image resources as necessary when changing how /// the GPU will use the image. /// /// This structure directly specifies how #1 is performed. #2 and #3 are managed by the list of @ref BarrierTransition /// structures passed in pTransitions. struct BarrierInfo { /// Determine at what point the GPU should stall until all specified waits and transitions have completed. If the /// specified wait point is unavailable, PAL will wait at the closest available earlier point. HwPipePoint waitPoint; uint32 pipePointWaitCount; ///< Number of entries in pPipePoints. const HwPipePoint* pPipePoints; ///< The barrier will stall until the hardware pipeline has cleared /// up to each point specified in this array. One entry in this /// array is typically enough, but CS and GFX operate in parallel /// at certain stages. uint32 gpuEventWaitCount; ///< Number of entries in ppGpuEvents. const IGpuEvent** ppGpuEvents; ///< The barrier will stall until each GPU event in this array is /// in the set state. uint32 rangeCheckedTargetWaitCount; ///< Number of entries in ppTargets. const IImage** ppTargets; ///< The barrier will stall until all previous rendering with any /// color or depth/stencil image in this list bound as a target /// has completed. If one of the targets is a nullptr it will /// perform a full range sync. uint32 transitionCount; ///< Number of entries in pTransitions. const BarrierTransition* pTransitions; ///< List of image/memory transitions to process. See /// @ref BarrierTransition. The same subresource should never /// be specified more than once in the list of transitions. /// PAL assumes that all specified subresources are unique. uint32 globalSrcCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined /// (bitwise logical union) with the @ref srcCacheMask field belonging to every /// element in @ref pTransitions. If this is zero or if there are no transitions, /// then no global cache flags are applied during every transition. uint32 globalDstCacheMask; ///< This is a global bitmask of @ref CacheCoherencyUsageFlags which is combined /// (bitwise logical union) with the @ref dstCacheMask field belonging to every /// element in @ref pTransitions. If this is zero or if there are no transitions, /// then no global cache flags are applied during every transition. uint32 reason; ///< The reason that the barrier was invoked. }; #endif /// Specifies execution dependencies, *availability* and/or *visibility* operations on a section of an IGpuMemory /// object that does not contain valid IImage data. PAL may assume image data is not present and skip certain /// cache operations. /// /// PAL specifies these execution dependencies using pairs of synchronization scope bitmasks of /// @ref PipelineStageFlag values. The barrier's execution dependencies are only applied to state in this barrier. /// Memory coherency operations or layout transitions in other barriers will ignore this barrier's execution /// dependencies. /// /// PAL specifies these operations using pairs of access scope bitmasks of @ref CacheCoherencyUsageFlags values. /// The source mask (named srcAccessMask or srcGlobalAccessMask) describes which prior write operations should be made /// available (i.e., written back from local caches to the LLC). The destination mask (named dstAccessMask or /// dstGlobalAccessMask) describes which upcoming read/write operations that need visibility (i.e., invalidate /// corresponding local caches above the LLC). These masks may be zero if no cache operations are needed. /// /// In general, PAL executes the availability and visibility operations in isolation because the CmdRelease functions /// require that the destination masks be zero and the CmdAcquire functions require that the source masks be zero. /// In essence, CmdRelease implements the availability operations and CmdAcquire implements the visibility operations. /// However, CmdReleaseThenAcquire sees both masks and thus can optimize its cache operations. /// /// To facilitate cache optimizations, the client is encouraged to add flags corresponding to prior read operations /// in the relevant source mask(s). Unlike the usual write operation flags, these read flags are entirely optional /// and do not impact correctness; if they are omitted PAL will simply issue the full set of cache operations. /// If they are provided PAL may detect cases where future read operations use the same caches as the prior read /// operations and thus can skip the usual visibility operations. /// /// Note that, /// 1. If the client does provide read operation flags in a source mask they *must* guarantee that the same flags /// were provided to a prior barrier's destination mask(s). Incorrect behavior may occur otherwise. /// 2. One @ref MemBarrier or @ImgBarrier object can only be applied to a single resource otherwise PAL's internal /// optimization may be incorrect. Don't OR multiple resource transitions' stage or access mask into one /// @ref MemBarrier or @ImgBarrier when making PAL barrier call. However, you are allowed to OR multiple resource /// transitions' stage or access mask into the global transition mask. /// /// This struct is used by @ref AcquireReleaseInfo. struct MemBarrier { #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 union { struct { uint32 globallyAvailable : 1; ///< Normally, data made available is in the GPU LLC. When this bit is /// set, available means in memory, available to all clients in the /// system. This is useful for rare cases like mid command buffer /// synchronization with the CPU or another external device. uint32 reserved : 31; ///< Reserved for future use. }; uint32 u32All; ///< Flags packed as a 32-bit uint. } flags; ///< Flags controlling the memory barrier. #endif #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 GpuMemSubAllocInfo memory; ///< Specifies a portion of an IGpuMemory object this memory barrier affects. /// Zero values of memory structure indicate full range barrier operations. #endif uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization /// scope that must be confirmed complete as part of a release. Must be /// 0 when passed in to CmdAcquire or CmdAcquireEvent. uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization /// scope of operations to be performed after the acquire. Must be /// 0 when passed in to CmdRelease or CmdReleaseEvent. uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the /// availability operation, as defined in the struct comment header. /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the /// visibility operation, as defined in the struct comment header. /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. }; /// Specifies required layout transition, execution dependencies, *availability*, and/or *visibility* operations on a /// subresource of an IImage object. /// /// See the header comment on @ref MemBarrier for a full description of the execution dependencies, availability and /// visibility operations, including what rules the clients must follow when filling out srcAccessMask and /// dstAccessMask. /// /// This struct is used by @ref AcquireReleaseInfo. struct ImgBarrier { const IImage* pImage; ///< Relevant image resource for this barrier. SubresRange subresRange; ///< Selects a range of planes/slices/mips the barrier affects. If newLayout /// includes @ref LayoutUninitializedTarget this range must cover all subresources of /// pImage unless the perSubresInit image create flag was specified. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 880 Box box; ///< Restricts the barrier to a sub-section of each subresource. The Z offset/extent /// must be 0 for 1D/2D images, and the Y offset/extent must be 0 for 1D images. A /// box with zero extents will be ignored, and the barrier will affect the entire /// subresource range. This box may be used to restrict ranges of cache flushes or /// invalidations, or may restrict what data is decompressed. However, the /// implementation may not be able to optimize particular cases and may expand the /// barrier to cover the entire subresource range. Specifying a subregion with a box /// when newLayout includes @ref LayoutUninitializedTarget is not supported. #endif uint32 srcStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization /// scope that must be confirmed complete as part of a release. Must be /// 0 when passed in to CmdAcquire or CmdAcquireEvent. uint32 dstStageMask; ///< Bitmask of PipelineStageFlag values defining the synchronization /// scope of operations to be performed after the acquire. Must be /// 0 when passed in to CmdRelease or CmdReleaseEvent. uint32 srcAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the /// availability operation, as defined in the struct comment header. /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. uint32 dstAccessMask; ///< CacheCoherencyUsageFlags mask which defines the access scope for the /// visibility operation, as defined in the struct comment header. /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. ImageLayout oldLayout; ///< Specifies the current image layout based on bitmasks of allowed operations and /// engines up to this point. These masks imply the previous compression state. No /// usage flags should ever be set in oldLayout.usages that correspond to usages /// that are not supported by the engine that is performing the transition. The /// engine type performing the transition must be set in oldLayout.engines. Can set /// both oldLayout and newLayout to zero value for no layout transition case. ImageLayout newLayout; ///< Specifies the upcoming image layout based on bitmasks of allowed operations and /// engines after this point. These masks imply the upcoming compression state. /// point. A difference between oldLayoutUsageMask and newLayoutUsageMask may result /// in a decompression. PAL's implementation will ensure the results of any layout /// operations are consistent with the requested availability and visibility /// operations. Can set both oldLayout and newLayout to zero value for no layout /// transition case. /// Specifies a custom sample pattern over a 2x2 pixel quad. The position for each sample is specified on a grid /// where the pixel center is <0,0>, the top left corner of the pixel is <-8,-8>, and <7,7> is the maximum valid /// position (not quite to the bottom/right border of the pixel). Specifies a custom sample pattern over a 2x2 /// pixel quad. Can be left null for non-MSAA images or when a valid IMsaaState is bound prior to the barrier /// call. const MsaaQuadSamplePattern* pQuadSamplePattern; }; /// Input structure to CmdRelease(), CmdReleaseEvent(), CmdAcquire(), CmdAcquireEvent(), and CmdReleastThenAcquire(). /// It describes the execution dependencies, memory dependencies, and image layout transitions that must be resolved. /// /// Global transition doesn't have buffer or image info so it will assume the worst case and the barrier operations may /// not be optimal (e.g. metadata may be misaligned and need issue LLC flush/invalidation). It's suggested that if /// clients know the buffer or image info, try setting up the barrier call with the full buffer or image transition /// info (including stageMask and accessMask) instead of global transition for optimal performance. /// /// Clients may OR multiple MemBarrier into a single MemBarrier on full range barrier cases for simple and saving CPU /// overhead. To allow more optimization chances (e.g. skip unnecessary stalls for read only transitions) in PAL, /// it's suggested to split the single grouped MemBarrier into two separate grouped MemBarriers: one is read only /// MemBarrier and the other is writeable MemBarrier; both are then passed together to the barrier call. struct AcquireReleaseInfo { uint32 srcGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global /// synchronization scope that must be confirmed complete as part of a /// release. Must be 0 when passed in to CmdAcquire or CmdAcquireEvent. uint32 dstGlobalStageMask; ///< Bitmask of PipelineStageFlag values defining the global /// synchronization scope of operations to be performed after the /// acquire. Must be 0 when passed in to CmdRelease or CmdReleaseEvent. uint32 srcGlobalAccessMask; ///< *Access scope* for the global availability operation. Serves the /// same purpose as srcAccessMask in @ref MemoryBarrier, but will cause /// all relevant caches to be flushed without range checking. /// This mask must be 0 when passed to CmdAcquire or CmdAcquireEvent. uint32 dstGlobalAccessMask; ///< *Access scope* for the global visibility operation. Serves the /// same purpose as dstAccessMask in @ref MemoryBarrier, but will cause /// all relevant caches to be invalidated without range checking. /// This must be 0 when passed to CmdRelease or CmdReleaseEvent. uint32 memoryBarrierCount; ///< Number of entries in pMemoryBarriers. const MemBarrier* pMemoryBarriers; ///< Describes memory dependencies specific to a range of a particular /// IGpuMemory object. uint32 imageBarrierCount; ///< Number of entries in pImageBarriers. const ImgBarrier* pImageBarriers; /// Describes memory dependencies and image layout transitions required /// for a subresource range of a particular IImage object. uint32 reason; ///< The reason that the barrier was invoked. /// See @ref Developer::BarrierReason for internal reason codes, though /// clients may define their own as well }; /// Specifies barrier type, global (potentially mixed cases of buffer and image), buffer or image. enum class BarrierType : uint32 { Global, Buffer, Image }; /// Number of all HW opaque release token types. constexpr uint32 NumReleaseTokenTypes = 4; /// Synchronization token structure for CmdRelease() and CmdAcquire(). /// /// Clients should pass the ReleaseToken returned by CmdRelease() to CmdAcquire() directly without changing the value. /// If a resource with given subresource range has multiple ReleaseToken, all related ReleaseToken should be passed to /// CmdAcquire(). /// /// Passing ReleaseToken { .fenceValue = N; .type = T } into CmdAcquire() will wait for all prior releases with /// .fenceValue <= N for .type == T. Resource with a large number of subresources may introduce lots of ReleaseToken /// potentially (e.g. released per subresource). No need to track all ReleaseToken for each resource since clients /// can optimize this based on the fact that release type and fenceValue are exposed for each ReleaseToken: define a /// ReleaseToken array with size @ref NumReleaseTokenTypes, only track ReleaseToken with the largest fenceValue per /// each release type; and then passing the tracked array ReleaseToken values to CmdAcquire() is enough. union ReleaseToken { struct { uint32 fenceValue : 24; ///< Release fence value per token type. uint32 type : 8; ///< Release token type (HW opaque). Note that please increase the number of bits if /// it can't hold all types, see @ref NumReleaseTokenTypes for details. }; uint32 u32All; }; /// Specifies parameters for a copy from one range of a source GPU memory allocation to a range of the same size in a /// destination GPU memory allocation. Used as an input to ICmdBuffer::CmdCopyMemory(). struct MemoryCopyRegion { gpusize srcOffset; ///< Offset in bytes into the source GPU memory allocation to copy data from. gpusize dstOffset; ///< Offset in bytes into the destination GPU memory allocation to copy data to. gpusize copySize; ///< Amount of data to copy in bytes. }; /// Specifies parameters for an image copy from one region in a source image subresource to a region of the same size in /// a destination image subresource. Used as input to ICmdBuffer::CmdCopyImage(). /// If the region describes a copy between a 2D and a 3D image, extent.depth and numSlices must be equal and may be /// larger than 1. struct ImageCopyRegion { SubresId srcSubres; ///< Selects the source subresource. Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. SubresId dstSubres; ///< Selects the destination subresource. Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination /// subresource. Extent3d extent; ///< Size of the copy region in pixels. uint32 numSlices; ///< Number of slices the copy will span. }; /// Specifies parameters for a copy between an image and a GPU memory allocation. The same structure is used regardless /// of direction, an input for both ICmdBuffer::CmdCopyImageToMemory() and ICmdBuffer::CmdCopyMemoryToImage(). struct MemoryImageCopyRegion { SubresId imageSubres; ///< Selects the image subresource. Offset3d imageOffset; ///< Pixel offset to the start of the chosen subresource region. Extent3d imageExtent; ///< Size of the image region in pixels. uint32 numSlices; ///< Number of slices the copy will span. gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. SwizzledFormat swizzledFormat;///< If not Undefined, reinterpret both subresources using this format and swizzle. }; /// Specifies parameters for a copy between a PRT and a GPU memory allocation. The same structure is used regardless /// of direction, an input for both ICmdBuffer::CmdCopyTiledImageToMemory() and ICmdBuffer::CmdCopyMemoryToTiledImage(). struct MemoryTiledImageCopyRegion { SubresId imageSubres; ///< Selects the image subresource; must not be a part of the packed mip tail. Offset3d imageOffset; ///< Tile offset to the start of the chosen subresource region. Extent3d imageExtent; ///< Size of the image region in tiles. uint32 numSlices; ///< Number of slices the copy will span. gpusize gpuMemoryOffset; ///< Offset in bytes to the start of the copy region in the GPU memory allocation. gpusize gpuMemoryRowPitch; ///< Offset in bytes between the same X position on two consecutive lines. gpusize gpuMemoryDepthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. }; /// Used by copy operations to temporarily interpret a range of GPU memory as a "typed buffer". A typed buffer is /// essentially a linear image with a caller-defined row pitch and depth pitch. Typed buffer copies do not require /// the GPU memory objects to be created with the "typedBuffer" flag. struct TypedBufferInfo { SwizzledFormat swizzledFormat; ///< The pixels in this buffer have this format. gpusize offset; ///< Offset in bytes to the start of the copy region in the buffer's GPU memory /// allocation. gpusize rowPitch; ///< Offset in bytes between the same X position on two consecutive lines. gpusize depthPitch; ///< Offset in bytes between the same X,Y position of two consecutive slices. }; /// Specifies parameters for a copy from one region of a typed buffer to a region of the same size in a destination /// typed buffer. Used as an input to ICmdBuffer::CmdCopyTypedBuffer(). struct TypedBufferCopyRegion { TypedBufferInfo srcBuffer; ///< How to interpret the source GPU memory allocation as a typed buffer. TypedBufferInfo dstBuffer; ///< How to interpret the destination GPU memory allocation as a typed buffer. Extent3d extent; ///< Size of the copy region in pixels. }; /// Specifies parameters for a scaled copy between an image and a typed buffer. The same structure is used regardless /// of direction, an input for ICmdBuffer::CmdScaledCopyTypedBufferToImage(). struct TypedBufferImageScaledCopyRegion { SubresId imageSubres; ///< Selects the image subresource. Offset2d imageOffset; ///< Pixel offset to the start of the chosen subresource region. Extent2d imageExtent; ///< Size of the image region in pixels. TypedBufferInfo bufferInfo; ///< How to interpret the GPU memory allocation as a typed buffer. Extent2d bufferExtent; ///< Size of the typed buffer region in pixels. SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. }; /// Specifies parameters for a scaled image copy from one region in a source image subresource to a region in the /// destination image subresource. Used as an input to ICmdBuffer::CmdScaledCopyImage. struct ImageScaledCopyRegion { SubresId srcSubres; ///< Selects the source subresource. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 uint32 srcSlices; ///< Number of source image slices to read across. #endif union { Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. Offset3dFloat srcOffsetFloat; ///< Alternative representation in floating point. }; union { SignedExtent3d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates /// a copy in the reverse direction. Extent3dFloat srcExtentFloat; ///< Alternative representation in floating point. }; SubresId dstSubres; ///< Selects the destination subresource. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 887 uint32 dstSlices; ///< Number of destination image slices to write. #endif union { Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. Offset3dFloat dstOffsetFloat; ///< Alternative representation in floating point. }; union { SignedExtent3d dstExtent; ///< Signed size of the destination region in pixels. A negative size /// indicates a copy in the reverse direction. Extent3dFloat dstExtentFloat; ///< Alternative representation in floating point. }; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 887 uint32 numSlices; ///< Number of slices the copy will span. #endif SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. /// The specified format needs to have been included in the "pViewFormats" list /// specified at image-creation time, otherwise the result might be incorrect. }; /// Specifies parameters for a color-space-conversion copy from one region in a source image subresource to a region in /// a destination image subresource. Used as an input to ICmdBuffer::CmdColorSpaceConversionCopy. struct ColorSpaceConversionRegion { Offset2d srcOffset; ///< Offset to the start of the chosen region in the source subresource(s). SignedExtent2d srcExtent; ///< Signed size of the source region in pixels. A negative size indicates a copy /// in the reverse direction. Offset2d dstOffset; ///< Offset to the start of the chosen region in the destination subresource(s). SignedExtent2d dstExtent; ///< Signed size of the destination region in pixels. A negative size indicates a /// copy in the reverse direction. SubresId rgbSubres; ///< Selects the first subresource of the RGB image where the copy will begin. This /// can either be the source or destination of the copy, depending on whether the /// copy is performing an RGB->YUV or YUV->RGB conversion. uint32 yuvStartSlice; ///< Array slice of the YUV image where the copy will begin. All planes of planar /// YUV images will be implicitly involved in the copy. This can either be the /// source or destination of the copy, depending on whether the copy is performing /// an RGB->YUV or YUV->RGB conversion. uint32 sliceCount; ///< Number of slices the copy will span. }; /// Specifies the color-space-conversion table used when converting between YUV and RGB Image formats. Used as an input /// to ICmdBuffer:CmdColorSpaceConversionCopy. struct ColorSpaceConversionTable { float table[3][4]; ///< Values forming the conversion table matrix, which has three rows and four columns. For RGB /// to YUV conversions, the conversion shader uses the following expressions to evaluate the /// YUV color: /// Y = dot( [R G B 1], [row #0] ) /// U = dot( [R G B 1], [row #1] ) /// V = dot( [R G B 1], [row #2] ) /// For YUV to RGB conversions, the conversion shader uses the following expressions to /// evaluate the RGB color: /// R = dot( [Y U V 1], [row #0] ) /// G = dot( [Y U V 1], [row #1] ) /// B = dot( [Y U V 1], [row #2] ) /// A fourth row is not needed because alpha is copied directly between the RGB and YUV colors. }; /// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy /// to perform a YUV to RGB color space conversion. Represents the BT.601 standard (standard-definition TV). extern const ColorSpaceConversionTable DefaultCscTableYuvToRgb; /// Default color-space-conversion table usable by PAL clients when calling ICmdBuffer::CmdColorSpaceConverionCopy /// to perform a RGB to YUV color space conversion. Represents the BT.601 standard (standard-definition TV). extern const ColorSpaceConversionTable DefaultCscTableRgbToYuv; /// Specifies flags controlling GPU copy behavior. Format related flags are ignored by DMA queues. enum CopyControlFlags : uint32 { CopyFormatConversion = 0x1, ///< Requests that the copy convert between two compatible formats. This is ignored /// unless both formats support @ref FormatFeatureFormatConversion. CopyRawSwizzle = 0x2, ///< If possible, raw copies will swizzle from the source channel format into the /// destination channel format (e.g., RGBA to BGRA). CopyEnableScissorTest = 0x4, ///< If set, do scissor test using the specified scissor rectangle. CopyControlAllFlags = 0x7 ///< Clients should NOT use it, for internal static_assert purpose only. }; /// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single /// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). struct ImageResolveRegion { uint32 srcPlane; ///< The source color, depth, or stencil plane. uint32 srcSlice; ///< Selects the source starting slice Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. uint32 dstPlane; ///< The destination color, depth, or stencil plane. uint32 dstMipLevel; ///< Selects destination mip level. uint32 dstSlice; ///< Selects the destination starting slice Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. Extent3d extent; ///< Size of the resolve region in pixels. uint32 numSlices; ///< Number of slices to be resolved SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret both subresources using this format and swizzle. /// The format must match both subresource's native formats. const MsaaQuadSamplePattern* pQuadSamplePattern; ///< Specifies sample pattern for MSAA depth image. It must be a /// valid pointer if image was created with sampleLocsAlwaysKnown /// flag set. }; /// A list of the types of PRT+ resolves that can be performed. enum class PrtPlusResolveType : uint32 { Decode = 0x0, ///< Translate from AMD HW format to format of destination image. Encode = 0x1, ///< Translate from source image to AMD HW format Count = 0x2, }; /// Input structure to the CmdResolvePrtPlusImage function struct PrtPlusImageResolveRegion { Offset3d srcOffset; ///< Offset to the start of the chosen region in the source subresource. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 SubresId srcSubresId; ///< Selects the source subresource #else uint32 srcMipLevel; ///< Selects source mip level uint32 srcSlice; ///< Selects the source starting slice #endif Offset3d dstOffset; ///< Offset to the start of the chosen region in the destination subresource. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 938 SubresId dstSubresId; ///< Selects the destination subresource #else uint32 dstMipLevel; ///< Selects destination mip level uint32 dstSlice; ///< Selects the destination starting slice #endif Extent3d extent; ///< Size of the resolve region in pixels. uint32 numSlices; ///< Number of slices to be resolved }; /// Input structure to ICmdBuffer::CmdResolvePrtPlusImageToBuffer() struct PrtPlusImageToBufferResolveRegion { SubresId srcSubresId; ///< Selects the source subresource gpusize dstOffset; ///< Offset into destination subresource Extent3d extent; ///< Size of the resolve region in pixels. uint32 numSlices; ///< Number of slices to be resolved }; /// Input structure to ICmdBuffer::CmdResolvePrtPlusBufferToImage() struct PrtPlusBufferToImageResolveRegion { gpusize srcOffset; ///< Offset into source subresource SubresId dstSubresId; ///< Selects the destination subresource Extent3d extent; ///< Size of the resolve region in pixels. uint32 numSlices; ///< Number of slices to be resolved }; /// Specifies parameters for a resolve of one region in an MSAA source image to a region of the same size in a single /// sample destination image. Used as an input to ICmdBuffer::CmdResolveImage(). enum class ResolveMode : uint32 { Average = 0x0, ///< Resolve result is an average of all the individual samples Minimum = 0x1, ///< Resolve result is the minimum value of all individual samples Maximum = 0x2, ///< Resolve result is the maximum value of all individual samples Count = 0x4, }; /// Specifies width of immediate data to be written out. enum class ImmediateDataWidth : uint32 { ImmediateData32Bit = 0x0, ImmediateData64Bit = 0x1, Count = 0x2, }; /// Specifies flags controlling GPU query behavior. union QueryControlFlags { struct { /// Controls accuracy of query data collection. Available only for occlusion queries. If set, occlusion query /// is guaranteed to return imprecise non-zero value if any samples pass the depth and stencil test. Using /// imprecise occlusion query results could improve rendering performance while an occlusion query is active. uint32 impreciseData : 1; uint32 reserved : 31; ///< Reserved for future use. }; uint32 u32All; ///< Flags packed as 32-bit uint. }; /// Specifies layout of GPU memory used as an input to CmdDrawIndirectMulti. struct DrawIndirectArgs { uint32 vertexCount; ///< Number of vertices to draw. uint32 instanceCount; ///< Number of instances to draw. uint32 firstVertex; ///< Starting index value for the draw. Indices passed to the vertex shader will range from /// firstVertex to firstVertex + vertexCount - 1. uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from /// firstInstance to firstInstance + instanceCount - 1. }; /// Specifies layout of GPU memory used as an input to CmdDrawIndexedIndirectMulti. /// /// Indices passed to the vertex shader will be: /// /// + IndexBuffer[firstIndex] + vertexOffset /// + IndexBuffer[firstIndex + 1] + vertexOffset, /// + ... /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset struct DrawIndexedIndirectArgs { uint32 indexCount; ///< Number of vertices to draw. uint32 instanceCount; ///< Number of instances to draw. uint32 firstIndex; ///< Starting index buffer slot for the draw. int32 vertexOffset; ///< Offset added to the index fetched from the index buffer before it is passed to the /// vertex shader. uint32 firstInstance; ///< Starting instance for the draw. Instace IDs passed to the vertex shader will range from /// firstInstance to firstInstance + instanceCount - 1. }; /// Specifies layout of GPU memory used as an input to CmdDispatchIndirect. struct DispatchIndirectArgs { uint32 x; ///< Threadgroups to dispatch in the X dimension. uint32 y; ///< Threadgroups to dispatch in the Y dimension. uint32 z; ///< Threadgroups to dispatch in the Z dimension. }; /// Specifies layout of GPU memory used as an input to CmdDispatchMeshIndirect. using DispatchMeshIndirectArgs = DispatchIndirectArgs; /// Specifies the GPU virtual address of an array and the stride in bytes between array elements. struct CpuVirtAddrAndStride { const void* pCpuVirtAddr; ///< CPU virtual address of the 0th array element. struct { uint64 stride : 32; ///< Distance between array elements in bytes. uint64 _pad : 32; ///< Padding for structure alignment. }; }; /// Specifies the GPU virtual address of an array and the stride in bytes between array elements. struct GpuVirtAddrAndStride { gpusize gpuVirtAddr; ///< GPU virtual address of the 0th array element. struct { uint64 stride : 32; ///< Distance between array elements in bytes. uint64 _pad : 32; ///< Padding for structure alignment. }; }; /// Flags to describe a dispatch union DispatchInfoFlags { struct { uint32 devDriverOverlay : 1; ///< Flag indicates this dispatch draws the DevDriver overlay uint32 reserved : 31; ///< Reserved for future use. }; uint32 u32All; ///< Flags packed as 32-bit uint. }; /// Specifies the different stages at which a combiner can choose between different shading rates. enum class VrsCombinerStage : uint32 { ProvokingVertex, ///< Chooses between the shading rate specified by the VrsRateParams struct and the shader /// rate provided by the provoking vertex. Primitive, ///< Chooses between previous combiner stage and the shader rate associated with the primitive Image, ///< Chooses between previous combiner stage and the shader rate associated with an image PsIterSamples, ///< Chooses between previous combiner stage and the PS_ITER_SAMPLES rate. Max }; /// Specifies the different possible shading rates. Not all are supported on all HW; see the supportedVrsRates /// entry in the gfxipProperties structure. enum class VrsShadingRate : uint32 { _16xSsaa = 0x0, _8xSsaa = 0x1, _4xSsaa = 0x2, _2xSsaa = 0x3, _1x1 = 0x4, _1x2 = 0x5, _2x1 = 0x6, _2x2 = 0x7, Count }; /// Indices into the centerOffset array member of the VrsCenterState structure. enum class VrsCenterRates : uint32 { _1x1 = 0x0, _1x2 = 0x1, _2x1 = 0x2, _2x2 = 0x3, Max = 0x4, }; /// Specifies the different ways in which a combiner can choose between two different shading rate inputs. enum class VrsCombiner : uint32 { Passthrough = 0, ///< Keep previous shading rate. Override = 1, ///< C.xy = B.xy Min = 2, ///< min(A.xy, B.xy) Max = 3, ///< max(A.xy, B.xy) Sum = 4, ///< min(maxRate, A.xy + B.xy) Count }; /// Structure for defining paramters to the CmdSetPerDrawVrsRate function. struct VrsRateParams { /// The shading rate to be bound to the render state. VrsShadingRate shadingRate; /// The state of all the combiners. VrsCombiner combinerState[static_cast(VrsCombinerStage::Max)]; union { struct { uint32 exposeVrsPixelsMask : 1; ///< Controls how the shader input mask of a coarse pixel is generated. /// 0 : Bitwise OR of all fine pixel`s mask /// 1 : Pack fine pixels` coverage mask into iMask. Layout based /// on VRS rate uint32 reserved : 31; }; uint32 u32All; ///< Flags packed as 32-bit uint. } flags; ///< Flags controlling VRS rate parameters }; /// Structure for defininig paramters to the CmdSetVrsCenterState function. struct VrsCenterState { /// The offset is scaled by the coarse pixel size and then added to the center location /// Center offsets are specified as two 4 bits signed integer value representing a location on a 16x16 grid gd. /// The offset is scaled by the coarse pixel size and then added to the center location /// 1x1, 1x2, 2x1 and 2x2 shading rates can all have their own unique offsets Offset2d centerOffset[static_cast(VrsCenterRates::Max)]; union { struct { uint32 overrideCenterSsaa : 1; ///< Override center interpolants to be evaluated at the sample /// position. uint32 overrideCentroidSsaa : 1; ///< Override centroid interpolants to be evaluated at the centroid /// of each sample group being iterated (simply the sample position /// in the typical case of 1-sample groups). uint32 alwaysComputeCentroid : 1; ///< Don't assume the centroid of a fully covered shading region is /// the center. It is possible all samples could be lit but the /// center is not lit for certain combinations of centerOffset[] /// values and programmable sample positions uint32 reserved : 29; ///< Reserved for future HW }; uint32 u32All; ///< Flags packed as 32-bit uint. } flags; ///< Flags controlling VRS center state }; /// @internal /// Function pointer type definition for setting pipeline-accessible user data entries to the specified values. Each /// command buffer object has one such callback per pipeline bind point, so the bind point is implicit. /// /// @see ICmdBuffer::CmdSetUserData(). typedef void (PAL_STDCALL *CmdSetUserDataFunc)( ICmdBuffer* pCmdBuffer, uint32 firstEntry, uint32 entryCount, const uint32* pEntryValues); /// @internal Function pointer type definition for issuing non-indexed draws. /// /// @see ICmdBuffer::CmdDraw(). typedef void (PAL_STDCALL *CmdDrawFunc)( ICmdBuffer* pCmdBuffer, uint32 firstVertex, uint32 vertexCount, uint32 firstInstance, uint32 instanceCount, uint32 drawId); /// @internal Function pointer type definition for issuing draws auto. /// /// @see ICmdBuffer::CmdDrawOpaque(). typedef void (PAL_STDCALL *CmdDrawOpaqueFunc)( ICmdBuffer* pCmdBuffer, gpusize streamOutFilledSizeVa, uint32 streamOutOffset, uint32 stride, uint32 firstInstance, uint32 instanceCount); /// @internal Function pointer type definition for issuing indexed draws. /// /// @see ICmdBuffer::CmdDrawIndexed(). typedef void (PAL_STDCALL *CmdDrawIndexedFunc)( ICmdBuffer* pCmdBuffer, uint32 firstIndex, uint32 indexCount, int32 vertexOffset, uint32 firstInstance, uint32 instanceCount, uint32 drawId); /// @internal Function pointer type definition for issuing indirect draws. /// /// @see ICmdBuffer::CmdDrawIndirectMulti(). typedef void (PAL_STDCALL *CmdDrawIndirectMultiFunc)( ICmdBuffer* pCmdBuffer, GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr); /// @internal Function pointer type definition for issuing indexed, indirect draws. /// /// @see ICmdBuffer::CmdDrawIndexedIndirectMulti(). typedef void (PAL_STDCALL *CmdDrawIndexedIndirectMultiFunc)( ICmdBuffer* pCmdBuffer, GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr); /// @internal Function pointer type definition for issuing direct dispatches. /// /// @see ICmdBuffer::CmdDispatch(). typedef void (PAL_STDCALL *CmdDispatchFunc)( ICmdBuffer* pCmdBuffer, DispatchDims size, DispatchInfoFlags infoFlags); /// @internal Function pointer type definition for issuing indirect dispatches. /// /// @see ICmdBuffer::CmdDispatchIndirect(). typedef void (PAL_STDCALL *CmdDispatchIndirectFunc)( ICmdBuffer* pCmdBuffer, gpusize gpuVirtAddr); /// @internal Function pointer type definition for issuing direct dispatches with threadgroup offsets. /// /// @see ICmdBuffer::CmdDispatchOffset(). typedef void (PAL_STDCALL *CmdDispatchOffsetFunc)( ICmdBuffer* pCmdBuffer, DispatchDims offset, DispatchDims launchSize, DispatchDims logicalSize); /// @internal Function pointer type definition for issuing direct mesh dispatches. /// /// @see ICmdBuffer::CmdDispatchMesh(). typedef void (PAL_STDCALL *CmdDispatchMeshFunc)( ICmdBuffer* pCmdBuffer, DispatchDims size); /// @internal Function pointer type definition for issuing indirect mesh dispatches. /// /// @see ICmdBuffer::CmdDispatchMeshIndirectMulti(). typedef void (PAL_STDCALL *CmdDispatchMeshIndirectMultiFunc)( ICmdBuffer* pCmdBuffer, GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr); /// This struct provides the parameters of all the supported features for kernel dispatch struct DispatchAqlParams { const hsa_kernel_dispatch_packet_t* pAqlPacket; ///< Pointer to AQL packet contains the essential /// information (size of workgroup, grid, data /// segments, handle of kernel code object, kernel /// arguments) of the kernel to be dispatched. gpusize scratchAddr; ///< GPU VM scratch buffer address uint32 scratchSize; ///< Scratch buffer size uint32 scratchOffset; ///< Scratch buffer offset from the base for generic /// address space #if PAL_CLIENT_INTERFACE_MAJOR_VERSION >= 920 const llvm::amdhsa::kernel_descriptor_t* pCpuAqlCode; ///< AMD kernel descriptor on CPU for PM4 emulation #else const amd_kernel_code_t* pCpuAqlCode; ///< AMD kernel code object on CPU for PM4 emulation #endif gpusize hsaQueueVa; ///< GPU VM address where amd_queue_t is allocated uint32 wavesPerSh; ///< Waves Per Shade Array bool useAtc; ///< Indicates whether ATC bit in registers should be set /// ATC bit is used for indicating if the address /// is GPUVM(=0) or SVM(=1). Valid only in gfx6 and older uint64 kernargSegmentSize; ///< The size of kerarg segment that holds the /// values of the arguments to the kernels uint32 workitemPrivateSegmentSize; ///< The amount of fixed private address /// space memory required for a workitem. uint32 aqlPacketIndex; ///< AQL ID in QueueCreateInfo.aqlPacketList for debugger. /// Note: debugger support enabled for Navi3x+ }; /// @internal Function pointer type definition for issuing AQL dispatches. /// /// @see ICmdBuffer::CmdDispatchAql(). typedef void (PAL_STDCALL *CmdDispatchAqlFunc)( ICmdBuffer* pCmdBuffer, const DispatchAqlParams& dispatchInfo); /// Specifies input assembler state for draws. /// @see ICmdBuffer::CmdSetInputAssemblyState struct InputAssemblyStateParams { PrimitiveTopology topology; ///< Defines how vertices should be interpretted and rendered by /// the graphics pipeline. uint8 patchControlPoints; ///< # of control points per patch. [0-32] valid. Should be set to /// 0 by clients if topology is not PrimitiveTopology::Patch. bool primitiveRestartEnable; ///< Enables the index specified by primitiveRestartIndex to _cut_ /// a primitive (i.e., triangle strip) and begin a new primitive /// with the next index. bool primitiveRestartMatchAllBits; ///< Specifies which bits from primitiveRestartIndex to use. /// false - only check relevant bits based on index type /// true - check all 32 bits irrespective of index type uint32 primitiveRestartIndex; ///< When primitiveRestartEnable is true, this is the index value /// that will restart a primitive. When using a 16-bit index /// buffer, the upper 16 bits of this value will be ignored. }; /// Specifies parameters for controlling triangle rasterization. /// @see ICmdBuffer::CmdSetTriangleRasterState struct TriangleRasterStateParams { struct { FillMode frontFillMode : 2; ///< Whether front-facing triangles should be rendered solid or wireframe. FillMode backFillMode : 2; ///< Whether back-facing triangles should be rendered solid or wireframe. CullMode cullMode : 2; ///< Specifies which, if any, triangles should be culled based on whether /// they are front or back facing. FaceOrientation frontFace : 1; ///< Specifies the vertex winding that results in a front-facing triangle. ProvokingVertex provokingVertex : 1; ///< Specifies whether the first or last vertex of a primitive is the }; /// provoking vertex as it affects flat shading. union { struct { uint8 frontDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for front-facing /// triangle-based primitives uint8 backDepthBiasEnable : 1; ///< Enable depth bias (i.e. polygon offset) for back-facing /// triangle-based primitives uint8 reserved : 6; ///< Reserved for future use. }; uint8 u8All; ///< Flags packed as 8-bit uint. } flags; ///< Triangle raster state flags. }; /// Specifies parameters for controlling point and line rasterization. /// @see ICmdBuffer::CmdSetPointLineRasterState struct PointLineRasterStateParams { float pointSize; ///< Width of a point primitive in pixels. float lineWidth; ///< Width of a line primitive in pixels. float pointSizeMin; ///< Minimum width of a point primitive in pixels. float pointSizeMax; ///< Maximum width of a point primitive in pixels. }; /// Specifies parameters for controlling line stippling. /// @see ICmdBuffer::CmdSetLineStippleState struct LineStippleStateParams { uint16 lineStippleValue; ///< Line stipple bit pattern. uint32 lineStippleScale; ///< Line stipple repeat factor. }; /// Specifies paramters for setting up depth bias. Depth Bias is used to ensure a primitive can properly be displayed /// (without Z fighting) in front (or behind) of the previously rendered co-planar primitive. This is useful for decal /// or shadow rendering. /// @see ICmdBuffer::CmdSetDepthBiasState struct DepthBiasParams { float depthBias; ///< Base depth bias to be added to each fragment's Z value. In units of the /// minimum delta representable in the bound depth buffer. float depthBiasClamp; ///< Maximum allowed depth bias result. Prevents polygons viewed at a sharp value /// from generating very large biases. float slopeScaledDepthBias; ///< Factor multiplied by the depth slope (change in Z coord per x/y pixel) to /// create more bias for "steep" polygons. This result is applied to the final /// Z value in addition to the base depthBias parameter. }; /// Specifies parameters for setting the value range to be used for depth bounds testing. /// @see ICmdBuffer::CmdSetDepthBounds struct DepthBoundsParams { float min; ///< Minimum depth value in passing range (closest). float max; ///< Maximum depth value in passing range (farthest). }; /// Specifies parameters for setting bit-masks applied to stencil buffer reads and writes. /// @see ICmdBuffer::CmdSetStencilRefMasks struct StencilRefMaskParams { uint8 frontRef; ///< Stencil reference value for front-facing polygons. uint8 frontReadMask; ///< Bitmask to restrict stencil buffer reads for front-facing polygons. uint8 frontWriteMask; ///< Bitmask to restrict stencil buffer writes for front-facing polygons. uint8 frontOpValue; ///< Stencil operation value for front-facing polygons. /// This is the value used as a parameter for a given stencil operation. /// For example: StencilOp::IncWrap will use this value when incrementing the current /// stencil contents. Typically, this would be set to one, but on AMD hardware, /// this register is 8 bits so there is a greater flexibility. uint8 backRef; ///< Stencil reference value for back-facing polygons. uint8 backReadMask; ///< Bitmask to restrict stencil buffer reads for back-facing polygons. uint8 backWriteMask; ///< Bitmask to restrict stencil buffer writes for back-facing polygons. uint8 backOpValue; ///< Stencil operation value for back-facing polygons - See description of frontOpValue /// for further details. union { uint8 u8All; ///< Flags packed as a 8-bit uint. struct { uint8 updateFrontRef : 1; ///< Updating reference value for front-facing polygons. uint8 updateFrontReadMask : 1; ///< Updating read mask value for front-facing polygons. uint8 updateFrontWriteMask : 1; ///< Updating write mask value for front-facing polygons. uint8 updateFrontOpValue : 1; ///< Updating stencil op value for front-facing polygons. uint8 updateBackRef : 1; ///< Updating reference value for back-facing polygons. uint8 updateBackReadMask : 1; ///< Updating read mask value for back-facing polygons. uint8 updateBackWriteMask : 1; ///< Updating write mask value for back-facing polygons. uint8 updateBackOpValue : 1; ///< Updating stencil op value for back-facing polygons. }; } flags; ///< Flags to indicate which of the stencil state values are being updated. }; /// HiS always exposes two pretests. constexpr uint32 NumHiSPretests = 2; /// Hierarchical stencil (HiS) allows work to be discarded by the stencil test at tile rate in certain cases. /// In order to use HiS, the client will define a set of pretests that will be performed whenever a particular stencil /// buffer is written. The stencil image will track the results of the pretest for each 8x8 tile, keeping a record of /// whether any pixel in the tile "may-pass" or "may-fail" the specified pretest. When stencil testing is enabled, /// the hardware may be able to discard whole tiles early based on what it can glean from the HiS pretest states. /// /// Each stencil image has two pretest slots per mip level. Pretest slots are reset when an initialization barrier /// targets their mip level on the stencil plane. The client can then pass this struct to @ref CmdUpdateHiSPretests /// to bind one or more valid pretests. It is legal to bind a pretest over a reset slot at any point. /// /// @warning Except in special cases, it is illegal to bind a pretest on top of an existing pretest. /// /// It is only legal to bind a new pretest on top of an existing pretest if: /// 1. All array slices within the given mip have been reset using an initialization barrier. /// 2. The client guarantees that they will rewrite all stencil values in all array slices within the given mip /// before the next draw with stencil testing enabled by doing either: /// a. One or more calls to @ref CmdClearDepthStencil. /// b. One or more draws with the stencil test disabled and stencil writes enabled. /// /// Once pretests are selected via @ref CmdUpdateHiSPretests the client should keep track of which tests were enabled /// on each stencil image and provide them to every call to @ref CmdClearDepthStencil. This is optional but PAL will /// not be able to generate HiS optimized clears unless it is given the current pretests. /// /// @warning The pretests provided to @ref CmdUpdateHiSPretests are applied to all mips of all subresource ranges. /// If the client varies pretests between mips they must guarantee that the given pretests were bound to all /// mips in the given subresource ranges. /// /// This feature works best if the future stencil test behavior is known, either directly told via an API extension /// or via an app profile in the client layer. For example, if the application 1) clears stencil, 2) does a pass to /// write stencil, 3) then does a final pass that masks rendering based on the stencil value being > 0, ideally we /// would choose a pretest of func=Greater, mask=0xFF, and value=0 so that #2 would update the stencil image with /// per-tile data that lets #3 be accelerated at maximum effeciency. /// /// In absence of app-specific knowledge, the following algorithm may be a good generic approach: /// 1. When the stencil image is cleared, set pretest #0 to func=Equal, mask=0xFF, and value set to the clear value. /// 2. On the first draw with stencil writes enabled, set pretest #1 with the mask set to the app's current stencil /// mask, and /// a. If the stencil op is INC or DEC, set func=GreaterEqual and value the same as in #1. /// b. If the stencil op is REPLACE, set func=Equal and set value to the app's current stencil ref value. /// /// Note that HiS can only be beneficial for GPU performance so clients that do not want to implement app profiles or /// generic heuristics should at least hard-code both tests to something simple. struct HiSPretests { struct { CompareFunc func; ///< This function is used to compare the pretest value with the image's stencil value. /// The expression is evaluated with the pretest value as the left-hand operand and the /// image's stencil value as the right-hand operand. uint8 mask; ///< This value is ANDed with both stencil values before evaluating the comparison. uint8 value; ///< The pretest value, used as the left-hand operand in the comparison. bool isValid; ///< True if this pretest contains valid information. Set to false to skip this test. } test[NumHiSPretests]; ///< The set of pretest slots. }; /// Specifies coordinates for setting up single user clip plane. /// @see ICmdBuffer::CmdSetUserClipPlanes struct UserClipPlane { float x; ///< Plane coordinate x float y; ///< Plane coordinate y float z; ///< Plane coordinate z float w; ///< Plane coordinate w }; /// Specifies parameters for setting the constant factor to be used by the blend hardware when programmed with the /// Blend::ConstantColor, Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend /// coefficients. /// @see ICmdBuffer::CmdSetBlendConst struct BlendConstParams { float blendConst[4]; ///< 4-component RGBA float specifying the new blend constant. }; /// Specifies the parameters for a single viewport struct Viewport { float originX; ///< X coordinate for the viewport's origin. float originY; ///< Y coordinate for the viewport's origin. float width; ///< Width of the viewport. float height; ///< Height of the viewport. float minDepth; ///< Minimum depth value of the viewport. Must be in the [0..1] range. float maxDepth; ///< Maximum depth value of the viewport. Must be in the [0..1] range. PointOrigin origin; ///< Origin of the viewport relative to NDC. UpperLeft or LowerLeft. }; /// Specifies the viewport transform parameters for setting a single viewport. /// @see ICmdBuffer::CmdSetViewport struct ViewportParams { uint32 count; ///< Number of viewports. float horzDiscardRatio; ///< The ratio between guardband discard rect width and viewport width. /// For all guard band ratio settings, values less than 1.0f are illegal. /// Value FLT_MAX opens the guardband as wide as the HW supports. /// Value 1.0f disables the guardband. float vertDiscardRatio; ///< The ratio between guardband discard rect height and viewport height. float horzClipRatio; ///< The ratio between guardband clip rect width and viewport width. float vertClipRatio; ///< The ratio between guardband clip rect height and viewport height. DepthRange depthRange; ///< Specifies the target range of Z values // Define viewports array at the end of the structure as it is common to only access the first N from the CPU. Viewport viewports[MaxViewports]; ///< Array of desciptors for each viewport. }; /// Specifies the parameters for specifing the scissor rectangle. struct ScissorRectParams { uint32 count; ///< Number of scissor rectangles. Rect scissors[MaxViewports]; ///< Array of scissor regions corresponding to each viewport. }; /// Specifies parameters for setting the global scissor rectangle. /// @see ICmdBuffer::CmdSetGlobalScissor struct GlobalScissorParams { Rect scissorRegion; ///< Rectangle of the global scissor window. }; /// Specifies parameters for binding the color targets and depth target. /// @see ICmdBuffer::CmdBindTargets struct BindTargetParams { uint32 colorTargetCount; ///< Number of color targets to bind. ColorTargetBindInfo colorTargets[MaxColorTargets]; ///< Array of color target descriptors. DepthStencilBindInfo depthTarget; ///< Describes the depth target bind info. }; /// Specifies parameters for binding the stream-output targets. /// @see ICmdBuffer::CmdBindStreamOutTargets struct BindStreamOutTargetParams { struct { gpusize gpuVirtAddr; ///< GPU virtual address of this stream-output target. Must be DWORD-aligned. If /// this is zero, 'size' is ignored and the target is considered un-bound. gpusize size; ///< Size of this stream-output target, in bytes. Must be DWORD-aligned. } target[MaxStreamOutTargets]; ///< Describes the stream-output target for each buffer slot. }; /// Specifies the different types of predication ops available. enum class PredicateType : uint32 { Zpass = 1, ///< Enable occlusion predicate PrimCount = 2, ///< Enable streamout predicate Boolean64 = 3, ///< CP PFP treats memory as a 64bit integer which is either false (0) or true, DX12 style. Boolean32 = 4, ///< CP PFP treats memory as a 32bit integer which is either false (0) or true, Vulkan style. Count }; /// Bitfield structure used to specify masks for functions that operate on depth and/or stencil planes of an image. union DepthStencilSelectFlags { struct { /// Select Depth. uint32 depth : 1; /// Select Stencil. uint32 stencil : 1; /// Reserved for future usage. uint32 reserved : 30; }; /// Flags packed as 32-bit uint. uint32 u32All; }; /// Specifies information related to clearing a bound color target. Input structure to CmdClearBoundColorTargets(). struct BoundColorTarget { uint32 targetIndex; ///< Render target index where the target image is currently bound. SwizzledFormat swizzledFormat; ///< Format and swizzle of the target image. uint32 samples; ///< Sample count for the target. uint32 fragments; ///< Fragment count for the target. ClearColor clearValue; ///< clear color value. }; /// Specifies clear region to clear a bound target. Input structure to CmdClearBoundColorTargets() and /// CmdClearBoundDepthStencilTargets() struct ClearBoundTargetRegion { Rect rect; ///< The 2D region to clear. uint32 startSlice; ///< The starting slice to clear. uint32 numSlices; ///< The number of slices to clear. }; /// Specifies flags controlling CmdSaveComputeState and CmdRestoreComputeState. PAL clients must be aware that saving /// and restoring specific state in a nested command buffer may not be supported. The rule is simple: if the client /// requires that the caller leak the given state to the callee, PAL will not support saving and restoring that state. enum ComputeStateFlags : uint32 { ComputeStatePipelineAndUserData = 0x1, ///< Selects the bound compute pipeline, all non-indirect user data, and all /// kernel arguments (if applicable). Note that the current user data will /// be invalidated on CmdSaveComputeState. ComputeStateBorderColorPalette = 0x2, ///< Selects the bound border color pallete that affects compute pipelines. ComputeStateAll = 0x3, ///< Selects all state }; /// Provides dynamic command buffer flags during submission /// The following flags are used for Frame Pacing when delay time is configured to be caculated by KMD. /// (Currently DX clients require this). /// For clients that do not need Frame Pacing with KMD caculated delay time, they can ignore these flags: /// /// - frameBegin and frameEnd : Client's presenting queue should track its present state, /// and set frameBegin flag on the first command buffer after present, /// set frameEnd flag on the the last command buffer before present. (Could be the Present command buffer itself.) /// We don't need to set them on queues other than the presenting queue. /// - P2PCmd : Mark a P2P copy command. KMD could use this flag for adjustments for its frame time calculation. /// For the current frame time algorithm, clients should only set this flag on SW compositing copy command. /// But KMD may adjust their algorithm, and clients should update the flag depending on KMD needs. /// /// The following flags are used for Direct Capture. /// /// - captureBegin and captureEnd : Direct capture info should be filled if any of these is set. And captureEnd flag /// also notifies KMD that the on-screen primary is safe to release. struct CmdBufInfo { union { struct { uint32 isValid : 1; ///< Indicate if this CmdBufInfo is valid and should be submitted uint32 frameBegin : 1; ///< First command buffer after Queue creation or Present. uint32 dfSpmTraceBegin : 1; ///< This command buffer begins a DF SPM trace. uint32 dfSpmTraceEnd : 1; ///< This command buffer ends a DF SPM trace. uint32 frameEnd : 1; ///< Last command buffer before Present. uint32 p2pCmd : 1; ///< Is P2P copy command. See CmdBufInfo comments for details. uint32 captureBegin : 1; ///< This command buffer begins a Direct Capture frame capture. uint32 captureEnd : 1; ///< This command buffer ends a Direct Capture frame capture. uint32 rayTracingExecuted : 1; ///< This command buffer contains ray tracing work. uint32 preflip : 1; ///< This command buffer has pre-flip access to DirectCapture resource uint32 postflip : 1; ///< This command buffer has post-flip access to DirectCapture resource uint32 privateFlip : 1; ///< Need to flip to a private primary surface for DirectCapture feature uint32 vpBltExecuted : 1; ///< This command buffer comtains VP Blt work. uint32 disableDccRejected : 1; ///< Reject KMD's DisableDcc request to avoid writing to front buffer. uint32 noFlip : 1; ///< No flip when DirectCapture access submission completes uint32 frameGenIndex : 4; ///< Index of the DirectCapture feature generated frames uint32 noRenderPresent : 1; ///< Last command buffer before present which is no render present or not uint32 motionVectorPropChanged : 1; ///< Indicates whether motion vector properties changed uint32 depthPropChanged : 1; ///< Indicates whether depth properties changed uint32 cameraPropChanged : 1; ///< Indicates whether camera matrix properties changed uint32 capturePrimary : 1; ///< Has Direct Capture primary surface capture uint32 captureMotionVector : 1; ///< Has Direct Capture motion vector capture uint32 captureDepth : 1; ///< Has Direct Capture depth capture uint32 captureCamera : 1; ///< Has Direct Capture camera matrix capture uint32 hudLessImagePropChanged : 1; ///< Indicates whether HUD less image properties changed uint32 captureHudLessImage : 1; ///< Has Direct Capture HUD less image capture uint32 reserved : 3; ///< Reserved for future usage. }; uint32 u32All; ///< Flags packed as uint32. }; const IGpuMemory* pPrimaryMemory; ///< The primary's gpu memory object used for passing its allocation handle /// to KMD for pre-flip primary access (PFPA). If frame metadata flags /// specifies that primaryHandle should be sent, clients should set this to /// current frame pending primary's IGpuMemory object on the creating GPU /// for the frameEnd command. Otherwise set this to nullptr. const IGpuMemory* pDirectCapMemory; ///< The Direct Capture gpu memory object. It should be set if flag /// captureBegin or captureEnd is set. Otherwise set this to nullptr. const IGpuMemory* pPrivFlipMemory; ///< The gpu memory object of the private flip primary surface for the /// DirectCapture feature. const Util::Event* pEarlyPresentEvent; ///< The 'early present' event object. This variable can be nullptr. uint64 frameIndex; ///< The frame index of this command buffer. It is only required for the /// DirectCapture feature uint32 vidPnSourceId; ///< The display source id for the DirectCapture feature. Clients must set /// a valid vidPnSourceId when privateFlip flag is set and pDirectCapMemory /// is nullptr. uint64 frameId; ///< Present frame index, incremented at each present const IGpuMemory* pMotionVectorMemory; ///< The motion vector gpu memory object for the DirectCapture feature. const IGpuMemory* pDepthMemory; ///< The depth gpu memory object for the DirectCapture feature. const IGpuMemory* pCameraMemory; ///< The camera gpu memory object for the DirectCapture feature. const IGpuMemory* pHudLessImageMemory; ///< The HUD less image gpu memory object for DirectCapture. }; /// Specifies rotation angle between two images. Used as input to ICmdBuffer::CmdScaledCopyImage. enum class ImageRotation : uint32 { Ccw0 = 0x0, ///< Counter clockwise degree 0 Ccw90 = 0x1, ///< Counter clockwise degree 90 Ccw180 = 0x2, ///< Counter clockwise degree 180 Ccw270 = 0x3, ///< Counter clockwise degree 270 Count }; /// Describes a color-key value which can control a pixel get copied or ignored during a CmdScaledCopyImage operation. struct ColorKey { uint32 u32Color[4]; ///< The color value for each channel }; /// Uniquely identifies the target of the a Present operation (swap chain / destination window / etc.) so that PAL's debug /// layers can track frames-per-second or other statistics correctly when applications render to multiple displays or /// windows. Client drivers which don't care about this can always specify a key value of 0. using UniquePresentKey = uint64; /// Convert an OS window handle to a unique present key. inline UniquePresentKey PresentKeyFromOsWindowHandle(OsWindowHandle handle) #if defined(_WIN32) { return reinterpret_cast(handle); } #else { return handle.win; } #endif /// Convert any pointer to a unique present key. template constexpr inline UniquePresentKey PresentKeyFromPointer(T* ptr) { return reinterpret_cast(ptr); } /// Specifies the input parameters for debug overlay's visual confirm. This struct is not functional. /// The client is expected to default initialize this struct and then fill out any state that makes /// sense under its presentation model. PAL will process any valid input and ignore fields that are /// default initialized. struct CmdPostProcessDebugOverlayInfo { PresentMode presentMode; ///< The Presentation Mode of the application. WsiPlatform wsiPlatform; ///< The WsiPlatform that Swap Chain works upon UniquePresentKey presentKey; ///< Identifies the window/swap chain, etc. used to present. }; /// Specifies the input parameters for ICmdBuffer::CmdPostProcessFrame. struct CmdPostProcessFrameInfo { union { struct { uint32 srcIsTypedBuffer : 1; ///< True if the source is a typed buffer instead of an image. uint32 reserved : 31; ///< Reserved for future usage. }; uint32 u32All; ///< Flags packed as uint32. } flags; union { const IImage* pSrcImage; ///< The image to postprocess (prior to presenting). const IGpuMemory* pSrcTypedBuffer; ///< The typed buffer to postprocess. /// Must have been created as a typed buffer. }; CmdPostProcessDebugOverlayInfo debugOverlay; FullScreenFrameMetadataControlFlags fullScreenFrameMetadataControlFlags; Pal::ImageLayout srcImageLayout; }; /// External flags for ScaledCopyImage. union ScaledCopyFlags { struct { uint32 srcColorKey : 1; ///< If set, enables source color-keying by using the value in the ColorKey member. /// That is, any pixel in the source image that matches the color key should not be /// copied to the destination image, and all of the source pixels that do not match /// the color key should be copied. Mutually exclusive with dstColorKey. uint32 dstColorKey : 1; ///< If set, enables destination color-keying by using the value in the ColorKey /// member. That is, any pixel in the destination image that matches the color key /// should be replaced with the corresponding pixel from the source image, and all of /// the destination pixels that do not match the color key should not be replaced. /// Mutually exclusive with srcColorKey. uint32 srcAlpha : 1; ///< If set, use alpha channel in source surface as blend factor. /// color = src alpha * src color + (1.0 - src alpha) * dst color. uint32 dstAsSrgb : 1; ///< If set, a non-srgb destination image will be treated as srgb format. /// Cannot be set if @ref dstAsNorm is set. uint32 dstAsNorm : 1; ///< If set, a srgb destination image will be treated as non-srgb format. /// Cannot be set if @ref dstAsSrgb is set. uint32 scissorTest : 1; ///< If set, do scissor test using the specified scissor rectangle. uint32 coordsInFloat : 1; ///< If set, copy regions are represented in floating point type. uint32 srcAsNorm : 1; ///< If set, an srgb source image will be treated as non-srgb format. /// Cannot be set if @ref srcAsSrgb is set. uint32 srcAsSrgb : 1; ///< If set, a non-srgb source image will be treated as srgb format. /// Cannot be set if @ref srcAsNorm is set. uint32 reserved : 23; ///< reserved for future usage. }; uint32 u32All; ///< Flags packed as uint32. }; /// Input structure to @ref ICmdBuffer::CmdScaledCopyImage. Specifies parameters needed to execute CmdScaledCopyImage. struct ScaledCopyInfo { const IImage* pSrcImage; ///< The source image to blt from. ImageLayout srcImageLayout; ///< The source image layout. const IImage* pDstImage; ///< The dest image to blt to. ImageLayout dstImageLayout; ///< The dest image layout. uint32 regionCount; ///< Copy region array size. const ImageScaledCopyRegion* pRegions; ///< Region array to copy. TexFilter filter; ///< Controlling how a given texture is sampled. ImageRotation rotation; ///< Rotation option between two images. const ColorKey* pColorKey; ///< Color key value. const Rect* pScissorRect; ///< Scissor test rectangle. ScaledCopyFlags flags; ///< Copy flags, identifies the type of blt to peform. }; /// Input structure to @ref ICmdBuffer::CmdGenerateMipmaps. Specifies parameters needed to execute CmdGenerateMipmaps. struct GenMipmapsInfo { const IImage* pImage; ///< Populate mips in this image by reading from existing higher-level mips. ImageLayout baseMipLayout; ///< The layout of all slices in the read-only base mip; must include LayoutCopySrc. ImageLayout genMipLayout; ///< The layout of all slices and mips that will be generated; must include /// LayoutCopySrc and LayoutCopyDst. SubresRange range; ///< Which subresources should be generated from earlier mips. The starting mipLevel /// must never be zero because there would be no larger mip to read. TexFilter filter; ///< Controls texture sampling during mip generation. Linear texture filtering is /// only supported for images with non-integer formats. SwizzledFormat swizzledFormat; ///< If not Undefined, reinterpret all subresources using this format and swizzle. /// The specified format needs to have been included in the "pViewFormats" list /// specified at image-creation time, otherwise the result might be incorrect. }; /// Defines a single memory range to prefetch using CmdPrimeGpuCaches. struct PrimeGpuCacheRange { gpusize gpuVirtAddr; ///< Base GPU virtual address to be prefetched. gpusize size; ///< Number of bytes to prefetch. Clients should keep range sizes small relative /// to the GPU caches (e.g., tccSizeInBytes); the PAL implementation may clamp /// prefetched ranges if they are too large for the cache being prefetched. uint32 usageMask; ///< Bitmask of CacheCoherencyUsageFlags defining the usage to prefetch for. /// E.g., if the mask includes CoherShader, then PAL will attempt to prefetch /// into caches that are on the shader core's data path. This mask must be a /// subset of the dstCacheMask specified in the last barrier operation executed /// on this memory range. Performing the cache prefetch is considered a read /// operation of the specified usage, and so must be properly accounted for /// in future barrier memory dependencies for this range. bool addrTranslationOnly; ///< If set, only the address translation caches (i.e., TLB) will be primed; /// no data caches will be affected. If this is set, the prefetch operation /// has no bearing on barrier execution or memory dependencies. }; /// Magic number tag for payloads in command buffer dumps constexpr uint32 CmdBufferPayloadSignature = 0x1337F77D; /// Maximum size, in DWORDs, of payload data in command buffer dumps. constexpr uint32 MaxPayloadSize = 254; /// Payload types used in special embedded NOP packets. enum class CmdBufferPayloadType : uint32 { Integer = 0, ///< Payload consists of a single 32-bit signed integer. UnsignedInteger = 1, ///< Payload consists of a single 32-bit unsigned integer. Integer64 = 2, ///< Payload consists of a single 64-bit signed integer. UnsignedInteger64 = 3, ///< Payload consists of a single 64-bit unsigned integer. Float = 4, ///< Payload consists of a single 32-bit floating point number. Double = 5, ///< Payload consists of a single 64-bit double precision floating point number. Pointer = 6, ///< Payload consists of a single 64-bit pointer address. String = 7, ///< Payload consists of a variable length string. Must contain null-terminator. Binary = 8, ///< Payload consists of DWORD-aligned binary data. }; /// Structure layout for embedded CmdBuffer payloads. This can be embedded into the command stream with the /// @ref ICmdBuffer::CmdNop() function. struct CmdBufferPayload { uint32 signature; ///< Magic number tag indicating the structure to follow. uint32 payloadSize; ///< Size of the NOP packet (one DWORD) plus the sizeof this structure and the /// payload data to follow. /// This value is in DWORDs. Payload size is expected to be under /// MaxPayloadSize. CmdBufferPayloadType type; ///< The type of payload. uint32 payload[1]; ///< Initial DWORD of payload data with the other data to follow. }; /// Flags controlling which sub-queue(s) of a command buffer should insert an RGP trace marker. Zeroing out this /// union is invalid, because RGP markers must be sent to at least one sub-queue. union RgpMarkerSubQueueFlags { struct { uint32 includeMainSubQueue : 1; ///< If set, includes the main sub-queue in the RGP marker. uint32 includeGangedSubQueues : 1; ///< If set, includes any ganged sub-queues in the RGP marker. uint32 reserved : 30; ///< Reserved for future use. }; uint32 u32All; ///< Flags packed into a uint32 }; /** *********************************************************************************************************************** * @interface ICmdBuffer * @brief Contains GPU rendering and other commands recorded by PAL on the client's behalf. * * A command buffer can be executed by the GPU multiple times and recycled, provided the command buffer is not pending * execution on the GPU when it is recycled. * * Command buffers are fully independent and there is no persistence of GPU state between submitted command buffers. * When a new command buffer is recorded, the state is undefined. All relevant state must be explicitly set by the * client before state-dependent operations such as draws and dispatches. * * @see IDevice::CreateCmdBuffer() *********************************************************************************************************************** */ class ICmdBuffer : public IDestroyable { public: /// Resets the command buffer's previous contents and state, then puts it in the _building_ _state_, allowing new /// commands to be recorded. /// /// If this is a root command buffer, the state will be reset to a "clean slate" with nothing bound. If this is a /// nested command buffer, the state is set to an "undefined" state so that all render state can be inherited from /// any root command buffer which executes this one. /// /// @param [in] info Controls how PAL will generate commands for this command buffer. E.g., specifies whether the /// command buffer may be submitted more than once, and controls options for optimizing PM4, etc. /// /// @returns Success if the command buffer was successfully reset and put into the _building_ _state_. Otherwise, /// one of the following error codes may be returned: /// + ErrorInvalidFlags if invalid flags are set in the flags parameter. /// + ErrorIncompleteCommandBuffer if the command buffer is already in the _building_ _state_. virtual Result Begin( const CmdBufferBuildInfo& info) = 0; /// Completes recording of a command buffer in the _building_ _state_, making it _executable_. /// /// @returns Success if the command buffer was successfully made _executable_. Otherwise, one of the following /// errors may be returned: /// + ErrorIncompleteCommandBuffer if the command buffer is not in the _building_ _state_. /// + ErrorBuildingCommandBuffer if some error occurred while building the command buffer, and it could not /// be made _executable_. If this error is returned, the command buffer can not be submitted. virtual Result End() = 0; /// Explicitly resets a command buffer, releasing any internal resources associated with it. /// /// This call must be used to reset command buffers that have previously reported a ErrorIncompleteCommandBuffer /// error. /// /// @note @ref Begin will implicitly cause a command buffer to be reset in addition to putting it in the /// _building_ _state_. This method just gives a way to release resources between when the client knows /// it is done with the command buffer and when it is ready to reuse this command buffer object for /// recording new commands. /// /// @param [in] pCmdAllocator If non-null, all future GPU memory allocations will be done using this allocator. /// Otherwise the command buffer will continue to use its current command allocator. /// /// @param [in] returnGpuMemory If true then all GPU memory associated with this command buffer will be returned /// to the allocator upon reset. If false data chunks will be retained and reused. /// Note: This flag must be true if changing command allocators. /// /// @warning If returnGpuMemory is false, the client must guarantee that this command buffer is not queued for /// execution, is not currently being executed, and that all other command buffers that have referenced /// this command buffer in a @ref CmdExecuteNestedCmdBuffers call have also been reset. /// /// @returns Success if the command buffer was successfully reset. Otherwise, one of the following errors may be /// returned: /// + ErrorUnknown if an internal PAL error occurs. virtual Result Reset(ICmdAllocator* pCmdAllocator, bool returnGpuMemory) = 0; /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to CmdAllocateEmbeddedData. /// /// This a property of the command buffer and its associated command allocator; it may change if the caller /// specifies a different command allocator on Reset(). /// /// @returns How many DWORDs of embedded data the command buffer can allocate at once. virtual uint32 GetEmbeddedDataLimit() const = 0; /// Queries how many DWORDs of embedded data the command buffer can allocate in one call to /// CmdAllocateLargeEmbeddedData. /// /// @returns Number of DWORDs that can be allocated in one call to CmdAllocateLargeEmbeddedData virtual uint32 GetLargeEmbeddedDataLimit() const = 0; /// Binds a graphics or compute pipeline to the current command buffer state. /// /// Graphics pipelines must be compiled for the PAL ABI. Compute pipelines must either be compiled for the PAL ABI /// or the HSA ABI, if it's supported. HSA ABI support is indicated by supportHsaAbi in @ref DeviceProperties. /// /// PAL ABI pipelines and HSA ABI pipelines use different mechanisms to bind inputs and outputs. PAL ABI pipelines /// use user data entries set by @ref CmdSetUserData. HSA ABI pipelines use kernel arguments set by @ref /// CmdSetKernelArguments. Binding or unbinding a compute pipeline can implicitly modify the user data and kernel /// argument state, please read the @ref CmdSetUserData and @ref CmdSetKernelArguments documentation for details. /// /// @param [in] params Parameters necessary to manage dynamic pipeline shader information. virtual void CmdBindPipeline( const PipelineBindParams& params) = 0; /// Binds the specified MSAA state object to the current command buffer state. /// /// @param [in] pMsaaState New MSAA state to be bound. Can be null in order to unbind a previously bound MSAA state /// object without binding a new one. virtual void CmdBindMsaaState( const IMsaaState* pMsaaState) = 0; /// Saves a copy of all of the current command buffer state that is used by graphics workloads. This feature is /// intended to give PAL clients a convenient way to issue their own internal graphics workloads without modifying /// the application-facing state. /// /// PAL cannot save multiple layers of state, each call to CmdSaveGraphicsState must be followed by a call to /// CmdRestoreGraphicsState before the next call to CmdSaveGraphicsState. Any barriers, resolves, blits, etc are not /// allowed while the state is pushed. /// /// This function can only be called on command buffers that support graphics workloads. All query counters will be /// disabled until CmdRestoreGraphicsState is called. virtual void CmdSaveGraphicsState() = 0; /// Restores all of the command buffer state that is used by graphics workloads. This feature is intended to /// give PAL clients a convenient way to issue their own internal graphics workloads without modifying the /// application-facing state. /// /// A call to this function must be preceded by a call to CmdSaveGraphicsState /// /// This function can only be called on command buffers that support graphics workloads. All previously disabled /// query counters will be reactivated. virtual void CmdRestoreGraphicsState() = 0; /// Sets the shading rate in the command buffer along with the state of the various combiners. /// /// @param [in] rateParams Nwe VRS shading rate parameters to be bound. virtual void CmdSetPerDrawVrsRate( const VrsRateParams& rateParams) = 0; /// Setup parameters regarding how pixel center will be evaluated with VRS. /// /// @param [in] centerState Nwe VRS parameters to be bound that control how pixel center is defined. virtual void CmdSetVrsCenterState( const VrsCenterState& centerState) = 0; /// Binds the shading rate data in the specified image into the pipeline for use with VRS. Only relevant if the /// combiner stage for VrsCombinerStage is set to something other than Passthrough. /// /// This binding point requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageSampleRate /// - CacheCoherency: @ref CoherSampleRate /// - ImageLayout: @ref LayoutSampleRate /// /// @param [in] pImage Image that contains sample rate data. Pointer can be NULL to force 1x1 shading rate. virtual void CmdBindSampleRateImage( const IImage* pImage) = 0; /// Binds the specified color/blend state object to the current command buffer state. /// /// @param [in] pColorBlendState New color/blend state to be bound. Can be null in order to unbind a previously /// bound color/blend state object without binding a new one. virtual void CmdBindColorBlendState( const IColorBlendState* pColorBlendState) = 0; /// Binds the specified depth/stencil state object to the current command buffer state. /// /// @param [in] pDepthStencilState New depth/stencil state to be bound. Can be null in order to unbind a previously /// bound depth/stencil state object without binding a new one. virtual void CmdBindDepthStencilState( const IDepthStencilState* pDepthStencilState) = 0; /// Sets the value range to be used for depth bounds testing. /// /// The depth bounds test is enabled in the graphics pipeline. When enabled, an additional check will be done that /// will reject a pixel if the pre-existing depth value stored at its destination location is outside of the /// specified bounds. Applications would typically use this feature to optimize shadow volume rendering. /// /// @param [in] params Parameters necessary to set the depth bounds (such as min/max depth). virtual void CmdSetDepthBounds( const DepthBoundsParams& params) = 0; /// Sets pipeline-accessible user data to the specified values. /// /// The values set in user data entries will be interpreted based on the resource mapping specified for each shader /// in the currently bound pipeline. For example, the client can write virtual addresses of tables containing /// SRDs, immediate SRDs that can be loaded without an indirection, or even a small number of immediate ALU /// constants. /// /// The user data values are only used by PAL ABI pipelines. Almost all pipelines used by PAL clients are compiled /// for the PAL ABI, but PAL also supports HSA ABI compute pipelines which use @ref CmdSetKernelArguments instead. /// When an HSA ABI pipeline is bound the current compute user data entries are saved and will be restored if the /// client later binds a PAL ABI compute pipeline. /// /// @warning It's illegal to set compute user data if an HSA ABI pipeline is currently bound. /// /// If no compute pipeline is currently bound PAL assumes the client will bind a PAL ABI pipeline and thus accepts /// user data bindings. Graphics user data are unaffected by all of this because graphics pipelines can only use /// the PAL ABI. /// /// @see PipelineShaderInfo /// @see ResourceMappingNode /// @ingroup ResourceBinding /// /// @param [in] bindPoint Specifies which type of user-date is to be set (i.e., compute or graphics). /// @param [in] firstEntry First user data entry to be updated. /// @param [in] entryCount Number of user data entries to update; size of the pEntryValues array. Must be greater /// than zero, and (firstEntry + entryCount) must not extend beyond MaxUserDataEntries. /// @param [in] pEntryValues Array of 32-bit values to be copied into user data. void CmdSetUserData( PipelineBindPoint bindPoint, uint32 firstEntry, uint32 entryCount, const uint32* pEntryValues) { (m_funcTable.pfnCmdSetUserData[static_cast(bindPoint)])(this, firstEntry, entryCount, pEntryValues); } /// Copies all pipeline-accessible user-data from one bind point to another. It is invalid if the source and /// dest parameters refer to the same bind point. /// /// @see CmdSetUserData for how the user-date entries will be interpreted by the pipeline. /// /// @param [in] source Specifies which bind point to copy from. /// @param [in] dest Specifies which bind point to copy into. virtual void CmdDuplicateUserData( PipelineBindPoint source, PipelineBindPoint dest) = 0; /// Sets one or more HSA code object kernel argument values. /// /// If the currently bound compute pipeline was compiled using the HSA compute ABI this function must be used to /// bind that pipeline's arguments. The argument position and value types are static properties of the pipeline /// and must be known by the client. /// /// @note Calling @ref CmdBindPipeline invalidates all prior kernel argument bindings, even if the new pipeline /// also uses the HSA ABI. Any kernel arguments that the client intends to share between pipelines must /// be manually rebound. /// /// @warning It's illegal to call this function if no compute pipeline is bound or if the bound compute pipeline /// uses a different ABI (e.g., the PAL compute ABI). /// /// @ingroup ResourceBinding /// /// @param [in] firstArg The zero-based position of the first kernel argument to bind. /// @param [in] argCount Number of kernel arguments this call binds. /// @param [in] ppValues Array of pointers to kernel argument values. virtual void CmdSetKernelArguments( uint32 firstArg, uint32 argCount, const void*const* ppValues) = 0; /// Changes one or more of the command buffer's active vertex buffers. /// /// @note If bufferViews.offsetMode is false, PAL will construct SRDs for each bound vertex buffer which are /// equivalent to the client calling @ref IDevice::CreateUntypedBufferViewSrd on each element of the /// pBuffers parameter. /// /// Note that vertex buffers require use of the following barrier flags: /// - PipelineStage: @ref PipelineStageVs /// - CacheCoherency: @ref CoherShaderRead /// /// @param [in] bufferViews Vertex buffer view descriptors.This parameter defines which vertex mode is used through /// @ref VertexBufferViews::offsetMode. VertexBufferViews::pVertexBufferViews or /// VertexBufferViews::pBufferViewInfos must not be nullptr. virtual void CmdSetVertexBuffers( const VertexBufferViews& bufferViews) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 877 void CmdSetVertexBuffers( uint32 firstBuffer, uint32 bufferCount, const BufferViewInfo* pBuffers) { const VertexBufferViews bufferViews = { .firstBuffer = firstBuffer, .bufferCount = bufferCount, .offsetMode = false, .pBufferViewInfos = pBuffers }; CmdSetVertexBuffers(bufferViews); } #endif /// Binds a range of memory for use as index data (i.e., binds an index buffer). /// /// The GPU virtual address must be index element aligned: 2-byte aligned for 16-bit indices or 4-byte aligned for /// 32-bit indices. /// /// The index buffer binding point requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageFetchIndices /// - CacheCoherency: @ref CoherIndexData /// /// @param [in] gpuAddr GPU virtual address of the index data. Can be zero to unbind the previously bound data. /// @param [in] indexCount Maximum number of indices in the index data; the GPU may read less indices. /// @param [in] indexType Specifies whether to use 8-bit, 16-bit or 32-bit index data. virtual void CmdBindIndexData( gpusize gpuAddr, uint32 indexCount, IndexType indexType) = 0; /// Binds color and depth/stencil targets to the current command buffer state. /// /// The current layout of each target must also be specified. /// /// The color target binding points require use of the following barrier flags: /// - PipelineStage: @ref PipelineStageColorTarget /// - CacheCoherency: @ref CoherColorTarget /// - ImageLayout: @ref LayoutColorTarget /// /// The depth and stencil target binding points require use of the following barrier flags: /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget /// - CacheCoherency: @ref CoherDepthStencilTarget /// - ImageLayout: @ref LayoutDepthStencilTarget /// /// @param [in] params Parameters representing the color and depth/stencil targets to bind to the command buffer. virtual void CmdBindTargets( const BindTargetParams& params) = 0; /// Binds stream-output target buffers to the current command buffer state. /// /// At draw-time, the stream-output targets must be consistent with the soState parameters specified by the /// currently bound graphics pipeline. /// /// The stream-output target buffers require use of the following barrier flags: /// - PipelineStage: @ref PipelineStageStreamOut /// - CacheCoherency: @ref CoherStreamOut /// /// @param [in] params Parameters representing the stream-output target buffers to bind to the command buffer. virtual void CmdBindStreamOutTargets( const BindStreamOutTargetParams& params) = 0; /// Sets the constant factor to be used by the blend hardware when programmed with the Blend::ConstantColor, /// Blend::OneMinusConstantColor, Blend::ConstantAlpha, or Blend::OneMinusConstantAlpha blend coefficients. /// /// @param [in] params Parameters representing the blend constant factor. virtual void CmdSetBlendConst( const BlendConstParams& params) = 0; /// Sets input assembly state for upcoming draws in this command buffer. /// /// At draw-time, the topology specified with this method must be consistent with the _topologyInfo_ parameters /// specified by the currently bound graphics pipeline. /// /// @param [in] params Parameters representing the input assembly state for upcoming draws. virtual void CmdSetInputAssemblyState( const InputAssemblyStateParams& params) = 0; /// Sets parameters controlling triangle rasterization. /// /// @param [in] params Parameters to set the triangle raster state (such as fill/cull mode). virtual void CmdSetTriangleRasterState( const TriangleRasterStateParams& params) = 0; /// Sets parameters controlling point and line rasterization. /// /// @param [in] params Parameters to set the point and line rasterization state (such as pointSize and lineWidth). virtual void CmdSetPointLineRasterState( const PointLineRasterStateParams& params) = 0; /// Sets parameters controlling line stippling. /// /// @param [in] params Parameters to set the line stipple state. virtual void CmdSetLineStippleState( const LineStippleStateParams& params) = 0; /// Sets depth bias parameters. /// /// Depth bias is used to ensure a primitive can properly be displayed (without Z fighting) in front (or behind) /// of the previously rendered co-planar primitive. This is useful for decal or shadow rendering. /// /// @param [in] params Parameters for setting the depth bias (such as depth bias, depth bias clamp, and slope /// scaled depth bias). virtual void CmdSetDepthBiasState( const DepthBiasParams& params) = 0; /// Sets stencil reference values and mask buffer reads and writes in upcoming draws. Separate reference values /// can be specified for front-facing and back-facing polygons. Update flags should be set for state which needs to /// be updated. All other state will be preserved. /// Setting all the values (reference, read/write masks and stencil op) in the StencilRefMaskParams together /// takes the faster path. /// Setting either the ref value, read/write masks or the stencil op value individually takes the slower /// read-modify-write path. /// /// @param [in] params Parameters for setting the stencil read and write masks. virtual void CmdSetStencilRefMasks( const StencilRefMaskParams& params) = 0; /// Sets user defined clip planes, should only be called on universal command buffers. /// /// @param [in] firstPlane The index of first plane in user define clip plane array. /// @param [in] planeCount The count of planes in plane array. /// @param [in] pPlanes Pointer to plane array. virtual void CmdSetUserClipPlanes( uint32 firstPlane, uint32 planeCount, const UserClipPlane* pPlanes) = 0; /// Sets clip rects, should only be called on universal command buffers. /// /// @param [in] clipRule 16-bit clip rule bits are used to determine if pixel shall be discarded or retained. /// For each pixel, a 4-bit index is computed based on which clip rects the pixel is /// inside (bitN represents rectN). Then uses this index to check the corresponding bit /// in clip rule for this pixel - 0 for discarded, 1 for retained. /// @param [in] rectCount The count of rectangles in rect list. This must be less than or equal to /// MaxClipRects (4). /// @param [in] pRectList Pointer to the rect list. virtual void CmdSetClipRects( uint16 clipRule, uint32 rectCount, const Rect* pRectList) = 0; /// Sets user defined MSAA quad-pixel sample pattern, should only be called on universal command buffers /// This should be called before clearing, rendering, barriering and resolving of MSAA DepthStencil image. /// /// @param [in] numSamplesPerPixel Number of samples per pixel /// @param [in] quadSamplePattern The input msaa sample pattern virtual void CmdSetMsaaQuadSamplePattern( uint32 numSamplesPerPixel, const MsaaQuadSamplePattern& quadSamplePattern) = 0; /// Sets the specified viewports to the current command buffer state. /// /// @param [in] params Parameters for setting the specified number of viewports. virtual void CmdSetViewports( const ViewportParams& params) = 0; /// Sets the scissor regions corresponding to each viewport to the current command buffer state. /// /// @param [in] params Parameters for setting the specified number of scissor regions. virtual void CmdSetScissorRects( const ScissorRectParams& params) = 0; /// Sets the global scissor rectangle. /// /// @param [in] params Parameters for setting the global scissor rectangle from the top left to bottom right /// coordinate. virtual void CmdSetGlobalScissor( const GlobalScissorParams& params) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 928 /// Inserts a barrier in the current command stream that can stall GPU execution, flush/invalidate caches, or /// decompress images before further, dependent work can continue in this command buffer. /// /// This operation does not honor the command buffer's predication state, if active. /// /// @param [in] barrierInfo See @ref BarrierInfo for detailed information. virtual void CmdBarrier( const BarrierInfo& barrierInfo) = 0; #endif /// Perform source pipeline stage and cache access optimization based on the acquire/release interface. /// /// @param [in] barrierType Barrier transition type @ref BarrierType. /// @param [in] pImage Image pointer for image transition, required when @ref BarrierType is /// BarrierType::Image. /// @param [in/out] pSrcStageMask A source mask of ORed @ref PipelineStageFlag to optimize, can't be null. /// @param [in/out] pSrcAccessMask A source mask of ORed @ref CacheCoherencyUsageFlags to optimize, can't be null. /// @param [in/out] pDstStageMask A destination mask of ORed @ref PipelineStageFlag to optimize, can't be null. /// @param [in/out] pDstAccessMask A destination mask of ORed @ref CacheCoherencyUsageFlags to optimize. /// /// @returns If need flush and invalidate GL2 cache. /// /// @note PipelineStageBlt will be converted to more accurate stage(s) based on the underlying implementation of /// outstanding BLTs, but will be left as PipelineStageBlt if the internal outstanding BLTs can't be expressed /// as a client-facing PipelineStage (e.g., if there are CP DMA BLTs in flight). virtual bool OptimizeAcqRelReleaseInfo( BarrierType barrierType, const IImage* pImage, uint32* pSrcStageMask, uint32* pSrcAccessMask, uint32* pDstStageMask, uint32* pDstAccessMask) const = 0; /// Performs the release portion of an acquire/release-based barrier. This releases a set of resources from their /// current usage, while CmdAcquire() is expected to be called to acquire access to the resources for future, /// different usage. /// /// Conceptually, this method will: /// - Ensure the specified source synchronization scope has completed. /// - Ensure all specified resources are available in memory. The availability operation will flush all /// write-back caches to the last-level-cache. /// - Perform any requested layout transitions. /// /// Once all of these operations are complete, the release issues a timestamp event that signals the operation /// completion. The event type and timestamp value is returned to caller in a packed uint32 token. A corresponding /// CmdAcquire() call is expected to wait on one or a list of such synchronization tokens and perform any necessary /// visibility operations and/or layout transitions that could not be predicted at release-time. /// /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout /// transitions. /// @returns Synchronization token for the release operation. Pass this token to CmdAcquire to confirm completion. #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 virtual uint32 CmdRelease( #else virtual ReleaseToken CmdRelease( #endif const AcquireReleaseInfo& releaseInfo) = 0; /// Performs the acquire portion of an acquire/release-based barrier. This acquire a set of resources for a new /// set of usages, assuming CmdRelease() was called to release access for the resource's past usage. /// /// Conceptually, this method will: /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all /// relevant caches above the last-level-cache. /// - Perform any requested layout transitions. /// - Ensure the release(s) have completed by waiting on the synchronization token of the release operation. /// /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout /// layout transitions. /// @param [in] syncTokenCount Number of entries in pSyncTokens, can be zero if no valid release token. /// @param [in] pSyncTokens Array of synchronization tokens, as returned from CmdRelease, to confirm completion. /// The token value(s) must have been returned by a CmdRelease call in the same command /// buffer. pSyncTokens can be null if syncTokenCount is 0. virtual void CmdAcquire( const AcquireReleaseInfo& acquireInfo, uint32 syncTokenCount, #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 885 const uint32* pSyncTokens) = 0; #else const ReleaseToken* pSyncTokens) = 0; #endif /// Performs the release portion of an acquire/release event-based barrier. This releases a set of resources from /// their current usage, while CmdAcquireEvent() is expected to be called to acquire access to the resources for /// future, different usage. /// /// Conceptually, this method will: /// - Ensure the specified source synchronization scope has completed. /// - Ensure all specified resources are available in memory. The availability operation will flush all /// write-back caches to the last-level-cache. /// - Perform any requested layout transitions. /// /// Once all of these operations are complete, the specified IGpuEvent object will be signaled. A corresponding /// CmdAcquireEvent() call is expected to wait on this event and perform any necessary visibility operations and/or /// layout transitions that could not be predicted at release-time. /// /// @param [in] releaseInfo Describes the synchronization scope, availability operations, and required layout /// transitions. /// @param [in] pGpuEvent Event to be signaled once the release has completed. Must be a valid (non-null) GPU /// event pointer. Call CmdRelease()/CmdAcquire() pair instead if want to release/acquire /// something but no GPU event is available. virtual void CmdReleaseEvent( const AcquireReleaseInfo& releaseInfo, const IGpuEvent* pGpuEvent) = 0; /// Performs the acquire portion of an acquire/release event-based barrier. This acquire a set of resources for a /// new set of usages, assuming CmdReleaseEvent() was called to release access for the resource's past usage. /// /// Conceptually, this method will: /// - Ensure the release(s) have completed by waiting for the specified IGpuEvent early enough in the pipeline to /// support the specified destination synchronization scope. /// - Ensure all specified resources are visible in memory. The visibility operation will invalidate all /// relevant caches above the last-level-cache. /// - Perform any requested layout transitions. /// /// @param [in] acquireInfo Describes the synchronization scope, visibility operations, and the required layout /// layout transitions. /// @param [in] gpuEventCount Number of entries in pGpuEvents. /// @param [in] ppGpuEvents Array of one or more events to wait on. Typically these will be set via /// CmdReleaseEvent(), but it's valid to wait on an event set through a different means /// like CmdSetEvent() from CPU side. Must be a valid (non-null) pointer to an array of /// gpuEventCount valid GPU event pointers. Call CmdReleaseThenAcquire() instead if wait /// to acquire something but no GPU event is available. virtual void CmdAcquireEvent( const AcquireReleaseInfo& acquireInfo, uint32 gpuEventCount, const IGpuEvent* const* ppGpuEvents) = 0; /// Conceptually equivalent to calling CmdRelease() followed immediately by CmdAcquire(), but it potentially has /// better performance than calling CmdRelease()/CmdAcquire() directly. Can be called in cases where the client/ /// application cannot detect separate release and acquire points for a transition. /// /// @param [in] barrierInfo Describes the synchronization scopes, availability/visibility operations, and the /// required layout transitions. virtual void CmdReleaseThenAcquire( const AcquireReleaseInfo& barrierInfo) = 0; /// Issues an instanced, non-indexed draw call using the command buffer's currently bound graphics state. Results /// in instanceCount * vertexCount vertices being processed. /// /// It is an error if the currently bound pipeline contains a mesh and/or task shader. /// /// @param [in] firstVertex Starting index value for the draw. Indices passed to the vertex shader will range /// from firstVertex to firstVertex + vertexCount - 1. /// @param [in] vertexCount Number of vertices to draw. If zero, the draw will be discarded. /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range /// from firstInstance to firstInstance + instanceCount - 1. /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. /// @param [in] drawId Draw index for the draw. void CmdDraw( uint32 firstVertex, uint32 vertexCount, uint32 firstInstance, uint32 instanceCount, uint32 drawId) { m_funcTable.pfnCmdDraw(this, firstVertex, vertexCount, firstInstance, instanceCount, drawId); } /// Issues draw opaque call using the command buffer's currently bound graphics state. /// Uses the stream-out target of a previous draw as the input vertex data. /// the number of vertices = (streamOutFilledSize (value of streamOutFilledSizeVa) - streamOutOffset) / stride /// /// It is an error if the currently bound pipeline contains a mesh and/or task shader. /// /// @param [in] streamOutFilledSizeVa gpuAddress of streamOut filled size for streamOut buffer. /// @param [in] streamOutOffset the offset of begin of streamOut as vertex. /// @param [in] stride stride for stream data as vertex. /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader /// will range from firstInstance to firstInstance + instanceCount - 1. /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. void CmdDrawOpaque( gpusize streamOutFilledSizeVa, uint32 streamOutOffset, uint32 stride, uint32 firstInstance, uint32 instanceCount) { m_funcTable.pfnCmdDrawOpaque(this, streamOutFilledSizeVa, streamOutOffset, stride, firstInstance, instanceCount); } /// Issues an instanced, indexed draw call using the command buffer's currently bound graphics state. Results in /// instanceCount * indexCount vertices being processed. /// /// It is an error if the currently bound pipeline contains a mesh and/or task shader. /// /// Indices passed to the vertex shader will be: /// /// + IndexBuffer[firstIndex] + vertexOffset /// + IndexBuffer[firstIndex + 1] + vertexOffset, /// + ... /// + IndexBuffer[firstIndex + indexCount - 1] + vertexOffset /// /// @param [in] firstIndex Starting index buffer slot for the draw. /// @param [in] indexCount Number of vertices to draw. If zero, the draw will be discarded. /// @param [in] vertexOffset Offset added to the index fetched from the index buffer before it is passed to the /// vertex shader. /// @param [in] firstInstance Starting instance for the draw. Instance IDs passed to the vertex shader will range /// from firstInstance to firstInstance + instanceCount - 1. /// @param [in] instanceCount Number of instances to draw. If zero, the draw will be discarded. /// @param [in] drawId Draw index for the draw. void CmdDrawIndexed( uint32 firstIndex, uint32 indexCount, int32 vertexOffset, uint32 firstInstance, uint32 instanceCount, uint32 drawId) { m_funcTable.pfnCmdDrawIndexed(this, firstIndex, indexCount, vertexOffset, firstInstance, instanceCount, drawId); } /// Issues instanced, non-indexed draw calls using the command buffer's currently bound graphics state. The draw /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its vertexCount or /// instanceCount is zero. /// /// The layout of the argument data is defined in the @ref DrawIndirectArgs structure. /// /// It is an error if the currently bound pipeline contains a mesh and/or task shader. /// /// This function requires use of the following barrier flags on the indirect memory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @see CmdDraw /// @see DrawIndirectArgs /// /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in /// memory from one structure to another. /// The virtual address must be 4 byte aligned. /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr /// is nonzero, the value at that memory location is clamped to /// this maximum. If countGpuAddr is zero, then the number of draws /// issued exactly matches this number. /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. /// Must be 4-byte aligned. void CmdDrawIndirectMulti( GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr) { m_funcTable.pfnCmdDrawIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); } /// Issues instanced, indexed draw calls using the command buffer's currently bound graphics state. The draw /// arguments come from GPU memory. This command will issue count draw calls, using the provided stride to find /// the next indirect args structure in gpuMemory. Each draw call will be discarded if its indexCount or /// instanceCount is zero. /// /// The layout of the argument data is defined in the @ref DrawIndexedIndirectArgs structure. /// /// It is an error if the currently bound pipeline contains a mesh and/or task shader. /// /// This function requires use of the following barrier flags on the indirect memory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @see CmdDrawIndexed /// @see DrawIndexedIndirectArgs /// /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in /// memory from one structure to another. /// The virtual address must be 4 byte aligned. /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr /// is nonzero, the value at that memory location is clamped to /// this maximum. If countGpuAddr is zero, then the number of draws /// issued exactly matches this number. /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. /// Must be 4-byte aligned. void CmdDrawIndexedIndirectMulti( GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr) { m_funcTable.pfnCmdDrawIndexedIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); } #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 909 /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. /// /// The thread group size is defined in the compute shader. /// /// Supports PAL ABI and HSA ABI pipelines. /// /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. inline void CmdDispatch( DispatchDims size) { m_funcTable.pfnCmdDispatch(this, size, {}); } #endif /// Dispatches a compute workload of the given dimensions using the command buffer's currently bound compute state. /// /// The thread group size is defined in the compute shader. /// /// Supports PAL ABI and HSA ABI pipelines. /// /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. /// @param [in] infoFlags Additional information about the dispatch. void CmdDispatch( DispatchDims size, DispatchInfoFlags infoFlags) { m_funcTable.pfnCmdDispatch(this, size, infoFlags); } /// Dispatches a compute workload using the command buffer's currently bound compute state. The dimensions of the /// workload come from GPU memory. The dispatch will be discarded if any of its dimensions are zero. /// /// The layout of the argument data is defined in the @ref DispatchIndirectArgs structure. /// /// @warning Does not support HSA ABI pipelines. /// /// This function requires use of the following barrier flags on the indirect memory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @see CmdDispatch /// @see DispatchIndirectArgs /// /// @param [in] gpuVirtAddr GPU virtual memory address where the indirect argument data is located. /// The virtual address must be 4-byte aligned. void CmdDispatchIndirect( gpusize gpuVirtAddr) { m_funcTable.pfnCmdDispatchIndirect(this, gpuVirtAddr); } /// Dispatches a compute workload of the given dimensions and offsets using the command buffer's currently bound /// compute state. This command allows targeting regions of thread groups without adding the offset computations in /// the shader. /// /// The caller may also provide a logical thread group count which is larger than the number of groups actually /// launched. If the shader reads the dispatch's thread group count from PAL metadata it will see the logical size, /// not the launch size. /// /// The combination of an offset, launch size, and logical size give the caller enough flexibility to take an /// incoming dispatch, split it up into sub-dispatches, and execute those sub-dispatches using multiple /// CmdDispatchOffset calls in whatever execution pattern they would like. Note that such an optimization /// would not work if the shader has global logic that does make assumptions about thread group launch order. /// /// The thread group size is defined in the compute shader. /// /// Supports PAL ABI and HSA ABI pipelines. /// /// @param [in] offset The thread groups offsets. Set them to zero if you don't want an offset. /// @param [in] launchSize Thread groups to dispatch. If any components are zero the dispatch will be discarded. /// @param [in] logicalSize The thread group dimensions reported to the shader via metadata. void CmdDispatchOffset( DispatchDims offset, DispatchDims launchSize, DispatchDims logicalSize) { m_funcTable.pfnCmdDispatchOffset(this, offset, launchSize, logicalSize); } /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if /// the currently bound graphics pipeline does not contain a mesh and/or task shader. /// /// The thread group size is defined in the mesh shader or task shader. /// /// @param [in] size Thread groups to dispatch. If any components are zero the dispatch will be discarded. void CmdDispatchMesh( DispatchDims size) { m_funcTable.pfnCmdDispatchMesh(this, size); } /// Dispatches a mesh shader workload using the command buffer's currently bound graphics state. It is an error if /// the currently bound graphics pipeline does not contain a mesh shader. The dimensions of the workload come from /// GPU memory. The dispatch will be discarded if any of its dimensions are zero. /// /// The layout of the argument data is defined in the @ref DispatchMeshIndirectArgs structure. /// /// This function requires use of the following barrier flags on the indirect memory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @see CmdDispatchMesh /// @see DispatchMeshIndirectArgs /// /// @param [in] gpuVirtAddrAndStride GPU virtual address where the indirect argument data is located and stride in /// memory from one structure to another. /// The virtual address must be 4 byte aligned. /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr /// is nonzero, the value at that memory location is clamped to /// this maximum. If countGpuAddr is zero, then the number of draws /// issued exactly matches this number. /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. /// Must be 4-byte aligned. void CmdDispatchMeshIndirectMulti( GpuVirtAddrAndStride gpuVirtAddrAndStride, uint32 maximumCount, gpusize countGpuAddr) { m_funcTable.pfnCmdDispatchMeshIndirectMulti(this, gpuVirtAddrAndStride, maximumCount, countGpuAddr); } /// Copies multiple regions from one GPU memory allocation to another. /// /// None of the destination regions are allowed to overlap each other, nor are destination and source regions /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping /// will cause undefined results. /// /// This call should be used for buffer memory copy only; don't use it for image memory. /// /// For best performance, offsets and copy sizes should be 4-byte aligned. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// /// @param [in] srcGpuMemory GPU memory allocation where the source regions are located. /// @param [in] dstGpuMemory GPU memory allocation where the destination regions are located. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, destination offset, and /// copy size. virtual void CmdCopyMemory( const IGpuMemory& srcGpuMemory, const IGpuMemory& dstGpuMemory, uint32 regionCount, const MemoryCopyRegion* pRegions) = 0; /// Copies multiple regions from one GPU memory virtual address to another. /// /// @note The CmdCopyMemory() path should be preferred because it contains more optimizations due to more /// knowledge about the memory itself that is lost when only virtual addresses are passed in. /// /// /// None of the destination regions are allowed to overlap each other, nor are destination and source regions /// allowed to overlap when the source and destination GPU memory virtual address are the same. Any illegal /// overlapping will cause undefined results. /// /// For best performance, addresses, offsets, and copy sizes should be 4-byte aligned. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// /// @param [in] srcGpuVirtAddr GPU memory vitrual address where the source regions are located. /// @param [in] dstGpuVirtAddr GPU memory virtual address where the destination regions are located. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifynig a source offset, destination offset, /// and copy size. virtual void CmdCopyMemoryByGpuVa( gpusize srcGpuVirtAddr, gpusize dstGpuVirtAddr, uint32 regionCount, const MemoryCopyRegion* pRegions) = 0; /// Copies multiple regions from one image to another. /// /// The source and destination subresource of a particular region are not allowed to be the same, and will produce /// undefined results. Additionally, destination subresources cannot be present more than once per CmdCopyImage() /// call. /// /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image /// extents are specified in compression blocks. /// /// The source and destination images must to be of the same type (1D, 2D or 3D), or optionally 2D and 3D with the /// number of slices matching the depth. MSAA source and destination images must have the same number of samples. /// /// Each region must satisfy these restrictions. /// - srcOffset >= 0 and dstOffset >= 0 /// - srcOffset + extent <= srcSubres's extent /// - dstOffset + extent <= dstSubres's extent /// /// Images copied via this function must have x/y/z offsets and width/height/depth extents aligned to the minimum /// tiled copy alignment specified in @ref DeviceProperties for the engine this function is executed on. Note that /// the DMA engine supports tiled copies regardless of the alignment; the reported minimum tiled copy alignments /// are an indication of the minimum alignments for which the copy will be performant. /// /// When the per-engine capability flag supportsMismatchedTileTokenCopy (@see DeviceProperties) is false, /// CmdCopyImage is only valid between two subresources that share the same tileToken (@see SubresLayout). /// /// Note that the copy can go through clone copy automatically if, /// - Both source and destination images are created with @ref ImageCreateInfo::flags::cloneable = 1 /// - Both source and destination images have same @ref ImageCreateInfo /// - Source image's layout is compatible with destination images' layout /// - This is a full image copy /// - Copy flags @ref CopyControlFlags required to be 0. /// /// Basically clone copy clones all subresources' data of one image object in another while preserving the image /// layout. It does raw copy on image data and metadata; and tries to keep the metadata (like DCC/HiZ/HiS) /// unchanged but may be not true due to different HW design. /// e.g. Client compression (fragment and ZPlane compression) will be missed during the compute based raw copy. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. /// /// @param [in] srcImage Image where source regions reside. /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] dstImage Image where destination regions reside. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a source subresource, destination /// subresource, source x/y/z offset, destination x/y/z offset, and copy size in the /// x/y/z dimensions. /// @param [in] pScissorRect Rectangle for scissor test. /// @param [in] flags A mask of ORed @ref CopyControlFlags that can be used to control copy behavior. virtual void CmdCopyImage( const IImage& srcImage, ImageLayout srcImageLayout, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const ImageCopyRegion* pRegions, const Rect* pScissorRect, uint32 flags) = 0; /// Copies data directly (without format conversion) from a GPU memory object to an image. /// /// For compressed images, the extents are specified in compression blocks. /// /// The size of the data copied from memory is implicitly derived from the image extents. /// /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination /// subresource cannot be present more than once per CmdCopyMemoryToImage() call. /// /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopyDst /// /// @param [in] srcGpuMemory GPU memory where the source data is located. /// @param [in] dstImage Image where destination data will be written. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. virtual void CmdCopyMemoryToImage( const IGpuMemory& srcGpuMemory, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const MemoryImageCopyRegion* pRegions) = 0; /// Copies data directly (without format conversion) from an image to a GPU memory object. /// /// For compressed images, the extents are specified in compression blocks. /// /// The size of the data copied to memory is implicitly derived from the image extents. /// /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A /// destination region cannot be present more than once per CmdCopyImageToMemory() call. /// /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopySrc /// /// @param [in] srcImage Image where source data will be read from. /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] dstGpuMemory GPU memory where the destination data will be written. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. virtual void CmdCopyImageToMemory( const IImage& srcImage, ImageLayout srcImageLayout, const IGpuMemory& dstGpuMemory, uint32 regionCount, const MemoryImageCopyRegion* pRegions) = 0; /// Copies data directly (without format conversion) from a GPU memory object to a PRT. /// /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. /// This function always copies entire tiles, even if parts of the tile are internal padding. /// /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that /// operate in texels like the generic CmdCopyMemoryToImage() should be used instead. /// /// The size of the data copied from memory is implicitly derived from the image extents. /// /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination /// subresource cannot be present more than once per CmdCopyMemoryToTiledImage() call. /// /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopyDst /// /// @param [in] srcGpuMemory GPU memory where the source data is located. /// @param [in] dstImage Image where destination data will be written. Must have the "prt" flag set. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, a destination /// subresource, destination x/y/z offset, and copy size in the x/y/z dimensions. virtual void CmdCopyMemoryToTiledImage( const IGpuMemory& srcGpuMemory, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const MemoryTiledImageCopyRegion* pRegions) = 0; /// Copies data directly (without format conversion) from a PRT to a GPU memory object. /// /// The image offset and extents are in units of tiles. @see ImageMemoryLayout for the size of a tile in texels. /// This function always copies entire tiles, even if parts of the tile are internal padding. /// /// This function cannot be used to copy any subresources stored in the packed mip tail. Other copy functions that /// operate in texels like the generic CmdCopyImageToMemory() should be used instead. /// /// The size of the data copied to memory is implicitly derived from the image extents. /// /// The destination memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A /// destination region cannot be present more than once per CmdCopyTiledImageToMemory() call. /// /// Each region's imageOffset must be >= 0 and imageOffset + imageExtent must be <= imageSubres's extent. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopySrc /// /// @param [in] srcImage Image where source data will be read from. /// @param [in] srcImageLayout Current allowed usages and queues for the source image. These masks must include /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] dstGpuMemory GPU memory where the destination data will be written. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source /// subresource, source x/y/z offset, and copy size in the x/y/z dimensions. virtual void CmdCopyTiledImageToMemory( const IImage& srcImage, ImageLayout srcImageLayout, const IGpuMemory& dstGpuMemory, uint32 regionCount, const MemoryTiledImageCopyRegion* pRegions) = 0; /// Copies multiple regions directly (without format conversion) from one typed buffer to another. /// /// For compressed formats, the extents are specified in compression blocks. /// /// The buffer memory offsets have to be aligned to the smaller of their texel sizes or 4 bytes. /// /// None of the destination regions are allowed to overlap each other, nor are destination and source regions /// allowed to overlap when the source and destination GPU memory allocations are the same. Any illegal overlapping /// will cause undefined results. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// /// @param [in] srcGpuMemory GPU memory where the source data is located. /// @param [in] dstGpuMemory GPU memory where the destination data will be written. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a destination offset, a source offset, /// and copy size in the x/y/z dimensions. virtual void CmdCopyTypedBuffer( const IGpuMemory& srcGpuMemory, const IGpuMemory& dstGpuMemory, uint32 regionCount, const TypedBufferCopyRegion* pRegions) = 0; /// Copies data directly (without format conversion) from a 2D typed buffer to a 2D image. /// /// For compressed images, the extents are specified in compression blocks. /// /// The source memory offset has to be aligned to the smaller of the copied texel size or 4 bytes. A destination /// subresource cannot be present more than once per CmdScaledCopyTypedBufferToImage() call. /// /// MSAA resource is unsupported. The client must resolve both resources before calling this function. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopyDst /// /// @param [in] srcGpuMemory GPU memory where the source data is located. /// @param [in] dstImage Image where destination data will be written. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of copy regions, each entry specifying a source offset, copy size of source /// region, a destination offset, destination subresource, and copy size of destination /// region. virtual void CmdScaledCopyTypedBufferToImage( const IGpuMemory& srcGpuMemory, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const TypedBufferImageScaledCopyRegion* pRegions) = 0; /// Copies a GPU register content to a GPU memory location. /// /// The destination memory offset has to be aligned to 4 bytes. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherMemory /// /// @param [in] srcRegisterOffset Source register offset in bytes /// @param [in] dstGpuMemory GPU memory where the destination data will be written. /// @param [in] dstOffset Destination memory offset in bytes. virtual void CmdCopyRegisterToMemory( uint32 srcRegisterOffset, const IGpuMemory& dstGpuMemory, gpusize dstOffset) = 0; /// Copies multiple scaled regions from one image to another. /// /// The source and destination subresource of a particular region are not allowed to be the same, and will produce /// undefined results. Additionally, destination subresources cannot be present more than once per /// CmdScaledCopyImage() call. /// /// For compressed images, the compression block size is used as the pixel size. For compressed images, the image /// extents are specified in compression blocks. /// /// The source and destination images must to be of the same type (1D, 2D or 3D). Both single sampled images and /// MSAA images are supported. /// /// Linear texture filtering is only supported for images with non-integer formats. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. /// /// @param [in] copyInfo Specifies parameters needed to execute CmdScaledCopyImage. See /// @ref ScaledCopyInfo for more information. virtual void CmdScaledCopyImage( const ScaledCopyInfo& copyInfo) = 0; /// Automatically generates texture data for a range of subresources such that they may be used as intermediate /// images in a mipmap chain. The existing values in mip N are used to generate mip N+1. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the base mip, @ref CoherCopySrc and @ref CoherCopyDst for the others. /// - ImageLayout: @ref LayoutCopySrc for the base mip, @ref LayoutCopySrc and @ref LayoutCopyDst for the others. /// /// @param [in] genInfo The parameters for CmdGenerateMipmaps. See @ref GenMipmapsInfo for more information. virtual void CmdGenerateMipmaps( const GenMipmapsInfo& genInfo) = 0; /// Copies multiple scaled regions from one image to another, converting between RGB and YUV color spaces during /// the copy. The exact conversion between YUV and RGB is controlled by a caller-specified color-space-conversion /// table. /// /// The source and destination images must both be of the 2D type. Only single-sampled images are supported. /// One of the two images involved must have an RGB color format, and the other must have a YUV color format. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// - ImageLayout: @ref LayoutCopySrc for the source and @ref LayoutCopyDst for the destination. /// /// @param [in] srcImage Images where source region reside. If this is a YUV image, the destination must be /// RGB, and this copy will convert YUV to RGB. Otherwise, the destination must be YUV, /// and the copy will convert RGB to YUV. /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include /// LayoutCopySrc and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] dstImage Image where destination regions reside. If this is a YUV image, the source must be /// RGB, and this copy will convert RGB to YUV. Otherwise, the source must be YUV and /// the copy will convert YUV to RGB. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutCopyDst and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] regionCount Number of regions to copy; size of the pRegions array. /// @param [in] pRegions Array of conversion-copy regions, each entry specifying a source x/y/z offset, source /// x/y/z extent, destination x/y/z offset, destination x/y/z extent, RGB subresource and /// YUV subresource(s). /// @param [in] filter Texture filtering for shader sample instruction. /// @param [in] cscTable Color-space-conversion table which controls how YUV data is converted to a specific /// RGB representation and vice-versa. virtual void CmdColorSpaceConversionCopy( const IImage& srcImage, ImageLayout srcImageLayout, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const ColorSpaceConversionRegion* pRegions, TexFilter filter, const ColorSpaceConversionTable& cscTable) = 0; /// Clones data of one image object in another while preserving the image layout. /// /// The source and destination images must be created with identical creation parameters and must specify the /// cloneable flag. The clone operation clones all subresources. /// /// Both resources can be in any layout before the clone operation. After the clone, the source image state is left /// intact and the destination image layout becomes the same as the source. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopySrc for the source and @ref CoherCopyDst for the destination. /// /// @param [in] srcImage Source image. /// @param [in] dstImage Destination image. virtual void CmdCloneImageData( const IImage& srcImage, const IImage& dstImage) = 0; /// Directly updates a range of GPU memory with a small amount of host data. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopyDst /// /// The client is responsible for choosing the proper method for optimal performance. If updating data size is less /// equal than 8 bytes, CmdWriteImmediate() is preferred. /// /// @param [in] dstGpuMemory GPU memory object to be updated. /// @param [in] dstOffset Byte offset into the GPU memory object to be udpated. Must be a multiple of 4. /// @param [in] dataSize Amount of data to write, in bytes. Must be a multiple of 4. /// @param [in] pData Pointer to host data to be copied into the GPU memory. virtual void CmdUpdateMemory( const IGpuMemory& dstGpuMemory, gpusize dstOffset, gpusize dataSize, const uint32* pData) = 0; /// Updates marker surface with a DWORD value to indicate an event completion. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherCp /// /// @param [in] dstGpuMemory GPU memory object to be updated. /// @param [in] offset Byte offset into marker address /// @param [in] value Marker DWORD value to be copied to the bus addressable or external physical memory. virtual void CmdUpdateBusAddressableMemoryMarker( const IGpuMemory& dstGpuMemory, gpusize offset, uint32 value) = 0; /// Fills a range of GPU memory with the provided 32-bit data. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopyDst /// /// @param [in] dstGpuMemory GPU memory object to be filled. /// @param [in] dstOffset Byte offset into the GPU memory object to be filled. Must be a multiple of 4. /// @param [in] fillSize Size to fill, in bytes. Must be a multiple of 4. /// @param [in] data 32-bit value to be repeated in the filled range. virtual void CmdFillMemory( const IGpuMemory& dstGpuMemory, gpusize dstOffset, gpusize fillSize, uint32 data) = 0; /// Interprets a range of GPU memory as a color buffer and clears it to the specified clear color. /// /// The maximum clear range is determined by the buffer offset and buffer extent; if any Ranges are specified they /// must be specified in texels with respect to the beginning of the buffer and must not exceed its extent. /// With 96-bit formats, bufferOffset must be specified in bytes. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageCs is expected but the more general @ref PipelineStageBlt is also OK. /// - CacheCoherency: @ref CoherShader is expected but the more general @ref CoherClear is also OK. /// /// @param [in] gpuMemory GPU memory to be cleared. /// @param [in] color Specifies the clear color data and how to interpret it. /// @param [in] bufferFormat The format of the color data in the buffer. /// @param [in] bufferOffset The offset to the beginning of the buffer, in units of texels /// (or bytes for 96-bit texels). /// @param [in] bufferExtent The extent of the buffer, in units of texels. /// @param [in] rangeCount Number of ranges within the buffer to clear; size of the pRanges array. /// If zero, the entire view will be cleared and pRanges will be ignored. /// @param [in] pRanges Array of ranges within the GPU memory to clear. virtual void CmdClearColorBuffer( const IGpuMemory& gpuMemory, const ClearColor& color, SwizzledFormat bufferFormat, uint32 bufferOffset, uint32 bufferExtent, uint32 rangeCount = 0, const Range* pRanges = nullptr) = 0; /// Clears the currently bound color targets to the specified clear color. /// /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in /// Vulkan where the color attachments are inherited. /// /// This requires regionCount being specified since resource size is for sure to be known. The bound color targets /// shouldn't have UndefinedSwizzledFormat as their swizzle format. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageColorTarget /// - CacheCoherency: @ref CoherColorTarget /// - ImageLayout: @ref LayoutColorTarget /// /// @param [in] colorTargetCount Number of bound color target that needs to be cleared. /// @param [in] pBoundColorTargets Color target information for the bound color targets. /// @param [in] regionCount Number of volumes within the image to clear; size of the pClearRegions array. /// This need to be non-zero. /// @param [in] pClearRegions Array of volumes within the subresources to clear. virtual void CmdClearBoundColorTargets( uint32 colorTargetCount, const BoundColorTarget* pBoundColorTargets, uint32 regionCount, const ClearBoundTargetRegion* pClearRegions) = 0; /// Clears a color image to the specified clear color. /// /// If any Boxes have been specified, all subresource ranges must contain a single, identical mip level. /// /// The imageLayout can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick /// to layouts that are likely to support compression like @ref LayoutColorTarget. /// /// This function requires use of the following barrier flags if @ref flags includes @ref ColorClearAutoSync: /// - PipelineStage: @ref PipelineStageColorTarget /// - CacheCoherency: @ref CoherColorTarget /// - ImageLayout: @ref LayoutColorTarget /// Otherwise the following barrier flags must be used: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherClear /// /// @param [in] image Image to be cleared. /// @param [in] imageLayout Current allowed usages and engines for the target image. /// @param [in] color Specifies the clear color data and how to interpret it. /// @param [in] clearFormat If clearFormat.format is Undefined (e.g. if UndefinedSwizzledFormat is provided), do not /// reinterpret the subresources' formats. Otherwise, the subresources' formats will be /// reinterpreted according to this parameter. The specified format needs to have been /// included in the "pViewFormats" list specified at image-creation time, otherwise /// corruption may occur. /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. /// @param [in] pRanges Array of subresource ranges to clear. /// @param [in] boxCount Number of volumes within the image to clear; size of the pBoxes array. /// If zero, entire subresources will be cleared and pBoxes will be ignored. /// @param [in] pBoxes Array of volumes within the subresources to clear. /// @param [in] flags Mask of ClearColorImageFlags values controlling behavior of the clear. virtual void CmdClearColorImage( const IImage& image, ImageLayout imageLayout, const ClearColor& color, const SwizzledFormat& clearFormat, uint32 rangeCount, const SubresRange* pRanges, uint32 boxCount, const Box* pBoxes, uint32 flags) = 0; /// Clears the currently bound depth/stencil targets to the specified clear values. /// /// This will always result in a slow clear and should only be used when the actual image being cleared is unknown. /// In practice, this is the case when vkCmdClearColorAttachments() is called in a secondary command buffer in /// Vulkan where the color attachments are inherited. /// /// This requires regionCount being specified since resource size is for sure to be known. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget /// - CacheCoherency: @ref CoherDepthStencilTarget /// - ImageLayout: @ref LayoutDepthStencilTarget /// /// @param [in] depth Depth clear value. /// @param [in] stencil Stencil clear value. /// @param [in] stencilWriteMask Stencil write mask to clear specific stencil planes. /// @param [in] samples Sample count. /// @param [in] fragments Fragment count. /// @param [in] flag Select to depth, stencil or depth and stencil. /// @param [in] regionCount Number of volumes within the bound depth/stencil target to clear. /// @param [in] pClearRegions Array of volumes within the subresources to clear. virtual void CmdClearBoundDepthStencilTargets( float depth, uint8 stencil, uint8 stencilWriteMask, uint32 samples, uint32 fragments, DepthStencilSelectFlags flag, uint32 regionCount, const ClearBoundTargetRegion* pClearRegions) = 0; /// Clears a depth/stencil image to the specified clear values. /// /// If any Rects have been specified, all subresource ranges must contain a single, identical mip level. /// /// The layouts can include any valid layout (e.g. not @ref LayoutUninitializedTarget) but it is wise to stick to /// layouts that are likely to support compression like @ref LayoutDepthStencilTarget. /// /// This function requires use of the following barrier flags if @ref flags includes @ref DsClearAutoSync: /// - PipelineStage: @ref PipelineStageEarlyDsTarget and/or @ref PipelineStageLateDsTarget /// - CacheCoherency: @ref CoherDepthStencilTarget /// - ImageLayout: @ref LayoutDepthStencilTarget /// Otherwise the following barrier flags must be used: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherClear /// /// @param [in] image Image to be cleared. /// @param [in] depth Depth clear value. /// @param [in] depthLayout Current allowed usages and engines for the depth plane. /// @param [in] stencil Stencil clear value. /// @param [in] stencilWriteMask Write-mask to apply to the stencil subresource ranges during the clear. /// @param [in] stencilLayout Current allowed usages and engines for the stencil plane. /// @param [in] rangeCount Number of subresource ranges to clear; size of the pRanges array. /// @param [in] pRanges Array of subresource ranges to clear. /// @param [in] rectCount Number of areas within the image to clear; size of the pRects array. If zero, /// the entire subresources will be cleared and pRects will be ignored. /// @param [in] pRects Array of areas within the subresources to clear. /// @param [in] flags Mask of ClearDepthStencilFlags values controlling behavior of the clear. virtual void CmdClearDepthStencil( const IImage& image, ImageLayout depthLayout, ImageLayout stencilLayout, float depth, uint8 stencil, uint8 stencilWriteMask, uint32 rangeCount, const SubresRange* pRanges, uint32 rectCount, const Rect* pRects, uint32 flags) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 910 /// Clears a range of GPU memory to the specified clear color using the specified buffer view SRD. /// /// The maximum clear range is determined by the view; if any Ranges are specified they must fit within the view's /// range. The view must support shader writes. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageCs /// - CacheCoherency: @ref CoherShader /// /// @note You may use the more general @ref PipelineStageBlt and @ref CoherClear if you wish but they may result in /// higher barrier overhead. /// /// @param [in] gpuMemory GPU memory to be cleared. /// @param [in] color Specifies the clear color data and how to interpret it. /// @param [in] pBufferViewSrd The image view SRD that will be used to interpret the image. /// @param [in] rangeCount Number of ranges within the GPU memory to clear; size of the pRanges array. /// If zero, the entire view will be cleared and pRanges will be ignored. /// @param [in] pRanges Array of ranges within the GPU memory to clear. virtual void CmdClearBufferView( const IGpuMemory& gpuMemory, const ClearColor& color, const void* pBufferViewSrd, uint32 rangeCount = 0, const Range* pRanges = nullptr) = 0; /// Clears an image to the specified clear color using the specified image view SRD. /// /// The clear subresouce range is determined by the view; if any Rects have been specified, the image view must /// contain a single mip level. The view must support shader writes. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageCs but the more general @ref PipelineStageBlt is also OK. /// - CacheCoherency: @ref CoherShader but the more general @ref CoherClear is also OK. /// - ImageLayout: @ref LayoutShaderWrite /// /// @param [in] image Image to be cleared. /// @param [in] imageLayout Current allowed usages and engines for the image, must include LayoutShaderWrite. /// @param [in] color Specifies the clear color data and how to interpret it. /// @param [in] pImageViewSrd The image view SRD that will be used to interpret the image. /// @param [in] rectCount Number of volumes within the image to clear; size of the pRects array. /// If zero, entire subresources will be cleared and pRects will be ignored. /// @param [in] pRects Array of volumes within the subresources to clear. The begin and end slices to be /// cleard are from SubresRange in pImageViewSrd. virtual void CmdClearImageView( const IImage& image, ImageLayout imageLayout, const ClearColor& color, const void* pImageViewSrd, uint32 rectCount = 0, const Rect* pRects = nullptr) = 0; #endif /// Resolves multiple regions of a multisampled image to a single-sampled image. /// /// The source image must be a 2D multisampled image and the destination must be a single-sampled image. /// The formats of the source and destination images must match unless all regions specify a valid format. /// /// For color images, if the source image has an integer numeric format, a single sample is copied (sample 0). /// /// For depth/stencil images, the resolve is performed by simply copying sample 0 from every source pixel to the /// destination pixel. /// /// The same subresource may not appear more than once in the specified array of regions. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherResolveSrc for the source and @ref CoherResolveDst for the destination. /// - ImageLayout: @ref LayoutResolveSrc for the source and @ref LayoutResolveDst for the destination. /// /// @param [in] srcImage MSAA source image. /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must include /// LayoutResolveSrc and the ImageLayoutEngineFlags corresponding to the engine this /// function is being called on. /// @param [in] dstImage Single-sample destination image. /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutResolveDst and the ImageLayoutEngineFlags corresponding to the engine /// this function is being called on. /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. /// @param [in] resolveMode Resolve mode /// @param [in] pRegions Specifies src/dst subresources and rectangles. /// @param [in] flags Mask of ResolveImageFlags values controlling behavior of the resolve. virtual void CmdResolveImage( const IImage& srcImage, ImageLayout srcImageLayout, const IImage& dstImage, ImageLayout dstImageLayout, ResolveMode resolveMode, uint32 regionCount, const ImageResolveRegion* pRegions, uint32 flags) = 0; /// Resolves multiple regions of a Sampler Feedback map to another image. /// /// The source image must be: /// Decode: A 2D Feedback map, in which case the destination image will hold the decoded sampled data /// Encode: A texture, in which case the destination will hold the encoded sampler map /// /// The formats of the source and destination images must be 8bpp /// /// The same subresource may not appear more than once in the specified array of regions. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. /// - ImageLayout: @ref LayoutShaderRead for the source and @ref LayoutShaderWrite for the destination. /// /// @param [in] srcImage Source image, depends on resolve type /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine /// this function is being called on. /// @param [in] dstImage Destination image, depends on resolve type /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine /// this function is being called on. /// @param [in] resolveType Resolve type /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. /// @param [in] pRegions Specifies src/dst subresources and rectangles virtual void CmdResolvePrtPlusImage( const IImage& srcImage, ImageLayout srcImageLayout, const IImage& dstImage, ImageLayout dstImageLayout, PrtPlusResolveType resolveType, uint32 regionCount, const PrtPlusImageResolveRegion* pRegions) = 0; /// Encodes a buffer into a Sampler Feedback image. /// /// This interface only supports encoding raw data from a buffer to an encoded Sampler Feedback map /// /// The format of the data in the source buffer and destination image must be 8bpp /// /// The same subresource may not appear more than once in the specified array of regions. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. /// - ImageLayout: @ref LayoutShaderWrite for the destination. /// /// @param [in] srcBuffer Source buffer, depends on the resolve type /// @param [in] dstImage Destination image, depends on resolve type /// @param [in] dstImageLayout Current allowed usages and engines for the destination image. These masks must /// include LayoutShaderWrite and the ImageLayoutEngineFlags corresponding to the engine /// this function is being called on. /// @param [in] regionCount Number of regions to resolve, size of pRegions array /// @param [in] pRegions Specifies src/dst subresources and rectangles virtual void CmdResolvePrtPlusBufferToImage( const IGpuMemory& srcBuffer, const IImage& dstImage, ImageLayout dstImageLayout, uint32 regionCount, const PrtPlusBufferToImageResolveRegion* pRegions) = 0; /// Decodes a Sampler Feedback map to a buffer /// /// This interface only supports decoding a Sampler Feedback map to buffer /// /// The format of the data in the source image and destination buffer must be 8bpp /// /// The same subresource may not appear more than once in the specified array of regions. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherShaderRead for the source and @ref CoherShaderWrite for the destination. /// - ImageLayout: @ref LayoutShaderRead for the source /// /// @param [in] srcImage Source image, depends on the resolve type /// @param [in] srcImageLayout Current allowed usages and engines for the source image. These masks must /// include LayoutShaderRead and the ImageLayoutEngineFlags corresponding to the engine /// this function is being called on. /// @param [in] dstBuffer Destination buffer, depends on resolve type /// @param [in] regionCount Number of regions to resolve; size of the pRegions array. /// @param [in] pRegions Specifies src/dst subresources and rectangles. virtual void CmdResolvePrtPlusImageToBuffer( const IImage& srcImage, ImageLayout srcImageLayout, const IGpuMemory& dstBuffer, uint32 regionCount, const PrtPlusImageToBufferResolveRegion* pRegions) = 0; /// Puts the specified event into the _set_ state when all prior GPU work has progressed past the given stages. /// /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. /// /// @param [in] gpuEvent GPU event to be set. /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that /// restricts which stages of prior GPU work must happen before the event is set. The set /// will be performed at the earliest possible stage after the prior stages. virtual void CmdSetEvent( const IGpuEvent& gpuEvent, uint32 stageMask) = 0; /// Puts the specified event into the _reset_ state when all prior GPU work has progressed past the given stages. /// /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. /// /// @param [in] gpuEvent GPU event to be reset. /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that /// restricts which stages of prior GPU work must happen before the event is reset. The /// reset will be performed at the earliest possible stage after the prior stages. virtual void CmdResetEvent( const IGpuEvent& gpuEvent, uint32 stageMask) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 /// Puts the specified GPU event into the _set_ state when all previous GPU work reaches the specified point in the /// pipeline. /// /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. /// /// @param [in] gpuEvent GPU event to be set. /// @param [in] setPoint Point in the graphics pipeline where the GPU event will be _set_, indicating all prior /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't /// support this operation at the exact specified point, the set will be performed at the /// earliest possible point _after_ the specified point. inline void CmdSetEvent( const IGpuEvent& gpuEvent, HwPipePoint setPoint) { CmdSetEvent(gpuEvent, HwPipePointToStage[setPoint]); } /// Puts the specified GPU event into the _reset_ state when all previous GPU work reaches the specified point in /// the pipeline. /// /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. /// /// @param [in] gpuEvent GPU event to be reset. /// @param [in] resetPoint Point in the graphics pipeline where the GPU event will be _reset_, indicating all prior /// issued GPU work has reached at least this point in the pipeline. If the GPU doesn't /// support this operation at the exact specified point, the reset will be performed at the /// earliest possible point _after_ the specified point. inline void CmdResetEvent( const IGpuEvent& gpuEvent, HwPipePoint resetPoint) { CmdResetEvent(gpuEvent, HwPipePointToStage[resetPoint]); } #endif /// Predicate the subsequent jobs in the command buffer if the event is set. /// /// @param [in] gpuEvent GPU event to be checked. virtual void CmdPredicateEvent( const IGpuEvent& gpuEvent) = 0; /// Performs the specified 32- or 64-bit memory operation. These operations are atomic with respect to shader /// atomic operations. /// /// The data size (32-bit or 64-bit) is determined by the operation type. For 32-bit atomics, only the lower /// 32-bits of srcData is used. /// /// The destination GPU memory offset must be 4-byte aligned for 32-bit atomics and 8-byte aligned for 64-bit /// atomics. /// /// This function requires use of the following barrier flags: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherQueueAtomic /// /// @param [in] dstGpuMemory Destination GPU memory object. /// @param [in] dstOffset Offset into the memory object where the atomic will be performed. /// @param [in] srcData Source data for the atomic operation. Use depends on the atomicOp. /// @param [in] atomicOp Specifies which atomic operation to perform. @see AtomicOp. virtual void CmdMemoryAtomic( const IGpuMemory& dstGpuMemory, gpusize dstOffset, uint64 srcData, AtomicOp atomicOp) = 0; /// Starts a query operation for the given slot of a query pool. /// /// The query slot must have been previously cleared with CmdResetQueryPool() before starting a query. /// /// @note Queries may not span multiple command buffers. /// /// @param [in] queryPool Query pool for this query. /// @param [in] queryType The type of query this operation will produce. /// @param [in] slot Slot in pQueryPool where the results of this query should be accumulated. /// @param [in] flags Flags controlling query behavior. @see QueryControlFlags. virtual void CmdBeginQuery( const IQueryPool& queryPool, QueryType queryType, uint32 slot, QueryControlFlags flags) = 0; /// Stops a query operation for the given slot of a query pool. /// /// The query slot must have an open query on it when this is called. /// /// @param [in] queryPool Query pool for this query. /// @param [in] queryType The type of query this operation will produce. /// @param [in] slot Slot in pQueryPool where the query is running. virtual void CmdEndQuery( const IQueryPool& queryPool, QueryType queryType, uint32 slot) = 0; /// Resolves the results of a range of queries to the specified query type into the specified GPU memory location. /// /// This function requires use of the following barrier flags on @ref dstGpuMemory: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopyDst /// /// This operation does not honor the command buffer's predication state, if active. /// /// @param [in] queryPool Query pool holding the source queries. /// @param [in] flags Flags that control the result data layout and how the results are retrieved. /// @param [in] queryType The type of queries this resolve will produce. /// @param [in] startQuery First slot in pQueryPool to resolve. /// @param [in] queryCount Number of query pool slots to resolve. /// @param [in] dstGpuMemory Destination GPU memory object. /// @param [in] dstOffset 4-byte aligned offset into pDstGpuMemory where the results should be written. /// @param [in] dstStride 4-byte aligned stride between where results are written into pDstGpuMemory. virtual void CmdResolveQuery( const IQueryPool& queryPool, QueryResultFlags flags, QueryType queryType, uint32 startQuery, uint32 queryCount, const IGpuMemory& dstGpuMemory, gpusize dstOffset, gpusize dstStride) = 0; /// Rests a range of slots in a query pool. A query slot must be reset each time before a query can be started /// using that slot. /// /// @param [in] queryPool Query pool to be reset. /// @param [in] startQuery First slot in pQueryPool to be reset. /// @param [in] queryCount Number of slots to reset. virtual void CmdResetQueryPool( const IQueryPool& queryPool, uint32 startQuery, uint32 queryCount) = 0; /// Writes a GPU performance timestamp to memory when all prior GPU work has progressed past the given stages. /// /// The timestamp data is a 64-bit value that increments once per clock. @ref timestampFrequency in DeviceProperties /// reports the frequency the timestamps are clocked at. Timestamps are only supported by engines that report /// @ref supportsTimestamps in DeviceProperties. /// /// This function requires use of the following barrier flags: /// - PipelineStage: the same flag(s) specified in @ref stageMask. /// - CacheCoherency: @ref CoherTimestamp /// /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. /// /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope that /// restricts which stages of prior GPU work must happen before the timestamp is written. /// The timestamp will be performed at the earliest possible stage after the prior stages. /// Note that the SDMA engine only supports bottom-of-pipe timestamps. /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to /// minTimestampAlignment in DeviceProperties. virtual void CmdWriteTimestamp( uint32 stageMask, const IGpuMemory& dstGpuMemory, gpusize dstOffset) = 0; /// Writes an immediate value to memory when all prior GPU work has progressed past the given stages. /// /// This function requires use of the following barrier flags: /// - PipelineStage: the same flag(s) specified in @ref stageMask. /// - CacheCoherency: @ref CoherCp /// /// @note Clients should use this version if they're using the CmdRelease/Acquire APIs. /// /// @param [in] stageMask A bitmask of @ref PipelineStageFlag values which defines a synchronization scope /// that restricts which stages of prior GPU work must happen before the immediate /// value is written. The write will be occur at the earliest possible stage after /// the prior stages. Note that the SDMA engine only supports bottom-of-pipe writes. /// @param [in] data Value to be written to gpu address. /// @param [in] ImmediateDataWidth Size of the data to be written out. /// @param [in] address GPU address where immediate value should be written. virtual void CmdWriteImmediate( uint32 stageMask, uint64 data, ImmediateDataWidth dataSize, gpusize address) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 /// Writes a HwPipePostPrefetch or HwPipeBottom timestamp to the specified memory location. /// /// The timestamp data is a 64-bit value that increments once per clock. timestampFrequency in DeviceProperties /// reports the frequency the timestamps are clocked at. /// /// Timestamps are only supported by engines that report supportsTimestamps in DeviceProperties. /// /// This function requires use of the following barrier flags: /// - PipelineStage: the same flag specified in @ref pipePoint. /// - CacheCoherency: @ref CoherTimestamp /// /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. /// /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. The only /// valid choices are HwPipePostPrefetch and HwPipeBottom. HwPipePostPrefetch timestamps /// are not supported on the SDMA engine, so all timestamps will be executed as /// bottom-of-pipe. /// @param [in] dstGpuMemory GPU memory object where timestamp should be written. /// @param [in] dstOffset Offset into pDstGpuMemory where the timestamp should be written. Must be aligned to /// minTimestampAlignment in DeviceProperties. inline void CmdWriteTimestamp( HwPipePoint pipePoint, const IGpuMemory& dstGpuMemory, gpusize dstOffset) { CmdWriteTimestamp(HwPipePointToStage[pipePoint], dstGpuMemory, dstOffset); } /// Writes a top-of-pipe or bottom-of-pipe immediate value to the specified memory location. /// /// This function requires use of the following barrier flags: /// - PipelineStage: the same flag specified in @ref pipePoint. /// - CacheCoherency: @ref CoherCp /// /// @note Clients may use this version if they're using the legacy @ref CmdBarrier API. /// /// @param [in] pipePoint Specifies where in the pipeline the timestamp should be sampled and written. /// The only valid choices are HwPipeTop, HwPipePostPrefetch and HwPipeBottom. /// Top-of-pipe timestamps are not supported on the SDMA engine, so all timestamps /// will be executed as bottom-of-pipe. /// @param [in] data Value to be written to gpu address. /// @param [in] ImmediateDataWidth Size of the data to be written out. /// @param [in] address GPU address where immediate value should be written. inline void CmdWriteImmediate( HwPipePoint pipePoint, uint64 data, ImmediateDataWidth dataSize, gpusize address) { CmdWriteImmediate(HwPipePointToStage[pipePoint], data, dataSize, address); } #endif /// Loads the current stream-out buffer-filled-sizes stored on the GPU from memory, typically from a target of a /// prior CmdSaveBufferFilledSizes() call. /// /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherCp /// /// @param [in] gpuVirtAddr Array of GPU virtual addresses to load each counter from. If any of these are zero, /// the corresponding filled-size counter is not loaded. virtual void CmdLoadBufferFilledSizes( const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; /// Saves the current stream-out buffer-filled-sizes into GPU memory. /// /// Note that barriers shouldn't be necessary in normal stream-out workflows. However, if the client wishes to use /// the @ref gpuVirtAddr allocations in shaders or PAL blts they must use the following barrier flags: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherCp /// /// @param [in] gpuVirtAddr Array of GPU virtual addresses to save each counter into. If any of these are zero, /// the corresponding filled-size counter is not saved. virtual void CmdSaveBufferFilledSizes( const gpusize (&gpuVirtAddr)[MaxStreamOutTargets]) = 0; /// Set the offset to buffer-filled-size for a stream-out target. /// /// @param [in] bufferId Stream-out buffer ID, it could be in the range [0, MaxStreamOutTargets). /// @param [in] offset The value to be written into the buffer filled size counter. /// virtual void CmdSetBufferFilledSize( uint32 bufferId, uint32 offset) = 0; /// Binds the specified border color palette for use by samplers. /// /// @param [in] pipelineBindPoint Specifies which pipeline type is affected (i.e., graphics or compute). /// @param [in] pPalette Border color palette object to bind. virtual void CmdBindBorderColorPalette( PipelineBindPoint pipelineBindPoint, const IBorderColorPalette* pPalette) = 0; /// Sets predication for this command buffer to use the specified GPU memory location. Any draw, dispatch or copy /// operation between this command and the corresponding reset/disable call will be skipped if the value in spec- /// ified location matches the passed-in predicated value /// /// This function requires use of the following barrier flags on @ref pGpuMemory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @param [in] pQueryPool pointer to QueryPool obj, not-nullptr means this is a QueryPool based predication /// - Zpass/Occlusion based predication /// - or PrimCount/Streamout based predication /// @param [in] slot Slot to use for setting occlusion predication, valid when pQueryPool is not nullptr /// @param [in] pGpuMemory GPU memory object for the predication value, only valid when pQueryPool is nullptr /// @param [in] offset GPU memory offset for the predication value /// @param [in] predType Predication type. /// @param [in] predPolarity Controls the polarity of the predication test /// true = draw_if_visible_or_no_overflow /// false = draw_if_not_visible_or_overflow /// @param [in] waitResults Hint only valid for Zpass/Occlusion. /// false = wait_until_final_zpass_written /// true = draw_if_not_final_zpass_written /// @param [in] accumulateData true(1) = allow_accumulation of Zpass and PrimCount across command buffer boundaries. /// /// pQueryPool and gpuVirtAddr should be exclusively set, when both are nullptr/0, other params will be ignored /// and it means to reset/disable predication so that the following commands can perform normally. virtual void CmdSetPredication( IQueryPool* pQueryPool, uint32 slot, const IGpuMemory* pGpuMemory, gpusize offset, PredicateType predType, bool predPolarity, bool waitResults, bool accumulateData) = 0; /// Suspend/resume any active predication for this command buffer /// /// @param [in] suspend Controls if predication should be paused /// true = suspend active predication /// false = resume active predication /// /// Any suspended predication must be resumed prior to disabling predication using CmdSetPredication with pQueryPool /// and gpuVirtAddr with nullptr/0. This is only valid on universal and compute command buffers. virtual void CmdSuspendPredication( bool suspend) = 0; /// Begins a conditional block in the current command buffer. All commands between this and the corresponding /// CmdEndIf() (or CmdElse() if it is present) command are executed if the specified condition is true. /// /// This function requires use of the following barrier flags on @ref gpuMemory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. /// @param [in] offset Offset within the memory object where the tested memory location begins. /// @param [in] data Source data to compare against the value in GPU memory. /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. /// @param [in] compareFunc Function controlling how the data operands are compared. virtual void CmdIf( const IGpuMemory& gpuMemory, gpusize offset, uint64 data, uint64 mask, CompareFunc compareFunc) = 0; /// Begins a conditional block in the current command buffer. All commands between this and the corresponding /// CmdEndIf() command are executed if the condition specified in the innermost active conditional block are false. virtual void CmdElse() = 0; /// Ends the innermost active conditional block in the current command buffer. virtual void CmdEndIf() = 0; /// Begins a while loop in the current command buffer. All commands between this and the corresponding CmdEndWhile() /// command are executed repeatedly as long as the specified condition remains true. /// /// This function requires use of the following barrier flags on @ref gpuMemory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. /// @param [in] offset Offset within the memory object where the tested memory location begins. /// @param [in] data Source data to compare against the value in GPU memory. /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. /// @param [in] compareFunc Function controlling how the data operands are compared. virtual void CmdWhile( const IGpuMemory& gpuMemory, gpusize offset, uint64 data, uint64 mask, CompareFunc compareFunc) = 0; /// Ends the innermost active while loop in the current command buffer. virtual void CmdEndWhile() = 0; /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a /// GPU register. /// /// The client (or application) is supposed to do necessary barriers before calling this function, but for now this /// is only need to wait some display or timer related registers. /// /// @param [in] registerOffset The offset in bytes of GPU register to be tested. /// @param [in] data Source data to compare against the value of GPU register. /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never shouldn't /// be used as the hardware does not support it. virtual void CmdWaitRegisterValue( uint32 registerOffset, uint32 data, uint32 mask, CompareFunc compareFunc) = 0; /// Stalls a command buffer execution based on a condition that compares an immediate value with value coming from a /// GPU memory location. /// /// The client (or application) is expected to transiton the memory to proper state before calling this function. /// The memory location for the condition must be 4-byte aligned. /// This function requires use of the following barrier flags on @ref gpuVirtAddr: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherCp /// /// @param [in] gpuVirtAddr GPU memory address containing the data to be tested. /// @param [in] data Source data to compare against the value in GPU memory. /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not /// be used as the hardware does not support it. virtual void CmdWaitMemoryValue( gpusize gpuVirtAddr, uint32 data, uint32 mask, CompareFunc compareFunc) = 0; /// Stalls a command buffer execution until an external device writes to the marker surface in the GPU bus /// addressable memory location. /// /// This function requires use of the following barrier flags on @ref gpuMemory: /// - PipelineStage: @ref PipelineStagePostPrefetch /// - CacheCoherency: @ref CoherCp /// /// @param [in] gpuMemory GPU memory object containing the memory location to be tested. /// @param [in] data Source data to compare against the value in GPU memory. /// @param [in] mask Mask to apply to the GPU memory (via bitwise AND) prior to comparison. /// @param [in] compareFunc Function controlling how the data operands are compared. CompareFunc::Never should not /// be used as the hardware does not support it. virtual void CmdWaitBusAddressableMemoryMarker( const IGpuMemory& gpuMemory, uint32 data, uint32 mask, CompareFunc compareFunc) = 0; /// Begins the specified performance experiment. /// /// @param [in] pPerfExperiment Performance experiment to begin. virtual void CmdBeginPerfExperiment( IPerfExperiment* pPerfExperiment) = 0; /// Updates the sqtt token mask on the specified performance experiment. /// /// @param [in] pPerfExperiment Performance experiment to update. /// @param [in] tokenConfig updated token and reg mask to apply. /// /// @note: This function is only valid to call if pPerfExperiment is a thread trace experiment that is currently // active. virtual void CmdUpdatePerfExperimentSqttTokenMask( IPerfExperiment* pPerfExperiment, const ThreadTraceTokenConfig& tokenConfig) = 0; /// Updates the sqtt token mask on all running traces, if any. /// /// @note This may overwrite the stall settings (making them more conservative) /// @param [in] tokenConfig updated token and reg mask to apply. virtual void CmdUpdateSqttTokenMask( const ThreadTraceTokenConfig& tokenConfig) = 0; /// Ends the specified performance experiment. /// /// @param [in] pPerfExperiment Performance experiment to end. virtual void CmdEndPerfExperiment( IPerfExperiment* pPerfExperiment) = 0; /// Inserts a trace marker into the command buffer. /// /// A trace marker can be inserted to mark particular points of interest in a command buffer to be viewed with the /// trace data collected in a performance experiment. /// /// @param [in] markerType Selects one of two generic marker categories ("A" or "B"). /// @param [in] markerData 32-bit marker value to be inserted. virtual void CmdInsertTraceMarker( PerfTraceMarkerType markerType, uint32 markerData) = 0; /// Inserts a set of SQ thread trace markers for consumption by the Radeon GPU Profiler (RGP). /// /// Only supported on Universal and Compute engines. /// /// @param [in] numDwords Number of dwords in pData to be inserted as SQTT markers. /// @param [in] pData SQTT marker data. See the RGP SQTT Instrumentation Specification for details on how this /// data should be formatted. virtual void CmdInsertRgpTraceMarker( RgpMarkerSubQueueFlags subQueueFlags, uint32 numDwords, const void* pData) = 0; /// This function is to be used to copy the DF SPM (MALL SPM) data from the output buffers to an accessible buffer. /// The buffer that HW outputs to is allocated with a special KMD flag and therefore cannot be the same as the /// normal IPerfExperiment buffer so we need a special command to get the data. /// /// The bulk of the implementation for this is done by the KMD. They are in charge of starting and stopping the /// trace as well as all of the register programming. When KMD recieves a dfSpmTraceEnd bit from a CmdBufInfo /// flag, they will wait for the command buffer to be completely idle before stopping the trace. Therefore, a /// CmdEndPerfExperiment call does not stop this particular sample, the end of a command buffer with a /// dfSpmTraceEnd does. This means that calling CmdCopyDfSpmTraceData in the same command buffer as /// dfSpmTraceEnd will give you incorrect data. The sample will still be in progress when the copy happens. /// You must call CmdCopyDfSpmTraceData in a separate command buffer after one where the dfSpmTraceEnd bit is /// set. /// /// There is also a metadata buffer that does not need a special KMD flag. It is also stored in a separate buffer /// and is copied along with the output buffer with this command. It contains a uint32 trace size, a uint32 pad, /// a uint64 start trace GPU timestamp and a uint64 stop trace GPU timestamp and is placed at the beginning of the /// dstGpuMemory. /// /// The minimum size of the dstGpuMemory should be the size of the metadata struct plus the size of the DF SPM /// ringSize given to the perf experiment. The SPM data may not fill the entire memory, but the client is /// responsible for parsing the data. /// /// This function requires use of the following barrier flags on @ref dstGpuMemory: /// - PipelineStage: @ref PipelineStageBlt /// - CacheCoherency: @ref CoherCopyDst /// /// @param [in] perfExperiment The perfExperiment that we will be copying the data from /// @param [in] dstGpuMemory The memory location that the DF SPM trace data will be copied to. /// @param [in] dstOffset The offset into the destination memory that the data will be copied to. virtual void CmdCopyDfSpmTraceData( const IPerfExperiment& perfExperiment, const IGpuMemory& dstGpuMemory, gpusize dstOffset) = 0; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 914 /// Loads data from the provided GPU Memory object into Constant Engine RAM. /// /// @param [in] srcGpuMemory GPU Memory object containing the source data to be loaded to CE RAM. /// @param [in] memOffset Offset within the memory object where the source data is located, /// must be 32-byte aligned. /// @param [in] ramOffset Byte offset destination in CE RAM where the data should be loaded, /// must be 32-byte aligned. /// @param [in] dwordSize Number of DWORDs that should be loaded into CE RAM, must be a multiple of 8. void CmdLoadCeRam( const IGpuMemory& srcGpuMemory, gpusize memOffset, uint32 ramOffset, uint32 dwordSize) {} /// Dumps data from Constant Engine RAM to the provided GPU Memory address which may be located in a GPU ring buffer /// managed by the CE. The CE can be used to automatically handle the synchronization between the DE and CE when /// manipulating a GPU ring buffer. In order for PAL to instruct the CE to handle this, we need to know the current /// position (entry) within the ring buffer being dumped to, as well as the total size (in entries) of the ring. /// /// @param [in] dstGpuMemory GPU Memory object destination where the data should be dumped from CE RAM. /// @param [in] memOffset Offset within the memory object where data should be dumped, must be 4 byte aligned. /// @param [in] ramOffset Byte offset source in CE RAM for data that should be dumped, must be 4 byte aligned. /// @param [in] dwordSize Number of DWORDs that should be dumped from CE RAM into GPU Memory /// @param [in] currRingPos Current position (ring entry) in the GPU ring buffer being managed by the CE which the /// dump location corresponds to. /// @param [in] ringSize Number of entries in the GPU ring buffer being managed by the CE. If the memory being /// dumped into is not managed in a ring-like fashion, this should be set to zero. void CmdDumpCeRam( const IGpuMemory& dstGpuMemory, gpusize memOffset, uint32 ramOffset, uint32 dwordSize, uint32 currRingPos, uint32 ringSize) {} /// Writes CPU data to Constant Engine RAM /// /// @param [in] pSrcData Pointer to the source CPU data to be written to CE RAM. /// @param [in] ramOffset Byte offset in CE RAM where the data should be written, must be 4 byte aligned. /// @param [in] dwordSize Number of DWORDs that should be written from pSrcData into CE RAM. virtual void CmdWriteCeRam( const void* pSrcData, uint32 ramOffset, uint32 dwordSize) {} #endif /// Allocates a chunk of command space that the client can use to embed constant data directly in the command /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within /// this command buffer (e.g., as an SRD table address). /// /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the /// value reported by GetEmbeddedDataLimit(). /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. /// @param [out] pGpuAddress The GPU address of the embedded space. /// /// @returns The DWORD-aligned CPU address of the embedded space. virtual uint32* CmdAllocateEmbeddedData( uint32 sizeInDwords, uint32 alignmentInDwords, gpusize* pGpuAddress) = 0; /// Allocates a chunk of command space that the client can use to embed constant data directly in the command /// buffer's backing memory. The returned CPU address is valid until ICmdBuffer::End() is called. The GPU address /// is valid until ICmdBuffer::Reset() or ICmdBuffer::Begin() and must only be referenced by work contained within /// this command buffer (e.g., as an SRD table address). /// /// @param [in] sizeInDwords Size of the embedded data space in DWORDs. It must be less than or equal to the /// value reported by GetLargeEmbeddedDataLimit(). /// @param [in] alignmentInDwords Minimum GPU address alignment of the embedded space in DWORDs. /// @param [out] pGpuAddress The GPU address of the embedded space. /// /// @returns The DWORD-aligned CPU address of the embedded space. virtual uint32* CmdAllocateLargeEmbeddedData( uint32 sizeInDwords, uint32 alignmentInDwords, gpusize* pGpuAddress) = 0; /// Get memory from scratch memory and bind to GPU event. For now only GpuEventPool and CmdBuffer's internal /// GpuEvent use this path to allocate and bind GPU memory. These usecases assume the bound GPU memory is GPU access /// only, so client is responsible for resetting the event from GPU, and cannot call Set(), Reset(), GetStatus(). /// /// @param [in] pGpuEvent The GPU event that needs to bind a memory. Must not be nullptr. /// /// @returns Success if the GPU event successfully binds a GPU memory. Otherwise, one of the following errors may /// be returned: /// + ErrorUnknown if an internal PAL error occurs. virtual Result AllocateAndBindGpuMemToEvent( IGpuEvent* pGpuEvent) = 0; /// Issues commands to prime GPU caches shortly before accessing the specified GPU address range(s). The benefit of /// this prefetching is likely to be platform-dependent based on the GPU's cache hierarchy, memory subsystem, /// available prefetching tools in hardware, etc., so caller beware. /// /// This operation may read data from memory into caches and therefore counts as a general BLT SRC operation with /// regard to barrier execution and memory dependencies. /// /// @param [in] rangeCount Number of entries in pRanges. /// @param [in] pRanges Array of structs defining a memory range and properties controlling prefetching of that /// range. virtual void CmdPrimeGpuCaches( uint32 rangeCount, const PrimeGpuCacheRange* pRanges) = 0; /// Issues commands which execute the specified group of nested command buffers. The observable behavior of this /// operation should be indiscernible from directly recording the nested command buffers' commands directly into /// this command buffer. Naturally, the queue type of the nested command buffers must match this command buffer. /// /// Conceptually, executing a nested command buffer is similar to calling a subroutine: the root command buffer is /// like the "caller", while the nested ones are the "callees". /// /// If any nested command buffers were allocated from a @ref ICmdAllocator with @ref autoMemoryReuse enabled, /// resetting or destroying those nested command buffers will render them retroactively uncallable. This effectively /// makes the caller command buffer invalid and illegal to submit even if it was otherwise valid and executable in /// the past. If the nested command allocator has autoMemoryReuse disabled, the calls to reset nested command /// buffers remain valid until the allocator itself is reset. /// /// State inheritance/leakage between the caller and callee(s) has the following behavior: /// + The callee only inherits the state specified in the callee CmdBufferBuildInfo. It is up to the client to /// bind any default state necessary when they called @ref ICmdBuffer::Begin() to begin building the callee. /// By default no state is inherited and all state must be specified by the client. /// + The callee leaks any render and resource-binding state back into the caller after it completes. It is up to /// the client to rebind the caller's state after this operation completes if they don't want state leakage. /// + Both of the above points apply in between callees, if more than one command buffer is being executed by this /// call. /// /// @param [in] cmdBufferCount Number of nested command buffers to execute. (i.e., size of the ppCmdBuffers /// array). This must be at least one, otherwise making this call is pointless. /// @param [in,out] ppCmdBuffers Array of nested command buffers to execute. It is an error condition if any /// of the following are true: (Debug assertions are used to check them.) /// + ppCmdBuffers is null. /// + Any member of ppCmdBuffers is null. /// + Any member of ppCmdBuffers is a root command buffer, or has a different /// queue type than this command buffer. virtual void CmdExecuteNestedCmdBuffers( uint32 cmdBufferCount, ICmdBuffer*const* ppCmdBuffers) = 0; /// Saves a copy of some set of the current command buffer state that is used by compute workloads. This feature is /// intended to give PAL clients a convenient way to issue their own internal compute workloads without modifying /// the application-facing state. /// /// PAL cannot save multiple layers of state, each call to CmdSaveComputeState must be followed by a call to /// CmdRestoreComputeState before the next call to CmdSaveComputeState. /// /// This function can only be called on command buffers that support compute workloads. All query counters will be /// disabled until CmdRestoreComputeState is called. /// /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to save. virtual void CmdSaveComputeState( uint32 stateFlags) = 0; /// Restores some set of the command buffer state that is used by compute workloads. This feature is intended to /// give PAL clients a convenient way to issue their own internal compute workloads without modifying the /// application-facing state. /// /// A call to this function must be preceded by a call to CmdSaveComputeState and the save stateFlags must contain /// all restore stateFlags, otherwise the values of the restored state are undefined. /// /// This function can only be called on command buffers that support compute workloads. All previously disabled /// query counters will be reactivated. /// /// @param [in] stateFlags A mask of ORed @ref ComputeStateFlags indicating which state to restore. virtual void CmdRestoreComputeState( uint32 stateFlags) = 0; /// Issues commands which complete two tasks: using the provided @ref IIndirectCmdGenerator object to translate the /// indirect argument buffer into a format understandable by the GPU; and then executing the generated commands. /// /// The virtual address must be 4-byte aligned. /// /// The indirect argument data offset in memory must be 4-byte aligned. The expected layout of the argument data /// is defined by the @ref IIndirectCmdGenerator object. /// /// It is unsafe to call this method on a command buffer which was not begun with either the optimizeOneTimeSubmit /// or optimizeExclusiveSubmit flags. This is because there is a potential race condition if the same command buffer /// is generating indirect commands on multiple Queues simultaneously. /// /// This function requires use of the following barrier flags on the indirect memory: /// - PipelineStage: @ref PipelineStageFetchIndirectArgs /// - CacheCoherency: @ref CoherIndirectArgs /// /// @param [in] generator Indirect command generator object which can translate the indirect argument buffer /// into a command buffer format which the GPU can understand. /// @param [in] gpuVirtAddr Gpu virtual address where the indirect argument data is located. /// @param [in] maximumCount Maximum count of data structures to loop through. If countGpuAddr is nonzero, the /// value at that memory location is clamped to this maximum. If countGpuAddr is zero, /// Then the number of draws issued exactly matches this number. /// @param [in] countGpuAddr GPU virtual address where the number of draws is stored. Must be 4-byte aligned. virtual void CmdExecuteIndirectCmds( const IIndirectCmdGenerator& generator, gpusize gpuVirtAddr, uint32 maximumCount, gpusize countGpuAddr) = 0; /// Updates one or more HiS pretests bound to the given stencil image within a range of mip levels. /// See @ref HiSPretests for a summary of HiS. /// /// @warning Improper use of pretests can cause corruption. Please see @ref HiSPretests for more information. /// /// @param [in] image The stencil image that will receive the new pretest(s). /// @param [in] pretests The new pretest(s). /// @param [in] firstMip The beginning of the mip range which will receive the new pretest(s). /// @param [in] numMips The number of mips in the mip range which will receive the new pretest(s). virtual void CmdUpdateHiSPretests( const IImage* pImage, const HiSPretests& pretests, uint32 firstMip, uint32 numMips) = 0; /// Reserve @ref CommandDataAlloc space for external command packets up to a size of @ref sizeInDwords. /// This method is only supported on command buffers for the following queue types: /// /// @warning @ref CmdCommitSpace must be called once after this function is called. // Failing to pair up these function calls will result in undefined behavior. /// /// @param [in] sizeInDwords Size of the command buffer space to reserve in dwords. /// If this param is 0, the default command stream reserve limit will be used. /// @param [in] reserveInNewChunk Selection to reserve space in a new chunk or current chunk. /// /// @returns A pointer to the reserved command space. virtual uint32* CmdReserveSpace( uint32 sizeInDwords, bool reserveInNewChunk) = 0; /// Ensure data is commited the command buffer and unused space is reclaimed. /// This method is only supported on command buffers for the following queue types: /// /// @param [in] pCmdSpace Pointer to the next unused dword in the command buffer. virtual void CmdCommitSpace( uint32* pCmdSpace) = 0; /// Executes any internal postprocessing commands to be performed on a frame, such as drawing the dev driver /// overlay. Calling this prior to presenting (via any path) is a requirement, and must be prior to or /// concurrent with frameEnd if FSFM is applicable. This must be called using the image that will be the /// source of the present. /// /// @param [in] postProcessInfo Information about the frame to be postprocessed. /// @param [out] pAddedGpuWork (Optional) Set to true if commands were added as part of this call. virtual void CmdPostProcessFrame( const CmdPostProcessFrameInfo& postProcessInfo, bool* pAddedGpuWork) = 0; /// Inserts a string embedded inside a NOP packet with a signature that is recognized by tools and can be printed /// inside a command buffer disassembly. Note that this is a real NOP that will really be submitted to the GPU /// and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug dumps. /// /// The maximum length of a string that may be embedded in the command buffer is currently 128 characters, /// including the NUL-terminator. This is defined in the internal command buffer class in MaxCommentStringLength. /// /// @param [in] pComment Pointer to NUL-terminated string that will be inserted into the command buffer. virtual void CmdCommentString( const char* pComment) = 0; /// Inserts the specified payload embedded inside a NOP packet. Note that this is a real NOP that will be submitted /// to the GPU and executed (skipped over) by CP. It will be visible in kernel debugging as well as offline debug /// dumps. /// /// @param [in] pPayload Pointer to binary data to embed. /// @param [in] payloadSize Size of the payload in DWORDs, expected to be under MaxPayloadSize. virtual void CmdNop( const void* pPayload, uint32 payloadSize) = 0; /// Marks the begin or end of a user-defined region of GPU work; analyzed post-mortem in crash-dump analysis tools. /// Each 'Begin' marker must be paired with a corresponding 'End' marker; however, markers may be nested by /// inserting multiple 'Begin' markers consecutively. /// /// @warning This function is a no-op if Crash Analysis mode is not enabled. /// /// @param [in] isBegin Whether this is a 'Begin' marker (true) or an 'End' marker (false). /// @param [in] sourceId The application layer ID at which the marker is being created: /// 0x0 => Application /// 0x1 => API (e.g. DX12, Vulkan, etc.) /// 0x2 => PAL /// Developers may use IDs within the range of 10 - 15 to define a custom /// application layer. /// @param [in] pMarkerName A NULL-terminated string containing a name for this marker, used for annotation /// purposes in external tools. Only valid for 'Begin' markers, and will be ignored if /// isBeginMarker is false. /// @param [in] markerNameSize Size of the marker string, in bytes. /// /// @returns Non-zero counter value of the embedded execution marker. /// If Crash Analysis mode is disabled, this will always return zero. virtual uint32 CmdInsertExecutionMarker( bool isBegin, uint8 sourceId, const char* pMarkerName, uint32 markerNameSize) = 0; /// Performs the virtual queue handshake. The host queue will do the following: /// - Wait until the parent kernel is done /// - Change the parent kernel state /// - CP waits until the child counter is 0 /// - CP sends the termination signal to the device queue /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. /// /// @param [in] parentState Address of the parent kernel state /// @param [in] newStateValue The new state value of the parent kernel /// @param [in] parentChildCounter Address of the parent child counter /// @param [in] signal Address of the virtual queue signal /// @param [in] dedicatedQueue Runtime uses a dedicated queue for the scheduler /// /// @note This function is to support OpenCL AQL submissions. virtual void CmdVirtualQueueHandshake( gpusize parentState, uint32 newStateValue, gpusize parentChildCounter, gpusize signal, bool dedicatedQueue) = 0; /// Returns GPU address of the loop start with dispatch templates. The pointer will be passed to the /// scheduler kernel for the update of dispatch templates. /// Also initializes common registers for each dispatch template. /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. /// /// @note This function is to support OpenCL AQL submissions. virtual gpusize CmdVirtualQueueDispatcherStart() = 0; /// Programs CP iterator with dispatch templates for device enqueue in OpenCL2.0 /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. /// /// @param [in] signal Address for the termination signal /// @param [in] loopStart GPU address of the loop start for CP /// @param [in] numTemplates The number of dispatch templates /// /// @note This function is to support OpenCL AQL submissions. virtual void CmdVirtualQueueDispatcherEnd( gpusize signal, gpusize loopStart, uint32 numTemplates) = 0; /// Emulates AQL dispatch with PM4 commands. /// NOTE: Available for compute queues when created with aqlQueue set in the QueueCreateInfo. /// /// @param [in] dispatchInfo Pointer to kernel dispatch info /// /// @note This function is to support OpenCL AQL submissions. void CmdDispatchAql( const DispatchAqlParams& dispatchInfo) { m_funcTable.pfnCmdDispatchAql(this, dispatchInfo); } #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 888 /// XDMA was retired starting in gfx10 so this function has no use anymore. inline void CmdXdmaWaitFlipPending() {} #endif /// Starts thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these /// functions) /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. /// The caller is responsible for setting up valid GPU Profiler panel settings. virtual void CmdStartGpuProfilerLogging() = 0; /// Stops thread-trace/counter-collection - used by GPS Shim's OpenShimInterface via DXCP /// Only valid for the GPU Profiler layer (which is enabled separately by the GPS Shim during usage of these /// functions) /// Only valid for per-draw granularity and hence non-RGP thread-trace formats. /// The caller is responsible for setting up valid GPU Profiler panel settings. virtual void CmdStopGpuProfilerLogging() = 0; /// Set a mask to control which view instances are enabled for subsequent draws, should only be called on /// universal command buffers. /// /// @param [in] mask The mask to control which view instances are enabled. virtual void CmdSetViewInstanceMask(uint32 mask) = 0; /// Get used size of all chunks in bytes for given CmdAllocType. For CommandDataAlloc with multi-queue scheme, the /// size reported will be the sum of all command streams associated with the command buffer. It's legal to call /// this function while in the command building state. /// /// @param [in] type Allocation type for ICmdAllocator /// /// @returns Used allocation data size in bytes for provided CmdAllocType. virtual uint32 GetUsedSize( CmdAllocType type) const = 0; /// Returns the value of the associated arbitrary client data pointer. /// Can be used to associate arbitrary data with a particular PAL object. /// /// @returns Pointer to client data. void* GetClientData() const { return m_pClientData; } /// Sets the value of the associated arbitrary client data pointer. /// Can be used to associate arbitrary data with a particular PAL object. /// /// @param [in] pClientData A pointer to arbitrary client data. void SetClientData( void* pClientData) { m_pClientData = pClientData; } /// Gets the internal unique Id of the command buffer. /// This function was originally only for internal debugging, and the Id is not unique across different queue types. /// However, CmdDisassembly::ICmdBufferReporting requires a way to differentiate cmdLists that matches /// the meaning of UniqueId(), in particular, not requiring uniqueness across different queue types. /// /// @returns Unique Id of the command buffer virtual uint32 UniqueId() const = 0; /// Get the number of arrays of tracking data (for correlation) held by this ICmdBuffer. /// For TrackCmdLocationBefore and TrackCmdLocationAfter below, only (idx < GetNumTrackingArrays()) /// can yield a non-nullptr result /// /// @detail If GetNumTrackingArrays() returns 0 but GetTrackedCmdLocationArraySizeInBytes() does not, it will /// be possible to create them on this ICmdBuffer using CreateTrackedCmdLocationArray /// /// @returns the number of a TrackedCmdLocationArray's held by the ICmdBuffer. virtual uint32 GetNumTrackingArrays() const = 0; /// Get the number of bytes required by CreateTrackedCmdLocationArray. /// /// @detail The value returned here accomdates the full number of TrackedCmdLocationArray's to be /// created, from a single contiguous allocation. /// If allocation has not yet occured, (GetNumTrackingArrays() == 0). /// If (GetTrackedCmdLocationArraySizeInBytes() > 0) && (GetNumTrackingArrays() == 0) /// this ICmdBuffer supports TrackedCmdLocationArray's, but has not yet allocated them /// If (GetTrackedCmdLocationArraySizeInBytes() == 0), this ICmdBuffer does not support /// TrackedCmdLocationArray's /// /// @returns 0 if TrackedCmdLocationArray's are not supported /// The total number of bytes required requied by CreateTrackedCmdLocationArray otherwise. virtual uint32 GetTrackedCmdLocationArraySizeInBytes() const = 0; /// Uses the memory pMemory to initialize GetNumTrackingArrays() TrackedCmdLocationArray's on this /// ICmdBuffer. /// /// @param [in] pMemory Address of memory allocated for the purpose of creating TrackedCmdLocationArray's /// This memory should be at larger than GetTrackedCmdLocationArraySizeInBytes() /// bytes. /// /// @returns Result::Success: Indicates creation was successful /// Result::Unsupported: This ICmdBuffer does not support TrackedCmdLocationArray's /// Confirm (GetTrackedCmdLocationArraySizeInBytes() > 0) before using this function /// Result::ErrorInvalidPointer: pMemory == nullptr /// Result::AlreadyExists: TrackedCmdLocationArray's have already been allocated on /// this ICmdBuffer /// other: Error values originating from Util::Vector::Reserve() or /// Util::Vector::PushBack() /// virtual Result CreateTrackedCmdLocationArray( void* pMemory) = 0; /// Executes the destructors for all TrackedCmdLocationArray's owned by this ICmdBuffer. This should be /// called prior to deleting the memory pMemory that was originally provided to CreateTrackedCmdLocationArray /// /// @param [out] ppAllocatedMemory /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() > 0) /// The original value for pMemory provided in CreateTrackedCmdLocationArray(pMemory) will /// by returned in *ppAllocatedMemory. ie *ppAllocatedMemory = pMemory /// If (ppAllocatedMemory != nullptr) && (GetNumTrackingArrays() == 0), /// *ppAllocatedMemory = nullptr; /// virtual void DestroyTrackedCmdLocationArray( void** ppAllocatedMemory) = 0; /// Creates a new TrackedCmdLocation in the TrackedCmdLocationArray corresponding to idx /// of type TrackedCmdLocationMode::Begin. This location will be furnished with a pointer to the address of /// the next PM4Packet to be created on the CmdBuffer referred to by idx, and m_event == eventId. /// /// @detail Note there is two potential valid corner cases. /// (pBeforeResult->Get()->m_correlateInternal.m_ptr. == 0) /// Indicates there is a commandstream but it has not begun building PM4Packets /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins /// (pBeforeResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) /// Indicates there is not yet an associated commandstream. /// /// @param [in] idx The idx corresponding to GetTrackingArray(idx). /// Only (idx < GetNumTrackingArrays()) will yield non-trivial results. /// @param [in] eventId The code for the event being tracked /// @param [out] pBeforeResult If successful, returns a TrackedCmdLocationRef to a TrackedCmdLocation within /// the TrackedCmdLocationArray corresponding to idx /// Otherwise, if (pBeforeResult != nullptr), is initialized to the default for /// TrackedCmdLocationRef() /// /// @returns /// Pal::Result::Success if successful /// Pal::Result::ErrorInvalidPointer if (pBeforeResult == nullptr) /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) /// Pal::Result::NotFound if there is no tracking array corresponding to idx /// This can occur if CreateTrackedCmdLocationArray has not been /// called - which may mean the feature is disabled /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable /// to allocate memory /// /// virtual Pal::Result TrackCmdLocationBefore( uint32 idx, uint8 eventId, CmdDisassembly::TrackedCmdLocationRef* pBeforeResult) = 0; /// Similar to TrackCmdLocationBefore, TrackCmdLocationAfter creates a TrackedCmdLocation, of type /// TrackedCmdLocationMode::End or TrackedCmdLocationMode::Delta. This location will be furnished with a /// pointer to the address of the next PM4Packet to be created on the CmdBuffer referred to by idx. Type /// TrackedCmdLocationMode::Delta will only occur if parameter before is the last TrackedCmdLocation /// for the TrackedCmdLocationArray corresponding to idx, has the same m_event == eventId, and the change in /// pointer address is small enough to be represented in 6 bits. /// /// @detail Note there is two potential valid corner cases. /// (pAfterResult->Get()->m_correlateInternal.m_ptr. == 0) /// Indicates there is a commandstream but it has not begun building PM4Packets /// This implicitly refers to the baseAddress of the commandstream, o0nce building begins /// (pAfterResult->Get()->m_correlateInternal.m_ptr == TrackedCmdLocation::NoCorrespondingBaseAddress) /// Indicates there is not yet an associated commandstream. /// In both of these cases, in parameter "before" had the same value for m_correlateInternal.m_ptr. /// and "before" referred to the most recent TrackedCmdLocation, this tracked location will be of /// type TrackedCmdLocationMode::Delta, with (before.Get()->m_correlateInternal.m_deltaInDWords == 0) /// /// @param [in] idx The idx corresponding to GetTrackingArray(idx). /// Only (idx < GetNumTrackingArrays()) can yield non-trivial results. /// @param [in] eventId The code for the event being tracked /// @param [in] before The corresponding location generated by TrackCmdLocationBefore /// This may be CmdDisassembly::TrackedCmdLocationRef() if no location from /// TrackCmdLocationBefore before exists (such as on Reset) /// @param [out] pAfterResult Returns a TrackedCmdLocationRef to a TrackedCmdLocation within the TrackedCmdLocationArray /// corresponding to idx /// /// @returns /// Pal::Result::Success if successful /// Pal::Result::ErrorInvalidPointer if (pAfterResult == nullptr) /// Pal::Result::Unsupported if not supported by this implementation of palCmdBuffer /// Pal::Result::ErrorInvalidValue if (idx >= NumCmdStreams()) /// Pal::Result::ErrorInvalidValue if (eventId != before.m_correlateInternal.m_event) /// Pal::Result::NotFound if there is no tracking array corresponding to idx /// This can occur if CreateTrackedCmdLocationArray has not been /// called - which may mean the feature is disabled /// Pal::Result ErrorOutOfMemory if the TrackedCmdLocationArray corresponding to idx is unable /// to allocate memory /// virtual Pal::Result TrackCmdLocationAfter( uint32 idx, uint8 eventId, CmdDisassembly::TrackedCmdLocationRef before, CmdDisassembly::TrackedCmdLocationRef* pAfterResult) = 0; /// An accessor function for the TrackedCmdLocationArray corresponding to idx /// /// @param idx There is a CmdDisassembly::TrackedCmdLocationArray* corresponding to each /// sub-cmdBuffer for this cmdBuffer. This idx indexes these in the same fashion. /// Only idx < GetNumTrackingArrays() can yield non-nullptr results. /// /// @returns the TrackedCmdLocationArray corresponding to idx virtual CmdDisassembly::TrackedCmdLocationArray* GetTrackingArray( uint32 idx) const = 0; /// TrackClientEvent operates similarly to TrackCmdLocationBefore and TrackCmdLocationAfter above but /// operates on all TrackedCmdLocationArray's on this CmdBuffer. The TrackedCmdLocation generated here /// corresponds to type TrackedCmdLocationMode::ClientEvent, where clientId is a value the client is using /// to track this cmdBuffer, and eventId refers to some event the client is tracking outside of driver. /// /// @detail The first call to TrackClientEvent will include a TrackedCmdLocation with /// (m_mode == TrackedCmdLocationMode::ClientId), to define clientId on this cmdBuffer /// All calls will generate the TrackedCmdLocationMode::ClientEventId with /// (m_clientEvent.m_clientEventId == clientEventId) /// and be followed with a TrackedCmdLocationMode::Delta location with (m_eventId == PostClientEvent (0xFF)) /// /// @param [in] clientId /// @param [in] eventId /// /// @returns /// Result::Success if the tracked client event was successfully recorded /// Result::Unsupported if the implementation of ICmdBuffer does not support tracking /// Result::ErrorInvalidPointer if there was an error encountered determining the cmdList correlation /// requested. This is likely to be an out-of-memory situation. /// Result::AlreadyExists if registering clientId occured multiple times. This should only occur for /// race conditions, if the code calling TrackClientEvent is not threadsafe virtual Result TrackClientEvent( uint64 clientId, uint64 clientEventId) = 0; protected: /// @internal Constructor. Prevent use of new operator on this interface. Client must create objects by explicitly /// called the proper create method. ICmdBuffer() : m_pClientData(nullptr) { } /// @internal Destructor. Prevent use of delete operator on this interface. Client must destroy objects by /// explicitly calling IDestroyable::Destroy() and is responsible for freeing the system memory allocated for the /// object on their own. virtual ~ICmdBuffer() { } /// Structure for function pointers for the ICmdBuffer::Cmd* functions. struct CmdBufferFnTable { /// CmdSetUserData function pointers for each pipeline bind point. CmdSetUserDataFunc pfnCmdSetUserData[static_cast(PipelineBindPoint::Count)]; CmdDrawFunc pfnCmdDraw; ///< CmdDraw function pointer. CmdDrawOpaqueFunc pfnCmdDrawOpaque; ///< CmdDrawOpaque function pointer. CmdDrawIndexedFunc pfnCmdDrawIndexed; ///< CmdDrawIndexed function pointer. CmdDrawIndirectMultiFunc pfnCmdDrawIndirectMulti; ///< CmdDrawIndirectMulti function pointer. CmdDrawIndexedIndirectMultiFunc pfnCmdDrawIndexedIndirectMulti; ///< CmdDrawIndexedIndirectMulti func pointer. CmdDispatchFunc pfnCmdDispatch; ///< CmdDispatch function pointer. CmdDispatchIndirectFunc pfnCmdDispatchIndirect; ///< CmdDispatchIndirect function pointer. CmdDispatchOffsetFunc pfnCmdDispatchOffset; ///< CmdDispatchOffset function pointer. CmdDispatchMeshFunc pfnCmdDispatchMesh; ///< CmdDispatchmesh function pointer. CmdDispatchMeshIndirectMultiFunc pfnCmdDispatchMeshIndirectMulti; ///< CmdDispatchMeshIndirect function pointer. CmdDispatchAqlFunc pfnCmdDispatchAql; ///< CmdDispatchAql function pointer. } m_funcTable; ///< Function pointer table for Cmd* functions. private: /// @internal Client data pointer. This can have an arbitrary value and can be returned by calling GetClientData() /// and set via SetClientData(). /// For non-top-layer objects, this will point to the layer above the current object. void* m_pClientData; #if PAL_CLIENT_INTERFACE_MAJOR_VERSION < 900 /// @internal Some back-compat glue for some of the HwPipePoint interfaces in this file. static constexpr uint32 HwPipePointToStage[] = { PipelineStageTopOfPipe, // HwPipeTop = 0x0 PipelineStagePostPrefetch, // HwPipePostPrefetch = 0x1 PipelineStageVs, // HwPipePreRasterization = 0x2 PipelineStagePs, // HwPipePostPs = 0x3 PipelineStageLateDsTarget, // HwPipePreColorTarget = 0x4 PipelineStageCs, // HwPipePostCs = 0x5 PipelineStageBlt, // HwPipePostBlt = 0x6 PipelineStageBottomOfPipe, // HwPipeBottom = 0x7 }; #endif }; } // Pal